diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,200034 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.6199825009939095, + "eval_steps": 500, + "global_step": 200000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "entropy": 2.3426266074180604, + "epoch": 3.099912504969547e-05, + "grad_norm": 15.919363975524902, + "learning_rate": 4.359197907585005e-09, + "loss": 0.904, + "mean_token_accuracy": 0.7392466813325882, + "num_tokens": 11974.0, + "step": 10 + }, + { + "entropy": 2.357245463132858, + "epoch": 6.199825009939094e-05, + "grad_norm": 17.161699295043945, + "learning_rate": 9.202751138235009e-09, + "loss": 0.9299, + "mean_token_accuracy": 0.7459014117717743, + "num_tokens": 23117.0, + "step": 20 + }, + { + "entropy": 2.2194840207695963, + "epoch": 9.299737514908641e-05, + "grad_norm": 5.266523361206055, + "learning_rate": 1.4046304368885014e-08, + "loss": 0.7805, + "mean_token_accuracy": 0.7642201945185662, + "num_tokens": 36481.0, + "step": 30 + }, + { + "entropy": 2.316211926937103, + "epoch": 0.00012399650019878189, + "grad_norm": 17.800691604614258, + "learning_rate": 1.8889857599535022e-08, + "loss": 0.901, + "mean_token_accuracy": 0.756186394393444, + "num_tokens": 48700.0, + "step": 40 + }, + { + "entropy": 2.3188992157578467, + "epoch": 0.00015499562524847737, + "grad_norm": 16.84282875061035, + "learning_rate": 2.3733410830185025e-08, + "loss": 0.8694, + "mean_token_accuracy": 0.7459902837872505, + "num_tokens": 60860.0, + "step": 50 + }, + { + "entropy": 2.334391838312149, + "epoch": 0.00018599475029817283, + "grad_norm": 17.42536735534668, + "learning_rate": 2.857696406083503e-08, + "loss": 0.9114, + "mean_token_accuracy": 0.744971951842308, + "num_tokens": 72100.0, + "step": 60 + }, + { + "entropy": 2.3774717271327974, + "epoch": 0.00021699387534786831, + "grad_norm": 13.821733474731445, + "learning_rate": 3.3420517291485034e-08, + "loss": 0.9343, + "mean_token_accuracy": 0.7515697747468948, + "num_tokens": 83144.0, + "step": 70 + }, + { + "entropy": 2.2282222718000413, + "epoch": 0.00024799300039756377, + "grad_norm": 16.813045501708984, + "learning_rate": 3.8264070522135044e-08, + "loss": 0.7473, + "mean_token_accuracy": 0.7650388479232788, + "num_tokens": 95878.0, + "step": 80 + }, + { + "entropy": 2.308245059847832, + "epoch": 0.00027899212544725923, + "grad_norm": 9.254444122314453, + "learning_rate": 4.310762375278505e-08, + "loss": 0.894, + "mean_token_accuracy": 0.7462251231074333, + "num_tokens": 107556.0, + "step": 90 + }, + { + "entropy": 2.1781405553221704, + "epoch": 0.00030999125049695474, + "grad_norm": 8.745959281921387, + "learning_rate": 4.795117698343506e-08, + "loss": 0.7802, + "mean_token_accuracy": 0.7630234107375145, + "num_tokens": 121234.0, + "step": 100 + }, + { + "entropy": 2.2521888092160225, + "epoch": 0.0003409903755466502, + "grad_norm": 9.263279914855957, + "learning_rate": 5.2794730214085053e-08, + "loss": 0.8459, + "mean_token_accuracy": 0.7538067430257798, + "num_tokens": 133561.0, + "step": 110 + }, + { + "entropy": 2.208202359080315, + "epoch": 0.00037198950059634566, + "grad_norm": 14.294042587280273, + "learning_rate": 5.763828344473506e-08, + "loss": 0.7828, + "mean_token_accuracy": 0.7738959670066834, + "num_tokens": 146427.0, + "step": 120 + }, + { + "entropy": 2.3306238532066343, + "epoch": 0.0004029886256460411, + "grad_norm": 18.315011978149414, + "learning_rate": 6.248183667538507e-08, + "loss": 0.935, + "mean_token_accuracy": 0.7471725791692734, + "num_tokens": 157320.0, + "step": 130 + }, + { + "entropy": 2.1904593795537948, + "epoch": 0.00043398775069573663, + "grad_norm": 9.24004077911377, + "learning_rate": 6.732538990603508e-08, + "loss": 0.7454, + "mean_token_accuracy": 0.7584904819726944, + "num_tokens": 170758.0, + "step": 140 + }, + { + "entropy": 2.2158657625317573, + "epoch": 0.0004649868757454321, + "grad_norm": 15.915936470031738, + "learning_rate": 7.216894313668508e-08, + "loss": 0.8056, + "mean_token_accuracy": 0.7633484750986099, + "num_tokens": 183462.0, + "step": 150 + }, + { + "entropy": 2.1613963916897774, + "epoch": 0.0004959860007951275, + "grad_norm": 16.494924545288086, + "learning_rate": 7.701249636733508e-08, + "loss": 0.7496, + "mean_token_accuracy": 0.7714725866913795, + "num_tokens": 196484.0, + "step": 160 + }, + { + "entropy": 2.2984951809048653, + "epoch": 0.0005269851258448231, + "grad_norm": 14.924416542053223, + "learning_rate": 8.18560495979851e-08, + "loss": 0.8421, + "mean_token_accuracy": 0.749389934539795, + "num_tokens": 208243.0, + "step": 170 + }, + { + "entropy": 2.2832500755786898, + "epoch": 0.0005579842508945185, + "grad_norm": 15.250982284545898, + "learning_rate": 8.66996028286351e-08, + "loss": 0.8231, + "mean_token_accuracy": 0.7653956174850464, + "num_tokens": 219830.0, + "step": 180 + }, + { + "entropy": 2.3017154157161714, + "epoch": 0.000588983375944214, + "grad_norm": 9.919282913208008, + "learning_rate": 9.154315605928509e-08, + "loss": 0.8631, + "mean_token_accuracy": 0.7567846044898033, + "num_tokens": 231756.0, + "step": 190 + }, + { + "entropy": 2.2674851924180985, + "epoch": 0.0006199825009939095, + "grad_norm": 15.856183052062988, + "learning_rate": 9.63867092899351e-08, + "loss": 0.8038, + "mean_token_accuracy": 0.7690823331475258, + "num_tokens": 244149.0, + "step": 200 + }, + { + "entropy": 2.2884296044707297, + "epoch": 0.0006509816260436049, + "grad_norm": 15.011128425598145, + "learning_rate": 1.0123026252058511e-07, + "loss": 0.8224, + "mean_token_accuracy": 0.7501513630151748, + "num_tokens": 256276.0, + "step": 210 + }, + { + "entropy": 2.3350342929363253, + "epoch": 0.0006819807510933004, + "grad_norm": 17.258758544921875, + "learning_rate": 1.0607381575123511e-07, + "loss": 0.904, + "mean_token_accuracy": 0.7543755397200584, + "num_tokens": 267504.0, + "step": 220 + }, + { + "entropy": 2.2519437611103057, + "epoch": 0.0007129798761429959, + "grad_norm": 13.8502197265625, + "learning_rate": 1.1091736898188512e-07, + "loss": 0.8167, + "mean_token_accuracy": 0.7757848709821701, + "num_tokens": 278715.0, + "step": 230 + }, + { + "entropy": 2.2798040971159934, + "epoch": 0.0007439790011926913, + "grad_norm": 18.540184020996094, + "learning_rate": 1.1576092221253512e-07, + "loss": 0.9195, + "mean_token_accuracy": 0.7466915145516395, + "num_tokens": 290352.0, + "step": 240 + }, + { + "entropy": 2.1401579171419143, + "epoch": 0.0007749781262423868, + "grad_norm": 13.905881881713867, + "learning_rate": 1.2060447544318512e-07, + "loss": 0.7208, + "mean_token_accuracy": 0.7657153263688088, + "num_tokens": 304153.0, + "step": 250 + }, + { + "entropy": 2.27926591783762, + "epoch": 0.0008059772512920822, + "grad_norm": 15.406006813049316, + "learning_rate": 1.2544802867383514e-07, + "loss": 0.8348, + "mean_token_accuracy": 0.7559728816151619, + "num_tokens": 316234.0, + "step": 260 + }, + { + "entropy": 2.285950776934624, + "epoch": 0.0008369763763417777, + "grad_norm": 7.619900703430176, + "learning_rate": 1.3029158190448516e-07, + "loss": 0.7548, + "mean_token_accuracy": 0.7609011575579643, + "num_tokens": 329019.0, + "step": 270 + }, + { + "entropy": 2.331369215250015, + "epoch": 0.0008679755013914733, + "grad_norm": 21.633079528808594, + "learning_rate": 1.3513513513513515e-07, + "loss": 0.8983, + "mean_token_accuracy": 0.7470938101410866, + "num_tokens": 340699.0, + "step": 280 + }, + { + "entropy": 2.3679069608449934, + "epoch": 0.0008989746264411687, + "grad_norm": 15.301081657409668, + "learning_rate": 1.3997868836578514e-07, + "loss": 0.8797, + "mean_token_accuracy": 0.7614053919911384, + "num_tokens": 351539.0, + "step": 290 + }, + { + "entropy": 2.349914161860943, + "epoch": 0.0009299737514908642, + "grad_norm": 13.113327026367188, + "learning_rate": 1.4482224159643515e-07, + "loss": 0.8534, + "mean_token_accuracy": 0.7527757495641708, + "num_tokens": 362660.0, + "step": 300 + }, + { + "entropy": 2.274288383126259, + "epoch": 0.0009609728765405597, + "grad_norm": 16.123380661010742, + "learning_rate": 1.4966579482708517e-07, + "loss": 0.8189, + "mean_token_accuracy": 0.7668241068720818, + "num_tokens": 374530.0, + "step": 310 + }, + { + "entropy": 2.2432350262999536, + "epoch": 0.000991972001590255, + "grad_norm": 9.562804222106934, + "learning_rate": 1.5450934805773516e-07, + "loss": 0.8106, + "mean_token_accuracy": 0.7698193415999413, + "num_tokens": 387392.0, + "step": 320 + }, + { + "entropy": 2.311431211233139, + "epoch": 0.0010229711266399506, + "grad_norm": 16.730819702148438, + "learning_rate": 1.593529012883852e-07, + "loss": 0.8178, + "mean_token_accuracy": 0.7644891291856766, + "num_tokens": 399180.0, + "step": 330 + }, + { + "entropy": 2.349755284190178, + "epoch": 0.0010539702516896461, + "grad_norm": 16.725875854492188, + "learning_rate": 1.641964545190352e-07, + "loss": 0.8588, + "mean_token_accuracy": 0.7604990780353547, + "num_tokens": 410702.0, + "step": 340 + }, + { + "entropy": 2.279722733795643, + "epoch": 0.0010849693767393416, + "grad_norm": 14.276802062988281, + "learning_rate": 1.6904000774968518e-07, + "loss": 0.7989, + "mean_token_accuracy": 0.7695920899510383, + "num_tokens": 422591.0, + "step": 350 + }, + { + "entropy": 2.1071667522192, + "epoch": 0.001115968501789037, + "grad_norm": 12.381896018981934, + "learning_rate": 1.7388356098033517e-07, + "loss": 0.6475, + "mean_token_accuracy": 0.7926038712263107, + "num_tokens": 436846.0, + "step": 360 + }, + { + "entropy": 2.3276678711175918, + "epoch": 0.0011469676268387324, + "grad_norm": 17.90573501586914, + "learning_rate": 1.7872711421098519e-07, + "loss": 0.8036, + "mean_token_accuracy": 0.7576885908842087, + "num_tokens": 448403.0, + "step": 370 + }, + { + "entropy": 2.2606518104672433, + "epoch": 0.001177966751888428, + "grad_norm": 12.439233779907227, + "learning_rate": 1.8357066744163518e-07, + "loss": 0.7721, + "mean_token_accuracy": 0.7698314875364304, + "num_tokens": 460244.0, + "step": 380 + }, + { + "entropy": 2.338596761226654, + "epoch": 0.0012089658769381235, + "grad_norm": 14.934176445007324, + "learning_rate": 1.884142206722852e-07, + "loss": 0.8393, + "mean_token_accuracy": 0.7656854018568993, + "num_tokens": 470678.0, + "step": 390 + }, + { + "entropy": 2.332528692483902, + "epoch": 0.001239965001987819, + "grad_norm": 16.811922073364258, + "learning_rate": 1.932577739029352e-07, + "loss": 0.8758, + "mean_token_accuracy": 0.7543434649705887, + "num_tokens": 482035.0, + "step": 400 + }, + { + "entropy": 2.282673454284668, + "epoch": 0.0012709641270375143, + "grad_norm": 15.464518547058105, + "learning_rate": 1.981013271335852e-07, + "loss": 0.8126, + "mean_token_accuracy": 0.7647529050707818, + "num_tokens": 494000.0, + "step": 410 + }, + { + "entropy": 2.306594988703728, + "epoch": 0.0013019632520872098, + "grad_norm": 16.271461486816406, + "learning_rate": 2.0294488036423522e-07, + "loss": 0.8339, + "mean_token_accuracy": 0.755289240181446, + "num_tokens": 505996.0, + "step": 420 + }, + { + "entropy": 2.3733946830034256, + "epoch": 0.0013329623771369053, + "grad_norm": 16.00725746154785, + "learning_rate": 2.0778843359488523e-07, + "loss": 0.8955, + "mean_token_accuracy": 0.7515278026461601, + "num_tokens": 517285.0, + "step": 430 + }, + { + "entropy": 2.2198241636157037, + "epoch": 0.0013639615021866008, + "grad_norm": 15.578631401062012, + "learning_rate": 2.1263198682553522e-07, + "loss": 0.746, + "mean_token_accuracy": 0.7685888081789016, + "num_tokens": 529581.0, + "step": 440 + }, + { + "entropy": 2.352111691981554, + "epoch": 0.0013949606272362963, + "grad_norm": 14.224849700927734, + "learning_rate": 2.1747554005618524e-07, + "loss": 0.859, + "mean_token_accuracy": 0.7565658301115036, + "num_tokens": 541039.0, + "step": 450 + }, + { + "entropy": 2.17586030960083, + "epoch": 0.0014259597522859918, + "grad_norm": 15.60697078704834, + "learning_rate": 2.2231909328683525e-07, + "loss": 0.7372, + "mean_token_accuracy": 0.775518947839737, + "num_tokens": 554099.0, + "step": 460 + }, + { + "entropy": 2.3119050413370132, + "epoch": 0.0014569588773356871, + "grad_norm": 14.367731094360352, + "learning_rate": 2.2716264651748524e-07, + "loss": 0.8381, + "mean_token_accuracy": 0.7697366833686828, + "num_tokens": 564893.0, + "step": 470 + }, + { + "entropy": 2.2108234629034995, + "epoch": 0.0014879580023853826, + "grad_norm": 17.166481018066406, + "learning_rate": 2.3200619974813526e-07, + "loss": 0.7081, + "mean_token_accuracy": 0.7743594750761986, + "num_tokens": 577998.0, + "step": 480 + }, + { + "entropy": 2.2766464233398436, + "epoch": 0.0015189571274350781, + "grad_norm": 16.3574275970459, + "learning_rate": 2.3684975297878528e-07, + "loss": 0.7699, + "mean_token_accuracy": 0.7645501315593719, + "num_tokens": 589715.0, + "step": 490 + }, + { + "entropy": 2.264049357175827, + "epoch": 0.0015499562524847737, + "grad_norm": 6.618349075317383, + "learning_rate": 2.4169330620943527e-07, + "loss": 0.78, + "mean_token_accuracy": 0.7658553898334504, + "num_tokens": 602659.0, + "step": 500 + }, + { + "entropy": 2.268606995046139, + "epoch": 0.0015809553775344692, + "grad_norm": 15.35413646697998, + "learning_rate": 2.4653685944008526e-07, + "loss": 0.8016, + "mean_token_accuracy": 0.7688265934586525, + "num_tokens": 615422.0, + "step": 510 + }, + { + "entropy": 2.181258149445057, + "epoch": 0.0016119545025841645, + "grad_norm": 16.496660232543945, + "learning_rate": 2.513804126707353e-07, + "loss": 0.6568, + "mean_token_accuracy": 0.7991862148046494, + "num_tokens": 627480.0, + "step": 520 + }, + { + "entropy": 2.207347111403942, + "epoch": 0.00164295362763386, + "grad_norm": 15.786961555480957, + "learning_rate": 2.562239659013853e-07, + "loss": 0.7347, + "mean_token_accuracy": 0.7753358393907547, + "num_tokens": 640090.0, + "step": 530 + }, + { + "entropy": 2.328162580728531, + "epoch": 0.0016739527526835555, + "grad_norm": 16.40909767150879, + "learning_rate": 2.610675191320353e-07, + "loss": 0.8802, + "mean_token_accuracy": 0.7577004536986351, + "num_tokens": 651238.0, + "step": 540 + }, + { + "entropy": 2.285073181986809, + "epoch": 0.001704951877733251, + "grad_norm": 15.64692211151123, + "learning_rate": 2.659110723626853e-07, + "loss": 0.808, + "mean_token_accuracy": 0.7738980025053024, + "num_tokens": 663483.0, + "step": 550 + }, + { + "entropy": 2.221916152536869, + "epoch": 0.0017359510027829465, + "grad_norm": 16.929515838623047, + "learning_rate": 2.7075462559333526e-07, + "loss": 0.7659, + "mean_token_accuracy": 0.7734002724289895, + "num_tokens": 676094.0, + "step": 560 + }, + { + "entropy": 2.207345911860466, + "epoch": 0.001766950127832642, + "grad_norm": 16.074033737182617, + "learning_rate": 2.755981788239853e-07, + "loss": 0.6923, + "mean_token_accuracy": 0.7804467469453812, + "num_tokens": 688999.0, + "step": 570 + }, + { + "entropy": 2.3080645158886908, + "epoch": 0.0017979492528823373, + "grad_norm": 14.152597427368164, + "learning_rate": 2.804417320546353e-07, + "loss": 0.8044, + "mean_token_accuracy": 0.7673657357692718, + "num_tokens": 700787.0, + "step": 580 + }, + { + "entropy": 2.3365137010812758, + "epoch": 0.0018289483779320328, + "grad_norm": 8.505473136901855, + "learning_rate": 2.852852852852853e-07, + "loss": 0.8693, + "mean_token_accuracy": 0.7561125203967094, + "num_tokens": 712382.0, + "step": 590 + }, + { + "entropy": 2.26595401763916, + "epoch": 0.0018599475029817283, + "grad_norm": 15.482884407043457, + "learning_rate": 2.901288385159353e-07, + "loss": 0.7499, + "mean_token_accuracy": 0.7823521599173546, + "num_tokens": 724324.0, + "step": 600 + }, + { + "entropy": 2.2956106424331666, + "epoch": 0.0018909466280314239, + "grad_norm": 17.781259536743164, + "learning_rate": 2.949723917465853e-07, + "loss": 0.8367, + "mean_token_accuracy": 0.7637320205569267, + "num_tokens": 736090.0, + "step": 610 + }, + { + "entropy": 2.380499321222305, + "epoch": 0.0019219457530811194, + "grad_norm": 13.911691665649414, + "learning_rate": 2.998159449772353e-07, + "loss": 0.8561, + "mean_token_accuracy": 0.7514171093702317, + "num_tokens": 746693.0, + "step": 620 + }, + { + "entropy": 2.3080674216151236, + "epoch": 0.0019529448781308147, + "grad_norm": 18.39996337890625, + "learning_rate": 3.0465949820788535e-07, + "loss": 0.7974, + "mean_token_accuracy": 0.7719434529542923, + "num_tokens": 758380.0, + "step": 630 + }, + { + "entropy": 2.33802735209465, + "epoch": 0.00198394400318051, + "grad_norm": 14.702959060668945, + "learning_rate": 3.0950305143853534e-07, + "loss": 0.8612, + "mean_token_accuracy": 0.749081814289093, + "num_tokens": 770107.0, + "step": 640 + }, + { + "entropy": 2.3233169302344323, + "epoch": 0.002014943128230206, + "grad_norm": 15.600394248962402, + "learning_rate": 3.143466046691854e-07, + "loss": 0.8358, + "mean_token_accuracy": 0.7564076974987983, + "num_tokens": 781888.0, + "step": 650 + }, + { + "entropy": 2.3429936945438383, + "epoch": 0.002045942253279901, + "grad_norm": 15.831826210021973, + "learning_rate": 3.1919015789983537e-07, + "loss": 0.8509, + "mean_token_accuracy": 0.7663177505135537, + "num_tokens": 792407.0, + "step": 660 + }, + { + "entropy": 2.2723167344927786, + "epoch": 0.0020769413783295965, + "grad_norm": 14.536561012268066, + "learning_rate": 3.240337111304853e-07, + "loss": 0.7932, + "mean_token_accuracy": 0.7709016263484955, + "num_tokens": 804473.0, + "step": 670 + }, + { + "entropy": 2.317927873134613, + "epoch": 0.0021079405033792922, + "grad_norm": 15.074233055114746, + "learning_rate": 3.2887726436113535e-07, + "loss": 0.7876, + "mean_token_accuracy": 0.7756809115409851, + "num_tokens": 815615.0, + "step": 680 + }, + { + "entropy": 2.279827579855919, + "epoch": 0.0021389396284289875, + "grad_norm": 15.745168685913086, + "learning_rate": 3.3372081759178534e-07, + "loss": 0.8729, + "mean_token_accuracy": 0.7602526545524597, + "num_tokens": 827866.0, + "step": 690 + }, + { + "entropy": 2.2185587152838706, + "epoch": 0.0021699387534786833, + "grad_norm": 13.561483383178711, + "learning_rate": 3.385643708224354e-07, + "loss": 0.748, + "mean_token_accuracy": 0.779901297390461, + "num_tokens": 839797.0, + "step": 700 + }, + { + "entropy": 2.234003722667694, + "epoch": 0.0022009378785283785, + "grad_norm": 13.80466079711914, + "learning_rate": 3.4340792405308537e-07, + "loss": 0.7714, + "mean_token_accuracy": 0.7791284009814262, + "num_tokens": 851460.0, + "step": 710 + }, + { + "entropy": 2.242032915353775, + "epoch": 0.002231937003578074, + "grad_norm": 8.220836639404297, + "learning_rate": 3.482514772837354e-07, + "loss": 0.7225, + "mean_token_accuracy": 0.7788921162486077, + "num_tokens": 863582.0, + "step": 720 + }, + { + "entropy": 2.2308412209153174, + "epoch": 0.0022629361286277696, + "grad_norm": 15.163178443908691, + "learning_rate": 3.5309503051438535e-07, + "loss": 0.7775, + "mean_token_accuracy": 0.771105645596981, + "num_tokens": 875961.0, + "step": 730 + }, + { + "entropy": 2.1884254679083823, + "epoch": 0.002293935253677465, + "grad_norm": 7.352316856384277, + "learning_rate": 3.579385837450354e-07, + "loss": 0.7359, + "mean_token_accuracy": 0.7716657683253288, + "num_tokens": 887838.0, + "step": 740 + }, + { + "entropy": 2.2045844838023188, + "epoch": 0.0023249343787271606, + "grad_norm": 14.715704917907715, + "learning_rate": 3.627821369756854e-07, + "loss": 0.7344, + "mean_token_accuracy": 0.7731250256299973, + "num_tokens": 900117.0, + "step": 750 + }, + { + "entropy": 2.261673641204834, + "epoch": 0.002355933503776856, + "grad_norm": 14.88746166229248, + "learning_rate": 3.676256902063354e-07, + "loss": 0.7627, + "mean_token_accuracy": 0.7688748240470886, + "num_tokens": 912552.0, + "step": 760 + }, + { + "entropy": 2.318584197759628, + "epoch": 0.002386932628826551, + "grad_norm": 15.337506294250488, + "learning_rate": 3.7246924343698536e-07, + "loss": 0.8543, + "mean_token_accuracy": 0.7685193121433258, + "num_tokens": 923351.0, + "step": 770 + }, + { + "entropy": 2.198628255724907, + "epoch": 0.002417931753876247, + "grad_norm": 16.160091400146484, + "learning_rate": 3.773127966676354e-07, + "loss": 0.7398, + "mean_token_accuracy": 0.7804590031504631, + "num_tokens": 935777.0, + "step": 780 + }, + { + "entropy": 2.291598491370678, + "epoch": 0.002448930878925942, + "grad_norm": 7.9013800621032715, + "learning_rate": 3.821563498982854e-07, + "loss": 0.7758, + "mean_token_accuracy": 0.7750082641839982, + "num_tokens": 947492.0, + "step": 790 + }, + { + "entropy": 2.3038537070155143, + "epoch": 0.002479930003975638, + "grad_norm": 16.498620986938477, + "learning_rate": 3.8699990312893544e-07, + "loss": 0.7678, + "mean_token_accuracy": 0.7833232745528221, + "num_tokens": 958770.0, + "step": 800 + }, + { + "entropy": 2.2682218730449675, + "epoch": 0.0025109291290253332, + "grad_norm": 15.202181816101074, + "learning_rate": 3.9184345635958543e-07, + "loss": 0.8518, + "mean_token_accuracy": 0.7557914197444916, + "num_tokens": 970777.0, + "step": 810 + }, + { + "entropy": 2.199041871726513, + "epoch": 0.0025419282540750285, + "grad_norm": 13.799867630004883, + "learning_rate": 3.9668700959023547e-07, + "loss": 0.7567, + "mean_token_accuracy": 0.7677057057619094, + "num_tokens": 982601.0, + "step": 820 + }, + { + "entropy": 2.2466016352176665, + "epoch": 0.0025729273791247243, + "grad_norm": 14.757229804992676, + "learning_rate": 4.015305628208854e-07, + "loss": 0.776, + "mean_token_accuracy": 0.7803550854325294, + "num_tokens": 994451.0, + "step": 830 + }, + { + "entropy": 2.304102659225464, + "epoch": 0.0026039265041744196, + "grad_norm": 16.70046615600586, + "learning_rate": 4.0637411605153545e-07, + "loss": 0.8395, + "mean_token_accuracy": 0.7707262724637985, + "num_tokens": 1005368.0, + "step": 840 + }, + { + "entropy": 2.2025340750813482, + "epoch": 0.0026349256292241153, + "grad_norm": 13.70840835571289, + "learning_rate": 4.1121766928218544e-07, + "loss": 0.722, + "mean_token_accuracy": 0.7827607110142708, + "num_tokens": 1017690.0, + "step": 850 + }, + { + "entropy": 2.1886146038770677, + "epoch": 0.0026659247542738106, + "grad_norm": 13.6995849609375, + "learning_rate": 4.160612225128355e-07, + "loss": 0.6548, + "mean_token_accuracy": 0.7894467368721962, + "num_tokens": 1030737.0, + "step": 860 + }, + { + "entropy": 2.119298684597015, + "epoch": 0.002696923879323506, + "grad_norm": 16.397750854492188, + "learning_rate": 4.2090477574348547e-07, + "loss": 0.6458, + "mean_token_accuracy": 0.7952458307147026, + "num_tokens": 1044784.0, + "step": 870 + }, + { + "entropy": 2.243351861834526, + "epoch": 0.0027279230043732016, + "grad_norm": 14.533041954040527, + "learning_rate": 4.257483289741354e-07, + "loss": 0.7294, + "mean_token_accuracy": 0.7760630249977112, + "num_tokens": 1056942.0, + "step": 880 + }, + { + "entropy": 2.3225993037223818, + "epoch": 0.002758922129422897, + "grad_norm": 16.506406784057617, + "learning_rate": 4.3059188220478545e-07, + "loss": 0.8633, + "mean_token_accuracy": 0.7579209730029106, + "num_tokens": 1067838.0, + "step": 890 + }, + { + "entropy": 2.0741579085588455, + "epoch": 0.0027899212544725926, + "grad_norm": 11.532833099365234, + "learning_rate": 4.3543543543543544e-07, + "loss": 0.6632, + "mean_token_accuracy": 0.7945702284574508, + "num_tokens": 1082957.0, + "step": 900 + }, + { + "entropy": 2.2643200919032096, + "epoch": 0.002820920379522288, + "grad_norm": 15.212760925292969, + "learning_rate": 4.402789886660855e-07, + "loss": 0.8228, + "mean_token_accuracy": 0.76753601282835, + "num_tokens": 1093997.0, + "step": 910 + }, + { + "entropy": 2.2407288014888764, + "epoch": 0.0028519195045719837, + "grad_norm": 16.18030548095703, + "learning_rate": 4.451225418967355e-07, + "loss": 0.8187, + "mean_token_accuracy": 0.7693384140729904, + "num_tokens": 1106072.0, + "step": 920 + }, + { + "entropy": 2.219868388772011, + "epoch": 0.002882918629621679, + "grad_norm": 16.22972297668457, + "learning_rate": 4.499660951273855e-07, + "loss": 0.7272, + "mean_token_accuracy": 0.7875460609793663, + "num_tokens": 1118504.0, + "step": 930 + }, + { + "entropy": 2.300278900563717, + "epoch": 0.0029139177546713742, + "grad_norm": 16.145498275756836, + "learning_rate": 4.5480964835803545e-07, + "loss": 0.7712, + "mean_token_accuracy": 0.7815862789750099, + "num_tokens": 1129720.0, + "step": 940 + }, + { + "entropy": 2.3173892736434936, + "epoch": 0.00294491687972107, + "grad_norm": 14.980684280395508, + "learning_rate": 4.596532015886855e-07, + "loss": 0.9265, + "mean_token_accuracy": 0.7435617208480835, + "num_tokens": 1140306.0, + "step": 950 + }, + { + "entropy": 2.172358636558056, + "epoch": 0.0029759160047707653, + "grad_norm": 14.839875221252441, + "learning_rate": 4.644967548193355e-07, + "loss": 0.7082, + "mean_token_accuracy": 0.7885784819722176, + "num_tokens": 1152921.0, + "step": 960 + }, + { + "entropy": 2.2328238427639007, + "epoch": 0.003006915129820461, + "grad_norm": 13.23880386352539, + "learning_rate": 4.6934030804998553e-07, + "loss": 0.7795, + "mean_token_accuracy": 0.7753711074590683, + "num_tokens": 1164657.0, + "step": 970 + }, + { + "entropy": 2.114644192159176, + "epoch": 0.0030379142548701563, + "grad_norm": 11.22083854675293, + "learning_rate": 4.741838612806355e-07, + "loss": 0.6744, + "mean_token_accuracy": 0.7775777190923691, + "num_tokens": 1177945.0, + "step": 980 + }, + { + "entropy": 2.1031323984265327, + "epoch": 0.0030689133799198516, + "grad_norm": 4.399860382080078, + "learning_rate": 4.790274145112855e-07, + "loss": 0.668, + "mean_token_accuracy": 0.7964637473225593, + "num_tokens": 1190989.0, + "step": 990 + }, + { + "entropy": 2.2336785733699798, + "epoch": 0.0030999125049695473, + "grad_norm": 15.71125602722168, + "learning_rate": 4.838709677419355e-07, + "loss": 0.772, + "mean_token_accuracy": 0.7712054967880249, + "num_tokens": 1201872.0, + "step": 1000 + }, + { + "entropy": 2.2714061349630357, + "epoch": 0.0031309116300192426, + "grad_norm": 15.550898551940918, + "learning_rate": 4.887145209725856e-07, + "loss": 0.795, + "mean_token_accuracy": 0.7752420023083687, + "num_tokens": 1213279.0, + "step": 1010 + }, + { + "entropy": 2.111206144094467, + "epoch": 0.0031619107550689383, + "grad_norm": 14.369123458862305, + "learning_rate": 4.935580742032355e-07, + "loss": 0.6963, + "mean_token_accuracy": 0.7931261286139488, + "num_tokens": 1226500.0, + "step": 1020 + }, + { + "entropy": 2.17624341994524, + "epoch": 0.0031929098801186336, + "grad_norm": 18.43202781677246, + "learning_rate": 4.984016274338856e-07, + "loss": 0.7139, + "mean_token_accuracy": 0.7850997045636177, + "num_tokens": 1238540.0, + "step": 1030 + }, + { + "entropy": 2.209955517947674, + "epoch": 0.003223909005168329, + "grad_norm": 16.401142120361328, + "learning_rate": 5.032451806645356e-07, + "loss": 0.7126, + "mean_token_accuracy": 0.7926564201712608, + "num_tokens": 1250753.0, + "step": 1040 + }, + { + "entropy": 2.267963781952858, + "epoch": 0.0032549081302180247, + "grad_norm": 15.670921325683594, + "learning_rate": 5.080887338951856e-07, + "loss": 0.8273, + "mean_token_accuracy": 0.7623932763934136, + "num_tokens": 1262037.0, + "step": 1050 + }, + { + "entropy": 1.9811606004834175, + "epoch": 0.00328590725526772, + "grad_norm": 14.190478324890137, + "learning_rate": 5.129322871258355e-07, + "loss": 0.5722, + "mean_token_accuracy": 0.7981273889541626, + "num_tokens": 1276993.0, + "step": 1060 + }, + { + "entropy": 2.1613825380802156, + "epoch": 0.0033169063803174157, + "grad_norm": 7.023738861083984, + "learning_rate": 5.177758403564856e-07, + "loss": 0.6904, + "mean_token_accuracy": 0.7866356909275055, + "num_tokens": 1289051.0, + "step": 1070 + }, + { + "entropy": 2.25160670876503, + "epoch": 0.003347905505367111, + "grad_norm": 14.642976760864258, + "learning_rate": 5.226193935871355e-07, + "loss": 0.8145, + "mean_token_accuracy": 0.7689608931541443, + "num_tokens": 1300620.0, + "step": 1080 + }, + { + "entropy": 2.2016865596175195, + "epoch": 0.0033789046304168063, + "grad_norm": 15.589518547058105, + "learning_rate": 5.274629468177855e-07, + "loss": 0.7511, + "mean_token_accuracy": 0.7763894140720368, + "num_tokens": 1312252.0, + "step": 1090 + }, + { + "entropy": 2.2270815491676332, + "epoch": 0.003409903755466502, + "grad_norm": 14.04956340789795, + "learning_rate": 5.323065000484356e-07, + "loss": 0.7445, + "mean_token_accuracy": 0.7833787277340889, + "num_tokens": 1323950.0, + "step": 1100 + }, + { + "entropy": 2.1960760056972504, + "epoch": 0.0034409028805161973, + "grad_norm": 17.72970199584961, + "learning_rate": 5.371500532790856e-07, + "loss": 0.7735, + "mean_token_accuracy": 0.7613553330302238, + "num_tokens": 1336895.0, + "step": 1110 + }, + { + "entropy": 2.0831907019019127, + "epoch": 0.003471902005565893, + "grad_norm": 10.339810371398926, + "learning_rate": 5.419936065097356e-07, + "loss": 0.6841, + "mean_token_accuracy": 0.7895815879106521, + "num_tokens": 1350813.0, + "step": 1120 + }, + { + "entropy": 2.1665103510022163, + "epoch": 0.0035029011306155883, + "grad_norm": 14.524060249328613, + "learning_rate": 5.468371597403856e-07, + "loss": 0.7598, + "mean_token_accuracy": 0.7754980430006981, + "num_tokens": 1362111.0, + "step": 1130 + }, + { + "entropy": 2.217361180484295, + "epoch": 0.003533900255665284, + "grad_norm": 14.30985164642334, + "learning_rate": 5.516807129710356e-07, + "loss": 0.7568, + "mean_token_accuracy": 0.782513102889061, + "num_tokens": 1373317.0, + "step": 1140 + }, + { + "entropy": 2.1852347582578657, + "epoch": 0.0035648993807149794, + "grad_norm": 16.35944938659668, + "learning_rate": 5.565242662016856e-07, + "loss": 0.7744, + "mean_token_accuracy": 0.7764955341815949, + "num_tokens": 1384737.0, + "step": 1150 + }, + { + "entropy": 2.2084068596363067, + "epoch": 0.0035958985057646746, + "grad_norm": 18.96138572692871, + "learning_rate": 5.613678194323357e-07, + "loss": 0.8437, + "mean_token_accuracy": 0.7699245184659957, + "num_tokens": 1396007.0, + "step": 1160 + }, + { + "entropy": 2.1020758375525475, + "epoch": 0.0036268976308143704, + "grad_norm": 7.519232749938965, + "learning_rate": 5.662113726629856e-07, + "loss": 0.6637, + "mean_token_accuracy": 0.7951419726014137, + "num_tokens": 1408380.0, + "step": 1170 + }, + { + "entropy": 2.157642234861851, + "epoch": 0.0036578967558640657, + "grad_norm": 15.029388427734375, + "learning_rate": 5.710549258936356e-07, + "loss": 0.7325, + "mean_token_accuracy": 0.785622188448906, + "num_tokens": 1420661.0, + "step": 1180 + }, + { + "entropy": 2.037250469624996, + "epoch": 0.0036888958809137614, + "grad_norm": 15.331890106201172, + "learning_rate": 5.758984791242856e-07, + "loss": 0.6885, + "mean_token_accuracy": 0.7827024027705193, + "num_tokens": 1434538.0, + "step": 1190 + }, + { + "entropy": 2.085458919405937, + "epoch": 0.0037198950059634567, + "grad_norm": 17.225831985473633, + "learning_rate": 5.807420323549356e-07, + "loss": 0.6795, + "mean_token_accuracy": 0.7806023344397545, + "num_tokens": 1448017.0, + "step": 1200 + }, + { + "entropy": 2.104422791302204, + "epoch": 0.003750894131013152, + "grad_norm": 16.510398864746094, + "learning_rate": 5.855855855855856e-07, + "loss": 0.7358, + "mean_token_accuracy": 0.7740836933255195, + "num_tokens": 1460208.0, + "step": 1210 + }, + { + "entropy": 2.1881393998861314, + "epoch": 0.0037818932560628477, + "grad_norm": 16.095951080322266, + "learning_rate": 5.904291388162357e-07, + "loss": 0.762, + "mean_token_accuracy": 0.7820860460400582, + "num_tokens": 1471627.0, + "step": 1220 + }, + { + "entropy": 2.092408487200737, + "epoch": 0.003812892381112543, + "grad_norm": 16.695289611816406, + "learning_rate": 5.952726920468856e-07, + "loss": 0.7157, + "mean_token_accuracy": 0.787506853044033, + "num_tokens": 1483004.0, + "step": 1230 + }, + { + "entropy": 2.0120772644877434, + "epoch": 0.0038438915061622387, + "grad_norm": 15.932051658630371, + "learning_rate": 6.001162452775357e-07, + "loss": 0.6887, + "mean_token_accuracy": 0.7973495230078698, + "num_tokens": 1496481.0, + "step": 1240 + }, + { + "entropy": 2.160550518333912, + "epoch": 0.003874890631211934, + "grad_norm": 15.765700340270996, + "learning_rate": 6.049597985081857e-07, + "loss": 0.7399, + "mean_token_accuracy": 0.7820422008633614, + "num_tokens": 1508377.0, + "step": 1250 + }, + { + "entropy": 2.1905842304229735, + "epoch": 0.0039058897562616293, + "grad_norm": 16.464012145996094, + "learning_rate": 6.098033517388357e-07, + "loss": 0.8825, + "mean_token_accuracy": 0.7656736433506012, + "num_tokens": 1519413.0, + "step": 1260 + }, + { + "entropy": 2.1988802313804627, + "epoch": 0.003936888881311325, + "grad_norm": 13.513689041137695, + "learning_rate": 6.146469049694856e-07, + "loss": 0.8144, + "mean_token_accuracy": 0.7779660537838936, + "num_tokens": 1530217.0, + "step": 1270 + }, + { + "entropy": 2.183208304643631, + "epoch": 0.00396788800636102, + "grad_norm": 7.471315383911133, + "learning_rate": 6.194904582001357e-07, + "loss": 0.7551, + "mean_token_accuracy": 0.7802443280816078, + "num_tokens": 1542024.0, + "step": 1280 + }, + { + "entropy": 2.0785870283842085, + "epoch": 0.003998887131410716, + "grad_norm": 17.728321075439453, + "learning_rate": 6.243340114307856e-07, + "loss": 0.6849, + "mean_token_accuracy": 0.7881320580840111, + "num_tokens": 1555038.0, + "step": 1290 + }, + { + "entropy": 2.0753221943974496, + "epoch": 0.004029886256460412, + "grad_norm": 13.686918258666992, + "learning_rate": 6.291775646614356e-07, + "loss": 0.6741, + "mean_token_accuracy": 0.7906556352972984, + "num_tokens": 1567185.0, + "step": 1300 + }, + { + "entropy": 2.083100973069668, + "epoch": 0.004060885381510107, + "grad_norm": 7.613674163818359, + "learning_rate": 6.340211178920856e-07, + "loss": 0.6265, + "mean_token_accuracy": 0.7916320934891701, + "num_tokens": 1580688.0, + "step": 1310 + }, + { + "entropy": 2.227837671339512, + "epoch": 0.004091884506559802, + "grad_norm": 17.518901824951172, + "learning_rate": 6.388646711227357e-07, + "loss": 0.7923, + "mean_token_accuracy": 0.7738375559449195, + "num_tokens": 1592038.0, + "step": 1320 + }, + { + "entropy": 2.1594600453972816, + "epoch": 0.004122883631609498, + "grad_norm": 15.655396461486816, + "learning_rate": 6.437082243533857e-07, + "loss": 0.7167, + "mean_token_accuracy": 0.7852950558066368, + "num_tokens": 1603983.0, + "step": 1330 + }, + { + "entropy": 2.0584792092442514, + "epoch": 0.004153882756659193, + "grad_norm": 12.117267608642578, + "learning_rate": 6.485517775840357e-07, + "loss": 0.6728, + "mean_token_accuracy": 0.7903219699859619, + "num_tokens": 1616249.0, + "step": 1340 + }, + { + "entropy": 2.088547757267952, + "epoch": 0.004184881881708889, + "grad_norm": 16.858585357666016, + "learning_rate": 6.533953308146857e-07, + "loss": 0.7391, + "mean_token_accuracy": 0.778919804096222, + "num_tokens": 1628549.0, + "step": 1350 + }, + { + "entropy": 2.1754756391048433, + "epoch": 0.0042158810067585845, + "grad_norm": 15.385615348815918, + "learning_rate": 6.582388840453358e-07, + "loss": 0.7799, + "mean_token_accuracy": 0.7731549352407455, + "num_tokens": 1639876.0, + "step": 1360 + }, + { + "entropy": 2.127278658747673, + "epoch": 0.004246880131808279, + "grad_norm": 8.056036949157715, + "learning_rate": 6.630824372759858e-07, + "loss": 0.7034, + "mean_token_accuracy": 0.7827352419495582, + "num_tokens": 1652097.0, + "step": 1370 + }, + { + "entropy": 2.0123574048280717, + "epoch": 0.004277879256857975, + "grad_norm": 6.172573566436768, + "learning_rate": 6.679259905066357e-07, + "loss": 0.6069, + "mean_token_accuracy": 0.7994264915585518, + "num_tokens": 1665043.0, + "step": 1380 + }, + { + "entropy": 2.096929004788399, + "epoch": 0.004308878381907671, + "grad_norm": 8.49801254272461, + "learning_rate": 6.727695437372856e-07, + "loss": 0.7136, + "mean_token_accuracy": 0.7863008975982666, + "num_tokens": 1677725.0, + "step": 1390 + }, + { + "entropy": 2.0824546411633493, + "epoch": 0.0043398775069573665, + "grad_norm": 14.584894180297852, + "learning_rate": 6.776130969679358e-07, + "loss": 0.7036, + "mean_token_accuracy": 0.7845976158976555, + "num_tokens": 1690896.0, + "step": 1400 + }, + { + "entropy": 2.13474909812212, + "epoch": 0.004370876632007061, + "grad_norm": 13.656248092651367, + "learning_rate": 6.824566501985857e-07, + "loss": 0.732, + "mean_token_accuracy": 0.7771820738911629, + "num_tokens": 1702660.0, + "step": 1410 + }, + { + "entropy": 2.0404009863734247, + "epoch": 0.004401875757056757, + "grad_norm": 13.051438331604004, + "learning_rate": 6.873002034292357e-07, + "loss": 0.6294, + "mean_token_accuracy": 0.7985868468880654, + "num_tokens": 1714725.0, + "step": 1420 + }, + { + "entropy": 2.0748074553906917, + "epoch": 0.004432874882106453, + "grad_norm": 13.570876121520996, + "learning_rate": 6.921437566598857e-07, + "loss": 0.6639, + "mean_token_accuracy": 0.7878038331866264, + "num_tokens": 1728017.0, + "step": 1430 + }, + { + "entropy": 2.072512838244438, + "epoch": 0.004463874007156148, + "grad_norm": 7.851343154907227, + "learning_rate": 6.969873098905358e-07, + "loss": 0.7054, + "mean_token_accuracy": 0.7887277230620384, + "num_tokens": 1740469.0, + "step": 1440 + }, + { + "entropy": 2.15796595364809, + "epoch": 0.004494873132205843, + "grad_norm": 13.530000686645508, + "learning_rate": 7.018308631211858e-07, + "loss": 0.7452, + "mean_token_accuracy": 0.7796575650572777, + "num_tokens": 1751979.0, + "step": 1450 + }, + { + "entropy": 2.0756077498197554, + "epoch": 0.004525872257255539, + "grad_norm": 14.499431610107422, + "learning_rate": 7.066744163518358e-07, + "loss": 0.7363, + "mean_token_accuracy": 0.7842569574713707, + "num_tokens": 1764493.0, + "step": 1460 + }, + { + "entropy": 1.988140694797039, + "epoch": 0.004556871382305234, + "grad_norm": 6.652562618255615, + "learning_rate": 7.115179695824858e-07, + "loss": 0.6319, + "mean_token_accuracy": 0.7997438430786132, + "num_tokens": 1777296.0, + "step": 1470 + }, + { + "entropy": 2.0755784660577774, + "epoch": 0.00458787050735493, + "grad_norm": 13.8985013961792, + "learning_rate": 7.163615228131359e-07, + "loss": 0.7402, + "mean_token_accuracy": 0.7709438994526863, + "num_tokens": 1789171.0, + "step": 1480 + }, + { + "entropy": 2.0258029848337173, + "epoch": 0.0046188696324046255, + "grad_norm": 7.679947376251221, + "learning_rate": 7.212050760437858e-07, + "loss": 0.6942, + "mean_token_accuracy": 0.785995414853096, + "num_tokens": 1801926.0, + "step": 1490 + }, + { + "entropy": 2.0404342114925385, + "epoch": 0.004649868757454321, + "grad_norm": 6.81480598449707, + "learning_rate": 7.260486292744357e-07, + "loss": 0.6672, + "mean_token_accuracy": 0.7987658172845841, + "num_tokens": 1814939.0, + "step": 1500 + }, + { + "entropy": 2.071673668920994, + "epoch": 0.004680867882504016, + "grad_norm": 15.106314659118652, + "learning_rate": 7.308921825050857e-07, + "loss": 0.6894, + "mean_token_accuracy": 0.7804762050509453, + "num_tokens": 1827340.0, + "step": 1510 + }, + { + "entropy": 2.0692993998527527, + "epoch": 0.004711867007553712, + "grad_norm": 15.853338241577148, + "learning_rate": 7.357357357357357e-07, + "loss": 0.7129, + "mean_token_accuracy": 0.7840728506445884, + "num_tokens": 1839510.0, + "step": 1520 + }, + { + "entropy": 2.135234770178795, + "epoch": 0.0047428661326034075, + "grad_norm": 6.517123699188232, + "learning_rate": 7.405792889663858e-07, + "loss": 0.8329, + "mean_token_accuracy": 0.7573605760931968, + "num_tokens": 1851013.0, + "step": 1530 + }, + { + "entropy": 2.036342756450176, + "epoch": 0.004773865257653102, + "grad_norm": 13.540575981140137, + "learning_rate": 7.454228421970358e-07, + "loss": 0.6977, + "mean_token_accuracy": 0.7853451654314995, + "num_tokens": 1863818.0, + "step": 1540 + }, + { + "entropy": 2.087058800458908, + "epoch": 0.004804864382702798, + "grad_norm": 8.103325843811035, + "learning_rate": 7.502663954276858e-07, + "loss": 0.7266, + "mean_token_accuracy": 0.7897397369146347, + "num_tokens": 1875751.0, + "step": 1550 + }, + { + "entropy": 2.091054494678974, + "epoch": 0.004835863507752494, + "grad_norm": 16.008771896362305, + "learning_rate": 7.551099486583358e-07, + "loss": 0.7302, + "mean_token_accuracy": 0.7836149573326111, + "num_tokens": 1887588.0, + "step": 1560 + }, + { + "entropy": 2.0431090027093886, + "epoch": 0.0048668626328021896, + "grad_norm": 8.24061393737793, + "learning_rate": 7.599535018889859e-07, + "loss": 0.6644, + "mean_token_accuracy": 0.8029666885733604, + "num_tokens": 1899870.0, + "step": 1570 + }, + { + "entropy": 1.8285046428442002, + "epoch": 0.004897861757851884, + "grad_norm": 7.983487606048584, + "learning_rate": 7.647970551196359e-07, + "loss": 0.5009, + "mean_token_accuracy": 0.8131875425577164, + "num_tokens": 1915664.0, + "step": 1580 + }, + { + "entropy": 2.086928182840347, + "epoch": 0.00492886088290158, + "grad_norm": 13.69112777709961, + "learning_rate": 7.696406083502859e-07, + "loss": 0.6791, + "mean_token_accuracy": 0.7944577828049659, + "num_tokens": 1927081.0, + "step": 1590 + }, + { + "entropy": 2.0402188524603844, + "epoch": 0.004959860007951276, + "grad_norm": 15.185161590576172, + "learning_rate": 7.744841615809357e-07, + "loss": 0.7159, + "mean_token_accuracy": 0.7936604365706443, + "num_tokens": 1940073.0, + "step": 1600 + }, + { + "entropy": 2.149085035920143, + "epoch": 0.004990859133000971, + "grad_norm": 16.92563247680664, + "learning_rate": 7.793277148115859e-07, + "loss": 0.7675, + "mean_token_accuracy": 0.7764922738075256, + "num_tokens": 1951448.0, + "step": 1610 + }, + { + "entropy": 2.099419781565666, + "epoch": 0.0050218582580506665, + "grad_norm": 15.367118835449219, + "learning_rate": 7.841712680422358e-07, + "loss": 0.7569, + "mean_token_accuracy": 0.7802120968699455, + "num_tokens": 1962645.0, + "step": 1620 + }, + { + "entropy": 2.0213639751076697, + "epoch": 0.005052857383100362, + "grad_norm": 13.852250099182129, + "learning_rate": 7.890148212728858e-07, + "loss": 0.6431, + "mean_token_accuracy": 0.7958111792802811, + "num_tokens": 1974941.0, + "step": 1630 + }, + { + "entropy": 2.1066105023026465, + "epoch": 0.005083856508150057, + "grad_norm": 15.502884864807129, + "learning_rate": 7.938583745035358e-07, + "loss": 0.6782, + "mean_token_accuracy": 0.7936943307518959, + "num_tokens": 1987185.0, + "step": 1640 + }, + { + "entropy": 2.0182456478476523, + "epoch": 0.005114855633199753, + "grad_norm": 7.306271076202393, + "learning_rate": 7.987019277341859e-07, + "loss": 0.5772, + "mean_token_accuracy": 0.8059568896889686, + "num_tokens": 2000487.0, + "step": 1650 + }, + { + "entropy": 2.147824813425541, + "epoch": 0.0051458547582494485, + "grad_norm": 18.491527557373047, + "learning_rate": 8.035454809648359e-07, + "loss": 0.8096, + "mean_token_accuracy": 0.7748027428984642, + "num_tokens": 2011914.0, + "step": 1660 + }, + { + "entropy": 1.9622011199593543, + "epoch": 0.005176853883299144, + "grad_norm": 13.780349731445312, + "learning_rate": 8.083890341954859e-07, + "loss": 0.6272, + "mean_token_accuracy": 0.8053760439157486, + "num_tokens": 2025757.0, + "step": 1670 + }, + { + "entropy": 2.098005874454975, + "epoch": 0.005207853008348839, + "grad_norm": 14.81534194946289, + "learning_rate": 8.132325874261359e-07, + "loss": 0.7484, + "mean_token_accuracy": 0.7781560063362122, + "num_tokens": 2037711.0, + "step": 1680 + }, + { + "entropy": 2.052612027525902, + "epoch": 0.005238852133398535, + "grad_norm": 8.133528709411621, + "learning_rate": 8.18076140656786e-07, + "loss": 0.6822, + "mean_token_accuracy": 0.79173034876585, + "num_tokens": 2049738.0, + "step": 1690 + }, + { + "entropy": 1.9735546708106995, + "epoch": 0.005269851258448231, + "grad_norm": 16.37797737121582, + "learning_rate": 8.22919693887436e-07, + "loss": 0.6594, + "mean_token_accuracy": 0.7950135871767998, + "num_tokens": 2062316.0, + "step": 1700 + }, + { + "entropy": 2.029072532057762, + "epoch": 0.0053008503834979254, + "grad_norm": 5.160717010498047, + "learning_rate": 8.277632471180859e-07, + "loss": 0.6559, + "mean_token_accuracy": 0.7948900654911994, + "num_tokens": 2075028.0, + "step": 1710 + }, + { + "entropy": 2.0763241961598395, + "epoch": 0.005331849508547621, + "grad_norm": 15.46190357208252, + "learning_rate": 8.326068003487358e-07, + "loss": 0.684, + "mean_token_accuracy": 0.7929447039961814, + "num_tokens": 2086702.0, + "step": 1720 + }, + { + "entropy": 2.141001485288143, + "epoch": 0.005362848633597317, + "grad_norm": 13.897162437438965, + "learning_rate": 8.374503535793858e-07, + "loss": 0.7299, + "mean_token_accuracy": 0.7951222896575928, + "num_tokens": 2097706.0, + "step": 1730 + }, + { + "entropy": 2.0640996396541595, + "epoch": 0.005393847758647012, + "grad_norm": 13.737348556518555, + "learning_rate": 8.422939068100359e-07, + "loss": 0.7175, + "mean_token_accuracy": 0.7896956667304039, + "num_tokens": 2108391.0, + "step": 1740 + }, + { + "entropy": 1.9356930390000344, + "epoch": 0.0054248468836967075, + "grad_norm": 8.433479309082031, + "learning_rate": 8.471374600406859e-07, + "loss": 0.631, + "mean_token_accuracy": 0.7932530760765075, + "num_tokens": 2120471.0, + "step": 1750 + }, + { + "entropy": 2.0442377462983132, + "epoch": 0.005455846008746403, + "grad_norm": 18.229915618896484, + "learning_rate": 8.519810132713359e-07, + "loss": 0.7221, + "mean_token_accuracy": 0.7952359080314636, + "num_tokens": 2132167.0, + "step": 1760 + }, + { + "entropy": 2.04462625682354, + "epoch": 0.005486845133796099, + "grad_norm": 14.919930458068848, + "learning_rate": 8.568245665019859e-07, + "loss": 0.699, + "mean_token_accuracy": 0.7886338919401169, + "num_tokens": 2143973.0, + "step": 1770 + }, + { + "entropy": 2.0158157765865328, + "epoch": 0.005517844258845794, + "grad_norm": 19.1096248626709, + "learning_rate": 8.61668119732636e-07, + "loss": 0.7034, + "mean_token_accuracy": 0.7857451155781746, + "num_tokens": 2156439.0, + "step": 1780 + }, + { + "entropy": 2.1503632470965384, + "epoch": 0.0055488433838954895, + "grad_norm": 17.376968383789062, + "learning_rate": 8.66511672963286e-07, + "loss": 0.742, + "mean_token_accuracy": 0.7804848656058312, + "num_tokens": 2167869.0, + "step": 1790 + }, + { + "entropy": 2.0076410725712774, + "epoch": 0.005579842508945185, + "grad_norm": 18.342008590698242, + "learning_rate": 8.71355226193936e-07, + "loss": 0.642, + "mean_token_accuracy": 0.8009541600942611, + "num_tokens": 2180188.0, + "step": 1800 + }, + { + "entropy": 2.0049708664417265, + "epoch": 0.00561084163399488, + "grad_norm": 8.827497482299805, + "learning_rate": 8.761987794245858e-07, + "loss": 0.6828, + "mean_token_accuracy": 0.7910264268517494, + "num_tokens": 2192579.0, + "step": 1810 + }, + { + "entropy": 2.085582806169987, + "epoch": 0.005641840759044576, + "grad_norm": 17.707990646362305, + "learning_rate": 8.81042332655236e-07, + "loss": 0.7866, + "mean_token_accuracy": 0.7726237401366234, + "num_tokens": 2204679.0, + "step": 1820 + }, + { + "entropy": 2.0235436514019964, + "epoch": 0.005672839884094272, + "grad_norm": 9.473557472229004, + "learning_rate": 8.858858858858859e-07, + "loss": 0.7397, + "mean_token_accuracy": 0.7853599205613137, + "num_tokens": 2216727.0, + "step": 1830 + }, + { + "entropy": 2.0955518752336504, + "epoch": 0.005703839009143967, + "grad_norm": 14.616801261901855, + "learning_rate": 8.907294391165359e-07, + "loss": 0.7601, + "mean_token_accuracy": 0.7755588620901108, + "num_tokens": 2228738.0, + "step": 1840 + }, + { + "entropy": 2.0462396532297134, + "epoch": 0.005734838134193662, + "grad_norm": 12.282236099243164, + "learning_rate": 8.955729923471859e-07, + "loss": 0.7061, + "mean_token_accuracy": 0.7882364228367805, + "num_tokens": 2240796.0, + "step": 1850 + }, + { + "entropy": 2.0632716536521913, + "epoch": 0.005765837259243358, + "grad_norm": 18.294025421142578, + "learning_rate": 9.00416545577836e-07, + "loss": 0.72, + "mean_token_accuracy": 0.7841200634837151, + "num_tokens": 2252408.0, + "step": 1860 + }, + { + "entropy": 2.0464861541986465, + "epoch": 0.005796836384293054, + "grad_norm": 13.904800415039062, + "learning_rate": 9.05260098808486e-07, + "loss": 0.7355, + "mean_token_accuracy": 0.7857467010617256, + "num_tokens": 2263031.0, + "step": 1870 + }, + { + "entropy": 1.9634292259812356, + "epoch": 0.0058278355093427485, + "grad_norm": 7.740158557891846, + "learning_rate": 9.10103652039136e-07, + "loss": 0.6566, + "mean_token_accuracy": 0.7917238727211953, + "num_tokens": 2275474.0, + "step": 1880 + }, + { + "entropy": 1.934257398545742, + "epoch": 0.005858834634392444, + "grad_norm": 7.317460060119629, + "learning_rate": 9.14947205269786e-07, + "loss": 0.6318, + "mean_token_accuracy": 0.8091923862695694, + "num_tokens": 2287535.0, + "step": 1890 + }, + { + "entropy": 2.0293245404958724, + "epoch": 0.00588983375944214, + "grad_norm": 13.60265064239502, + "learning_rate": 9.197907585004361e-07, + "loss": 0.6919, + "mean_token_accuracy": 0.7935610696673393, + "num_tokens": 2299006.0, + "step": 1900 + }, + { + "entropy": 1.9936009973287583, + "epoch": 0.005920832884491835, + "grad_norm": 7.678701400756836, + "learning_rate": 9.246343117310861e-07, + "loss": 0.6424, + "mean_token_accuracy": 0.7983625203371048, + "num_tokens": 2310660.0, + "step": 1910 + }, + { + "entropy": 1.910171502828598, + "epoch": 0.0059518320095415305, + "grad_norm": 4.376688480377197, + "learning_rate": 9.29477864961736e-07, + "loss": 0.6749, + "mean_token_accuracy": 0.791431900858879, + "num_tokens": 2323660.0, + "step": 1920 + }, + { + "entropy": 1.9754113674163818, + "epoch": 0.005982831134591226, + "grad_norm": 16.712688446044922, + "learning_rate": 9.343214181923859e-07, + "loss": 0.6853, + "mean_token_accuracy": 0.7904626414179802, + "num_tokens": 2335479.0, + "step": 1930 + }, + { + "entropy": 1.9469898149371148, + "epoch": 0.006013830259640922, + "grad_norm": 7.1344733238220215, + "learning_rate": 9.391649714230359e-07, + "loss": 0.6714, + "mean_token_accuracy": 0.7852405294775963, + "num_tokens": 2347294.0, + "step": 1940 + }, + { + "entropy": 2.031823492050171, + "epoch": 0.006044829384690617, + "grad_norm": 14.974668502807617, + "learning_rate": 9.44008524653686e-07, + "loss": 0.7696, + "mean_token_accuracy": 0.7771394848823547, + "num_tokens": 2358885.0, + "step": 1950 + }, + { + "entropy": 2.001576192677021, + "epoch": 0.006075828509740313, + "grad_norm": 14.197503089904785, + "learning_rate": 9.48852077884336e-07, + "loss": 0.7037, + "mean_token_accuracy": 0.7951447427272796, + "num_tokens": 2370132.0, + "step": 1960 + }, + { + "entropy": 1.9794816732406617, + "epoch": 0.006106827634790008, + "grad_norm": 14.473783493041992, + "learning_rate": 9.53695631114986e-07, + "loss": 0.7147, + "mean_token_accuracy": 0.7915472134947776, + "num_tokens": 2381547.0, + "step": 1970 + }, + { + "entropy": 2.093907243013382, + "epoch": 0.006137826759839703, + "grad_norm": 14.241476058959961, + "learning_rate": 9.585391843456359e-07, + "loss": 0.8013, + "mean_token_accuracy": 0.7786402046680451, + "num_tokens": 2392334.0, + "step": 1980 + }, + { + "entropy": 1.9560458779335022, + "epoch": 0.006168825884889399, + "grad_norm": 13.721399307250977, + "learning_rate": 9.633827375762862e-07, + "loss": 0.6542, + "mean_token_accuracy": 0.7954897537827492, + "num_tokens": 2405128.0, + "step": 1990 + }, + { + "entropy": 2.0026605874300003, + "epoch": 0.006199825009939095, + "grad_norm": 16.695531845092773, + "learning_rate": 9.68226290806936e-07, + "loss": 0.7113, + "mean_token_accuracy": 0.7941533356904984, + "num_tokens": 2416699.0, + "step": 2000 + }, + { + "entropy": 1.9149562925100327, + "epoch": 0.00623082413498879, + "grad_norm": 13.72096061706543, + "learning_rate": 9.73069844037586e-07, + "loss": 0.6126, + "mean_token_accuracy": 0.7917631477117538, + "num_tokens": 2429162.0, + "step": 2010 + }, + { + "entropy": 1.981172299385071, + "epoch": 0.006261823260038485, + "grad_norm": 15.749645233154297, + "learning_rate": 9.77913397268236e-07, + "loss": 0.7357, + "mean_token_accuracy": 0.7887249678373337, + "num_tokens": 2440565.0, + "step": 2020 + }, + { + "entropy": 1.9732341334223746, + "epoch": 0.006292822385088181, + "grad_norm": 16.25255012512207, + "learning_rate": 9.827569504988861e-07, + "loss": 0.6904, + "mean_token_accuracy": 0.7926795452833175, + "num_tokens": 2452318.0, + "step": 2030 + }, + { + "entropy": 1.9686682283878327, + "epoch": 0.006323821510137877, + "grad_norm": 7.215946197509766, + "learning_rate": 9.87600503729536e-07, + "loss": 0.6178, + "mean_token_accuracy": 0.7987834200263023, + "num_tokens": 2464480.0, + "step": 2040 + }, + { + "entropy": 2.0191121250391006, + "epoch": 0.0063548206351875716, + "grad_norm": 7.287489891052246, + "learning_rate": 9.924440569601861e-07, + "loss": 0.751, + "mean_token_accuracy": 0.7695384725928307, + "num_tokens": 2477286.0, + "step": 2050 + }, + { + "entropy": 1.9891038194298745, + "epoch": 0.006385819760237267, + "grad_norm": 15.255844116210938, + "learning_rate": 9.97287610190836e-07, + "loss": 0.6402, + "mean_token_accuracy": 0.7988035991787911, + "num_tokens": 2489470.0, + "step": 2060 + }, + { + "entropy": 2.0310388937592507, + "epoch": 0.006416818885286963, + "grad_norm": 18.825220108032227, + "learning_rate": 1.0021311634214861e-06, + "loss": 0.6939, + "mean_token_accuracy": 0.790933045744896, + "num_tokens": 2500898.0, + "step": 2070 + }, + { + "entropy": 1.9929038733243942, + "epoch": 0.006447818010336658, + "grad_norm": 12.69325065612793, + "learning_rate": 1.0069747166521362e-06, + "loss": 0.7442, + "mean_token_accuracy": 0.7899109661579132, + "num_tokens": 2511757.0, + "step": 2080 + }, + { + "entropy": 1.956763118505478, + "epoch": 0.006478817135386354, + "grad_norm": 11.434943199157715, + "learning_rate": 1.011818269882786e-06, + "loss": 0.7163, + "mean_token_accuracy": 0.7898407876491547, + "num_tokens": 2522888.0, + "step": 2090 + }, + { + "entropy": 1.97008508592844, + "epoch": 0.006509816260436049, + "grad_norm": 15.685938835144043, + "learning_rate": 1.016661823113436e-06, + "loss": 0.7396, + "mean_token_accuracy": 0.785623998939991, + "num_tokens": 2534453.0, + "step": 2100 + }, + { + "entropy": 1.9547904863953591, + "epoch": 0.006540815385485745, + "grad_norm": 13.821385383605957, + "learning_rate": 1.021505376344086e-06, + "loss": 0.7402, + "mean_token_accuracy": 0.785305617749691, + "num_tokens": 2546617.0, + "step": 2110 + }, + { + "entropy": 1.8934344440698623, + "epoch": 0.00657181451053544, + "grad_norm": 13.396963119506836, + "learning_rate": 1.0263489295747362e-06, + "loss": 0.627, + "mean_token_accuracy": 0.8024709656834602, + "num_tokens": 2559429.0, + "step": 2120 + }, + { + "entropy": 1.9755502566695213, + "epoch": 0.006602813635585136, + "grad_norm": 13.538336753845215, + "learning_rate": 1.031192482805386e-06, + "loss": 0.7059, + "mean_token_accuracy": 0.7905629277229309, + "num_tokens": 2570593.0, + "step": 2130 + }, + { + "entropy": 1.949264821410179, + "epoch": 0.006633812760634831, + "grad_norm": 14.021260261535645, + "learning_rate": 1.0360360360360361e-06, + "loss": 0.6666, + "mean_token_accuracy": 0.7925188347697258, + "num_tokens": 2582602.0, + "step": 2140 + }, + { + "entropy": 1.9639265209436416, + "epoch": 0.006664811885684526, + "grad_norm": 14.326904296875, + "learning_rate": 1.040879589266686e-06, + "loss": 0.7264, + "mean_token_accuracy": 0.7851147443056107, + "num_tokens": 2594982.0, + "step": 2150 + }, + { + "entropy": 1.908056390285492, + "epoch": 0.006695811010734222, + "grad_norm": 15.651636123657227, + "learning_rate": 1.0457231424973361e-06, + "loss": 0.6616, + "mean_token_accuracy": 0.7910144224762916, + "num_tokens": 2607900.0, + "step": 2160 + }, + { + "entropy": 2.064947435259819, + "epoch": 0.006726810135783918, + "grad_norm": 13.445158004760742, + "learning_rate": 1.0505666957279862e-06, + "loss": 0.7854, + "mean_token_accuracy": 0.7805202215909958, + "num_tokens": 2618173.0, + "step": 2170 + }, + { + "entropy": 1.9829859480261802, + "epoch": 0.0067578092608336126, + "grad_norm": 17.02464485168457, + "learning_rate": 1.055410248958636e-06, + "loss": 0.7521, + "mean_token_accuracy": 0.7822321712970733, + "num_tokens": 2629832.0, + "step": 2180 + }, + { + "entropy": 1.9977169573307036, + "epoch": 0.006788808385883308, + "grad_norm": 16.560504913330078, + "learning_rate": 1.060253802189286e-06, + "loss": 0.6964, + "mean_token_accuracy": 0.793058268725872, + "num_tokens": 2641302.0, + "step": 2190 + }, + { + "entropy": 2.0350762784481047, + "epoch": 0.006819807510933004, + "grad_norm": 16.779306411743164, + "learning_rate": 1.0650973554199363e-06, + "loss": 0.754, + "mean_token_accuracy": 0.7842149198055267, + "num_tokens": 2651948.0, + "step": 2200 + }, + { + "entropy": 1.9923300370573997, + "epoch": 0.0068508066359827, + "grad_norm": 16.477792739868164, + "learning_rate": 1.0699409086505862e-06, + "loss": 0.7457, + "mean_token_accuracy": 0.7842650130391121, + "num_tokens": 2663131.0, + "step": 2210 + }, + { + "entropy": 1.7988253235816956, + "epoch": 0.006881805761032395, + "grad_norm": 6.730621814727783, + "learning_rate": 1.074784461881236e-06, + "loss": 0.5973, + "mean_token_accuracy": 0.80888991355896, + "num_tokens": 2676405.0, + "step": 2220 + }, + { + "entropy": 1.9884469777345657, + "epoch": 0.00691280488608209, + "grad_norm": 15.052291870117188, + "learning_rate": 1.0796280151118862e-06, + "loss": 0.6849, + "mean_token_accuracy": 0.7967025130987168, + "num_tokens": 2688102.0, + "step": 2230 + }, + { + "entropy": 2.003503252565861, + "epoch": 0.006943804011131786, + "grad_norm": 7.750051498413086, + "learning_rate": 1.0844715683425363e-06, + "loss": 0.7153, + "mean_token_accuracy": 0.7897115662693978, + "num_tokens": 2699848.0, + "step": 2240 + }, + { + "entropy": 1.8995970517396927, + "epoch": 0.006974803136181481, + "grad_norm": 7.199272632598877, + "learning_rate": 1.0893151215731861e-06, + "loss": 0.5577, + "mean_token_accuracy": 0.8024276942014694, + "num_tokens": 2713648.0, + "step": 2250 + }, + { + "entropy": 2.017641696333885, + "epoch": 0.007005802261231177, + "grad_norm": 15.489433288574219, + "learning_rate": 1.0941586748038362e-06, + "loss": 0.7434, + "mean_token_accuracy": 0.7756871730089188, + "num_tokens": 2725047.0, + "step": 2260 + }, + { + "entropy": 1.895441135764122, + "epoch": 0.007036801386280872, + "grad_norm": 17.170063018798828, + "learning_rate": 1.0990022280344861e-06, + "loss": 0.6771, + "mean_token_accuracy": 0.7931119576096535, + "num_tokens": 2737777.0, + "step": 2270 + }, + { + "entropy": 1.8989380843937398, + "epoch": 0.007067800511330568, + "grad_norm": 13.440524101257324, + "learning_rate": 1.1038457812651362e-06, + "loss": 0.6491, + "mean_token_accuracy": 0.806152519583702, + "num_tokens": 2749458.0, + "step": 2280 + }, + { + "entropy": 1.9610642537474632, + "epoch": 0.007098799636380263, + "grad_norm": 17.207420349121094, + "learning_rate": 1.1086893344957863e-06, + "loss": 0.7137, + "mean_token_accuracy": 0.7841380387544632, + "num_tokens": 2760376.0, + "step": 2290 + }, + { + "entropy": 1.9233600437641143, + "epoch": 0.007129798761429959, + "grad_norm": 14.002983093261719, + "learning_rate": 1.1135328877264362e-06, + "loss": 0.6691, + "mean_token_accuracy": 0.7973776832222939, + "num_tokens": 2771954.0, + "step": 2300 + }, + { + "entropy": 1.9587767377495766, + "epoch": 0.0071607978864796544, + "grad_norm": 14.966231346130371, + "learning_rate": 1.118376440957086e-06, + "loss": 0.6645, + "mean_token_accuracy": 0.7941723063588142, + "num_tokens": 2784375.0, + "step": 2310 + }, + { + "entropy": 1.994602382183075, + "epoch": 0.007191797011529349, + "grad_norm": 15.076648712158203, + "learning_rate": 1.1232199941877364e-06, + "loss": 0.7767, + "mean_token_accuracy": 0.7866624847054482, + "num_tokens": 2795409.0, + "step": 2320 + }, + { + "entropy": 1.871568574011326, + "epoch": 0.007222796136579045, + "grad_norm": 15.472980499267578, + "learning_rate": 1.1280635474183863e-06, + "loss": 0.6379, + "mean_token_accuracy": 0.7967082962393761, + "num_tokens": 2807885.0, + "step": 2330 + }, + { + "entropy": 1.9470938667654991, + "epoch": 0.007253795261628741, + "grad_norm": 8.13938045501709, + "learning_rate": 1.1329071006490362e-06, + "loss": 0.6471, + "mean_token_accuracy": 0.796507653594017, + "num_tokens": 2820252.0, + "step": 2340 + }, + { + "entropy": 1.92869935631752, + "epoch": 0.007284794386678436, + "grad_norm": 15.049967765808105, + "learning_rate": 1.1377506538796862e-06, + "loss": 0.705, + "mean_token_accuracy": 0.7862442404031753, + "num_tokens": 2832038.0, + "step": 2350 + }, + { + "entropy": 1.914162775874138, + "epoch": 0.007315793511728131, + "grad_norm": 14.12210750579834, + "learning_rate": 1.1425942071103361e-06, + "loss": 0.6729, + "mean_token_accuracy": 0.7895276993513107, + "num_tokens": 2844886.0, + "step": 2360 + }, + { + "entropy": 1.967710091173649, + "epoch": 0.007346792636777827, + "grad_norm": 7.458662033081055, + "learning_rate": 1.1474377603409862e-06, + "loss": 0.7499, + "mean_token_accuracy": 0.7771386727690697, + "num_tokens": 2857547.0, + "step": 2370 + }, + { + "entropy": 1.998754534125328, + "epoch": 0.007377791761827523, + "grad_norm": 14.489635467529297, + "learning_rate": 1.1522813135716363e-06, + "loss": 0.7481, + "mean_token_accuracy": 0.7853496834635735, + "num_tokens": 2868673.0, + "step": 2380 + }, + { + "entropy": 1.9616554602980614, + "epoch": 0.007408790886877218, + "grad_norm": 12.88488483428955, + "learning_rate": 1.1571248668022862e-06, + "loss": 0.7012, + "mean_token_accuracy": 0.7969075560569763, + "num_tokens": 2880536.0, + "step": 2390 + }, + { + "entropy": 2.0036663502454757, + "epoch": 0.007439790011926913, + "grad_norm": 13.311254501342773, + "learning_rate": 1.161968420032936e-06, + "loss": 0.7523, + "mean_token_accuracy": 0.7829051271080971, + "num_tokens": 2891783.0, + "step": 2400 + }, + { + "entropy": 1.9757195591926575, + "epoch": 0.007470789136976609, + "grad_norm": 16.514055252075195, + "learning_rate": 1.1668119732635864e-06, + "loss": 0.716, + "mean_token_accuracy": 0.7831418976187706, + "num_tokens": 2903383.0, + "step": 2410 + }, + { + "entropy": 1.897585642337799, + "epoch": 0.007501788262026304, + "grad_norm": 14.471832275390625, + "learning_rate": 1.1716555264942363e-06, + "loss": 0.6846, + "mean_token_accuracy": 0.7894420132040978, + "num_tokens": 2915336.0, + "step": 2420 + }, + { + "entropy": 2.0040734350681304, + "epoch": 0.007532787387076, + "grad_norm": 15.47757339477539, + "learning_rate": 1.1764990797248862e-06, + "loss": 0.7554, + "mean_token_accuracy": 0.7723087310791016, + "num_tokens": 2926824.0, + "step": 2430 + }, + { + "entropy": 1.9852148294448853, + "epoch": 0.0075637865121256954, + "grad_norm": 14.812381744384766, + "learning_rate": 1.1813426329555363e-06, + "loss": 0.7039, + "mean_token_accuracy": 0.7904437810182572, + "num_tokens": 2938564.0, + "step": 2440 + }, + { + "entropy": 1.9800173118710518, + "epoch": 0.00759478563717539, + "grad_norm": 7.551329612731934, + "learning_rate": 1.1861861861861864e-06, + "loss": 0.7097, + "mean_token_accuracy": 0.7884911954402923, + "num_tokens": 2950233.0, + "step": 2450 + }, + { + "entropy": 1.9487542375922202, + "epoch": 0.007625784762225086, + "grad_norm": 15.525565147399902, + "learning_rate": 1.1910297394168362e-06, + "loss": 0.706, + "mean_token_accuracy": 0.7928291246294975, + "num_tokens": 2962418.0, + "step": 2460 + }, + { + "entropy": 1.9103297770023346, + "epoch": 0.007656783887274782, + "grad_norm": 16.958786010742188, + "learning_rate": 1.1958732926474863e-06, + "loss": 0.6459, + "mean_token_accuracy": 0.7916588813066483, + "num_tokens": 2974375.0, + "step": 2470 + }, + { + "entropy": 1.8603628262877465, + "epoch": 0.0076877830123244775, + "grad_norm": 15.165175437927246, + "learning_rate": 1.2007168458781362e-06, + "loss": 0.6034, + "mean_token_accuracy": 0.8035720109939575, + "num_tokens": 2986722.0, + "step": 2480 + }, + { + "entropy": 1.8906757444143296, + "epoch": 0.007718782137374172, + "grad_norm": 14.799652099609375, + "learning_rate": 1.2055603991087863e-06, + "loss": 0.721, + "mean_token_accuracy": 0.7916418060660362, + "num_tokens": 2998939.0, + "step": 2490 + }, + { + "entropy": 1.8013859555125236, + "epoch": 0.007749781262423868, + "grad_norm": 6.574148654937744, + "learning_rate": 1.2104039523394364e-06, + "loss": 0.591, + "mean_token_accuracy": 0.8007106646895409, + "num_tokens": 3011678.0, + "step": 2500 + }, + { + "entropy": 1.9229096472263336, + "epoch": 0.007780780387473564, + "grad_norm": 15.573455810546875, + "learning_rate": 1.2152475055700863e-06, + "loss": 0.6444, + "mean_token_accuracy": 0.7954755112528801, + "num_tokens": 3024579.0, + "step": 2510 + }, + { + "entropy": 1.938393659889698, + "epoch": 0.007811779512523259, + "grad_norm": 14.614435195922852, + "learning_rate": 1.2200910588007362e-06, + "loss": 0.7364, + "mean_token_accuracy": 0.7778101339936256, + "num_tokens": 3036102.0, + "step": 2520 + }, + { + "entropy": 2.013949643075466, + "epoch": 0.007842778637572954, + "grad_norm": 8.017887115478516, + "learning_rate": 1.2249346120313865e-06, + "loss": 0.7157, + "mean_token_accuracy": 0.7902801647782326, + "num_tokens": 3047786.0, + "step": 2530 + }, + { + "entropy": 1.9791011467576027, + "epoch": 0.00787377776262265, + "grad_norm": 13.628707885742188, + "learning_rate": 1.2297781652620364e-06, + "loss": 0.7443, + "mean_token_accuracy": 0.7846587881445884, + "num_tokens": 3059102.0, + "step": 2540 + }, + { + "entropy": 1.9235740274190902, + "epoch": 0.007904776887672346, + "grad_norm": 14.778630256652832, + "learning_rate": 1.2346217184926863e-06, + "loss": 0.6579, + "mean_token_accuracy": 0.7987881228327751, + "num_tokens": 3071029.0, + "step": 2550 + }, + { + "entropy": 1.873728483915329, + "epoch": 0.00793577601272204, + "grad_norm": 5.178359508514404, + "learning_rate": 1.2394652717233364e-06, + "loss": 0.6452, + "mean_token_accuracy": 0.7905326500535012, + "num_tokens": 3083691.0, + "step": 2560 + }, + { + "entropy": 1.9002502277493476, + "epoch": 0.007966775137771736, + "grad_norm": 6.418622016906738, + "learning_rate": 1.2443088249539862e-06, + "loss": 0.6569, + "mean_token_accuracy": 0.7951910138130188, + "num_tokens": 3096979.0, + "step": 2570 + }, + { + "entropy": 2.094506660103798, + "epoch": 0.007997774262821432, + "grad_norm": 14.790390968322754, + "learning_rate": 1.2491523781846363e-06, + "loss": 0.7683, + "mean_token_accuracy": 0.7839270010590553, + "num_tokens": 3107589.0, + "step": 2580 + }, + { + "entropy": 1.9785408988595008, + "epoch": 0.008028773387871127, + "grad_norm": 13.493193626403809, + "learning_rate": 1.2539959314152864e-06, + "loss": 0.6176, + "mean_token_accuracy": 0.7813256174325943, + "num_tokens": 3120357.0, + "step": 2590 + }, + { + "entropy": 1.8982400611042975, + "epoch": 0.008059772512920824, + "grad_norm": 8.22580623626709, + "learning_rate": 1.2588394846459365e-06, + "loss": 0.533, + "mean_token_accuracy": 0.8012925937771798, + "num_tokens": 3134146.0, + "step": 2600 + }, + { + "entropy": 1.9128680050373077, + "epoch": 0.008090771637970519, + "grad_norm": 12.298065185546875, + "learning_rate": 1.2636830378765864e-06, + "loss": 0.6659, + "mean_token_accuracy": 0.7981240004301071, + "num_tokens": 3146528.0, + "step": 2610 + }, + { + "entropy": 1.8557716280221939, + "epoch": 0.008121770763020213, + "grad_norm": 13.299572944641113, + "learning_rate": 1.2685265911072365e-06, + "loss": 0.5324, + "mean_token_accuracy": 0.811582088470459, + "num_tokens": 3160369.0, + "step": 2620 + }, + { + "entropy": 1.9083554148674011, + "epoch": 0.00815276988806991, + "grad_norm": 12.815226554870605, + "learning_rate": 1.2733701443378862e-06, + "loss": 0.7, + "mean_token_accuracy": 0.7826020732522011, + "num_tokens": 3171895.0, + "step": 2630 + }, + { + "entropy": 1.858922117948532, + "epoch": 0.008183769013119605, + "grad_norm": 7.371999740600586, + "learning_rate": 1.2782136975685363e-06, + "loss": 0.6308, + "mean_token_accuracy": 0.7977426454424859, + "num_tokens": 3184167.0, + "step": 2640 + }, + { + "entropy": 1.9080826759338378, + "epoch": 0.0082147681381693, + "grad_norm": 15.016209602355957, + "learning_rate": 1.2830572507991866e-06, + "loss": 0.6998, + "mean_token_accuracy": 0.7945051088929176, + "num_tokens": 3196310.0, + "step": 2650 + }, + { + "entropy": 2.000731149315834, + "epoch": 0.008245767263218996, + "grad_norm": 7.419742107391357, + "learning_rate": 1.2879008040298362e-06, + "loss": 0.7669, + "mean_token_accuracy": 0.7797714114189148, + "num_tokens": 3207853.0, + "step": 2660 + }, + { + "entropy": 1.9436663463711739, + "epoch": 0.008276766388268691, + "grad_norm": 14.453858375549316, + "learning_rate": 1.2927443572604863e-06, + "loss": 0.7125, + "mean_token_accuracy": 0.7874180287122726, + "num_tokens": 3219704.0, + "step": 2670 + }, + { + "entropy": 1.8370084151625634, + "epoch": 0.008307765513318386, + "grad_norm": 13.891809463500977, + "learning_rate": 1.2975879104911364e-06, + "loss": 0.6604, + "mean_token_accuracy": 0.7957833841443062, + "num_tokens": 3232139.0, + "step": 2680 + }, + { + "entropy": 1.8790874809026719, + "epoch": 0.008338764638368083, + "grad_norm": 6.238895893096924, + "learning_rate": 1.3024314637217863e-06, + "loss": 0.6469, + "mean_token_accuracy": 0.8013461381196976, + "num_tokens": 3244302.0, + "step": 2690 + }, + { + "entropy": 1.8268845871090889, + "epoch": 0.008369763763417777, + "grad_norm": 8.341919898986816, + "learning_rate": 1.3072750169524364e-06, + "loss": 0.5651, + "mean_token_accuracy": 0.8057503595948219, + "num_tokens": 3258280.0, + "step": 2700 + }, + { + "entropy": 1.9424984157085419, + "epoch": 0.008400762888467472, + "grad_norm": 14.885581970214844, + "learning_rate": 1.3121185701830863e-06, + "loss": 0.6694, + "mean_token_accuracy": 0.7825981408357621, + "num_tokens": 3270381.0, + "step": 2710 + }, + { + "entropy": 1.954266221821308, + "epoch": 0.008431762013517169, + "grad_norm": 14.373212814331055, + "learning_rate": 1.3169621234137364e-06, + "loss": 0.6892, + "mean_token_accuracy": 0.7864295184612274, + "num_tokens": 3282087.0, + "step": 2720 + }, + { + "entropy": 1.9162860542535782, + "epoch": 0.008462761138566864, + "grad_norm": 12.995490074157715, + "learning_rate": 1.3218056766443865e-06, + "loss": 0.7248, + "mean_token_accuracy": 0.7878184035420418, + "num_tokens": 3295059.0, + "step": 2730 + }, + { + "entropy": 1.8371574386954308, + "epoch": 0.008493760263616559, + "grad_norm": 14.610986709594727, + "learning_rate": 1.3266492298750364e-06, + "loss": 0.6114, + "mean_token_accuracy": 0.7994052410125733, + "num_tokens": 3307978.0, + "step": 2740 + }, + { + "entropy": 1.850772686302662, + "epoch": 0.008524759388666255, + "grad_norm": 14.59720516204834, + "learning_rate": 1.3314927831056865e-06, + "loss": 0.675, + "mean_token_accuracy": 0.7939030557870865, + "num_tokens": 3320392.0, + "step": 2750 + }, + { + "entropy": 1.8222134336829185, + "epoch": 0.00855575851371595, + "grad_norm": 5.238037109375, + "learning_rate": 1.3363363363363366e-06, + "loss": 0.5953, + "mean_token_accuracy": 0.8046846255660057, + "num_tokens": 3333208.0, + "step": 2760 + }, + { + "entropy": 1.9551292091608048, + "epoch": 0.008586757638765647, + "grad_norm": 16.98749351501465, + "learning_rate": 1.3411798895669865e-06, + "loss": 0.77, + "mean_token_accuracy": 0.7961449101567268, + "num_tokens": 3344004.0, + "step": 2770 + }, + { + "entropy": 1.9047151386737824, + "epoch": 0.008617756763815342, + "grad_norm": 15.543542861938477, + "learning_rate": 1.3460234427976366e-06, + "loss": 0.7398, + "mean_token_accuracy": 0.7901824861764908, + "num_tokens": 3354724.0, + "step": 2780 + }, + { + "entropy": 1.9245003148913384, + "epoch": 0.008648755888865036, + "grad_norm": 12.885390281677246, + "learning_rate": 1.3508669960282864e-06, + "loss": 0.7131, + "mean_token_accuracy": 0.7908341690897942, + "num_tokens": 3366242.0, + "step": 2790 + }, + { + "entropy": 1.873139852285385, + "epoch": 0.008679755013914733, + "grad_norm": 12.843483924865723, + "learning_rate": 1.3557105492589365e-06, + "loss": 0.6774, + "mean_token_accuracy": 0.8066575720906257, + "num_tokens": 3377837.0, + "step": 2800 + }, + { + "entropy": 1.8930615320801736, + "epoch": 0.008710754138964428, + "grad_norm": 10.335051536560059, + "learning_rate": 1.3605541024895866e-06, + "loss": 0.6729, + "mean_token_accuracy": 0.789339256286621, + "num_tokens": 3390578.0, + "step": 2810 + }, + { + "entropy": 1.9254612401127815, + "epoch": 0.008741753264014123, + "grad_norm": 13.433055877685547, + "learning_rate": 1.3653976557202365e-06, + "loss": 0.7126, + "mean_token_accuracy": 0.7813933417201042, + "num_tokens": 3401808.0, + "step": 2820 + }, + { + "entropy": 1.8774896636605263, + "epoch": 0.00877275238906382, + "grad_norm": 15.659320831298828, + "learning_rate": 1.3702412089508866e-06, + "loss": 0.6582, + "mean_token_accuracy": 0.7960716530680656, + "num_tokens": 3412922.0, + "step": 2830 + }, + { + "entropy": 1.8881728678941727, + "epoch": 0.008803751514113514, + "grad_norm": 14.669523239135742, + "learning_rate": 1.3750847621815363e-06, + "loss": 0.6416, + "mean_token_accuracy": 0.8034508541226387, + "num_tokens": 3425285.0, + "step": 2840 + }, + { + "entropy": 1.907693549990654, + "epoch": 0.008834750639163209, + "grad_norm": 17.293590545654297, + "learning_rate": 1.3799283154121864e-06, + "loss": 0.6792, + "mean_token_accuracy": 0.797477675974369, + "num_tokens": 3436697.0, + "step": 2850 + }, + { + "entropy": 1.9830518007278441, + "epoch": 0.008865749764212906, + "grad_norm": 15.576813697814941, + "learning_rate": 1.3847718686428367e-06, + "loss": 0.7526, + "mean_token_accuracy": 0.779937069118023, + "num_tokens": 3447898.0, + "step": 2860 + }, + { + "entropy": 2.013961046934128, + "epoch": 0.0088967488892626, + "grad_norm": 19.190113067626953, + "learning_rate": 1.3896154218734864e-06, + "loss": 0.7515, + "mean_token_accuracy": 0.7880441144108772, + "num_tokens": 3458894.0, + "step": 2870 + }, + { + "entropy": 1.9314459562301636, + "epoch": 0.008927748014312295, + "grad_norm": 12.630243301391602, + "learning_rate": 1.3944589751041364e-06, + "loss": 0.6666, + "mean_token_accuracy": 0.7965318202972412, + "num_tokens": 3471083.0, + "step": 2880 + }, + { + "entropy": 1.9051019206643105, + "epoch": 0.008958747139361992, + "grad_norm": 12.55810546875, + "learning_rate": 1.3993025283347865e-06, + "loss": 0.678, + "mean_token_accuracy": 0.7915308341383934, + "num_tokens": 3483322.0, + "step": 2890 + }, + { + "entropy": 1.889833214879036, + "epoch": 0.008989746264411687, + "grad_norm": 11.794564247131348, + "learning_rate": 1.4041460815654364e-06, + "loss": 0.5963, + "mean_token_accuracy": 0.7985853642225266, + "num_tokens": 3496188.0, + "step": 2900 + }, + { + "entropy": 1.8793418243527413, + "epoch": 0.009020745389461382, + "grad_norm": 6.941933631896973, + "learning_rate": 1.4089896347960865e-06, + "loss": 0.6065, + "mean_token_accuracy": 0.8100902363657951, + "num_tokens": 3508412.0, + "step": 2910 + }, + { + "entropy": 1.7804090306162834, + "epoch": 0.009051744514511078, + "grad_norm": 7.316125392913818, + "learning_rate": 1.4138331880267364e-06, + "loss": 0.5352, + "mean_token_accuracy": 0.8099708408117294, + "num_tokens": 3522455.0, + "step": 2920 + }, + { + "entropy": 1.9128621608018874, + "epoch": 0.009082743639560773, + "grad_norm": 17.679824829101562, + "learning_rate": 1.4186767412573865e-06, + "loss": 0.684, + "mean_token_accuracy": 0.7950308054685593, + "num_tokens": 3534077.0, + "step": 2930 + }, + { + "entropy": 1.951967991888523, + "epoch": 0.009113742764610468, + "grad_norm": 15.615214347839355, + "learning_rate": 1.4235202944880366e-06, + "loss": 0.6822, + "mean_token_accuracy": 0.7895113542675972, + "num_tokens": 3545717.0, + "step": 2940 + }, + { + "entropy": 1.993370993435383, + "epoch": 0.009144741889660165, + "grad_norm": 6.757071018218994, + "learning_rate": 1.4283638477186865e-06, + "loss": 0.7091, + "mean_token_accuracy": 0.7836599707603454, + "num_tokens": 3556645.0, + "step": 2950 + }, + { + "entropy": 2.0306035369634627, + "epoch": 0.00917574101470986, + "grad_norm": 14.662017822265625, + "learning_rate": 1.4332074009493366e-06, + "loss": 0.7833, + "mean_token_accuracy": 0.7853764742612839, + "num_tokens": 3568121.0, + "step": 2960 + }, + { + "entropy": 1.981213203072548, + "epoch": 0.009206740139759556, + "grad_norm": 15.54949951171875, + "learning_rate": 1.4380509541799867e-06, + "loss": 0.7714, + "mean_token_accuracy": 0.7741693139076233, + "num_tokens": 3578617.0, + "step": 2970 + }, + { + "entropy": 1.872342900186777, + "epoch": 0.009237739264809251, + "grad_norm": 17.143173217773438, + "learning_rate": 1.4428945074106366e-06, + "loss": 0.6521, + "mean_token_accuracy": 0.7934080705046653, + "num_tokens": 3590839.0, + "step": 2980 + }, + { + "entropy": 1.9597981229424477, + "epoch": 0.009268738389858946, + "grad_norm": 14.204910278320312, + "learning_rate": 1.4477380606412867e-06, + "loss": 0.7372, + "mean_token_accuracy": 0.7796640574932099, + "num_tokens": 3602197.0, + "step": 2990 + }, + { + "entropy": 1.9134353682398797, + "epoch": 0.009299737514908642, + "grad_norm": 15.377642631530762, + "learning_rate": 1.4525816138719365e-06, + "loss": 0.5693, + "mean_token_accuracy": 0.80574119836092, + "num_tokens": 3614912.0, + "step": 3000 + }, + { + "entropy": 1.978382122516632, + "epoch": 0.009330736639958337, + "grad_norm": 7.089384078979492, + "learning_rate": 1.4574251671025866e-06, + "loss": 0.7043, + "mean_token_accuracy": 0.7872982352972031, + "num_tokens": 3626738.0, + "step": 3010 + }, + { + "entropy": 1.9824548229575156, + "epoch": 0.009361735765008032, + "grad_norm": 7.4606781005859375, + "learning_rate": 1.4622687203332367e-06, + "loss": 0.7379, + "mean_token_accuracy": 0.7830901652574539, + "num_tokens": 3638714.0, + "step": 3020 + }, + { + "entropy": 1.8985080510377883, + "epoch": 0.009392734890057729, + "grad_norm": 13.994378089904785, + "learning_rate": 1.4671122735638866e-06, + "loss": 0.6141, + "mean_token_accuracy": 0.8004487827420235, + "num_tokens": 3651514.0, + "step": 3030 + }, + { + "entropy": 1.9109556749463081, + "epoch": 0.009423734015107424, + "grad_norm": 11.190482139587402, + "learning_rate": 1.4719558267945367e-06, + "loss": 0.6391, + "mean_token_accuracy": 0.8045189529657364, + "num_tokens": 3662697.0, + "step": 3040 + }, + { + "entropy": 1.8760196268558502, + "epoch": 0.009454733140157118, + "grad_norm": 12.307433128356934, + "learning_rate": 1.4767993800251864e-06, + "loss": 0.6314, + "mean_token_accuracy": 0.8026507899165154, + "num_tokens": 3674503.0, + "step": 3050 + }, + { + "entropy": 1.8742030158638954, + "epoch": 0.009485732265206815, + "grad_norm": 7.280045509338379, + "learning_rate": 1.4816429332558365e-06, + "loss": 0.6784, + "mean_token_accuracy": 0.7885152325034142, + "num_tokens": 3686269.0, + "step": 3060 + }, + { + "entropy": 1.7706302866339683, + "epoch": 0.00951673139025651, + "grad_norm": 16.125728607177734, + "learning_rate": 1.4864864864864868e-06, + "loss": 0.6367, + "mean_token_accuracy": 0.8014728352427483, + "num_tokens": 3699713.0, + "step": 3070 + }, + { + "entropy": 1.9473395302891732, + "epoch": 0.009547730515306205, + "grad_norm": 13.948101997375488, + "learning_rate": 1.4913300397171365e-06, + "loss": 0.7302, + "mean_token_accuracy": 0.7855488672852516, + "num_tokens": 3710852.0, + "step": 3080 + }, + { + "entropy": 1.9612311527132988, + "epoch": 0.009578729640355901, + "grad_norm": 13.97789192199707, + "learning_rate": 1.4961735929477866e-06, + "loss": 0.6629, + "mean_token_accuracy": 0.7972382113337517, + "num_tokens": 3722160.0, + "step": 3090 + }, + { + "entropy": 1.960909178853035, + "epoch": 0.009609728765405596, + "grad_norm": 17.231172561645508, + "learning_rate": 1.5010171461784366e-06, + "loss": 0.754, + "mean_token_accuracy": 0.7888594791293144, + "num_tokens": 3732729.0, + "step": 3100 + }, + { + "entropy": 1.8868347622454167, + "epoch": 0.009640727890455291, + "grad_norm": 15.940637588500977, + "learning_rate": 1.5058606994090865e-06, + "loss": 0.5887, + "mean_token_accuracy": 0.8036533623933793, + "num_tokens": 3746194.0, + "step": 3110 + }, + { + "entropy": 2.00607987344265, + "epoch": 0.009671727015504988, + "grad_norm": 16.22367286682129, + "learning_rate": 1.5107042526397366e-06, + "loss": 0.7286, + "mean_token_accuracy": 0.7928143292665482, + "num_tokens": 3756751.0, + "step": 3120 + }, + { + "entropy": 1.9029984012246133, + "epoch": 0.009702726140554683, + "grad_norm": 6.391202926635742, + "learning_rate": 1.5155478058703865e-06, + "loss": 0.6529, + "mean_token_accuracy": 0.7976860404014587, + "num_tokens": 3768774.0, + "step": 3130 + }, + { + "entropy": 1.900516900420189, + "epoch": 0.009733725265604379, + "grad_norm": 8.11662483215332, + "learning_rate": 1.5203913591010366e-06, + "loss": 0.6046, + "mean_token_accuracy": 0.8091256007552147, + "num_tokens": 3780820.0, + "step": 3140 + }, + { + "entropy": 1.9626210525631904, + "epoch": 0.009764724390654074, + "grad_norm": 12.520034790039062, + "learning_rate": 1.5252349123316867e-06, + "loss": 0.7038, + "mean_token_accuracy": 0.7925601005554199, + "num_tokens": 3792530.0, + "step": 3150 + }, + { + "entropy": 1.9671666517853736, + "epoch": 0.009795723515703769, + "grad_norm": 13.528938293457031, + "learning_rate": 1.5300784655623366e-06, + "loss": 0.6599, + "mean_token_accuracy": 0.7918861374258995, + "num_tokens": 3804625.0, + "step": 3160 + }, + { + "entropy": 2.005161625146866, + "epoch": 0.009826722640753465, + "grad_norm": 15.400350570678711, + "learning_rate": 1.5349220187929867e-06, + "loss": 0.7848, + "mean_token_accuracy": 0.7739899069070816, + "num_tokens": 3815444.0, + "step": 3170 + }, + { + "entropy": 1.9359246730804442, + "epoch": 0.00985772176580316, + "grad_norm": 12.780713081359863, + "learning_rate": 1.5397655720236368e-06, + "loss": 0.6371, + "mean_token_accuracy": 0.7938413843512535, + "num_tokens": 3827684.0, + "step": 3180 + }, + { + "entropy": 1.8368206307291985, + "epoch": 0.009888720890852855, + "grad_norm": 14.72050952911377, + "learning_rate": 1.5446091252542867e-06, + "loss": 0.6173, + "mean_token_accuracy": 0.8116529732942581, + "num_tokens": 3840934.0, + "step": 3190 + }, + { + "entropy": 1.9378165647387504, + "epoch": 0.009919720015902552, + "grad_norm": 14.93758487701416, + "learning_rate": 1.5494526784849368e-06, + "loss": 0.7211, + "mean_token_accuracy": 0.7911474660038949, + "num_tokens": 3852242.0, + "step": 3200 + }, + { + "entropy": 1.9542915537953376, + "epoch": 0.009950719140952247, + "grad_norm": 16.133121490478516, + "learning_rate": 1.5542962317155866e-06, + "loss": 0.7008, + "mean_token_accuracy": 0.7842934697866439, + "num_tokens": 3864015.0, + "step": 3210 + }, + { + "entropy": 1.9251963838934898, + "epoch": 0.009981718266001941, + "grad_norm": 15.101007461547852, + "learning_rate": 1.5591397849462367e-06, + "loss": 0.6018, + "mean_token_accuracy": 0.8036449015140533, + "num_tokens": 3876673.0, + "step": 3220 + }, + { + "entropy": 1.9787523686885833, + "epoch": 0.010012717391051638, + "grad_norm": 11.887308120727539, + "learning_rate": 1.5639833381768868e-06, + "loss": 0.7158, + "mean_token_accuracy": 0.79226556122303, + "num_tokens": 3888063.0, + "step": 3230 + }, + { + "entropy": 1.99595515280962, + "epoch": 0.010043716516101333, + "grad_norm": 12.0714693069458, + "learning_rate": 1.5688268914075367e-06, + "loss": 0.7203, + "mean_token_accuracy": 0.7814503669738769, + "num_tokens": 3899339.0, + "step": 3240 + }, + { + "entropy": 1.92583971619606, + "epoch": 0.010074715641151028, + "grad_norm": 14.31470775604248, + "learning_rate": 1.5736704446381868e-06, + "loss": 0.6961, + "mean_token_accuracy": 0.7950286611914634, + "num_tokens": 3911124.0, + "step": 3250 + }, + { + "entropy": 1.9035099178552628, + "epoch": 0.010105714766200724, + "grad_norm": 14.206976890563965, + "learning_rate": 1.5785139978688365e-06, + "loss": 0.6835, + "mean_token_accuracy": 0.7935141950845719, + "num_tokens": 3923110.0, + "step": 3260 + }, + { + "entropy": 1.8103605896234511, + "epoch": 0.01013671389125042, + "grad_norm": 11.845626831054688, + "learning_rate": 1.5833575510994868e-06, + "loss": 0.5957, + "mean_token_accuracy": 0.8122652113437653, + "num_tokens": 3936835.0, + "step": 3270 + }, + { + "entropy": 1.9789897188544274, + "epoch": 0.010167713016300114, + "grad_norm": 16.31435775756836, + "learning_rate": 1.5882011043301369e-06, + "loss": 0.6931, + "mean_token_accuracy": 0.790991485118866, + "num_tokens": 3948691.0, + "step": 3280 + }, + { + "entropy": 1.9635705202817917, + "epoch": 0.01019871214134981, + "grad_norm": 18.17036247253418, + "learning_rate": 1.5930446575607866e-06, + "loss": 0.7027, + "mean_token_accuracy": 0.7976090654730796, + "num_tokens": 3960588.0, + "step": 3290 + }, + { + "entropy": 1.8178722724318503, + "epoch": 0.010229711266399506, + "grad_norm": 4.42231559753418, + "learning_rate": 1.5978882107914367e-06, + "loss": 0.6036, + "mean_token_accuracy": 0.8080889210104942, + "num_tokens": 3974030.0, + "step": 3300 + }, + { + "entropy": 2.014702117443085, + "epoch": 0.010260710391449202, + "grad_norm": 13.163649559020996, + "learning_rate": 1.6027317640220868e-06, + "loss": 0.7217, + "mean_token_accuracy": 0.7939134731888771, + "num_tokens": 3984810.0, + "step": 3310 + }, + { + "entropy": 1.9950988233089446, + "epoch": 0.010291709516498897, + "grad_norm": 15.815617561340332, + "learning_rate": 1.6075753172527366e-06, + "loss": 0.7374, + "mean_token_accuracy": 0.7863194987177848, + "num_tokens": 3996191.0, + "step": 3320 + }, + { + "entropy": 1.9132106304168701, + "epoch": 0.010322708641548592, + "grad_norm": 15.737754821777344, + "learning_rate": 1.6124188704833867e-06, + "loss": 0.7396, + "mean_token_accuracy": 0.7841628924012184, + "num_tokens": 4007285.0, + "step": 3330 + }, + { + "entropy": 1.8114167541265487, + "epoch": 0.010353707766598289, + "grad_norm": 13.689631462097168, + "learning_rate": 1.6172624237140366e-06, + "loss": 0.5893, + "mean_token_accuracy": 0.8101338252425194, + "num_tokens": 4020382.0, + "step": 3340 + }, + { + "entropy": 1.8398823648691178, + "epoch": 0.010384706891647983, + "grad_norm": 14.286563873291016, + "learning_rate": 1.6221059769446867e-06, + "loss": 0.6389, + "mean_token_accuracy": 0.8049512296915055, + "num_tokens": 4032030.0, + "step": 3350 + }, + { + "entropy": 1.9326018333435058, + "epoch": 0.010415706016697678, + "grad_norm": 16.373798370361328, + "learning_rate": 1.6269495301753368e-06, + "loss": 0.7155, + "mean_token_accuracy": 0.7966372415423393, + "num_tokens": 4043118.0, + "step": 3360 + }, + { + "entropy": 1.905664649605751, + "epoch": 0.010446705141747375, + "grad_norm": 12.13680648803711, + "learning_rate": 1.6317930834059867e-06, + "loss": 0.7183, + "mean_token_accuracy": 0.7907021954655647, + "num_tokens": 4055085.0, + "step": 3370 + }, + { + "entropy": 1.7940703511238099, + "epoch": 0.01047770426679707, + "grad_norm": 6.879446983337402, + "learning_rate": 1.6366366366366368e-06, + "loss": 0.5276, + "mean_token_accuracy": 0.8221948266029357, + "num_tokens": 4067929.0, + "step": 3380 + }, + { + "entropy": 1.964170092344284, + "epoch": 0.010508703391846765, + "grad_norm": 17.012205123901367, + "learning_rate": 1.6414801898672869e-06, + "loss": 0.7205, + "mean_token_accuracy": 0.7949927791953086, + "num_tokens": 4079191.0, + "step": 3390 + }, + { + "entropy": 1.9278406336903573, + "epoch": 0.010539702516896461, + "grad_norm": 15.772573471069336, + "learning_rate": 1.6463237430979368e-06, + "loss": 0.6634, + "mean_token_accuracy": 0.7951213136315346, + "num_tokens": 4090571.0, + "step": 3400 + }, + { + "entropy": 1.894165250658989, + "epoch": 0.010570701641946156, + "grad_norm": 7.302316188812256, + "learning_rate": 1.6511672963285869e-06, + "loss": 0.6412, + "mean_token_accuracy": 0.8126410812139511, + "num_tokens": 4102478.0, + "step": 3410 + }, + { + "entropy": 1.8725802972912788, + "epoch": 0.010601700766995851, + "grad_norm": 12.873794555664062, + "learning_rate": 1.6560108495592367e-06, + "loss": 0.6578, + "mean_token_accuracy": 0.808172582089901, + "num_tokens": 4114349.0, + "step": 3420 + }, + { + "entropy": 1.9037815779447556, + "epoch": 0.010632699892045547, + "grad_norm": 6.517296314239502, + "learning_rate": 1.6608544027898868e-06, + "loss": 0.6275, + "mean_token_accuracy": 0.8006631091237069, + "num_tokens": 4126609.0, + "step": 3430 + }, + { + "entropy": 1.9055060788989067, + "epoch": 0.010663699017095242, + "grad_norm": 15.855356216430664, + "learning_rate": 1.665697956020537e-06, + "loss": 0.7089, + "mean_token_accuracy": 0.7951321870088577, + "num_tokens": 4138476.0, + "step": 3440 + }, + { + "entropy": 1.9123858138918877, + "epoch": 0.010694698142144937, + "grad_norm": 14.749032020568848, + "learning_rate": 1.6705415092511868e-06, + "loss": 0.6421, + "mean_token_accuracy": 0.799940338730812, + "num_tokens": 4150474.0, + "step": 3450 + }, + { + "entropy": 1.9116358309984207, + "epoch": 0.010725697267194634, + "grad_norm": 13.992262840270996, + "learning_rate": 1.675385062481837e-06, + "loss": 0.5853, + "mean_token_accuracy": 0.8097973421216011, + "num_tokens": 4162977.0, + "step": 3460 + }, + { + "entropy": 1.9737422183156013, + "epoch": 0.010756696392244329, + "grad_norm": 19.65887451171875, + "learning_rate": 1.6802286157124866e-06, + "loss": 0.7513, + "mean_token_accuracy": 0.786674790084362, + "num_tokens": 4174740.0, + "step": 3470 + }, + { + "entropy": 1.9084539324045182, + "epoch": 0.010787695517294024, + "grad_norm": 7.143436908721924, + "learning_rate": 1.685072168943137e-06, + "loss": 0.6752, + "mean_token_accuracy": 0.8009995728731155, + "num_tokens": 4187420.0, + "step": 3480 + }, + { + "entropy": 1.9033571183681488, + "epoch": 0.01081869464234372, + "grad_norm": 12.777674674987793, + "learning_rate": 1.689915722173787e-06, + "loss": 0.6786, + "mean_token_accuracy": 0.8100510269403458, + "num_tokens": 4198138.0, + "step": 3490 + }, + { + "entropy": 1.8302487596869468, + "epoch": 0.010849693767393415, + "grad_norm": 14.17345905303955, + "learning_rate": 1.6947592754044367e-06, + "loss": 0.6558, + "mean_token_accuracy": 0.7974511593580246, + "num_tokens": 4210639.0, + "step": 3500 + }, + { + "entropy": 1.8793094977736473, + "epoch": 0.010880692892443112, + "grad_norm": 15.137555122375488, + "learning_rate": 1.6996028286350868e-06, + "loss": 0.6217, + "mean_token_accuracy": 0.8036959484219551, + "num_tokens": 4222201.0, + "step": 3510 + }, + { + "entropy": 1.8393305256962775, + "epoch": 0.010911692017492806, + "grad_norm": 12.150157928466797, + "learning_rate": 1.704446381865737e-06, + "loss": 0.6265, + "mean_token_accuracy": 0.8053354471921921, + "num_tokens": 4234129.0, + "step": 3520 + }, + { + "entropy": 1.8129374995827674, + "epoch": 0.010942691142542501, + "grad_norm": 6.902397155761719, + "learning_rate": 1.7092899350963867e-06, + "loss": 0.5879, + "mean_token_accuracy": 0.8074716746807098, + "num_tokens": 4247360.0, + "step": 3530 + }, + { + "entropy": 1.847067180275917, + "epoch": 0.010973690267592198, + "grad_norm": 9.06917953491211, + "learning_rate": 1.7141334883270368e-06, + "loss": 0.6657, + "mean_token_accuracy": 0.7977814689278603, + "num_tokens": 4260605.0, + "step": 3540 + }, + { + "entropy": 1.935109367966652, + "epoch": 0.011004689392641893, + "grad_norm": 13.947689056396484, + "learning_rate": 1.7189770415576867e-06, + "loss": 0.6719, + "mean_token_accuracy": 0.8019600152969361, + "num_tokens": 4271987.0, + "step": 3550 + }, + { + "entropy": 1.8618975296616553, + "epoch": 0.011035688517691588, + "grad_norm": 7.604375839233398, + "learning_rate": 1.7238205947883368e-06, + "loss": 0.6746, + "mean_token_accuracy": 0.7953737512230873, + "num_tokens": 4283981.0, + "step": 3560 + }, + { + "entropy": 1.9873407423496245, + "epoch": 0.011066687642741284, + "grad_norm": 12.513638496398926, + "learning_rate": 1.728664148018987e-06, + "loss": 0.8001, + "mean_token_accuracy": 0.7810007587075234, + "num_tokens": 4295314.0, + "step": 3570 + }, + { + "entropy": 1.8661947965621948, + "epoch": 0.011097686767790979, + "grad_norm": 8.476912498474121, + "learning_rate": 1.7335077012496368e-06, + "loss": 0.6368, + "mean_token_accuracy": 0.8108328223228455, + "num_tokens": 4306919.0, + "step": 3580 + }, + { + "entropy": 1.9405100882053374, + "epoch": 0.011128685892840674, + "grad_norm": 11.374967575073242, + "learning_rate": 1.7383512544802869e-06, + "loss": 0.7597, + "mean_token_accuracy": 0.7805906578898429, + "num_tokens": 4318405.0, + "step": 3590 + }, + { + "entropy": 1.8536500945687293, + "epoch": 0.01115968501789037, + "grad_norm": 17.95821762084961, + "learning_rate": 1.743194807710937e-06, + "loss": 0.6538, + "mean_token_accuracy": 0.7971635937690735, + "num_tokens": 4330730.0, + "step": 3600 + }, + { + "entropy": 1.8885389596223832, + "epoch": 0.011190684142940065, + "grad_norm": 15.392064094543457, + "learning_rate": 1.7480383609415869e-06, + "loss": 0.6708, + "mean_token_accuracy": 0.8034435287117958, + "num_tokens": 4341732.0, + "step": 3610 + }, + { + "entropy": 1.9072374895215034, + "epoch": 0.01122168326798976, + "grad_norm": 15.804481506347656, + "learning_rate": 1.752881914172237e-06, + "loss": 0.7122, + "mean_token_accuracy": 0.7888311371207237, + "num_tokens": 4352944.0, + "step": 3620 + }, + { + "entropy": 1.8851828515529632, + "epoch": 0.011252682393039457, + "grad_norm": 14.782668113708496, + "learning_rate": 1.7577254674028869e-06, + "loss": 0.6998, + "mean_token_accuracy": 0.7909768223762512, + "num_tokens": 4363879.0, + "step": 3630 + }, + { + "entropy": 1.8380015999078751, + "epoch": 0.011283681518089152, + "grad_norm": 14.773683547973633, + "learning_rate": 1.762569020633537e-06, + "loss": 0.6264, + "mean_token_accuracy": 0.8070567846298218, + "num_tokens": 4375759.0, + "step": 3640 + }, + { + "entropy": 1.8058712944388389, + "epoch": 0.011314680643138847, + "grad_norm": 13.945756912231445, + "learning_rate": 1.767412573864187e-06, + "loss": 0.5706, + "mean_token_accuracy": 0.8059628069400787, + "num_tokens": 4389341.0, + "step": 3650 + }, + { + "entropy": 1.8813235729932785, + "epoch": 0.011345679768188543, + "grad_norm": 12.711499214172363, + "learning_rate": 1.772256127094837e-06, + "loss": 0.6448, + "mean_token_accuracy": 0.8153657332062721, + "num_tokens": 4400801.0, + "step": 3660 + }, + { + "entropy": 1.849845513701439, + "epoch": 0.011376678893238238, + "grad_norm": 15.230743408203125, + "learning_rate": 1.777099680325487e-06, + "loss": 0.5924, + "mean_token_accuracy": 0.8115788295865058, + "num_tokens": 4413458.0, + "step": 3670 + }, + { + "entropy": 1.9601149141788483, + "epoch": 0.011407678018287935, + "grad_norm": 13.785551071166992, + "learning_rate": 1.7819432335561367e-06, + "loss": 0.7572, + "mean_token_accuracy": 0.7834953621029854, + "num_tokens": 4424999.0, + "step": 3680 + }, + { + "entropy": 1.8368350595235825, + "epoch": 0.01143867714333763, + "grad_norm": 7.308245658874512, + "learning_rate": 1.786786786786787e-06, + "loss": 0.5917, + "mean_token_accuracy": 0.8099960327148438, + "num_tokens": 4436854.0, + "step": 3690 + }, + { + "entropy": 1.8301602736115457, + "epoch": 0.011469676268387324, + "grad_norm": 13.396066665649414, + "learning_rate": 1.791630340017437e-06, + "loss": 0.61, + "mean_token_accuracy": 0.8089475125074387, + "num_tokens": 4449388.0, + "step": 3700 + }, + { + "entropy": 1.9232326075434685, + "epoch": 0.011500675393437021, + "grad_norm": 14.820940971374512, + "learning_rate": 1.7964738932480868e-06, + "loss": 0.7029, + "mean_token_accuracy": 0.7972525000572205, + "num_tokens": 4460601.0, + "step": 3710 + }, + { + "entropy": 1.9275172814726829, + "epoch": 0.011531674518486716, + "grad_norm": 13.760834693908691, + "learning_rate": 1.8013174464787369e-06, + "loss": 0.67, + "mean_token_accuracy": 0.7982899993658066, + "num_tokens": 4472147.0, + "step": 3720 + }, + { + "entropy": 1.841733305156231, + "epoch": 0.01156267364353641, + "grad_norm": 4.695168495178223, + "learning_rate": 1.8061609997093872e-06, + "loss": 0.604, + "mean_token_accuracy": 0.8182388171553612, + "num_tokens": 4484477.0, + "step": 3730 + }, + { + "entropy": 1.8292774349451064, + "epoch": 0.011593672768586107, + "grad_norm": 5.965690612792969, + "learning_rate": 1.8110045529400368e-06, + "loss": 0.5746, + "mean_token_accuracy": 0.8187378898262978, + "num_tokens": 4497212.0, + "step": 3740 + }, + { + "entropy": 1.8096903830766677, + "epoch": 0.011624671893635802, + "grad_norm": 6.74788761138916, + "learning_rate": 1.815848106170687e-06, + "loss": 0.5768, + "mean_token_accuracy": 0.8042061701416969, + "num_tokens": 4509570.0, + "step": 3750 + }, + { + "entropy": 1.8733750000596046, + "epoch": 0.011655671018685497, + "grad_norm": 15.247547149658203, + "learning_rate": 1.8206916594013368e-06, + "loss": 0.6363, + "mean_token_accuracy": 0.7944400921463967, + "num_tokens": 4521995.0, + "step": 3760 + }, + { + "entropy": 1.9379871144890786, + "epoch": 0.011686670143735194, + "grad_norm": 5.622687816619873, + "learning_rate": 1.825535212631987e-06, + "loss": 0.6895, + "mean_token_accuracy": 0.7910193100571632, + "num_tokens": 4533546.0, + "step": 3770 + }, + { + "entropy": 1.8144460648298264, + "epoch": 0.011717669268784888, + "grad_norm": 7.539439678192139, + "learning_rate": 1.830378765862637e-06, + "loss": 0.6076, + "mean_token_accuracy": 0.8145319998264313, + "num_tokens": 4546173.0, + "step": 3780 + }, + { + "entropy": 1.914628729224205, + "epoch": 0.011748668393834583, + "grad_norm": 15.919760704040527, + "learning_rate": 1.835222319093287e-06, + "loss": 0.6686, + "mean_token_accuracy": 0.8031175434589386, + "num_tokens": 4557017.0, + "step": 3790 + }, + { + "entropy": 1.873679694533348, + "epoch": 0.01177966751888428, + "grad_norm": 15.300971031188965, + "learning_rate": 1.840065872323937e-06, + "loss": 0.6355, + "mean_token_accuracy": 0.7994208186864853, + "num_tokens": 4569294.0, + "step": 3800 + }, + { + "entropy": 1.9676134511828423, + "epoch": 0.011810666643933975, + "grad_norm": 14.174129486083984, + "learning_rate": 1.844909425554587e-06, + "loss": 0.709, + "mean_token_accuracy": 0.7975045815110207, + "num_tokens": 4580270.0, + "step": 3810 + }, + { + "entropy": 1.7934204161167144, + "epoch": 0.01184166576898367, + "grad_norm": 14.214337348937988, + "learning_rate": 1.849752978785237e-06, + "loss": 0.5367, + "mean_token_accuracy": 0.8222482308745385, + "num_tokens": 4593333.0, + "step": 3820 + }, + { + "entropy": 1.880036075413227, + "epoch": 0.011872664894033366, + "grad_norm": 13.970633506774902, + "learning_rate": 1.854596532015887e-06, + "loss": 0.6634, + "mean_token_accuracy": 0.7958371922373771, + "num_tokens": 4605903.0, + "step": 3830 + }, + { + "entropy": 1.8512480556964874, + "epoch": 0.011903664019083061, + "grad_norm": 11.63100814819336, + "learning_rate": 1.859440085246537e-06, + "loss": 0.5904, + "mean_token_accuracy": 0.8205496445298195, + "num_tokens": 4618539.0, + "step": 3840 + }, + { + "entropy": 1.9032649219036102, + "epoch": 0.011934663144132758, + "grad_norm": 14.786650657653809, + "learning_rate": 1.864283638477187e-06, + "loss": 0.6271, + "mean_token_accuracy": 0.7999899938702584, + "num_tokens": 4630679.0, + "step": 3850 + }, + { + "entropy": 1.8507294937968255, + "epoch": 0.011965662269182453, + "grad_norm": 15.505373001098633, + "learning_rate": 1.8691271917078371e-06, + "loss": 0.5308, + "mean_token_accuracy": 0.8145019263029099, + "num_tokens": 4644442.0, + "step": 3860 + }, + { + "entropy": 2.043610119819641, + "epoch": 0.011996661394232147, + "grad_norm": 17.056150436401367, + "learning_rate": 1.873970744938487e-06, + "loss": 0.719, + "mean_token_accuracy": 0.7810810253024101, + "num_tokens": 4654902.0, + "step": 3870 + }, + { + "entropy": 1.9894573852419852, + "epoch": 0.012027660519281844, + "grad_norm": 14.494179725646973, + "learning_rate": 1.8788142981691371e-06, + "loss": 0.6273, + "mean_token_accuracy": 0.8132184654474258, + "num_tokens": 4665669.0, + "step": 3880 + }, + { + "entropy": 2.0583216458559037, + "epoch": 0.012058659644331539, + "grad_norm": 12.334035873413086, + "learning_rate": 1.883657851399787e-06, + "loss": 0.7878, + "mean_token_accuracy": 0.7764063253998756, + "num_tokens": 4676308.0, + "step": 3890 + }, + { + "entropy": 1.8608665503561497, + "epoch": 0.012089658769381234, + "grad_norm": 6.846446514129639, + "learning_rate": 1.8885014046304371e-06, + "loss": 0.6183, + "mean_token_accuracy": 0.8085096433758736, + "num_tokens": 4689421.0, + "step": 3900 + }, + { + "entropy": 1.957814833521843, + "epoch": 0.01212065789443093, + "grad_norm": 12.504427909851074, + "learning_rate": 1.8933449578610872e-06, + "loss": 0.6852, + "mean_token_accuracy": 0.8001207739114762, + "num_tokens": 4701492.0, + "step": 3910 + }, + { + "entropy": 1.8347266063094139, + "epoch": 0.012151657019480625, + "grad_norm": 15.001429557800293, + "learning_rate": 1.8981885110917369e-06, + "loss": 0.5869, + "mean_token_accuracy": 0.81041249781847, + "num_tokens": 4714252.0, + "step": 3920 + }, + { + "entropy": 1.9000362247228622, + "epoch": 0.01218265614453032, + "grad_norm": 15.871277809143066, + "learning_rate": 1.903032064322387e-06, + "loss": 0.6574, + "mean_token_accuracy": 0.8029514774680138, + "num_tokens": 4725743.0, + "step": 3930 + }, + { + "entropy": 1.9085357084870338, + "epoch": 0.012213655269580017, + "grad_norm": 11.861626625061035, + "learning_rate": 1.907875617553037e-06, + "loss": 0.6232, + "mean_token_accuracy": 0.8074530705809593, + "num_tokens": 4737602.0, + "step": 3940 + }, + { + "entropy": 1.8550396144390107, + "epoch": 0.012244654394629712, + "grad_norm": 8.315388679504395, + "learning_rate": 1.912719170783687e-06, + "loss": 0.6392, + "mean_token_accuracy": 0.7964627891778946, + "num_tokens": 4749843.0, + "step": 3950 + }, + { + "entropy": 1.8868352055549622, + "epoch": 0.012275653519679406, + "grad_norm": 6.1222028732299805, + "learning_rate": 1.9175627240143373e-06, + "loss": 0.6339, + "mean_token_accuracy": 0.8049155220389366, + "num_tokens": 4762341.0, + "step": 3960 + }, + { + "entropy": 1.9090922087430955, + "epoch": 0.012306652644729103, + "grad_norm": 13.033318519592285, + "learning_rate": 1.922406277244987e-06, + "loss": 0.7009, + "mean_token_accuracy": 0.7983160421252251, + "num_tokens": 4774156.0, + "step": 3970 + }, + { + "entropy": 1.8667214959859848, + "epoch": 0.012337651769778798, + "grad_norm": 13.20643424987793, + "learning_rate": 1.927249830475637e-06, + "loss": 0.6541, + "mean_token_accuracy": 0.7958778321743012, + "num_tokens": 4786644.0, + "step": 3980 + }, + { + "entropy": 1.8116755485534668, + "epoch": 0.012368650894828493, + "grad_norm": 4.642879962921143, + "learning_rate": 1.9320933837062873e-06, + "loss": 0.5928, + "mean_token_accuracy": 0.8170793354511261, + "num_tokens": 4799594.0, + "step": 3990 + }, + { + "entropy": 1.7941434875130653, + "epoch": 0.01239965001987819, + "grad_norm": 13.760114669799805, + "learning_rate": 1.9369369369369372e-06, + "loss": 0.5889, + "mean_token_accuracy": 0.8124065786600113, + "num_tokens": 4811994.0, + "step": 4000 + }, + { + "entropy": 1.9247236236929894, + "epoch": 0.012430649144927884, + "grad_norm": 13.147157669067383, + "learning_rate": 1.941780490167587e-06, + "loss": 0.7134, + "mean_token_accuracy": 0.7897293627262115, + "num_tokens": 4823158.0, + "step": 4010 + }, + { + "entropy": 1.853769588470459, + "epoch": 0.01246164826997758, + "grad_norm": 11.97884750366211, + "learning_rate": 1.9466240433982374e-06, + "loss": 0.6361, + "mean_token_accuracy": 0.8066916093230247, + "num_tokens": 4834679.0, + "step": 4020 + }, + { + "entropy": 1.8805758222937583, + "epoch": 0.012492647395027276, + "grad_norm": 13.70005989074707, + "learning_rate": 1.951467596628887e-06, + "loss": 0.684, + "mean_token_accuracy": 0.7904590263962745, + "num_tokens": 4846234.0, + "step": 4030 + }, + { + "entropy": 1.9497404143214225, + "epoch": 0.01252364652007697, + "grad_norm": 5.448077201843262, + "learning_rate": 1.956311149859537e-06, + "loss": 0.7173, + "mean_token_accuracy": 0.7891417220234871, + "num_tokens": 4857890.0, + "step": 4040 + }, + { + "entropy": 1.8685417965054512, + "epoch": 0.012554645645126667, + "grad_norm": 13.170268058776855, + "learning_rate": 1.961154703090187e-06, + "loss": 0.6227, + "mean_token_accuracy": 0.8130503103137017, + "num_tokens": 4869779.0, + "step": 4050 + }, + { + "entropy": 1.91714718490839, + "epoch": 0.012585644770176362, + "grad_norm": 13.95510196685791, + "learning_rate": 1.965998256320837e-06, + "loss": 0.6498, + "mean_token_accuracy": 0.7988581836223603, + "num_tokens": 4881872.0, + "step": 4060 + }, + { + "entropy": 1.8488874793052674, + "epoch": 0.012616643895226057, + "grad_norm": 15.533134460449219, + "learning_rate": 1.9708418095514873e-06, + "loss": 0.6223, + "mean_token_accuracy": 0.8086504280567169, + "num_tokens": 4893930.0, + "step": 4070 + }, + { + "entropy": 1.827307391166687, + "epoch": 0.012647643020275753, + "grad_norm": 14.92998218536377, + "learning_rate": 1.975685362782137e-06, + "loss": 0.6516, + "mean_token_accuracy": 0.7971120476722717, + "num_tokens": 4905551.0, + "step": 4080 + }, + { + "entropy": 1.8286997452378273, + "epoch": 0.012678642145325448, + "grad_norm": 14.836496353149414, + "learning_rate": 1.980528916012787e-06, + "loss": 0.6449, + "mean_token_accuracy": 0.7994356840848923, + "num_tokens": 4917025.0, + "step": 4090 + }, + { + "entropy": 1.8299384236335754, + "epoch": 0.012709641270375143, + "grad_norm": 13.094915390014648, + "learning_rate": 1.985372469243437e-06, + "loss": 0.5917, + "mean_token_accuracy": 0.8010865300893784, + "num_tokens": 4930128.0, + "step": 4100 + }, + { + "entropy": 1.895867148041725, + "epoch": 0.01274064039542484, + "grad_norm": 13.526707649230957, + "learning_rate": 1.990216022474087e-06, + "loss": 0.7135, + "mean_token_accuracy": 0.7984941571950912, + "num_tokens": 4941517.0, + "step": 4110 + }, + { + "entropy": 1.89191782027483, + "epoch": 0.012771639520474535, + "grad_norm": 14.294553756713867, + "learning_rate": 1.995059575704737e-06, + "loss": 0.6931, + "mean_token_accuracy": 0.7926407441496849, + "num_tokens": 4952810.0, + "step": 4120 + }, + { + "entropy": 1.8057468429207801, + "epoch": 0.01280263864552423, + "grad_norm": 12.936646461486816, + "learning_rate": 1.999903128935387e-06, + "loss": 0.6215, + "mean_token_accuracy": 0.806446696817875, + "num_tokens": 4965259.0, + "step": 4130 + }, + { + "entropy": 1.8559681817889213, + "epoch": 0.012833637770573926, + "grad_norm": 14.09201431274414, + "learning_rate": 2.0047466821660373e-06, + "loss": 0.6436, + "mean_token_accuracy": 0.8033310145139694, + "num_tokens": 4976636.0, + "step": 4140 + }, + { + "entropy": 1.754169662296772, + "epoch": 0.012864636895623621, + "grad_norm": 6.45542573928833, + "learning_rate": 2.009590235396687e-06, + "loss": 0.5474, + "mean_token_accuracy": 0.8193281710147857, + "num_tokens": 4989310.0, + "step": 4150 + }, + { + "entropy": 1.755221924185753, + "epoch": 0.012895636020673316, + "grad_norm": 6.969062328338623, + "learning_rate": 2.014433788627337e-06, + "loss": 0.5726, + "mean_token_accuracy": 0.8212425723671913, + "num_tokens": 5001740.0, + "step": 4160 + }, + { + "entropy": 1.923591212928295, + "epoch": 0.012926635145723012, + "grad_norm": 16.79177474975586, + "learning_rate": 2.0192773418579874e-06, + "loss": 0.7115, + "mean_token_accuracy": 0.7964399144053459, + "num_tokens": 5012837.0, + "step": 4170 + }, + { + "entropy": 1.789121389389038, + "epoch": 0.012957634270772707, + "grad_norm": 6.560935974121094, + "learning_rate": 2.0241208950886372e-06, + "loss": 0.5094, + "mean_token_accuracy": 0.8209047600626945, + "num_tokens": 5026059.0, + "step": 4180 + }, + { + "entropy": 1.8681050762534142, + "epoch": 0.012988633395822402, + "grad_norm": 13.890046119689941, + "learning_rate": 2.028964448319287e-06, + "loss": 0.614, + "mean_token_accuracy": 0.7979257851839066, + "num_tokens": 5038464.0, + "step": 4190 + }, + { + "entropy": 1.844995491206646, + "epoch": 0.013019632520872099, + "grad_norm": 13.706587791442871, + "learning_rate": 2.0338080015499374e-06, + "loss": 0.655, + "mean_token_accuracy": 0.8055862948298455, + "num_tokens": 5051036.0, + "step": 4200 + }, + { + "entropy": 1.8614793449640274, + "epoch": 0.013050631645921794, + "grad_norm": 14.581295013427734, + "learning_rate": 2.0386515547805873e-06, + "loss": 0.6409, + "mean_token_accuracy": 0.8066158786416053, + "num_tokens": 5063162.0, + "step": 4210 + }, + { + "entropy": 1.8593030095100402, + "epoch": 0.01308163077097149, + "grad_norm": 13.553071022033691, + "learning_rate": 2.043495108011237e-06, + "loss": 0.5952, + "mean_token_accuracy": 0.8146178603172303, + "num_tokens": 5075545.0, + "step": 4220 + }, + { + "entropy": 1.8848890259861946, + "epoch": 0.013112629896021185, + "grad_norm": 17.947132110595703, + "learning_rate": 2.0483386612418875e-06, + "loss": 0.6514, + "mean_token_accuracy": 0.7945050925016404, + "num_tokens": 5087372.0, + "step": 4230 + }, + { + "entropy": 1.880325546860695, + "epoch": 0.01314362902107088, + "grad_norm": 15.84373950958252, + "learning_rate": 2.053182214472537e-06, + "loss": 0.6546, + "mean_token_accuracy": 0.8056602075695991, + "num_tokens": 5098820.0, + "step": 4240 + }, + { + "entropy": 1.9198369443416596, + "epoch": 0.013174628146120576, + "grad_norm": 14.227551460266113, + "learning_rate": 2.0580257677031873e-06, + "loss": 0.6509, + "mean_token_accuracy": 0.8100201249122619, + "num_tokens": 5109708.0, + "step": 4250 + }, + { + "entropy": 1.95897875726223, + "epoch": 0.013205627271170271, + "grad_norm": 15.566633224487305, + "learning_rate": 2.062869320933837e-06, + "loss": 0.6949, + "mean_token_accuracy": 0.7956610158085823, + "num_tokens": 5121454.0, + "step": 4260 + }, + { + "entropy": 1.8369690001010894, + "epoch": 0.013236626396219966, + "grad_norm": 16.143478393554688, + "learning_rate": 2.067712874164487e-06, + "loss": 0.5837, + "mean_token_accuracy": 0.8159258916974068, + "num_tokens": 5134141.0, + "step": 4270 + }, + { + "entropy": 1.912563818693161, + "epoch": 0.013267625521269663, + "grad_norm": 15.098668098449707, + "learning_rate": 2.0725564273951374e-06, + "loss": 0.6755, + "mean_token_accuracy": 0.7963416025042533, + "num_tokens": 5146353.0, + "step": 4280 + }, + { + "entropy": 1.9880844503641129, + "epoch": 0.013298624646319358, + "grad_norm": 14.18931770324707, + "learning_rate": 2.0773999806257872e-06, + "loss": 0.7678, + "mean_token_accuracy": 0.7802196651697159, + "num_tokens": 5157703.0, + "step": 4290 + }, + { + "entropy": 1.8342043563723565, + "epoch": 0.013329623771369052, + "grad_norm": 6.213775157928467, + "learning_rate": 2.082243533856437e-06, + "loss": 0.5523, + "mean_token_accuracy": 0.8177143961191178, + "num_tokens": 5170695.0, + "step": 4300 + }, + { + "entropy": 1.8414741292595864, + "epoch": 0.013360622896418749, + "grad_norm": 12.719599723815918, + "learning_rate": 2.087087087087087e-06, + "loss": 0.5698, + "mean_token_accuracy": 0.8134493753314018, + "num_tokens": 5183018.0, + "step": 4310 + }, + { + "entropy": 1.8449064493179321, + "epoch": 0.013391622021468444, + "grad_norm": 12.744540214538574, + "learning_rate": 2.0919306403177373e-06, + "loss": 0.6141, + "mean_token_accuracy": 0.7993644192814827, + "num_tokens": 5195549.0, + "step": 4320 + }, + { + "entropy": 1.9135398477315904, + "epoch": 0.013422621146518139, + "grad_norm": 15.205565452575684, + "learning_rate": 2.096774193548387e-06, + "loss": 0.6636, + "mean_token_accuracy": 0.8040877833962441, + "num_tokens": 5207595.0, + "step": 4330 + }, + { + "entropy": 1.892987634241581, + "epoch": 0.013453620271567835, + "grad_norm": 14.970274925231934, + "learning_rate": 2.101617746779037e-06, + "loss": 0.6144, + "mean_token_accuracy": 0.8050429001450539, + "num_tokens": 5220012.0, + "step": 4340 + }, + { + "entropy": 1.933681371808052, + "epoch": 0.01348461939661753, + "grad_norm": 16.078292846679688, + "learning_rate": 2.1064613000096874e-06, + "loss": 0.6802, + "mean_token_accuracy": 0.7953403264284133, + "num_tokens": 5232144.0, + "step": 4350 + }, + { + "entropy": 1.8837040960788727, + "epoch": 0.013515618521667225, + "grad_norm": 12.591126441955566, + "learning_rate": 2.1113048532403373e-06, + "loss": 0.6592, + "mean_token_accuracy": 0.8096277773380279, + "num_tokens": 5244523.0, + "step": 4360 + }, + { + "entropy": 1.8900502383708955, + "epoch": 0.013546617646716922, + "grad_norm": 7.1288604736328125, + "learning_rate": 2.116148406470987e-06, + "loss": 0.6995, + "mean_token_accuracy": 0.797728767991066, + "num_tokens": 5255816.0, + "step": 4370 + }, + { + "entropy": 1.8959971249103547, + "epoch": 0.013577616771766617, + "grad_norm": 8.134716987609863, + "learning_rate": 2.1209919597016375e-06, + "loss": 0.6631, + "mean_token_accuracy": 0.7931605547666549, + "num_tokens": 5268217.0, + "step": 4380 + }, + { + "entropy": 1.8685301005840302, + "epoch": 0.013608615896816313, + "grad_norm": 14.050363540649414, + "learning_rate": 2.1258355129322874e-06, + "loss": 0.5861, + "mean_token_accuracy": 0.7998355850577354, + "num_tokens": 5280627.0, + "step": 4390 + }, + { + "entropy": 1.9573094069957733, + "epoch": 0.013639615021866008, + "grad_norm": 12.361193656921387, + "learning_rate": 2.1306790661629372e-06, + "loss": 0.6558, + "mean_token_accuracy": 0.797396968305111, + "num_tokens": 5291443.0, + "step": 4400 + }, + { + "entropy": 1.8597141653299332, + "epoch": 0.013670614146915703, + "grad_norm": 16.280147552490234, + "learning_rate": 2.1355226193935875e-06, + "loss": 0.6441, + "mean_token_accuracy": 0.7984641641378403, + "num_tokens": 5303414.0, + "step": 4410 + }, + { + "entropy": 1.8121001735329627, + "epoch": 0.0137016132719654, + "grad_norm": 19.821773529052734, + "learning_rate": 2.1403661726242374e-06, + "loss": 0.5859, + "mean_token_accuracy": 0.8085174813866616, + "num_tokens": 5316024.0, + "step": 4420 + }, + { + "entropy": 1.8680142611265182, + "epoch": 0.013732612397015094, + "grad_norm": 14.235099792480469, + "learning_rate": 2.1452097258548873e-06, + "loss": 0.6586, + "mean_token_accuracy": 0.7996479585766793, + "num_tokens": 5327955.0, + "step": 4430 + }, + { + "entropy": 1.8850964441895486, + "epoch": 0.01376361152206479, + "grad_norm": 12.817394256591797, + "learning_rate": 2.1500532790855376e-06, + "loss": 0.6506, + "mean_token_accuracy": 0.7959688842296601, + "num_tokens": 5340123.0, + "step": 4440 + }, + { + "entropy": 1.9236231684684753, + "epoch": 0.013794610647114486, + "grad_norm": 13.93416976928711, + "learning_rate": 2.1548968323161875e-06, + "loss": 0.6866, + "mean_token_accuracy": 0.8000624299049377, + "num_tokens": 5351902.0, + "step": 4450 + }, + { + "entropy": 1.8246183797717095, + "epoch": 0.01382560977216418, + "grad_norm": 6.33852481842041, + "learning_rate": 2.1597403855468374e-06, + "loss": 0.5937, + "mean_token_accuracy": 0.8101627513766289, + "num_tokens": 5364681.0, + "step": 4460 + }, + { + "entropy": 1.896468523144722, + "epoch": 0.013856608897213876, + "grad_norm": 15.372577667236328, + "learning_rate": 2.1645839387774873e-06, + "loss": 0.6683, + "mean_token_accuracy": 0.8046876505017281, + "num_tokens": 5376647.0, + "step": 4470 + }, + { + "entropy": 1.8945748567581178, + "epoch": 0.013887608022263572, + "grad_norm": 14.416013717651367, + "learning_rate": 2.169427492008137e-06, + "loss": 0.6216, + "mean_token_accuracy": 0.8085326358675957, + "num_tokens": 5388517.0, + "step": 4480 + }, + { + "entropy": 1.9351640045642853, + "epoch": 0.013918607147313267, + "grad_norm": 11.383118629455566, + "learning_rate": 2.1742710452387875e-06, + "loss": 0.7617, + "mean_token_accuracy": 0.7835462704300881, + "num_tokens": 5399458.0, + "step": 4490 + }, + { + "entropy": 1.873738704621792, + "epoch": 0.013949606272362962, + "grad_norm": 14.66506576538086, + "learning_rate": 2.1791145984694373e-06, + "loss": 0.6014, + "mean_token_accuracy": 0.8175080880522728, + "num_tokens": 5411219.0, + "step": 4500 + }, + { + "entropy": 1.9110942378640174, + "epoch": 0.013980605397412658, + "grad_norm": 7.845452308654785, + "learning_rate": 2.1839581517000872e-06, + "loss": 0.6462, + "mean_token_accuracy": 0.799024523794651, + "num_tokens": 5423441.0, + "step": 4510 + }, + { + "entropy": 1.8728701308369637, + "epoch": 0.014011604522462353, + "grad_norm": 14.299894332885742, + "learning_rate": 2.188801704930737e-06, + "loss": 0.6333, + "mean_token_accuracy": 0.7987611889839172, + "num_tokens": 5435798.0, + "step": 4520 + }, + { + "entropy": 1.9454208359122276, + "epoch": 0.014042603647512048, + "grad_norm": 14.531904220581055, + "learning_rate": 2.1936452581613874e-06, + "loss": 0.6534, + "mean_token_accuracy": 0.8002553582191467, + "num_tokens": 5447898.0, + "step": 4530 + }, + { + "entropy": 1.93377585709095, + "epoch": 0.014073602772561745, + "grad_norm": 12.258691787719727, + "learning_rate": 2.1984888113920373e-06, + "loss": 0.6732, + "mean_token_accuracy": 0.7952050551772117, + "num_tokens": 5459752.0, + "step": 4540 + }, + { + "entropy": 1.9374273508787154, + "epoch": 0.01410460189761144, + "grad_norm": 6.370416641235352, + "learning_rate": 2.203332364622687e-06, + "loss": 0.6438, + "mean_token_accuracy": 0.8051278084516525, + "num_tokens": 5470921.0, + "step": 4550 + }, + { + "entropy": 1.9412765011191369, + "epoch": 0.014135601022661136, + "grad_norm": 6.380893230438232, + "learning_rate": 2.2081759178533375e-06, + "loss": 0.6152, + "mean_token_accuracy": 0.811233189702034, + "num_tokens": 5482832.0, + "step": 4560 + }, + { + "entropy": 1.954646387696266, + "epoch": 0.014166600147710831, + "grad_norm": 11.699299812316895, + "learning_rate": 2.2130194710839874e-06, + "loss": 0.6937, + "mean_token_accuracy": 0.7911556661128998, + "num_tokens": 5494419.0, + "step": 4570 + }, + { + "entropy": 1.8840202033519744, + "epoch": 0.014197599272760526, + "grad_norm": 14.447190284729004, + "learning_rate": 2.2178630243146373e-06, + "loss": 0.5864, + "mean_token_accuracy": 0.8123130038380623, + "num_tokens": 5507775.0, + "step": 4580 + }, + { + "entropy": 1.9366067603230477, + "epoch": 0.014228598397810223, + "grad_norm": 6.459699630737305, + "learning_rate": 2.2227065775452876e-06, + "loss": 0.6687, + "mean_token_accuracy": 0.7912245571613312, + "num_tokens": 5519641.0, + "step": 4590 + }, + { + "entropy": 1.8860922396183013, + "epoch": 0.014259597522859917, + "grad_norm": 7.023352146148682, + "learning_rate": 2.2275501307759375e-06, + "loss": 0.5427, + "mean_token_accuracy": 0.8207052007317543, + "num_tokens": 5532518.0, + "step": 4600 + }, + { + "entropy": 1.8792527481913566, + "epoch": 0.014290596647909612, + "grad_norm": 14.053731918334961, + "learning_rate": 2.2323936840065873e-06, + "loss": 0.6267, + "mean_token_accuracy": 0.8102740809321404, + "num_tokens": 5545434.0, + "step": 4610 + }, + { + "entropy": 1.8577488988637925, + "epoch": 0.014321595772959309, + "grad_norm": 13.818965911865234, + "learning_rate": 2.2372372372372376e-06, + "loss": 0.607, + "mean_token_accuracy": 0.8134890556335449, + "num_tokens": 5557659.0, + "step": 4620 + }, + { + "entropy": 1.8798778101801872, + "epoch": 0.014352594898009004, + "grad_norm": 15.251192092895508, + "learning_rate": 2.2420807904678875e-06, + "loss": 0.7015, + "mean_token_accuracy": 0.8011739581823349, + "num_tokens": 5569313.0, + "step": 4630 + }, + { + "entropy": 1.9401899367570876, + "epoch": 0.014383594023058699, + "grad_norm": 14.222334861755371, + "learning_rate": 2.2469243436985374e-06, + "loss": 0.6341, + "mean_token_accuracy": 0.8067525029182434, + "num_tokens": 5580286.0, + "step": 4640 + }, + { + "entropy": 1.791366559267044, + "epoch": 0.014414593148108395, + "grad_norm": 15.209399223327637, + "learning_rate": 2.2517678969291877e-06, + "loss": 0.621, + "mean_token_accuracy": 0.8049216285347939, + "num_tokens": 5593090.0, + "step": 4650 + }, + { + "entropy": 1.8601211935281754, + "epoch": 0.01444559227315809, + "grad_norm": 6.989866733551025, + "learning_rate": 2.2566114501598376e-06, + "loss": 0.6226, + "mean_token_accuracy": 0.7975085467100144, + "num_tokens": 5606291.0, + "step": 4660 + }, + { + "entropy": 1.8646612569689751, + "epoch": 0.014476591398207785, + "grad_norm": 6.809642314910889, + "learning_rate": 2.2614550033904875e-06, + "loss": 0.6267, + "mean_token_accuracy": 0.801960551738739, + "num_tokens": 5618542.0, + "step": 4670 + }, + { + "entropy": 1.8657331004738809, + "epoch": 0.014507590523257482, + "grad_norm": 15.695577621459961, + "learning_rate": 2.2662985566211374e-06, + "loss": 0.6266, + "mean_token_accuracy": 0.8036523833870888, + "num_tokens": 5630622.0, + "step": 4680 + }, + { + "entropy": 1.8683514580130578, + "epoch": 0.014538589648307176, + "grad_norm": 14.665810585021973, + "learning_rate": 2.2711421098517873e-06, + "loss": 0.6272, + "mean_token_accuracy": 0.8168015897274017, + "num_tokens": 5642626.0, + "step": 4690 + }, + { + "entropy": 1.910755640268326, + "epoch": 0.014569588773356871, + "grad_norm": 14.815983772277832, + "learning_rate": 2.2759856630824376e-06, + "loss": 0.6303, + "mean_token_accuracy": 0.7996454507112503, + "num_tokens": 5654704.0, + "step": 4700 + }, + { + "entropy": 1.9073377847671509, + "epoch": 0.014600587898406568, + "grad_norm": 12.390499114990234, + "learning_rate": 2.2808292163130874e-06, + "loss": 0.6777, + "mean_token_accuracy": 0.7973004877567291, + "num_tokens": 5666771.0, + "step": 4710 + }, + { + "entropy": 1.8886594474315643, + "epoch": 0.014631587023456263, + "grad_norm": 15.042806625366211, + "learning_rate": 2.2856727695437373e-06, + "loss": 0.666, + "mean_token_accuracy": 0.7995603799819946, + "num_tokens": 5679125.0, + "step": 4720 + }, + { + "entropy": 1.8458405777812004, + "epoch": 0.014662586148505958, + "grad_norm": 13.178945541381836, + "learning_rate": 2.2905163227743872e-06, + "loss": 0.6103, + "mean_token_accuracy": 0.8050804138183594, + "num_tokens": 5691263.0, + "step": 4730 + }, + { + "entropy": 1.8889022842049599, + "epoch": 0.014693585273555654, + "grad_norm": 13.8629150390625, + "learning_rate": 2.2953598760050375e-06, + "loss": 0.6005, + "mean_token_accuracy": 0.8008949771523476, + "num_tokens": 5703903.0, + "step": 4740 + }, + { + "entropy": 1.8346245408058166, + "epoch": 0.014724584398605349, + "grad_norm": 13.840920448303223, + "learning_rate": 2.3002034292356874e-06, + "loss": 0.6284, + "mean_token_accuracy": 0.8062951177358627, + "num_tokens": 5715573.0, + "step": 4750 + }, + { + "entropy": 1.797354480624199, + "epoch": 0.014755583523655046, + "grad_norm": 7.390272617340088, + "learning_rate": 2.3050469824663373e-06, + "loss": 0.5823, + "mean_token_accuracy": 0.81457539498806, + "num_tokens": 5728120.0, + "step": 4760 + }, + { + "entropy": 1.787947428226471, + "epoch": 0.01478658264870474, + "grad_norm": 13.413910865783691, + "learning_rate": 2.3098905356969876e-06, + "loss": 0.6103, + "mean_token_accuracy": 0.8159710958600044, + "num_tokens": 5739981.0, + "step": 4770 + }, + { + "entropy": 1.7457917869091033, + "epoch": 0.014817581773754435, + "grad_norm": 14.069392204284668, + "learning_rate": 2.3147340889276375e-06, + "loss": 0.5329, + "mean_token_accuracy": 0.8245069414377213, + "num_tokens": 5753136.0, + "step": 4780 + }, + { + "entropy": 1.8335485979914665, + "epoch": 0.014848580898804132, + "grad_norm": 12.345169067382812, + "learning_rate": 2.3195776421582874e-06, + "loss": 0.6593, + "mean_token_accuracy": 0.8006348922848702, + "num_tokens": 5765376.0, + "step": 4790 + }, + { + "entropy": 1.8139121234416962, + "epoch": 0.014879580023853827, + "grad_norm": 15.01768970489502, + "learning_rate": 2.3244211953889377e-06, + "loss": 0.6159, + "mean_token_accuracy": 0.8078545331954956, + "num_tokens": 5778012.0, + "step": 4800 + }, + { + "entropy": 1.8669386252760887, + "epoch": 0.014910579148903522, + "grad_norm": 14.147043228149414, + "learning_rate": 2.3292647486195876e-06, + "loss": 0.6281, + "mean_token_accuracy": 0.8018081709742546, + "num_tokens": 5789805.0, + "step": 4810 + }, + { + "entropy": 1.784064681828022, + "epoch": 0.014941578273953218, + "grad_norm": 13.715585708618164, + "learning_rate": 2.3341083018502374e-06, + "loss": 0.6004, + "mean_token_accuracy": 0.8130191281437874, + "num_tokens": 5802546.0, + "step": 4820 + }, + { + "entropy": 1.7632321387529373, + "epoch": 0.014972577399002913, + "grad_norm": 14.334385871887207, + "learning_rate": 2.3389518550808878e-06, + "loss": 0.6187, + "mean_token_accuracy": 0.8080756813287735, + "num_tokens": 5815760.0, + "step": 4830 + }, + { + "entropy": 1.826922358572483, + "epoch": 0.015003576524052608, + "grad_norm": 12.539878845214844, + "learning_rate": 2.3437954083115376e-06, + "loss": 0.6262, + "mean_token_accuracy": 0.8105258822441102, + "num_tokens": 5827438.0, + "step": 4840 + }, + { + "entropy": 1.788956308364868, + "epoch": 0.015034575649102305, + "grad_norm": 11.371108055114746, + "learning_rate": 2.3486389615421875e-06, + "loss": 0.5732, + "mean_token_accuracy": 0.817362517118454, + "num_tokens": 5839410.0, + "step": 4850 + }, + { + "entropy": 1.734141993522644, + "epoch": 0.015065574774152, + "grad_norm": 6.056836128234863, + "learning_rate": 2.353482514772838e-06, + "loss": 0.4693, + "mean_token_accuracy": 0.8292117938399315, + "num_tokens": 5852222.0, + "step": 4860 + }, + { + "entropy": 1.9274750858545304, + "epoch": 0.015096573899201694, + "grad_norm": 14.039139747619629, + "learning_rate": 2.3583260680034877e-06, + "loss": 0.6941, + "mean_token_accuracy": 0.7952279299497604, + "num_tokens": 5864055.0, + "step": 4870 + }, + { + "entropy": 1.8699121579527855, + "epoch": 0.015127573024251391, + "grad_norm": 15.34942626953125, + "learning_rate": 2.3631696212341376e-06, + "loss": 0.669, + "mean_token_accuracy": 0.7950506001710892, + "num_tokens": 5876040.0, + "step": 4880 + }, + { + "entropy": 1.9636721938848496, + "epoch": 0.015158572149301086, + "grad_norm": 16.529897689819336, + "learning_rate": 2.3680131744647875e-06, + "loss": 0.7517, + "mean_token_accuracy": 0.7848048597574234, + "num_tokens": 5887191.0, + "step": 4890 + }, + { + "entropy": 1.7576201945543288, + "epoch": 0.01518957127435078, + "grad_norm": 11.712950706481934, + "learning_rate": 2.3728567276954374e-06, + "loss": 0.5591, + "mean_token_accuracy": 0.8087734043598175, + "num_tokens": 5901313.0, + "step": 4900 + }, + { + "entropy": 1.8790557369589806, + "epoch": 0.015220570399400477, + "grad_norm": 13.049158096313477, + "learning_rate": 2.3777002809260877e-06, + "loss": 0.6831, + "mean_token_accuracy": 0.7949941202998161, + "num_tokens": 5913169.0, + "step": 4910 + }, + { + "entropy": 1.8304862260818482, + "epoch": 0.015251569524450172, + "grad_norm": 14.758519172668457, + "learning_rate": 2.3825438341567376e-06, + "loss": 0.6111, + "mean_token_accuracy": 0.7983208373188972, + "num_tokens": 5925901.0, + "step": 4920 + }, + { + "entropy": 1.7591045543551445, + "epoch": 0.015282568649499869, + "grad_norm": 7.14612340927124, + "learning_rate": 2.3873873873873874e-06, + "loss": 0.608, + "mean_token_accuracy": 0.8068576440215111, + "num_tokens": 5939642.0, + "step": 4930 + }, + { + "entropy": 1.9316822454333304, + "epoch": 0.015313567774549564, + "grad_norm": 11.606863021850586, + "learning_rate": 2.3922309406180373e-06, + "loss": 0.6747, + "mean_token_accuracy": 0.7950692802667618, + "num_tokens": 5951806.0, + "step": 4940 + }, + { + "entropy": 1.8316121339797973, + "epoch": 0.015344566899599258, + "grad_norm": 12.468024253845215, + "learning_rate": 2.3970744938486876e-06, + "loss": 0.6445, + "mean_token_accuracy": 0.7969418242573738, + "num_tokens": 5963560.0, + "step": 4950 + }, + { + "entropy": 1.8592291116714477, + "epoch": 0.015375566024648955, + "grad_norm": 10.784393310546875, + "learning_rate": 2.4019180470793375e-06, + "loss": 0.584, + "mean_token_accuracy": 0.815529865026474, + "num_tokens": 5975576.0, + "step": 4960 + }, + { + "entropy": 1.8554728999733925, + "epoch": 0.01540656514969865, + "grad_norm": 6.568962574005127, + "learning_rate": 2.4067616003099874e-06, + "loss": 0.6138, + "mean_token_accuracy": 0.8106912553310395, + "num_tokens": 5987431.0, + "step": 4970 + }, + { + "entropy": 1.9579525411128997, + "epoch": 0.015437564274748345, + "grad_norm": 15.040802001953125, + "learning_rate": 2.4116051535406377e-06, + "loss": 0.7144, + "mean_token_accuracy": 0.7940159112215042, + "num_tokens": 5998986.0, + "step": 4980 + }, + { + "entropy": 1.8558162599802017, + "epoch": 0.015468563399798041, + "grad_norm": 12.90155029296875, + "learning_rate": 2.4164487067712876e-06, + "loss": 0.5972, + "mean_token_accuracy": 0.8087948963046074, + "num_tokens": 6011316.0, + "step": 4990 + }, + { + "entropy": 1.9120135977864265, + "epoch": 0.015499562524847736, + "grad_norm": 13.731481552124023, + "learning_rate": 2.4212922600019375e-06, + "loss": 0.6358, + "mean_token_accuracy": 0.808275742828846, + "num_tokens": 6022791.0, + "step": 5000 + }, + { + "entropy": 1.9652427747845649, + "epoch": 0.015530561649897431, + "grad_norm": 10.624104499816895, + "learning_rate": 2.4261358132325878e-06, + "loss": 0.7283, + "mean_token_accuracy": 0.7885913565754891, + "num_tokens": 6034662.0, + "step": 5010 + }, + { + "entropy": 1.8572456985712051, + "epoch": 0.015561560774947128, + "grad_norm": 13.485920906066895, + "learning_rate": 2.4309793664632377e-06, + "loss": 0.6653, + "mean_token_accuracy": 0.8142799854278564, + "num_tokens": 6046637.0, + "step": 5020 + }, + { + "entropy": 1.8741802372038365, + "epoch": 0.015592559899996822, + "grad_norm": 13.382155418395996, + "learning_rate": 2.4358229196938875e-06, + "loss": 0.673, + "mean_token_accuracy": 0.7978162422776223, + "num_tokens": 6058910.0, + "step": 5030 + }, + { + "entropy": 1.8276728346943856, + "epoch": 0.015623559025046517, + "grad_norm": 13.904196739196777, + "learning_rate": 2.440666472924538e-06, + "loss": 0.6098, + "mean_token_accuracy": 0.80892014503479, + "num_tokens": 6071136.0, + "step": 5040 + }, + { + "entropy": 1.8636898145079612, + "epoch": 0.015654558150096212, + "grad_norm": 12.65003490447998, + "learning_rate": 2.4455100261551877e-06, + "loss": 0.6052, + "mean_token_accuracy": 0.8113350749015809, + "num_tokens": 6082834.0, + "step": 5050 + }, + { + "entropy": 1.9150115132331849, + "epoch": 0.01568555727514591, + "grad_norm": 12.964512825012207, + "learning_rate": 2.4503535793858376e-06, + "loss": 0.6974, + "mean_token_accuracy": 0.7987506031990051, + "num_tokens": 6093808.0, + "step": 5060 + }, + { + "entropy": 1.8618119135499, + "epoch": 0.015716556400195605, + "grad_norm": 5.460781097412109, + "learning_rate": 2.455197132616488e-06, + "loss": 0.6885, + "mean_token_accuracy": 0.8034725025296211, + "num_tokens": 6105718.0, + "step": 5070 + }, + { + "entropy": 1.9206832945346832, + "epoch": 0.0157475555252453, + "grad_norm": 6.1644134521484375, + "learning_rate": 2.460040685847138e-06, + "loss": 0.6132, + "mean_token_accuracy": 0.8128495365381241, + "num_tokens": 6117606.0, + "step": 5080 + }, + { + "entropy": 1.8482828214764595, + "epoch": 0.015778554650294995, + "grad_norm": 16.253582000732422, + "learning_rate": 2.4648842390777877e-06, + "loss": 0.5834, + "mean_token_accuracy": 0.8172304585576058, + "num_tokens": 6130415.0, + "step": 5090 + }, + { + "entropy": 1.998060804605484, + "epoch": 0.01580955377534469, + "grad_norm": 12.287386894226074, + "learning_rate": 2.4697277923084376e-06, + "loss": 0.7151, + "mean_token_accuracy": 0.8040244802832603, + "num_tokens": 6140984.0, + "step": 5100 + }, + { + "entropy": 1.8899462610483169, + "epoch": 0.015840552900394385, + "grad_norm": 14.631074905395508, + "learning_rate": 2.4745713455390875e-06, + "loss": 0.5975, + "mean_token_accuracy": 0.8050005823373795, + "num_tokens": 6152531.0, + "step": 5110 + }, + { + "entropy": 1.9535317480564118, + "epoch": 0.01587155202544408, + "grad_norm": 17.399723052978516, + "learning_rate": 2.4794148987697378e-06, + "loss": 0.714, + "mean_token_accuracy": 0.7943957537412644, + "num_tokens": 6163431.0, + "step": 5120 + }, + { + "entropy": 1.8350759640336036, + "epoch": 0.015902551150493778, + "grad_norm": 6.640737056732178, + "learning_rate": 2.4842584520003877e-06, + "loss": 0.5901, + "mean_token_accuracy": 0.8108617261052131, + "num_tokens": 6176031.0, + "step": 5130 + }, + { + "entropy": 1.8753447353839874, + "epoch": 0.01593355027554347, + "grad_norm": 12.087038040161133, + "learning_rate": 2.4891020052310375e-06, + "loss": 0.6479, + "mean_token_accuracy": 0.8096778213977813, + "num_tokens": 6187445.0, + "step": 5140 + }, + { + "entropy": 1.9582001134753226, + "epoch": 0.015964549400593168, + "grad_norm": 14.169228553771973, + "learning_rate": 2.4939455584616874e-06, + "loss": 0.6759, + "mean_token_accuracy": 0.8033049464225769, + "num_tokens": 6199080.0, + "step": 5150 + }, + { + "entropy": 1.9630192652344705, + "epoch": 0.015995548525642864, + "grad_norm": 15.533053398132324, + "learning_rate": 2.4987891116923377e-06, + "loss": 0.6722, + "mean_token_accuracy": 0.7981782972812652, + "num_tokens": 6210218.0, + "step": 5160 + }, + { + "entropy": 1.9423971146345138, + "epoch": 0.016026547650692557, + "grad_norm": 15.028423309326172, + "learning_rate": 2.5036326649229876e-06, + "loss": 0.6986, + "mean_token_accuracy": 0.7927390620112419, + "num_tokens": 6221926.0, + "step": 5170 + }, + { + "entropy": 1.9331501245498657, + "epoch": 0.016057546775742254, + "grad_norm": 14.322336196899414, + "learning_rate": 2.5084762181536375e-06, + "loss": 0.669, + "mean_token_accuracy": 0.7981711998581886, + "num_tokens": 6233615.0, + "step": 5180 + }, + { + "entropy": 1.920173704624176, + "epoch": 0.01608854590079195, + "grad_norm": 13.353188514709473, + "learning_rate": 2.513319771384288e-06, + "loss": 0.6021, + "mean_token_accuracy": 0.804129633307457, + "num_tokens": 6245365.0, + "step": 5190 + }, + { + "entropy": 1.9042732536792755, + "epoch": 0.016119545025841647, + "grad_norm": 6.624873161315918, + "learning_rate": 2.5181633246149377e-06, + "loss": 0.6251, + "mean_token_accuracy": 0.8022510379552841, + "num_tokens": 6257307.0, + "step": 5200 + }, + { + "entropy": 1.9409416317939758, + "epoch": 0.01615054415089134, + "grad_norm": 16.728931427001953, + "learning_rate": 2.523006877845588e-06, + "loss": 0.6694, + "mean_token_accuracy": 0.8007285013794899, + "num_tokens": 6268908.0, + "step": 5210 + }, + { + "entropy": 1.9607915192842484, + "epoch": 0.016181543275941037, + "grad_norm": 12.1609525680542, + "learning_rate": 2.5278504310762375e-06, + "loss": 0.6902, + "mean_token_accuracy": 0.7996085345745086, + "num_tokens": 6279792.0, + "step": 5220 + }, + { + "entropy": 1.9298648953437805, + "epoch": 0.016212542400990734, + "grad_norm": 14.782210350036621, + "learning_rate": 2.5326939843068878e-06, + "loss": 0.6543, + "mean_token_accuracy": 0.7992920339107513, + "num_tokens": 6291405.0, + "step": 5230 + }, + { + "entropy": 1.8961961045861244, + "epoch": 0.016243541526040427, + "grad_norm": 15.001334190368652, + "learning_rate": 2.5375375375375377e-06, + "loss": 0.619, + "mean_token_accuracy": 0.8142734676599502, + "num_tokens": 6303357.0, + "step": 5240 + }, + { + "entropy": 1.8733942970633506, + "epoch": 0.016274540651090123, + "grad_norm": 11.908313751220703, + "learning_rate": 2.542381090768188e-06, + "loss": 0.5783, + "mean_token_accuracy": 0.8036831006407738, + "num_tokens": 6316147.0, + "step": 5250 + }, + { + "entropy": 1.8637539759278297, + "epoch": 0.01630553977613982, + "grad_norm": 13.72813606262207, + "learning_rate": 2.547224643998838e-06, + "loss": 0.6272, + "mean_token_accuracy": 0.8079806834459304, + "num_tokens": 6328593.0, + "step": 5260 + }, + { + "entropy": 1.9503967970609666, + "epoch": 0.016336538901189513, + "grad_norm": 13.402142524719238, + "learning_rate": 2.5520681972294873e-06, + "loss": 0.7049, + "mean_token_accuracy": 0.7919132471084595, + "num_tokens": 6340259.0, + "step": 5270 + }, + { + "entropy": 1.8650514677166938, + "epoch": 0.01636753802623921, + "grad_norm": 17.254850387573242, + "learning_rate": 2.5569117504601376e-06, + "loss": 0.6297, + "mean_token_accuracy": 0.8054231390357017, + "num_tokens": 6351914.0, + "step": 5280 + }, + { + "entropy": 1.861028863489628, + "epoch": 0.016398537151288906, + "grad_norm": 14.428080558776855, + "learning_rate": 2.561755303690788e-06, + "loss": 0.681, + "mean_token_accuracy": 0.7947863683104515, + "num_tokens": 6364405.0, + "step": 5290 + }, + { + "entropy": 1.9019827499985695, + "epoch": 0.0164295362763386, + "grad_norm": 14.161713600158691, + "learning_rate": 2.566598856921438e-06, + "loss": 0.6824, + "mean_token_accuracy": 0.8035579726099968, + "num_tokens": 6375673.0, + "step": 5300 + }, + { + "entropy": 1.8992380529642106, + "epoch": 0.016460535401388296, + "grad_norm": 13.563090324401855, + "learning_rate": 2.571442410152088e-06, + "loss": 0.6121, + "mean_token_accuracy": 0.8122377499938012, + "num_tokens": 6387160.0, + "step": 5310 + }, + { + "entropy": 1.8584588319063187, + "epoch": 0.016491534526437993, + "grad_norm": 7.150653839111328, + "learning_rate": 2.576285963382738e-06, + "loss": 0.6296, + "mean_token_accuracy": 0.8052732735872269, + "num_tokens": 6399189.0, + "step": 5320 + }, + { + "entropy": 1.822690936923027, + "epoch": 0.016522533651487686, + "grad_norm": 13.757638931274414, + "learning_rate": 2.5811295166133875e-06, + "loss": 0.6489, + "mean_token_accuracy": 0.8050631493330002, + "num_tokens": 6411077.0, + "step": 5330 + }, + { + "entropy": 1.86706335991621, + "epoch": 0.016553532776537382, + "grad_norm": 12.249866485595703, + "learning_rate": 2.5859730698440378e-06, + "loss": 0.6379, + "mean_token_accuracy": 0.8055833503603935, + "num_tokens": 6422795.0, + "step": 5340 + }, + { + "entropy": 1.914227731525898, + "epoch": 0.01658453190158708, + "grad_norm": 13.125116348266602, + "learning_rate": 2.5908166230746876e-06, + "loss": 0.6412, + "mean_token_accuracy": 0.8045208930969239, + "num_tokens": 6434827.0, + "step": 5350 + }, + { + "entropy": 1.8957944259047508, + "epoch": 0.016615531026636772, + "grad_norm": 16.094440460205078, + "learning_rate": 2.595660176305338e-06, + "loss": 0.6912, + "mean_token_accuracy": 0.8000255629420281, + "num_tokens": 6446181.0, + "step": 5360 + }, + { + "entropy": 1.8262048691511155, + "epoch": 0.01664653015168647, + "grad_norm": 7.709341526031494, + "learning_rate": 2.6005037295359883e-06, + "loss": 0.5977, + "mean_token_accuracy": 0.8138811901211739, + "num_tokens": 6458511.0, + "step": 5370 + }, + { + "entropy": 1.9274291083216668, + "epoch": 0.016677529276736165, + "grad_norm": 15.519542694091797, + "learning_rate": 2.6053472827666377e-06, + "loss": 0.7491, + "mean_token_accuracy": 0.7906691998243331, + "num_tokens": 6470175.0, + "step": 5380 + }, + { + "entropy": 1.8656795375049113, + "epoch": 0.01670852840178586, + "grad_norm": 15.985265731811523, + "learning_rate": 2.6101908359972876e-06, + "loss": 0.6216, + "mean_token_accuracy": 0.8056905150413514, + "num_tokens": 6482873.0, + "step": 5390 + }, + { + "entropy": 1.9562640219926835, + "epoch": 0.016739527526835555, + "grad_norm": 12.090657234191895, + "learning_rate": 2.615034389227938e-06, + "loss": 0.6927, + "mean_token_accuracy": 0.7986342236399651, + "num_tokens": 6494045.0, + "step": 5400 + }, + { + "entropy": 1.919235098361969, + "epoch": 0.01677052665188525, + "grad_norm": 12.741799354553223, + "learning_rate": 2.619877942458588e-06, + "loss": 0.6563, + "mean_token_accuracy": 0.8064717799425125, + "num_tokens": 6504934.0, + "step": 5410 + }, + { + "entropy": 1.8321070238947867, + "epoch": 0.016801525776934945, + "grad_norm": 13.527047157287598, + "learning_rate": 2.624721495689238e-06, + "loss": 0.5623, + "mean_token_accuracy": 0.8185374692082406, + "num_tokens": 6517746.0, + "step": 5420 + }, + { + "entropy": 1.8565064251422883, + "epoch": 0.01683252490198464, + "grad_norm": 6.626979351043701, + "learning_rate": 2.6295650489198876e-06, + "loss": 0.6632, + "mean_token_accuracy": 0.801714438199997, + "num_tokens": 6530512.0, + "step": 5430 + }, + { + "entropy": 1.815597450733185, + "epoch": 0.016863524027034338, + "grad_norm": 14.231537818908691, + "learning_rate": 2.634408602150538e-06, + "loss": 0.6259, + "mean_token_accuracy": 0.8043477773666382, + "num_tokens": 6543281.0, + "step": 5440 + }, + { + "entropy": 1.847208908200264, + "epoch": 0.01689452315208403, + "grad_norm": 14.332513809204102, + "learning_rate": 2.6392521553811878e-06, + "loss": 0.5887, + "mean_token_accuracy": 0.8129609242081642, + "num_tokens": 6556103.0, + "step": 5450 + }, + { + "entropy": 1.8287439972162247, + "epoch": 0.016925522277133728, + "grad_norm": 17.275390625, + "learning_rate": 2.644095708611838e-06, + "loss": 0.5664, + "mean_token_accuracy": 0.8119298800826072, + "num_tokens": 6568831.0, + "step": 5460 + }, + { + "entropy": 1.9554360315203667, + "epoch": 0.016956521402183424, + "grad_norm": 16.211164474487305, + "learning_rate": 2.648939261842488e-06, + "loss": 0.7553, + "mean_token_accuracy": 0.790603817999363, + "num_tokens": 6580730.0, + "step": 5470 + }, + { + "entropy": 1.8777785643935203, + "epoch": 0.016987520527233117, + "grad_norm": 7.529537677764893, + "learning_rate": 2.6537828150731374e-06, + "loss": 0.5822, + "mean_token_accuracy": 0.81802958548069, + "num_tokens": 6593151.0, + "step": 5480 + }, + { + "entropy": 1.83996739089489, + "epoch": 0.017018519652282814, + "grad_norm": 16.056753158569336, + "learning_rate": 2.6586263683037877e-06, + "loss": 0.5795, + "mean_token_accuracy": 0.8094681099057197, + "num_tokens": 6605508.0, + "step": 5490 + }, + { + "entropy": 1.8156257584691047, + "epoch": 0.01704951877733251, + "grad_norm": 13.18197250366211, + "learning_rate": 2.663469921534438e-06, + "loss": 0.5445, + "mean_token_accuracy": 0.818387684226036, + "num_tokens": 6617913.0, + "step": 5500 + }, + { + "entropy": 1.830448153614998, + "epoch": 0.017080517902382204, + "grad_norm": 17.966718673706055, + "learning_rate": 2.668313474765088e-06, + "loss": 0.591, + "mean_token_accuracy": 0.8055599793791771, + "num_tokens": 6630804.0, + "step": 5510 + }, + { + "entropy": 1.9519992083311082, + "epoch": 0.0171115170274319, + "grad_norm": 15.51613998413086, + "learning_rate": 2.6731570279957382e-06, + "loss": 0.7094, + "mean_token_accuracy": 0.7885363206267357, + "num_tokens": 6641484.0, + "step": 5520 + }, + { + "entropy": 1.787497617304325, + "epoch": 0.017142516152481597, + "grad_norm": 13.95913028717041, + "learning_rate": 2.678000581226388e-06, + "loss": 0.5966, + "mean_token_accuracy": 0.8152831554412842, + "num_tokens": 6655013.0, + "step": 5530 + }, + { + "entropy": 1.8791037619113922, + "epoch": 0.017173515277531293, + "grad_norm": 12.858197212219238, + "learning_rate": 2.6828441344570376e-06, + "loss": 0.708, + "mean_token_accuracy": 0.7970382794737816, + "num_tokens": 6665889.0, + "step": 5540 + }, + { + "entropy": 1.8964224010705948, + "epoch": 0.017204514402580987, + "grad_norm": 14.71556282043457, + "learning_rate": 2.687687687687688e-06, + "loss": 0.7067, + "mean_token_accuracy": 0.8008865773677826, + "num_tokens": 6676444.0, + "step": 5550 + }, + { + "entropy": 1.868991169333458, + "epoch": 0.017235513527630683, + "grad_norm": 12.040255546569824, + "learning_rate": 2.6925312409183378e-06, + "loss": 0.6944, + "mean_token_accuracy": 0.7887442111968994, + "num_tokens": 6687681.0, + "step": 5560 + }, + { + "entropy": 1.7748029723763465, + "epoch": 0.01726651265268038, + "grad_norm": 16.154150009155273, + "learning_rate": 2.697374794148988e-06, + "loss": 0.558, + "mean_token_accuracy": 0.818262355029583, + "num_tokens": 6700056.0, + "step": 5570 + }, + { + "entropy": 1.8267305195331573, + "epoch": 0.017297511777730073, + "grad_norm": 13.24530029296875, + "learning_rate": 2.7022183473796384e-06, + "loss": 0.6354, + "mean_token_accuracy": 0.8041493833065033, + "num_tokens": 6712309.0, + "step": 5580 + }, + { + "entropy": 1.801226018369198, + "epoch": 0.01732851090277977, + "grad_norm": 13.650617599487305, + "learning_rate": 2.707061900610288e-06, + "loss": 0.6252, + "mean_token_accuracy": 0.8037795275449753, + "num_tokens": 6725171.0, + "step": 5590 + }, + { + "entropy": 1.9054549396038056, + "epoch": 0.017359510027829466, + "grad_norm": 13.89938735961914, + "learning_rate": 2.7119054538409377e-06, + "loss": 0.673, + "mean_token_accuracy": 0.794456647336483, + "num_tokens": 6735768.0, + "step": 5600 + }, + { + "entropy": 1.8770520448684693, + "epoch": 0.01739050915287916, + "grad_norm": 11.668399810791016, + "learning_rate": 2.716749007071588e-06, + "loss": 0.6098, + "mean_token_accuracy": 0.8073992922902107, + "num_tokens": 6748114.0, + "step": 5610 + }, + { + "entropy": 1.8336502104997634, + "epoch": 0.017421508277928856, + "grad_norm": 15.977221488952637, + "learning_rate": 2.721592560302238e-06, + "loss": 0.5509, + "mean_token_accuracy": 0.7992581978440285, + "num_tokens": 6761005.0, + "step": 5620 + }, + { + "entropy": 1.868009014427662, + "epoch": 0.017452507402978552, + "grad_norm": 14.214484214782715, + "learning_rate": 2.726436113532888e-06, + "loss": 0.6505, + "mean_token_accuracy": 0.794841094315052, + "num_tokens": 6773183.0, + "step": 5630 + }, + { + "entropy": 1.8636821389198304, + "epoch": 0.017483506528028245, + "grad_norm": 12.620464324951172, + "learning_rate": 2.7312796667635377e-06, + "loss": 0.6509, + "mean_token_accuracy": 0.8034439250826836, + "num_tokens": 6785345.0, + "step": 5640 + }, + { + "entropy": 1.9341711491346358, + "epoch": 0.017514505653077942, + "grad_norm": 15.093106269836426, + "learning_rate": 2.736123219994188e-06, + "loss": 0.6546, + "mean_token_accuracy": 0.80091482847929, + "num_tokens": 6796959.0, + "step": 5650 + }, + { + "entropy": 1.8229330405592918, + "epoch": 0.01754550477812764, + "grad_norm": 5.629141807556152, + "learning_rate": 2.740966773224838e-06, + "loss": 0.5279, + "mean_token_accuracy": 0.8240866810083389, + "num_tokens": 6809608.0, + "step": 5660 + }, + { + "entropy": 1.8584133207798004, + "epoch": 0.017576503903177332, + "grad_norm": 14.71494197845459, + "learning_rate": 2.745810326455488e-06, + "loss": 0.5945, + "mean_token_accuracy": 0.8071165978908539, + "num_tokens": 6821161.0, + "step": 5670 + }, + { + "entropy": 1.9193618685007094, + "epoch": 0.01760750302822703, + "grad_norm": 6.501834392547607, + "learning_rate": 2.750653879686138e-06, + "loss": 0.6277, + "mean_token_accuracy": 0.8095588624477387, + "num_tokens": 6832907.0, + "step": 5680 + }, + { + "entropy": 1.9340075165033341, + "epoch": 0.017638502153276725, + "grad_norm": 16.142831802368164, + "learning_rate": 2.755497432916788e-06, + "loss": 0.6317, + "mean_token_accuracy": 0.8106139227747917, + "num_tokens": 6844618.0, + "step": 5690 + }, + { + "entropy": 1.9251418694853784, + "epoch": 0.017669501278326418, + "grad_norm": 5.188778877258301, + "learning_rate": 2.760340986147438e-06, + "loss": 0.5842, + "mean_token_accuracy": 0.8212108716368676, + "num_tokens": 6856467.0, + "step": 5700 + }, + { + "entropy": 2.0215213894844055, + "epoch": 0.017700500403376115, + "grad_norm": 14.539006233215332, + "learning_rate": 2.765184539378088e-06, + "loss": 0.7512, + "mean_token_accuracy": 0.7919975519180298, + "num_tokens": 6867647.0, + "step": 5710 + }, + { + "entropy": 1.8021862357854843, + "epoch": 0.01773149952842581, + "grad_norm": 12.78034782409668, + "learning_rate": 2.770028092608738e-06, + "loss": 0.5369, + "mean_token_accuracy": 0.8308590099215507, + "num_tokens": 6879996.0, + "step": 5720 + }, + { + "entropy": 1.9824547916650772, + "epoch": 0.017762498653475504, + "grad_norm": 16.377723693847656, + "learning_rate": 2.7748716458393883e-06, + "loss": 0.6998, + "mean_token_accuracy": 0.7981508508324623, + "num_tokens": 6890686.0, + "step": 5730 + }, + { + "entropy": 1.8871808886528014, + "epoch": 0.0177934977785252, + "grad_norm": 8.088064193725586, + "learning_rate": 2.779715199070038e-06, + "loss": 0.6382, + "mean_token_accuracy": 0.8009686887264251, + "num_tokens": 6902728.0, + "step": 5740 + }, + { + "entropy": 1.9506133437156676, + "epoch": 0.017824496903574898, + "grad_norm": 13.706978797912598, + "learning_rate": 2.7845587523006877e-06, + "loss": 0.7109, + "mean_token_accuracy": 0.7973517820239067, + "num_tokens": 6913584.0, + "step": 5750 + }, + { + "entropy": 1.8800716385245324, + "epoch": 0.01785549602862459, + "grad_norm": 14.467167854309082, + "learning_rate": 2.789402305531338e-06, + "loss": 0.6481, + "mean_token_accuracy": 0.8017850533127785, + "num_tokens": 6925591.0, + "step": 5760 + }, + { + "entropy": 1.9344962298870088, + "epoch": 0.017886495153674287, + "grad_norm": 13.922574996948242, + "learning_rate": 2.794245858761988e-06, + "loss": 0.7058, + "mean_token_accuracy": 0.8056110054254532, + "num_tokens": 6936542.0, + "step": 5770 + }, + { + "entropy": 1.8384886384010315, + "epoch": 0.017917494278723984, + "grad_norm": 13.464262962341309, + "learning_rate": 2.799089411992638e-06, + "loss": 0.6779, + "mean_token_accuracy": 0.8026763945817947, + "num_tokens": 6948233.0, + "step": 5780 + }, + { + "entropy": 1.8690043538808823, + "epoch": 0.017948493403773677, + "grad_norm": 19.54326820373535, + "learning_rate": 2.8039329652232885e-06, + "loss": 0.6308, + "mean_token_accuracy": 0.8030871346592903, + "num_tokens": 6959330.0, + "step": 5790 + }, + { + "entropy": 1.8759321600198746, + "epoch": 0.017979492528823374, + "grad_norm": 12.744894981384277, + "learning_rate": 2.808776518453938e-06, + "loss": 0.6481, + "mean_token_accuracy": 0.8097668170928956, + "num_tokens": 6970815.0, + "step": 5800 + }, + { + "entropy": 1.828912016749382, + "epoch": 0.01801049165387307, + "grad_norm": 12.839016914367676, + "learning_rate": 2.813620071684588e-06, + "loss": 0.629, + "mean_token_accuracy": 0.8034252509474754, + "num_tokens": 6982527.0, + "step": 5810 + }, + { + "entropy": 1.8393566399812697, + "epoch": 0.018041490778922763, + "grad_norm": 12.746121406555176, + "learning_rate": 2.818463624915238e-06, + "loss": 0.6182, + "mean_token_accuracy": 0.8172145113348961, + "num_tokens": 6993866.0, + "step": 5820 + }, + { + "entropy": 1.9092845901846887, + "epoch": 0.01807248990397246, + "grad_norm": 6.71993350982666, + "learning_rate": 2.823307178145888e-06, + "loss": 0.6816, + "mean_token_accuracy": 0.7965900272130966, + "num_tokens": 7005083.0, + "step": 5830 + }, + { + "entropy": 1.8039664879441262, + "epoch": 0.018103489029022157, + "grad_norm": 14.516589164733887, + "learning_rate": 2.8281507313765383e-06, + "loss": 0.6241, + "mean_token_accuracy": 0.8058669179677963, + "num_tokens": 7017669.0, + "step": 5840 + }, + { + "entropy": 1.8136644974350928, + "epoch": 0.01813448815407185, + "grad_norm": 5.473270893096924, + "learning_rate": 2.8329942846071878e-06, + "loss": 0.5442, + "mean_token_accuracy": 0.8144378513097763, + "num_tokens": 7030554.0, + "step": 5850 + }, + { + "entropy": 1.8616077184677124, + "epoch": 0.018165487279121546, + "grad_norm": 6.733494281768799, + "learning_rate": 2.837837837837838e-06, + "loss": 0.6268, + "mean_token_accuracy": 0.8107643172144889, + "num_tokens": 7043459.0, + "step": 5860 + }, + { + "entropy": 1.8695634379982948, + "epoch": 0.018196486404171243, + "grad_norm": 13.21927547454834, + "learning_rate": 2.842681391068488e-06, + "loss": 0.66, + "mean_token_accuracy": 0.7932876944541931, + "num_tokens": 7055971.0, + "step": 5870 + }, + { + "entropy": 1.9111657977104186, + "epoch": 0.018227485529220936, + "grad_norm": 12.689513206481934, + "learning_rate": 2.8475249442991383e-06, + "loss": 0.6847, + "mean_token_accuracy": 0.8026935487985611, + "num_tokens": 7067129.0, + "step": 5880 + }, + { + "entropy": 1.9150221094489097, + "epoch": 0.018258484654270633, + "grad_norm": 15.985901832580566, + "learning_rate": 2.852368497529788e-06, + "loss": 0.636, + "mean_token_accuracy": 0.8080303505063057, + "num_tokens": 7078292.0, + "step": 5890 + }, + { + "entropy": 1.9083669915795327, + "epoch": 0.01828948377932033, + "grad_norm": 6.71138334274292, + "learning_rate": 2.857212050760438e-06, + "loss": 0.625, + "mean_token_accuracy": 0.8103194802999496, + "num_tokens": 7089362.0, + "step": 5900 + }, + { + "entropy": 1.9217876300215722, + "epoch": 0.018320482904370026, + "grad_norm": 15.180530548095703, + "learning_rate": 2.862055603991088e-06, + "loss": 0.7363, + "mean_token_accuracy": 0.8016302824020386, + "num_tokens": 7100640.0, + "step": 5910 + }, + { + "entropy": 1.8832744032144546, + "epoch": 0.01835148202941972, + "grad_norm": 7.438052654266357, + "learning_rate": 2.8668991572217382e-06, + "loss": 0.6582, + "mean_token_accuracy": 0.7962846517562866, + "num_tokens": 7112772.0, + "step": 5920 + }, + { + "entropy": 1.855891165137291, + "epoch": 0.018382481154469416, + "grad_norm": 14.392753601074219, + "learning_rate": 2.871742710452388e-06, + "loss": 0.6342, + "mean_token_accuracy": 0.7954742982983589, + "num_tokens": 7124974.0, + "step": 5930 + }, + { + "entropy": 1.906690326333046, + "epoch": 0.018413480279519112, + "grad_norm": 5.785147666931152, + "learning_rate": 2.8765862636830384e-06, + "loss": 0.6857, + "mean_token_accuracy": 0.7974092051386833, + "num_tokens": 7136071.0, + "step": 5940 + }, + { + "entropy": 1.8553122743964194, + "epoch": 0.018444479404568805, + "grad_norm": 12.362131118774414, + "learning_rate": 2.8814298169136883e-06, + "loss": 0.6059, + "mean_token_accuracy": 0.8133484557271004, + "num_tokens": 7148231.0, + "step": 5950 + }, + { + "entropy": 1.8504623636603355, + "epoch": 0.018475478529618502, + "grad_norm": 12.675224304199219, + "learning_rate": 2.8862733701443378e-06, + "loss": 0.6486, + "mean_token_accuracy": 0.8114531069993973, + "num_tokens": 7159463.0, + "step": 5960 + }, + { + "entropy": 1.7733413144946097, + "epoch": 0.0185064776546682, + "grad_norm": 14.104755401611328, + "learning_rate": 2.891116923374988e-06, + "loss": 0.6063, + "mean_token_accuracy": 0.8127880603075027, + "num_tokens": 7172067.0, + "step": 5970 + }, + { + "entropy": 1.9293433710932733, + "epoch": 0.01853747677971789, + "grad_norm": 13.121195793151855, + "learning_rate": 2.895960476605638e-06, + "loss": 0.6808, + "mean_token_accuracy": 0.7942067563533783, + "num_tokens": 7183215.0, + "step": 5980 + }, + { + "entropy": 1.9015644788742065, + "epoch": 0.018568475904767588, + "grad_norm": 14.695234298706055, + "learning_rate": 2.9008040298362883e-06, + "loss": 0.6561, + "mean_token_accuracy": 0.807443767786026, + "num_tokens": 7193804.0, + "step": 5990 + }, + { + "entropy": 1.8584507659077645, + "epoch": 0.018599475029817285, + "grad_norm": 16.059717178344727, + "learning_rate": 2.9056475830669386e-06, + "loss": 0.6235, + "mean_token_accuracy": 0.808501772582531, + "num_tokens": 7205778.0, + "step": 6000 + }, + { + "entropy": 1.8808137744665145, + "epoch": 0.018630474154866978, + "grad_norm": 13.323634147644043, + "learning_rate": 2.910491136297588e-06, + "loss": 0.6352, + "mean_token_accuracy": 0.8102394938468933, + "num_tokens": 7217406.0, + "step": 6010 + }, + { + "entropy": 1.9097854852676392, + "epoch": 0.018661473279916675, + "grad_norm": 12.783294677734375, + "learning_rate": 2.915334689528238e-06, + "loss": 0.6669, + "mean_token_accuracy": 0.8101324290037155, + "num_tokens": 7228513.0, + "step": 6020 + }, + { + "entropy": 1.949710837006569, + "epoch": 0.01869247240496637, + "grad_norm": 8.33492660522461, + "learning_rate": 2.9201782427588882e-06, + "loss": 0.7241, + "mean_token_accuracy": 0.7899691253900528, + "num_tokens": 7240091.0, + "step": 6030 + }, + { + "entropy": 1.7995387852191924, + "epoch": 0.018723471530016064, + "grad_norm": 14.658129692077637, + "learning_rate": 2.925021795989538e-06, + "loss": 0.5537, + "mean_token_accuracy": 0.8123287737369538, + "num_tokens": 7252717.0, + "step": 6040 + }, + { + "entropy": 1.8480943590402603, + "epoch": 0.01875447065506576, + "grad_norm": 12.393167495727539, + "learning_rate": 2.9298653492201884e-06, + "loss": 0.5651, + "mean_token_accuracy": 0.8123827025294303, + "num_tokens": 7265132.0, + "step": 6050 + }, + { + "entropy": 1.9156224384903908, + "epoch": 0.018785469780115457, + "grad_norm": 17.64678382873535, + "learning_rate": 2.934708902450838e-06, + "loss": 0.637, + "mean_token_accuracy": 0.7986405953764916, + "num_tokens": 7277392.0, + "step": 6060 + }, + { + "entropy": 1.8689854100346566, + "epoch": 0.01881646890516515, + "grad_norm": 13.045455932617188, + "learning_rate": 2.939552455681488e-06, + "loss": 0.605, + "mean_token_accuracy": 0.8187311470508576, + "num_tokens": 7289307.0, + "step": 6070 + }, + { + "entropy": 1.9299559414386749, + "epoch": 0.018847468030214847, + "grad_norm": 14.602327346801758, + "learning_rate": 2.944396008912138e-06, + "loss": 0.6499, + "mean_token_accuracy": 0.8123447954654693, + "num_tokens": 7300976.0, + "step": 6080 + }, + { + "entropy": 1.8863015726208687, + "epoch": 0.018878467155264544, + "grad_norm": 16.277345657348633, + "learning_rate": 2.9492395621427884e-06, + "loss": 0.6285, + "mean_token_accuracy": 0.805467925965786, + "num_tokens": 7312816.0, + "step": 6090 + }, + { + "entropy": 1.88625720590353, + "epoch": 0.018909466280314237, + "grad_norm": 13.414533615112305, + "learning_rate": 2.9540831153734383e-06, + "loss": 0.61, + "mean_token_accuracy": 0.8089847132563591, + "num_tokens": 7325107.0, + "step": 6100 + }, + { + "entropy": 1.9170991733670235, + "epoch": 0.018940465405363933, + "grad_norm": 12.490900039672852, + "learning_rate": 2.958926668604088e-06, + "loss": 0.5832, + "mean_token_accuracy": 0.8148081079125404, + "num_tokens": 7337186.0, + "step": 6110 + }, + { + "entropy": 1.8110515862703322, + "epoch": 0.01897146453041363, + "grad_norm": 6.398469924926758, + "learning_rate": 2.963770221834738e-06, + "loss": 0.5864, + "mean_token_accuracy": 0.8025536656379699, + "num_tokens": 7351631.0, + "step": 6120 + }, + { + "entropy": 1.925946244597435, + "epoch": 0.019002463655463323, + "grad_norm": 12.947382926940918, + "learning_rate": 2.9686137750653883e-06, + "loss": 0.6428, + "mean_token_accuracy": 0.8030333101749421, + "num_tokens": 7363334.0, + "step": 6130 + }, + { + "entropy": 1.84049913585186, + "epoch": 0.01903346278051302, + "grad_norm": 13.585901260375977, + "learning_rate": 2.9734573282960382e-06, + "loss": 0.5674, + "mean_token_accuracy": 0.8090210378170013, + "num_tokens": 7376221.0, + "step": 6140 + }, + { + "entropy": 1.913094098865986, + "epoch": 0.019064461905562716, + "grad_norm": 13.463624954223633, + "learning_rate": 2.9783008815266885e-06, + "loss": 0.6148, + "mean_token_accuracy": 0.8032952442765235, + "num_tokens": 7388430.0, + "step": 6150 + }, + { + "entropy": 1.9327594608068466, + "epoch": 0.01909546103061241, + "grad_norm": 13.593147277832031, + "learning_rate": 2.9831444347573384e-06, + "loss": 0.6631, + "mean_token_accuracy": 0.8024881407618523, + "num_tokens": 7400021.0, + "step": 6160 + }, + { + "entropy": 1.920905977487564, + "epoch": 0.019126460155662106, + "grad_norm": 12.407073020935059, + "learning_rate": 2.987987987987988e-06, + "loss": 0.6858, + "mean_token_accuracy": 0.8053328812122345, + "num_tokens": 7411092.0, + "step": 6170 + }, + { + "entropy": 1.9286428660154342, + "epoch": 0.019157459280711803, + "grad_norm": 11.15323543548584, + "learning_rate": 2.992831541218638e-06, + "loss": 0.6284, + "mean_token_accuracy": 0.8029859021306038, + "num_tokens": 7423534.0, + "step": 6180 + }, + { + "entropy": 1.91870975792408, + "epoch": 0.019188458405761496, + "grad_norm": 6.104159832000732, + "learning_rate": 2.9976750944492885e-06, + "loss": 0.6215, + "mean_token_accuracy": 0.804423876106739, + "num_tokens": 7434989.0, + "step": 6190 + }, + { + "entropy": 1.9800142824649811, + "epoch": 0.019219457530811192, + "grad_norm": 15.913412094116211, + "learning_rate": 3.0025186476799384e-06, + "loss": 0.6803, + "mean_token_accuracy": 0.8035276308655739, + "num_tokens": 7446518.0, + "step": 6200 + }, + { + "entropy": 1.925678089261055, + "epoch": 0.01925045665586089, + "grad_norm": 16.255130767822266, + "learning_rate": 3.0073622009105887e-06, + "loss": 0.6876, + "mean_token_accuracy": 0.8033014148473739, + "num_tokens": 7457926.0, + "step": 6210 + }, + { + "entropy": 1.951409675180912, + "epoch": 0.019281455780910582, + "grad_norm": 13.905477523803711, + "learning_rate": 3.012205754141238e-06, + "loss": 0.6945, + "mean_token_accuracy": 0.793704767525196, + "num_tokens": 7469282.0, + "step": 6220 + }, + { + "entropy": 1.8324480682611466, + "epoch": 0.01931245490596028, + "grad_norm": 6.767990589141846, + "learning_rate": 3.017049307371888e-06, + "loss": 0.5416, + "mean_token_accuracy": 0.818550530076027, + "num_tokens": 7481902.0, + "step": 6230 + }, + { + "entropy": 1.9617059826850891, + "epoch": 0.019343454031009975, + "grad_norm": 6.368531227111816, + "learning_rate": 3.0218928606025383e-06, + "loss": 0.6478, + "mean_token_accuracy": 0.8018120333552361, + "num_tokens": 7493336.0, + "step": 6240 + }, + { + "entropy": 1.8573770090937614, + "epoch": 0.019374453156059672, + "grad_norm": 6.3957295417785645, + "learning_rate": 3.0267364138331882e-06, + "loss": 0.5436, + "mean_token_accuracy": 0.8206924736499787, + "num_tokens": 7506208.0, + "step": 6250 + }, + { + "entropy": 1.8330803513526917, + "epoch": 0.019405452281109365, + "grad_norm": 6.754683494567871, + "learning_rate": 3.0315799670638385e-06, + "loss": 0.5934, + "mean_token_accuracy": 0.8097323834896087, + "num_tokens": 7519948.0, + "step": 6260 + }, + { + "entropy": 1.911997850239277, + "epoch": 0.01943645140615906, + "grad_norm": 14.22529125213623, + "learning_rate": 3.036423520294488e-06, + "loss": 0.5984, + "mean_token_accuracy": 0.8061494305729866, + "num_tokens": 7531446.0, + "step": 6270 + }, + { + "entropy": 1.7933774992823601, + "epoch": 0.019467450531208758, + "grad_norm": 12.579453468322754, + "learning_rate": 3.0412670735251383e-06, + "loss": 0.5515, + "mean_token_accuracy": 0.8117320775985718, + "num_tokens": 7545248.0, + "step": 6280 + }, + { + "entropy": 2.001555660367012, + "epoch": 0.01949844965625845, + "grad_norm": 11.950443267822266, + "learning_rate": 3.046110626755788e-06, + "loss": 0.6827, + "mean_token_accuracy": 0.8007282823324203, + "num_tokens": 7555914.0, + "step": 6290 + }, + { + "entropy": 1.832805335521698, + "epoch": 0.019529448781308148, + "grad_norm": 16.177709579467773, + "learning_rate": 3.0509541799864385e-06, + "loss": 0.5295, + "mean_token_accuracy": 0.8259735867381096, + "num_tokens": 7568442.0, + "step": 6300 + }, + { + "entropy": 1.9412631839513779, + "epoch": 0.019560447906357845, + "grad_norm": 13.161090850830078, + "learning_rate": 3.0557977332170884e-06, + "loss": 0.7107, + "mean_token_accuracy": 0.7944168001413345, + "num_tokens": 7579082.0, + "step": 6310 + }, + { + "entropy": 1.8757167264819146, + "epoch": 0.019591447031407538, + "grad_norm": 11.784358978271484, + "learning_rate": 3.0606412864477382e-06, + "loss": 0.5951, + "mean_token_accuracy": 0.8066261202096939, + "num_tokens": 7590950.0, + "step": 6320 + }, + { + "entropy": 1.8339521378278731, + "epoch": 0.019622446156457234, + "grad_norm": 14.665081977844238, + "learning_rate": 3.065484839678388e-06, + "loss": 0.5788, + "mean_token_accuracy": 0.8134505435824394, + "num_tokens": 7602873.0, + "step": 6330 + }, + { + "entropy": 1.853539614379406, + "epoch": 0.01965344528150693, + "grad_norm": 14.00977611541748, + "learning_rate": 3.0703283929090384e-06, + "loss": 0.6516, + "mean_token_accuracy": 0.8087933987379075, + "num_tokens": 7614636.0, + "step": 6340 + }, + { + "entropy": 1.8311856165528297, + "epoch": 0.019684444406556624, + "grad_norm": 13.730056762695312, + "learning_rate": 3.0751719461396883e-06, + "loss": 0.5566, + "mean_token_accuracy": 0.8031821623444557, + "num_tokens": 7627435.0, + "step": 6350 + }, + { + "entropy": 1.9072323009371757, + "epoch": 0.01971544353160632, + "grad_norm": 5.6056036949157715, + "learning_rate": 3.0800154993703386e-06, + "loss": 0.6932, + "mean_token_accuracy": 0.7961813092231751, + "num_tokens": 7639023.0, + "step": 6360 + }, + { + "entropy": 1.8226707085967064, + "epoch": 0.019746442656656017, + "grad_norm": 15.677154541015625, + "learning_rate": 3.0848590526009885e-06, + "loss": 0.6193, + "mean_token_accuracy": 0.8108833208680153, + "num_tokens": 7651962.0, + "step": 6370 + }, + { + "entropy": 1.7706042945384979, + "epoch": 0.01977744178170571, + "grad_norm": 13.05368423461914, + "learning_rate": 3.089702605831638e-06, + "loss": 0.5883, + "mean_token_accuracy": 0.8179239988327026, + "num_tokens": 7664383.0, + "step": 6380 + }, + { + "entropy": 1.888214285671711, + "epoch": 0.019808440906755407, + "grad_norm": 12.008337020874023, + "learning_rate": 3.0945461590622883e-06, + "loss": 0.6435, + "mean_token_accuracy": 0.8116688475012779, + "num_tokens": 7675538.0, + "step": 6390 + }, + { + "entropy": 1.8811195820569993, + "epoch": 0.019839440031805104, + "grad_norm": 13.883013725280762, + "learning_rate": 3.0993897122929386e-06, + "loss": 0.6319, + "mean_token_accuracy": 0.8048406511545181, + "num_tokens": 7687099.0, + "step": 6400 + }, + { + "entropy": 1.8231300503015517, + "epoch": 0.019870439156854797, + "grad_norm": 11.85662841796875, + "learning_rate": 3.1042332655235885e-06, + "loss": 0.5582, + "mean_token_accuracy": 0.816590279340744, + "num_tokens": 7700987.0, + "step": 6410 + }, + { + "entropy": 1.8579674810171127, + "epoch": 0.019901438281904493, + "grad_norm": 15.262175559997559, + "learning_rate": 3.1090768187542388e-06, + "loss": 0.6259, + "mean_token_accuracy": 0.8095797553658486, + "num_tokens": 7713844.0, + "step": 6420 + }, + { + "entropy": 1.879908984899521, + "epoch": 0.01993243740695419, + "grad_norm": 13.756391525268555, + "learning_rate": 3.1139203719848882e-06, + "loss": 0.6006, + "mean_token_accuracy": 0.8058964297175407, + "num_tokens": 7726962.0, + "step": 6430 + }, + { + "entropy": 1.9172916844487191, + "epoch": 0.019963436532003883, + "grad_norm": 6.88080358505249, + "learning_rate": 3.118763925215538e-06, + "loss": 0.6446, + "mean_token_accuracy": 0.7972790181636811, + "num_tokens": 7738773.0, + "step": 6440 + }, + { + "entropy": 1.9533959448337554, + "epoch": 0.01999443565705358, + "grad_norm": 14.810494422912598, + "learning_rate": 3.1236074784461884e-06, + "loss": 0.7092, + "mean_token_accuracy": 0.797048932313919, + "num_tokens": 7749295.0, + "step": 6450 + }, + { + "entropy": 1.904009547829628, + "epoch": 0.020025434782103276, + "grad_norm": 12.683305740356445, + "learning_rate": 3.1284510316768383e-06, + "loss": 0.6457, + "mean_token_accuracy": 0.8019493162631989, + "num_tokens": 7760568.0, + "step": 6460 + }, + { + "entropy": 1.8452860102057458, + "epoch": 0.02005643390715297, + "grad_norm": 6.6882171630859375, + "learning_rate": 3.1332945849074886e-06, + "loss": 0.591, + "mean_token_accuracy": 0.8225623354315758, + "num_tokens": 7773203.0, + "step": 6470 + }, + { + "entropy": 1.891256783902645, + "epoch": 0.020087433032202666, + "grad_norm": 15.097151756286621, + "learning_rate": 3.138138138138138e-06, + "loss": 0.6204, + "mean_token_accuracy": 0.8069896474480629, + "num_tokens": 7785046.0, + "step": 6480 + }, + { + "entropy": 1.841528557240963, + "epoch": 0.020118432157252363, + "grad_norm": 4.251867771148682, + "learning_rate": 3.1429816913687884e-06, + "loss": 0.5801, + "mean_token_accuracy": 0.8183139115571976, + "num_tokens": 7798180.0, + "step": 6490 + }, + { + "entropy": 1.8745207443833352, + "epoch": 0.020149431282302056, + "grad_norm": 13.895639419555664, + "learning_rate": 3.1478252445994383e-06, + "loss": 0.6282, + "mean_token_accuracy": 0.8085477828979493, + "num_tokens": 7810555.0, + "step": 6500 + }, + { + "entropy": 1.8891053274273872, + "epoch": 0.020180430407351752, + "grad_norm": 14.992795944213867, + "learning_rate": 3.1526687978300886e-06, + "loss": 0.6291, + "mean_token_accuracy": 0.80573670566082, + "num_tokens": 7821921.0, + "step": 6510 + }, + { + "entropy": 1.9124128386378287, + "epoch": 0.02021142953240145, + "grad_norm": 12.522514343261719, + "learning_rate": 3.1575123510607385e-06, + "loss": 0.6336, + "mean_token_accuracy": 0.820842219889164, + "num_tokens": 7832746.0, + "step": 6520 + }, + { + "entropy": 1.7800809681415557, + "epoch": 0.020242428657451142, + "grad_norm": 11.537628173828125, + "learning_rate": 3.1623559042913884e-06, + "loss": 0.5485, + "mean_token_accuracy": 0.8302456393837929, + "num_tokens": 7845908.0, + "step": 6530 + }, + { + "entropy": 1.8512769252061845, + "epoch": 0.02027342778250084, + "grad_norm": 13.687551498413086, + "learning_rate": 3.1671994575220382e-06, + "loss": 0.594, + "mean_token_accuracy": 0.8206526070833207, + "num_tokens": 7857703.0, + "step": 6540 + }, + { + "entropy": 1.9494429141283036, + "epoch": 0.020304426907550535, + "grad_norm": 13.054556846618652, + "learning_rate": 3.1720430107526885e-06, + "loss": 0.6967, + "mean_token_accuracy": 0.7968014314770698, + "num_tokens": 7868457.0, + "step": 6550 + }, + { + "entropy": 1.872872059047222, + "epoch": 0.02033542603260023, + "grad_norm": 14.116209030151367, + "learning_rate": 3.1768865639833384e-06, + "loss": 0.5831, + "mean_token_accuracy": 0.8167121425271034, + "num_tokens": 7880101.0, + "step": 6560 + }, + { + "entropy": 1.8299372598528862, + "epoch": 0.020366425157649925, + "grad_norm": 15.94222354888916, + "learning_rate": 3.1817301172139887e-06, + "loss": 0.6118, + "mean_token_accuracy": 0.8142679139971734, + "num_tokens": 7892509.0, + "step": 6570 + }, + { + "entropy": 1.8935213565826416, + "epoch": 0.02039742428269962, + "grad_norm": 15.16163158416748, + "learning_rate": 3.1865736704446386e-06, + "loss": 0.6278, + "mean_token_accuracy": 0.8115665182471276, + "num_tokens": 7903535.0, + "step": 6580 + }, + { + "entropy": 1.737954817712307, + "epoch": 0.020428423407749315, + "grad_norm": 13.716304779052734, + "learning_rate": 3.191417223675288e-06, + "loss": 0.4859, + "mean_token_accuracy": 0.8255019202828408, + "num_tokens": 7917342.0, + "step": 6590 + }, + { + "entropy": 1.8174697816371919, + "epoch": 0.02045942253279901, + "grad_norm": 13.559125900268555, + "learning_rate": 3.1962607769059384e-06, + "loss": 0.5972, + "mean_token_accuracy": 0.8168426662683487, + "num_tokens": 7929488.0, + "step": 6600 + }, + { + "entropy": 1.8946778982877732, + "epoch": 0.020490421657848708, + "grad_norm": 15.61142635345459, + "learning_rate": 3.2011043301365887e-06, + "loss": 0.7263, + "mean_token_accuracy": 0.799173790216446, + "num_tokens": 7940278.0, + "step": 6610 + }, + { + "entropy": 1.8623188078403472, + "epoch": 0.020521420782898404, + "grad_norm": 11.181509017944336, + "learning_rate": 3.2059478833672386e-06, + "loss": 0.6354, + "mean_token_accuracy": 0.8031112432479859, + "num_tokens": 7951379.0, + "step": 6620 + }, + { + "entropy": 1.814198338985443, + "epoch": 0.020552419907948098, + "grad_norm": 13.728723526000977, + "learning_rate": 3.210791436597889e-06, + "loss": 0.6129, + "mean_token_accuracy": 0.8157623007893562, + "num_tokens": 7963714.0, + "step": 6630 + }, + { + "entropy": 1.7929986909031868, + "epoch": 0.020583419032997794, + "grad_norm": 15.344460487365723, + "learning_rate": 3.2156349898285383e-06, + "loss": 0.5832, + "mean_token_accuracy": 0.8098177522420883, + "num_tokens": 7976870.0, + "step": 6640 + }, + { + "entropy": 1.947634482383728, + "epoch": 0.02061441815804749, + "grad_norm": 14.946516036987305, + "learning_rate": 3.2204785430591882e-06, + "loss": 0.7049, + "mean_token_accuracy": 0.79284388422966, + "num_tokens": 7987426.0, + "step": 6650 + }, + { + "entropy": 1.8572285056114197, + "epoch": 0.020645417283097184, + "grad_norm": 6.301254749298096, + "learning_rate": 3.2253220962898385e-06, + "loss": 0.6301, + "mean_token_accuracy": 0.8085553154349328, + "num_tokens": 8000549.0, + "step": 6660 + }, + { + "entropy": 1.803759203851223, + "epoch": 0.02067641640814688, + "grad_norm": 12.17286491394043, + "learning_rate": 3.2301656495204884e-06, + "loss": 0.5198, + "mean_token_accuracy": 0.8267651617527008, + "num_tokens": 8015000.0, + "step": 6670 + }, + { + "entropy": 1.8756111353635787, + "epoch": 0.020707415533196577, + "grad_norm": 15.4856595993042, + "learning_rate": 3.2350092027511387e-06, + "loss": 0.5969, + "mean_token_accuracy": 0.80719293653965, + "num_tokens": 8026773.0, + "step": 6680 + }, + { + "entropy": 1.9130279645323753, + "epoch": 0.02073841465824627, + "grad_norm": 14.601099014282227, + "learning_rate": 3.239852755981788e-06, + "loss": 0.6407, + "mean_token_accuracy": 0.8074927061796189, + "num_tokens": 8038375.0, + "step": 6690 + }, + { + "entropy": 1.84910279661417, + "epoch": 0.020769413783295967, + "grad_norm": 13.639535903930664, + "learning_rate": 3.2446963092124385e-06, + "loss": 0.6246, + "mean_token_accuracy": 0.8115042328834534, + "num_tokens": 8050746.0, + "step": 6700 + }, + { + "entropy": 1.9024951666593553, + "epoch": 0.020800412908345663, + "grad_norm": 11.334275245666504, + "learning_rate": 3.2495398624430884e-06, + "loss": 0.6458, + "mean_token_accuracy": 0.8086831450462342, + "num_tokens": 8062275.0, + "step": 6710 + }, + { + "entropy": 1.953022199869156, + "epoch": 0.020831412033395356, + "grad_norm": 12.958690643310547, + "learning_rate": 3.2543834156737387e-06, + "loss": 0.6934, + "mean_token_accuracy": 0.8028324991464615, + "num_tokens": 8073879.0, + "step": 6720 + }, + { + "entropy": 1.9028348535299302, + "epoch": 0.020862411158445053, + "grad_norm": 12.473337173461914, + "learning_rate": 3.2592269689043886e-06, + "loss": 0.6281, + "mean_token_accuracy": 0.808925811946392, + "num_tokens": 8086058.0, + "step": 6730 + }, + { + "entropy": 1.8883586995303632, + "epoch": 0.02089341028349475, + "grad_norm": 13.6912841796875, + "learning_rate": 3.2640705221350385e-06, + "loss": 0.5796, + "mean_token_accuracy": 0.8137388199567794, + "num_tokens": 8097481.0, + "step": 6740 + }, + { + "entropy": 1.8357958167791366, + "epoch": 0.020924409408544443, + "grad_norm": 12.956893920898438, + "learning_rate": 3.2689140753656883e-06, + "loss": 0.5703, + "mean_token_accuracy": 0.8275958806276321, + "num_tokens": 8109412.0, + "step": 6750 + }, + { + "entropy": 1.7908343866467475, + "epoch": 0.02095540853359414, + "grad_norm": 14.640457153320312, + "learning_rate": 3.2737576285963386e-06, + "loss": 0.5865, + "mean_token_accuracy": 0.8118628889322281, + "num_tokens": 8122837.0, + "step": 6760 + }, + { + "entropy": 1.7841477155685426, + "epoch": 0.020986407658643836, + "grad_norm": 12.808713912963867, + "learning_rate": 3.2786011818269885e-06, + "loss": 0.6055, + "mean_token_accuracy": 0.8140031367540359, + "num_tokens": 8136163.0, + "step": 6770 + }, + { + "entropy": 1.8466138496994973, + "epoch": 0.02101740678369353, + "grad_norm": 14.429435729980469, + "learning_rate": 3.283444735057639e-06, + "loss": 0.6075, + "mean_token_accuracy": 0.8045207351446152, + "num_tokens": 8148613.0, + "step": 6780 + }, + { + "entropy": 1.7560095891356469, + "epoch": 0.021048405908743226, + "grad_norm": 7.405726909637451, + "learning_rate": 3.2882882882882887e-06, + "loss": 0.5141, + "mean_token_accuracy": 0.8221228688955307, + "num_tokens": 8162089.0, + "step": 6790 + }, + { + "entropy": 1.9308898389339446, + "epoch": 0.021079405033792922, + "grad_norm": 14.129651069641113, + "learning_rate": 3.293131841518938e-06, + "loss": 0.6899, + "mean_token_accuracy": 0.8042490169405937, + "num_tokens": 8173779.0, + "step": 6800 + }, + { + "entropy": 1.9463881149888038, + "epoch": 0.021110404158842615, + "grad_norm": 8.938533782958984, + "learning_rate": 3.2979753947495885e-06, + "loss": 0.6966, + "mean_token_accuracy": 0.8073234111070633, + "num_tokens": 8184942.0, + "step": 6810 + }, + { + "entropy": 1.8929473131895065, + "epoch": 0.021141403283892312, + "grad_norm": 17.423473358154297, + "learning_rate": 3.302818947980239e-06, + "loss": 0.7093, + "mean_token_accuracy": 0.7885128021240234, + "num_tokens": 8196565.0, + "step": 6820 + }, + { + "entropy": 1.8256116762757302, + "epoch": 0.02117240240894201, + "grad_norm": 11.359122276306152, + "learning_rate": 3.3076625012108887e-06, + "loss": 0.6383, + "mean_token_accuracy": 0.8059591919183731, + "num_tokens": 8208383.0, + "step": 6830 + }, + { + "entropy": 1.8639884352684022, + "epoch": 0.021203401533991702, + "grad_norm": 14.2997407913208, + "learning_rate": 3.312506054441539e-06, + "loss": 0.6282, + "mean_token_accuracy": 0.8068465679883957, + "num_tokens": 8220104.0, + "step": 6840 + }, + { + "entropy": 1.9167477533221244, + "epoch": 0.0212344006590414, + "grad_norm": 6.671177387237549, + "learning_rate": 3.3173496076721885e-06, + "loss": 0.6529, + "mean_token_accuracy": 0.8033589154481888, + "num_tokens": 8231370.0, + "step": 6850 + }, + { + "entropy": 1.8746015638113023, + "epoch": 0.021265399784091095, + "grad_norm": 14.27453327178955, + "learning_rate": 3.3221931609028383e-06, + "loss": 0.6251, + "mean_token_accuracy": 0.8159545630216598, + "num_tokens": 8243023.0, + "step": 6860 + }, + { + "entropy": 1.917281800508499, + "epoch": 0.021296398909140788, + "grad_norm": 11.446333885192871, + "learning_rate": 3.3270367141334886e-06, + "loss": 0.6558, + "mean_token_accuracy": 0.7994436591863632, + "num_tokens": 8254401.0, + "step": 6870 + }, + { + "entropy": 1.8957316786050797, + "epoch": 0.021327398034190485, + "grad_norm": 11.739078521728516, + "learning_rate": 3.3318802673641385e-06, + "loss": 0.7036, + "mean_token_accuracy": 0.8000991299748421, + "num_tokens": 8265419.0, + "step": 6880 + }, + { + "entropy": 1.7615302629768848, + "epoch": 0.02135839715924018, + "grad_norm": 5.259082317352295, + "learning_rate": 3.336723820594789e-06, + "loss": 0.5323, + "mean_token_accuracy": 0.8190023839473725, + "num_tokens": 8278764.0, + "step": 6890 + }, + { + "entropy": 1.7910688430070878, + "epoch": 0.021389396284289874, + "grad_norm": 12.807435035705566, + "learning_rate": 3.3415673738254383e-06, + "loss": 0.5615, + "mean_token_accuracy": 0.8227502018213272, + "num_tokens": 8291327.0, + "step": 6900 + }, + { + "entropy": 1.7641884714365006, + "epoch": 0.02142039540933957, + "grad_norm": 12.872736930847168, + "learning_rate": 3.3464109270560886e-06, + "loss": 0.5901, + "mean_token_accuracy": 0.8182562112808227, + "num_tokens": 8303703.0, + "step": 6910 + }, + { + "entropy": 1.8527298361063003, + "epoch": 0.021451394534389268, + "grad_norm": 14.935564041137695, + "learning_rate": 3.3512544802867385e-06, + "loss": 0.6421, + "mean_token_accuracy": 0.8045241579413414, + "num_tokens": 8315526.0, + "step": 6920 + }, + { + "entropy": 1.8295287489891052, + "epoch": 0.02148239365943896, + "grad_norm": 13.789701461791992, + "learning_rate": 3.356098033517389e-06, + "loss": 0.6008, + "mean_token_accuracy": 0.8134370803833008, + "num_tokens": 8327672.0, + "step": 6930 + }, + { + "entropy": 1.896050798892975, + "epoch": 0.021513392784488657, + "grad_norm": 12.54214859008789, + "learning_rate": 3.3609415867480387e-06, + "loss": 0.6978, + "mean_token_accuracy": 0.7970891669392586, + "num_tokens": 8339084.0, + "step": 6940 + }, + { + "entropy": 1.8715087831020356, + "epoch": 0.021544391909538354, + "grad_norm": 13.210384368896484, + "learning_rate": 3.3657851399786886e-06, + "loss": 0.6879, + "mean_token_accuracy": 0.7981110483407974, + "num_tokens": 8350484.0, + "step": 6950 + }, + { + "entropy": 1.747877648472786, + "epoch": 0.021575391034588047, + "grad_norm": 13.573982238769531, + "learning_rate": 3.3706286932093384e-06, + "loss": 0.4994, + "mean_token_accuracy": 0.8265791207551956, + "num_tokens": 8363033.0, + "step": 6960 + }, + { + "entropy": 1.7737156122922897, + "epoch": 0.021606390159637744, + "grad_norm": 16.360973358154297, + "learning_rate": 3.3754722464399888e-06, + "loss": 0.5927, + "mean_token_accuracy": 0.8136523142457008, + "num_tokens": 8375536.0, + "step": 6970 + }, + { + "entropy": 1.7883441895246506, + "epoch": 0.02163738928468744, + "grad_norm": 13.998126029968262, + "learning_rate": 3.3803157996706386e-06, + "loss": 0.5853, + "mean_token_accuracy": 0.8114990651607513, + "num_tokens": 8388141.0, + "step": 6980 + }, + { + "entropy": 1.9003767311573028, + "epoch": 0.021668388409737137, + "grad_norm": 17.087512969970703, + "learning_rate": 3.385159352901289e-06, + "loss": 0.677, + "mean_token_accuracy": 0.8007842287421226, + "num_tokens": 8399614.0, + "step": 6990 + }, + { + "entropy": 1.8496545001864433, + "epoch": 0.02169938753478683, + "grad_norm": 11.947819709777832, + "learning_rate": 3.390002906131939e-06, + "loss": 0.5943, + "mean_token_accuracy": 0.8178739234805107, + "num_tokens": 8410656.0, + "step": 7000 + }, + { + "entropy": 1.8931600719690322, + "epoch": 0.021730386659836527, + "grad_norm": 12.223316192626953, + "learning_rate": 3.3948464593625883e-06, + "loss": 0.6515, + "mean_token_accuracy": 0.804112882912159, + "num_tokens": 8422509.0, + "step": 7010 + }, + { + "entropy": 1.8370696052908897, + "epoch": 0.021761385784886223, + "grad_norm": 13.245317459106445, + "learning_rate": 3.3996900125932386e-06, + "loss": 0.6179, + "mean_token_accuracy": 0.8125099033117295, + "num_tokens": 8434469.0, + "step": 7020 + }, + { + "entropy": 1.9446087509393692, + "epoch": 0.021792384909935916, + "grad_norm": 13.750452995300293, + "learning_rate": 3.404533565823889e-06, + "loss": 0.6823, + "mean_token_accuracy": 0.793831068277359, + "num_tokens": 8445989.0, + "step": 7030 + }, + { + "entropy": 1.8339137956500053, + "epoch": 0.021823384034985613, + "grad_norm": 12.146233558654785, + "learning_rate": 3.409377119054539e-06, + "loss": 0.585, + "mean_token_accuracy": 0.8186416819691658, + "num_tokens": 8459049.0, + "step": 7040 + }, + { + "entropy": 1.9154435515403747, + "epoch": 0.02185438316003531, + "grad_norm": 13.75799560546875, + "learning_rate": 3.414220672285189e-06, + "loss": 0.6339, + "mean_token_accuracy": 0.8105358377099037, + "num_tokens": 8470694.0, + "step": 7050 + }, + { + "entropy": 1.8546890705823897, + "epoch": 0.021885382285085003, + "grad_norm": 14.610628128051758, + "learning_rate": 3.4190642255158386e-06, + "loss": 0.6188, + "mean_token_accuracy": 0.8084279328584671, + "num_tokens": 8483926.0, + "step": 7060 + }, + { + "entropy": 1.9373271316289902, + "epoch": 0.0219163814101347, + "grad_norm": 13.321712493896484, + "learning_rate": 3.4239077787464884e-06, + "loss": 0.635, + "mean_token_accuracy": 0.7939963206648827, + "num_tokens": 8495258.0, + "step": 7070 + }, + { + "entropy": 1.8608694806694985, + "epoch": 0.021947380535184396, + "grad_norm": 12.879969596862793, + "learning_rate": 3.4287513319771387e-06, + "loss": 0.5935, + "mean_token_accuracy": 0.8156390026211738, + "num_tokens": 8507779.0, + "step": 7080 + }, + { + "entropy": 1.9322496995329856, + "epoch": 0.02197837966023409, + "grad_norm": 15.521547317504883, + "learning_rate": 3.4335948852077886e-06, + "loss": 0.65, + "mean_token_accuracy": 0.8115120708942414, + "num_tokens": 8519474.0, + "step": 7090 + }, + { + "entropy": 1.7319314986467362, + "epoch": 0.022009378785283785, + "grad_norm": 16.432004928588867, + "learning_rate": 3.438438438438439e-06, + "loss": 0.5074, + "mean_token_accuracy": 0.8291773125529289, + "num_tokens": 8533400.0, + "step": 7100 + }, + { + "entropy": 1.8349164828658104, + "epoch": 0.022040377910333482, + "grad_norm": 4.809366703033447, + "learning_rate": 3.4432819916690884e-06, + "loss": 0.598, + "mean_token_accuracy": 0.809609466791153, + "num_tokens": 8547001.0, + "step": 7110 + }, + { + "entropy": 1.8549170672893525, + "epoch": 0.022071377035383175, + "grad_norm": 12.525724411010742, + "learning_rate": 3.4481255448997387e-06, + "loss": 0.5441, + "mean_token_accuracy": 0.8268691346049308, + "num_tokens": 8559045.0, + "step": 7120 + }, + { + "entropy": 1.8226833492517471, + "epoch": 0.022102376160432872, + "grad_norm": 11.418012619018555, + "learning_rate": 3.4529690981303886e-06, + "loss": 0.5714, + "mean_token_accuracy": 0.8124491006135941, + "num_tokens": 8572509.0, + "step": 7130 + }, + { + "entropy": 1.7938767224550247, + "epoch": 0.02213337528548257, + "grad_norm": 14.121147155761719, + "learning_rate": 3.457812651361039e-06, + "loss": 0.5331, + "mean_token_accuracy": 0.8254873216152191, + "num_tokens": 8585878.0, + "step": 7140 + }, + { + "entropy": 1.7850069940090179, + "epoch": 0.02216437441053226, + "grad_norm": 3.6775126457214355, + "learning_rate": 3.4626562045916888e-06, + "loss": 0.563, + "mean_token_accuracy": 0.8238705024123192, + "num_tokens": 8599106.0, + "step": 7150 + }, + { + "entropy": 1.8556845039129257, + "epoch": 0.022195373535581958, + "grad_norm": 11.72307300567627, + "learning_rate": 3.4674997578223387e-06, + "loss": 0.5998, + "mean_token_accuracy": 0.8144358694553375, + "num_tokens": 8611671.0, + "step": 7160 + }, + { + "entropy": 1.9032898783683776, + "epoch": 0.022226372660631655, + "grad_norm": 15.8334379196167, + "learning_rate": 3.4723433110529886e-06, + "loss": 0.6952, + "mean_token_accuracy": 0.7943542674183846, + "num_tokens": 8623666.0, + "step": 7170 + }, + { + "entropy": 1.8691613361239434, + "epoch": 0.022257371785681348, + "grad_norm": 11.972046852111816, + "learning_rate": 3.477186864283639e-06, + "loss": 0.6065, + "mean_token_accuracy": 0.8114256381988525, + "num_tokens": 8635505.0, + "step": 7180 + }, + { + "entropy": 1.8365458711981772, + "epoch": 0.022288370910731044, + "grad_norm": 14.07669448852539, + "learning_rate": 3.4820304175142887e-06, + "loss": 0.5641, + "mean_token_accuracy": 0.8142189055681228, + "num_tokens": 8648420.0, + "step": 7190 + }, + { + "entropy": 1.8941682323813438, + "epoch": 0.02231937003578074, + "grad_norm": 12.625129699707031, + "learning_rate": 3.486873970744939e-06, + "loss": 0.6311, + "mean_token_accuracy": 0.8075986996293067, + "num_tokens": 8661128.0, + "step": 7200 + }, + { + "entropy": 1.8504259541630745, + "epoch": 0.022350369160830434, + "grad_norm": 6.088210105895996, + "learning_rate": 3.491717523975589e-06, + "loss": 0.6102, + "mean_token_accuracy": 0.8124124839901924, + "num_tokens": 8672830.0, + "step": 7210 + }, + { + "entropy": 1.8653683230280875, + "epoch": 0.02238136828588013, + "grad_norm": 15.06690502166748, + "learning_rate": 3.496561077206239e-06, + "loss": 0.6134, + "mean_token_accuracy": 0.8111754149198532, + "num_tokens": 8684754.0, + "step": 7220 + }, + { + "entropy": 1.9144769728183746, + "epoch": 0.022412367410929827, + "grad_norm": 13.07332992553711, + "learning_rate": 3.5014046304368887e-06, + "loss": 0.7025, + "mean_token_accuracy": 0.8014708399772644, + "num_tokens": 8695969.0, + "step": 7230 + }, + { + "entropy": 1.8702037513256073, + "epoch": 0.02244336653597952, + "grad_norm": 13.671896934509277, + "learning_rate": 3.506248183667539e-06, + "loss": 0.657, + "mean_token_accuracy": 0.809376485645771, + "num_tokens": 8707677.0, + "step": 7240 + }, + { + "entropy": 1.9119601994752884, + "epoch": 0.022474365661029217, + "grad_norm": 16.43385124206543, + "learning_rate": 3.511091736898189e-06, + "loss": 0.6931, + "mean_token_accuracy": 0.7956320688128471, + "num_tokens": 8718648.0, + "step": 7250 + }, + { + "entropy": 1.8430123403668404, + "epoch": 0.022505364786078914, + "grad_norm": 5.884398937225342, + "learning_rate": 3.515935290128839e-06, + "loss": 0.6187, + "mean_token_accuracy": 0.8040406808257103, + "num_tokens": 8731028.0, + "step": 7260 + }, + { + "entropy": 1.7957608908414842, + "epoch": 0.022536363911128607, + "grad_norm": 6.280706882476807, + "learning_rate": 3.5207788433594887e-06, + "loss": 0.5309, + "mean_token_accuracy": 0.8192771345376968, + "num_tokens": 8743771.0, + "step": 7270 + }, + { + "entropy": 1.8952230989933014, + "epoch": 0.022567363036178303, + "grad_norm": 11.960763931274414, + "learning_rate": 3.5256223965901385e-06, + "loss": 0.6716, + "mean_token_accuracy": 0.7986912429332733, + "num_tokens": 8754987.0, + "step": 7280 + }, + { + "entropy": 1.7763853132724763, + "epoch": 0.022598362161228, + "grad_norm": 11.904229164123535, + "learning_rate": 3.530465949820789e-06, + "loss": 0.5633, + "mean_token_accuracy": 0.8182390749454498, + "num_tokens": 8767593.0, + "step": 7290 + }, + { + "entropy": 1.8223341763019563, + "epoch": 0.022629361286277693, + "grad_norm": 13.150022506713867, + "learning_rate": 3.5353095030514387e-06, + "loss": 0.6274, + "mean_token_accuracy": 0.8080811381340027, + "num_tokens": 8779386.0, + "step": 7300 + }, + { + "entropy": 1.8453481420874596, + "epoch": 0.02266036041132739, + "grad_norm": 16.510391235351562, + "learning_rate": 3.540153056282089e-06, + "loss": 0.6204, + "mean_token_accuracy": 0.8153272330760956, + "num_tokens": 8791800.0, + "step": 7310 + }, + { + "entropy": 1.8597090423107148, + "epoch": 0.022691359536377086, + "grad_norm": 11.312858581542969, + "learning_rate": 3.5449966095127385e-06, + "loss": 0.6451, + "mean_token_accuracy": 0.8115374326705933, + "num_tokens": 8802831.0, + "step": 7320 + }, + { + "entropy": 1.7629368484020234, + "epoch": 0.022722358661426783, + "grad_norm": 10.922659873962402, + "learning_rate": 3.549840162743389e-06, + "loss": 0.5833, + "mean_token_accuracy": 0.8109967395663261, + "num_tokens": 8816085.0, + "step": 7330 + }, + { + "entropy": 1.8767753183841704, + "epoch": 0.022753357786476476, + "grad_norm": 12.062516212463379, + "learning_rate": 3.5546837159740387e-06, + "loss": 0.6765, + "mean_token_accuracy": 0.7968204706907273, + "num_tokens": 8828125.0, + "step": 7340 + }, + { + "entropy": 1.798590750992298, + "epoch": 0.022784356911526173, + "grad_norm": 6.616394996643066, + "learning_rate": 3.559527269204689e-06, + "loss": 0.534, + "mean_token_accuracy": 0.8197192326188087, + "num_tokens": 8840873.0, + "step": 7350 + }, + { + "entropy": 1.8430965647101403, + "epoch": 0.02281535603657587, + "grad_norm": 12.304119110107422, + "learning_rate": 3.564370822435339e-06, + "loss": 0.5905, + "mean_token_accuracy": 0.8144068956375122, + "num_tokens": 8853322.0, + "step": 7360 + }, + { + "entropy": 1.8360921517014503, + "epoch": 0.022846355161625562, + "grad_norm": 13.151806831359863, + "learning_rate": 3.5692143756659888e-06, + "loss": 0.5766, + "mean_token_accuracy": 0.8184613898396492, + "num_tokens": 8866377.0, + "step": 7370 + }, + { + "entropy": 1.8777944207191468, + "epoch": 0.02287735428667526, + "grad_norm": 11.325216293334961, + "learning_rate": 3.5740579288966387e-06, + "loss": 0.6267, + "mean_token_accuracy": 0.8124101653695106, + "num_tokens": 8877572.0, + "step": 7380 + }, + { + "entropy": 1.8015395179390907, + "epoch": 0.022908353411724956, + "grad_norm": 13.632472038269043, + "learning_rate": 3.578901482127289e-06, + "loss": 0.5888, + "mean_token_accuracy": 0.8304689675569534, + "num_tokens": 8890279.0, + "step": 7390 + }, + { + "entropy": 1.9334001183509826, + "epoch": 0.02293935253677465, + "grad_norm": 18.318246841430664, + "learning_rate": 3.583745035357939e-06, + "loss": 0.6549, + "mean_token_accuracy": 0.8056276828050614, + "num_tokens": 8901650.0, + "step": 7400 + }, + { + "entropy": 1.8892749726772309, + "epoch": 0.022970351661824345, + "grad_norm": 13.172524452209473, + "learning_rate": 3.588588588588589e-06, + "loss": 0.6269, + "mean_token_accuracy": 0.8132000923156738, + "num_tokens": 8912513.0, + "step": 7410 + }, + { + "entropy": 1.9355769097805022, + "epoch": 0.023001350786874042, + "grad_norm": 12.621254920959473, + "learning_rate": 3.593432141819239e-06, + "loss": 0.6664, + "mean_token_accuracy": 0.8047688767313957, + "num_tokens": 8923668.0, + "step": 7420 + }, + { + "entropy": 1.7977433398365974, + "epoch": 0.023032349911923735, + "grad_norm": 14.412604331970215, + "learning_rate": 3.598275695049889e-06, + "loss": 0.5684, + "mean_token_accuracy": 0.8217224717140198, + "num_tokens": 8936743.0, + "step": 7430 + }, + { + "entropy": 1.8730682261288165, + "epoch": 0.02306334903697343, + "grad_norm": 18.06209373474121, + "learning_rate": 3.603119248280539e-06, + "loss": 0.687, + "mean_token_accuracy": 0.7976897180080413, + "num_tokens": 8948361.0, + "step": 7440 + }, + { + "entropy": 1.8379228845238686, + "epoch": 0.023094348162023128, + "grad_norm": 16.5709171295166, + "learning_rate": 3.607962801511189e-06, + "loss": 0.647, + "mean_token_accuracy": 0.8052407920360565, + "num_tokens": 8960081.0, + "step": 7450 + }, + { + "entropy": 1.8426879778504373, + "epoch": 0.02312534728707282, + "grad_norm": 6.525999546051025, + "learning_rate": 3.612806354741839e-06, + "loss": 0.6043, + "mean_token_accuracy": 0.8133143231272697, + "num_tokens": 8972165.0, + "step": 7460 + }, + { + "entropy": 1.8481390669941902, + "epoch": 0.023156346412122518, + "grad_norm": 11.893180847167969, + "learning_rate": 3.6176499079724893e-06, + "loss": 0.5844, + "mean_token_accuracy": 0.8177680626511574, + "num_tokens": 8983683.0, + "step": 7470 + }, + { + "entropy": 1.8435255289077759, + "epoch": 0.023187345537172215, + "grad_norm": 13.133444786071777, + "learning_rate": 3.6224934612031388e-06, + "loss": 0.6272, + "mean_token_accuracy": 0.8149840265512467, + "num_tokens": 8994970.0, + "step": 7480 + }, + { + "entropy": 1.8090941041707993, + "epoch": 0.023218344662221908, + "grad_norm": 5.264407157897949, + "learning_rate": 3.6273370144337886e-06, + "loss": 0.6154, + "mean_token_accuracy": 0.8030720934271812, + "num_tokens": 9007512.0, + "step": 7490 + }, + { + "entropy": 1.8767491683363915, + "epoch": 0.023249343787271604, + "grad_norm": 12.091042518615723, + "learning_rate": 3.632180567664439e-06, + "loss": 0.6655, + "mean_token_accuracy": 0.8082658454775811, + "num_tokens": 9018984.0, + "step": 7500 + }, + { + "entropy": 1.8025119572877883, + "epoch": 0.0232803429123213, + "grad_norm": 14.580635070800781, + "learning_rate": 3.637024120895089e-06, + "loss": 0.5927, + "mean_token_accuracy": 0.8104995712637901, + "num_tokens": 9032061.0, + "step": 7510 + }, + { + "entropy": 1.807465235888958, + "epoch": 0.023311342037370994, + "grad_norm": 12.528902053833008, + "learning_rate": 3.641867674125739e-06, + "loss": 0.5885, + "mean_token_accuracy": 0.8124234288930893, + "num_tokens": 9044338.0, + "step": 7520 + }, + { + "entropy": 1.8326406210660935, + "epoch": 0.02334234116242069, + "grad_norm": 15.056619644165039, + "learning_rate": 3.6467112273563886e-06, + "loss": 0.5776, + "mean_token_accuracy": 0.8178173303604126, + "num_tokens": 9056355.0, + "step": 7530 + }, + { + "entropy": 1.7988777324557303, + "epoch": 0.023373340287470387, + "grad_norm": 12.382633209228516, + "learning_rate": 3.651554780587039e-06, + "loss": 0.638, + "mean_token_accuracy": 0.8013660088181496, + "num_tokens": 9068937.0, + "step": 7540 + }, + { + "entropy": 1.8599283427000046, + "epoch": 0.02340433941252008, + "grad_norm": 11.771322250366211, + "learning_rate": 3.656398333817689e-06, + "loss": 0.644, + "mean_token_accuracy": 0.8095093712210655, + "num_tokens": 9080598.0, + "step": 7550 + }, + { + "entropy": 1.9167393915355206, + "epoch": 0.023435338537569777, + "grad_norm": 15.524273872375488, + "learning_rate": 3.661241887048339e-06, + "loss": 0.6336, + "mean_token_accuracy": 0.807981975376606, + "num_tokens": 9091965.0, + "step": 7560 + }, + { + "entropy": 1.9095857173204422, + "epoch": 0.023466337662619473, + "grad_norm": 11.96094799041748, + "learning_rate": 3.666085440278989e-06, + "loss": 0.6676, + "mean_token_accuracy": 0.7986720770597457, + "num_tokens": 9103815.0, + "step": 7570 + }, + { + "entropy": 1.8987512320280076, + "epoch": 0.023497336787669167, + "grad_norm": 13.277989387512207, + "learning_rate": 3.670928993509639e-06, + "loss": 0.6695, + "mean_token_accuracy": 0.8065518125891685, + "num_tokens": 9115160.0, + "step": 7580 + }, + { + "entropy": 1.8922671616077422, + "epoch": 0.023528335912718863, + "grad_norm": 13.145485877990723, + "learning_rate": 3.6757725467402888e-06, + "loss": 0.685, + "mean_token_accuracy": 0.8018161416053772, + "num_tokens": 9126074.0, + "step": 7590 + }, + { + "entropy": 1.8771102100610733, + "epoch": 0.02355933503776856, + "grad_norm": 10.288253784179688, + "learning_rate": 3.680616099970939e-06, + "loss": 0.5876, + "mean_token_accuracy": 0.8214558124542236, + "num_tokens": 9138014.0, + "step": 7600 + }, + { + "entropy": 1.8878172472119332, + "epoch": 0.023590334162818253, + "grad_norm": 12.76760482788086, + "learning_rate": 3.685459653201589e-06, + "loss": 0.6105, + "mean_token_accuracy": 0.8086960777640343, + "num_tokens": 9149156.0, + "step": 7610 + }, + { + "entropy": 1.8934050709009171, + "epoch": 0.02362133328786795, + "grad_norm": 12.631563186645508, + "learning_rate": 3.6903032064322393e-06, + "loss": 0.7241, + "mean_token_accuracy": 0.7934870898723603, + "num_tokens": 9161339.0, + "step": 7620 + }, + { + "entropy": 1.9421094968914985, + "epoch": 0.023652332412917646, + "grad_norm": 14.10696792602539, + "learning_rate": 3.695146759662889e-06, + "loss": 0.6605, + "mean_token_accuracy": 0.7969638511538506, + "num_tokens": 9172617.0, + "step": 7630 + }, + { + "entropy": 1.8514475598931313, + "epoch": 0.02368333153796734, + "grad_norm": 13.774374008178711, + "learning_rate": 3.699990312893539e-06, + "loss": 0.6989, + "mean_token_accuracy": 0.7987961351871491, + "num_tokens": 9184783.0, + "step": 7640 + }, + { + "entropy": 1.7891797423362732, + "epoch": 0.023714330663017036, + "grad_norm": 5.731255054473877, + "learning_rate": 3.704833866124189e-06, + "loss": 0.595, + "mean_token_accuracy": 0.8199530601501465, + "num_tokens": 9197781.0, + "step": 7650 + }, + { + "entropy": 1.8698611691594125, + "epoch": 0.023745329788066732, + "grad_norm": 12.50722599029541, + "learning_rate": 3.7096774193548392e-06, + "loss": 0.5373, + "mean_token_accuracy": 0.822203965485096, + "num_tokens": 9210004.0, + "step": 7660 + }, + { + "entropy": 1.9275674849748612, + "epoch": 0.023776328913116426, + "grad_norm": 11.671341896057129, + "learning_rate": 3.714520972585489e-06, + "loss": 0.6877, + "mean_token_accuracy": 0.7972215577960015, + "num_tokens": 9221045.0, + "step": 7670 + }, + { + "entropy": 1.8518989741802216, + "epoch": 0.023807328038166122, + "grad_norm": 12.143589973449707, + "learning_rate": 3.7193645258161394e-06, + "loss": 0.5829, + "mean_token_accuracy": 0.8160637483000756, + "num_tokens": 9233582.0, + "step": 7680 + }, + { + "entropy": 1.9745838671922684, + "epoch": 0.02383832716321582, + "grad_norm": 8.162403106689453, + "learning_rate": 3.724208079046789e-06, + "loss": 0.6552, + "mean_token_accuracy": 0.8079164877533913, + "num_tokens": 9244759.0, + "step": 7690 + }, + { + "entropy": 1.906966118514538, + "epoch": 0.023869326288265515, + "grad_norm": 5.672482490539551, + "learning_rate": 3.7290516322774388e-06, + "loss": 0.6534, + "mean_token_accuracy": 0.8026331990957261, + "num_tokens": 9256899.0, + "step": 7700 + }, + { + "entropy": 1.8956855058670044, + "epoch": 0.02390032541331521, + "grad_norm": 11.346663475036621, + "learning_rate": 3.733895185508089e-06, + "loss": 0.5923, + "mean_token_accuracy": 0.8169514790177346, + "num_tokens": 9268280.0, + "step": 7710 + }, + { + "entropy": 1.9229377016425133, + "epoch": 0.023931324538364905, + "grad_norm": 6.657054424285889, + "learning_rate": 3.7387387387387394e-06, + "loss": 0.6111, + "mean_token_accuracy": 0.8021419748663903, + "num_tokens": 9280505.0, + "step": 7720 + }, + { + "entropy": 1.7850939154624939, + "epoch": 0.0239623236634146, + "grad_norm": 3.4274275302886963, + "learning_rate": 3.7435822919693893e-06, + "loss": 0.5326, + "mean_token_accuracy": 0.829072143137455, + "num_tokens": 9293099.0, + "step": 7730 + }, + { + "entropy": 1.9557977497577668, + "epoch": 0.023993322788464295, + "grad_norm": 13.326607704162598, + "learning_rate": 3.7484258452000387e-06, + "loss": 0.6884, + "mean_token_accuracy": 0.798516571521759, + "num_tokens": 9304408.0, + "step": 7740 + }, + { + "entropy": 1.8509003862738609, + "epoch": 0.02402432191351399, + "grad_norm": 13.444816589355469, + "learning_rate": 3.753269398430689e-06, + "loss": 0.6127, + "mean_token_accuracy": 0.8102645307779313, + "num_tokens": 9316689.0, + "step": 7750 + }, + { + "entropy": 1.7711122930049896, + "epoch": 0.024055321038563688, + "grad_norm": 5.156175136566162, + "learning_rate": 3.758112951661339e-06, + "loss": 0.5507, + "mean_token_accuracy": 0.8136220246553421, + "num_tokens": 9329593.0, + "step": 7760 + }, + { + "entropy": 1.8134596601128579, + "epoch": 0.02408632016361338, + "grad_norm": 12.387152671813965, + "learning_rate": 3.7629565048919892e-06, + "loss": 0.5433, + "mean_token_accuracy": 0.8308922097086906, + "num_tokens": 9341486.0, + "step": 7770 + }, + { + "entropy": 1.8755627959966659, + "epoch": 0.024117319288663078, + "grad_norm": 12.338440895080566, + "learning_rate": 3.767800058122639e-06, + "loss": 0.6576, + "mean_token_accuracy": 0.8139973327517509, + "num_tokens": 9352093.0, + "step": 7780 + }, + { + "entropy": 1.7985035717487334, + "epoch": 0.024148318413712774, + "grad_norm": 15.153599739074707, + "learning_rate": 3.772643611353289e-06, + "loss": 0.5679, + "mean_token_accuracy": 0.8182680875062942, + "num_tokens": 9364468.0, + "step": 7790 + }, + { + "entropy": 1.841192065924406, + "epoch": 0.024179317538762467, + "grad_norm": 4.2616729736328125, + "learning_rate": 3.777487164583939e-06, + "loss": 0.5846, + "mean_token_accuracy": 0.8146574392914772, + "num_tokens": 9377926.0, + "step": 7800 + }, + { + "entropy": 1.920288896560669, + "epoch": 0.024210316663812164, + "grad_norm": 14.35584545135498, + "learning_rate": 3.782330717814589e-06, + "loss": 0.7463, + "mean_token_accuracy": 0.7909737974405289, + "num_tokens": 9389642.0, + "step": 7810 + }, + { + "entropy": 1.8628991067409515, + "epoch": 0.02424131578886186, + "grad_norm": 15.304876327514648, + "learning_rate": 3.787174271045239e-06, + "loss": 0.586, + "mean_token_accuracy": 0.8081997647881508, + "num_tokens": 9402632.0, + "step": 7820 + }, + { + "entropy": 1.9279965221881867, + "epoch": 0.024272314913911554, + "grad_norm": 14.144513130187988, + "learning_rate": 3.7920178242758894e-06, + "loss": 0.6125, + "mean_token_accuracy": 0.8075599849224091, + "num_tokens": 9414752.0, + "step": 7830 + }, + { + "entropy": 1.8668213859200478, + "epoch": 0.02430331403896125, + "grad_norm": 13.267974853515625, + "learning_rate": 3.7968613775065392e-06, + "loss": 0.6041, + "mean_token_accuracy": 0.8120471283793449, + "num_tokens": 9426149.0, + "step": 7840 + }, + { + "entropy": 1.9365358769893646, + "epoch": 0.024334313164010947, + "grad_norm": 13.412445068359375, + "learning_rate": 3.801704930737189e-06, + "loss": 0.6644, + "mean_token_accuracy": 0.8024364963173867, + "num_tokens": 9437306.0, + "step": 7850 + }, + { + "entropy": 1.9267346888780594, + "epoch": 0.02436531228906064, + "grad_norm": 13.718780517578125, + "learning_rate": 3.806548483967839e-06, + "loss": 0.6209, + "mean_token_accuracy": 0.7963616490364075, + "num_tokens": 9449401.0, + "step": 7860 + }, + { + "entropy": 1.8495199769735335, + "epoch": 0.024396311414110337, + "grad_norm": 11.75393009185791, + "learning_rate": 3.8113920371984893e-06, + "loss": 0.5303, + "mean_token_accuracy": 0.8257737800478935, + "num_tokens": 9461930.0, + "step": 7870 + }, + { + "entropy": 1.8680783659219742, + "epoch": 0.024427310539160033, + "grad_norm": 10.541101455688477, + "learning_rate": 3.81623559042914e-06, + "loss": 0.5844, + "mean_token_accuracy": 0.8095761522650718, + "num_tokens": 9473753.0, + "step": 7880 + }, + { + "entropy": 1.874305109679699, + "epoch": 0.024458309664209726, + "grad_norm": 17.521041870117188, + "learning_rate": 3.8210791436597895e-06, + "loss": 0.5836, + "mean_token_accuracy": 0.8210688814520836, + "num_tokens": 9486165.0, + "step": 7890 + }, + { + "entropy": 1.9577010348439217, + "epoch": 0.024489308789259423, + "grad_norm": 11.793618202209473, + "learning_rate": 3.8259226968904386e-06, + "loss": 0.656, + "mean_token_accuracy": 0.8024436876177787, + "num_tokens": 9497697.0, + "step": 7900 + }, + { + "entropy": 1.9006599888205529, + "epoch": 0.02452030791430912, + "grad_norm": 13.784941673278809, + "learning_rate": 3.830766250121089e-06, + "loss": 0.5801, + "mean_token_accuracy": 0.8112906947731972, + "num_tokens": 9510027.0, + "step": 7910 + }, + { + "entropy": 1.943654653429985, + "epoch": 0.024551307039358813, + "grad_norm": 12.328388214111328, + "learning_rate": 3.835609803351739e-06, + "loss": 0.6456, + "mean_token_accuracy": 0.8070406153798103, + "num_tokens": 9521309.0, + "step": 7920 + }, + { + "entropy": 1.8668604627251626, + "epoch": 0.02458230616440851, + "grad_norm": 13.092666625976562, + "learning_rate": 3.840453356582389e-06, + "loss": 0.6008, + "mean_token_accuracy": 0.8188393861055374, + "num_tokens": 9533756.0, + "step": 7930 + }, + { + "entropy": 1.8963513985276221, + "epoch": 0.024613305289458206, + "grad_norm": 12.246744155883789, + "learning_rate": 3.84529690981304e-06, + "loss": 0.6602, + "mean_token_accuracy": 0.8034892365336418, + "num_tokens": 9545338.0, + "step": 7940 + }, + { + "entropy": 1.8683793991804123, + "epoch": 0.0246443044145079, + "grad_norm": 5.166250228881836, + "learning_rate": 3.850140463043689e-06, + "loss": 0.6427, + "mean_token_accuracy": 0.8070285364985466, + "num_tokens": 9558352.0, + "step": 7950 + }, + { + "entropy": 1.848561166226864, + "epoch": 0.024675303539557596, + "grad_norm": 12.724311828613281, + "learning_rate": 3.854984016274339e-06, + "loss": 0.603, + "mean_token_accuracy": 0.8135858446359634, + "num_tokens": 9570765.0, + "step": 7960 + }, + { + "entropy": 1.9112862929701806, + "epoch": 0.024706302664607292, + "grad_norm": 12.425032615661621, + "learning_rate": 3.8598275695049894e-06, + "loss": 0.6219, + "mean_token_accuracy": 0.8120498448610306, + "num_tokens": 9581809.0, + "step": 7970 + }, + { + "entropy": 1.9325566202402116, + "epoch": 0.024737301789656985, + "grad_norm": 12.291463851928711, + "learning_rate": 3.864671122735639e-06, + "loss": 0.7302, + "mean_token_accuracy": 0.7900342762470245, + "num_tokens": 9592614.0, + "step": 7980 + }, + { + "entropy": 1.8553043097257613, + "epoch": 0.024768300914706682, + "grad_norm": 14.992151260375977, + "learning_rate": 3.869514675966289e-06, + "loss": 0.6167, + "mean_token_accuracy": 0.8028819039463997, + "num_tokens": 9604643.0, + "step": 7990 + }, + { + "entropy": 1.86794516146183, + "epoch": 0.02479930003975638, + "grad_norm": 11.671442985534668, + "learning_rate": 3.874358229196939e-06, + "loss": 0.5971, + "mean_token_accuracy": 0.8125581681728363, + "num_tokens": 9617101.0, + "step": 8000 + }, + { + "entropy": 1.977939623594284, + "epoch": 0.02483029916480607, + "grad_norm": 14.295232772827148, + "learning_rate": 3.879201782427589e-06, + "loss": 0.7147, + "mean_token_accuracy": 0.7930570155382156, + "num_tokens": 9628082.0, + "step": 8010 + }, + { + "entropy": 1.9121051743626594, + "epoch": 0.02486129828985577, + "grad_norm": 13.757525444030762, + "learning_rate": 3.884045335658239e-06, + "loss": 0.6761, + "mean_token_accuracy": 0.8087022632360459, + "num_tokens": 9639932.0, + "step": 8020 + }, + { + "entropy": 1.841791082918644, + "epoch": 0.024892297414905465, + "grad_norm": 6.790825366973877, + "learning_rate": 3.88888888888889e-06, + "loss": 0.5533, + "mean_token_accuracy": 0.8306658193469048, + "num_tokens": 9652182.0, + "step": 8030 + }, + { + "entropy": 1.937432810664177, + "epoch": 0.02492329653995516, + "grad_norm": 13.771592140197754, + "learning_rate": 3.8937324421195395e-06, + "loss": 0.635, + "mean_token_accuracy": 0.8062109544873237, + "num_tokens": 9664539.0, + "step": 8040 + }, + { + "entropy": 1.8707983300089837, + "epoch": 0.024954295665004855, + "grad_norm": 12.660676956176758, + "learning_rate": 3.898575995350189e-06, + "loss": 0.6223, + "mean_token_accuracy": 0.8023678585886955, + "num_tokens": 9676330.0, + "step": 8050 + }, + { + "entropy": 1.9288925692439078, + "epoch": 0.02498529479005455, + "grad_norm": 14.32672119140625, + "learning_rate": 3.903419548580839e-06, + "loss": 0.65, + "mean_token_accuracy": 0.8022472143173218, + "num_tokens": 9687521.0, + "step": 8060 + }, + { + "entropy": 1.84913961738348, + "epoch": 0.025016293915104248, + "grad_norm": 6.136168003082275, + "learning_rate": 3.908263101811489e-06, + "loss": 0.5446, + "mean_token_accuracy": 0.8220839589834213, + "num_tokens": 9699208.0, + "step": 8070 + }, + { + "entropy": 1.8712972477078438, + "epoch": 0.02504729304015394, + "grad_norm": 11.905717849731445, + "learning_rate": 3.913106655042139e-06, + "loss": 0.6119, + "mean_token_accuracy": 0.8191757917404174, + "num_tokens": 9711146.0, + "step": 8080 + }, + { + "entropy": 1.9155054926872253, + "epoch": 0.025078292165203638, + "grad_norm": 12.616907119750977, + "learning_rate": 3.91795020827279e-06, + "loss": 0.689, + "mean_token_accuracy": 0.8038010418415069, + "num_tokens": 9722222.0, + "step": 8090 + }, + { + "entropy": 1.9103090360760688, + "epoch": 0.025109291290253334, + "grad_norm": 12.909311294555664, + "learning_rate": 3.92279376150344e-06, + "loss": 0.6034, + "mean_token_accuracy": 0.812662410736084, + "num_tokens": 9733890.0, + "step": 8100 + }, + { + "entropy": 1.8312827795743942, + "epoch": 0.025140290415303027, + "grad_norm": 14.832688331604004, + "learning_rate": 3.927637314734089e-06, + "loss": 0.5364, + "mean_token_accuracy": 0.8249428883194924, + "num_tokens": 9746809.0, + "step": 8110 + }, + { + "entropy": 1.8507552802562715, + "epoch": 0.025171289540352724, + "grad_norm": 13.549945831298828, + "learning_rate": 3.932480867964739e-06, + "loss": 0.5865, + "mean_token_accuracy": 0.8166124686598778, + "num_tokens": 9759355.0, + "step": 8120 + }, + { + "entropy": 1.8369555801153183, + "epoch": 0.02520228866540242, + "grad_norm": 11.552787780761719, + "learning_rate": 3.937324421195389e-06, + "loss": 0.5874, + "mean_token_accuracy": 0.825542688369751, + "num_tokens": 9771999.0, + "step": 8130 + }, + { + "entropy": 1.8414116382598877, + "epoch": 0.025233287790452114, + "grad_norm": 12.822636604309082, + "learning_rate": 3.942167974426039e-06, + "loss": 0.5757, + "mean_token_accuracy": 0.8180017948150635, + "num_tokens": 9784135.0, + "step": 8140 + }, + { + "entropy": 1.847972247004509, + "epoch": 0.02526428691550181, + "grad_norm": 6.315905570983887, + "learning_rate": 3.94701152765669e-06, + "loss": 0.6042, + "mean_token_accuracy": 0.8069576799869538, + "num_tokens": 9795872.0, + "step": 8150 + }, + { + "entropy": 1.8612953543663024, + "epoch": 0.025295286040551507, + "grad_norm": 7.054393768310547, + "learning_rate": 3.951855080887339e-06, + "loss": 0.6359, + "mean_token_accuracy": 0.808410094678402, + "num_tokens": 9807148.0, + "step": 8160 + }, + { + "entropy": 1.745456214249134, + "epoch": 0.0253262851656012, + "grad_norm": 10.395822525024414, + "learning_rate": 3.956698634117989e-06, + "loss": 0.5158, + "mean_token_accuracy": 0.8252757102251053, + "num_tokens": 9820692.0, + "step": 8170 + }, + { + "entropy": 1.933900985121727, + "epoch": 0.025357284290650896, + "grad_norm": 15.611356735229492, + "learning_rate": 3.9615421873486395e-06, + "loss": 0.8023, + "mean_token_accuracy": 0.7968411594629288, + "num_tokens": 9832230.0, + "step": 8180 + }, + { + "entropy": 1.8410508632659912, + "epoch": 0.025388283415700593, + "grad_norm": 15.906883239746094, + "learning_rate": 3.966385740579289e-06, + "loss": 0.5578, + "mean_token_accuracy": 0.8164623320102692, + "num_tokens": 9844672.0, + "step": 8190 + }, + { + "entropy": 1.9910759955644608, + "epoch": 0.025419282540750286, + "grad_norm": 13.914506912231445, + "learning_rate": 3.971229293809939e-06, + "loss": 0.7414, + "mean_token_accuracy": 0.7940586417913437, + "num_tokens": 9856014.0, + "step": 8200 + }, + { + "entropy": 1.9779651939868927, + "epoch": 0.025450281665799983, + "grad_norm": 12.388845443725586, + "learning_rate": 3.976072847040589e-06, + "loss": 0.6406, + "mean_token_accuracy": 0.810752010345459, + "num_tokens": 9867585.0, + "step": 8210 + }, + { + "entropy": 1.950091141462326, + "epoch": 0.02548128079084968, + "grad_norm": 15.438441276550293, + "learning_rate": 3.980916400271239e-06, + "loss": 0.6076, + "mean_token_accuracy": 0.8093546569347382, + "num_tokens": 9879228.0, + "step": 8220 + }, + { + "entropy": 1.8706778109073638, + "epoch": 0.025512279915899373, + "grad_norm": 10.18140983581543, + "learning_rate": 3.985759953501889e-06, + "loss": 0.5486, + "mean_token_accuracy": 0.8231084540486335, + "num_tokens": 9892299.0, + "step": 8230 + }, + { + "entropy": 1.9114290565252303, + "epoch": 0.02554327904094907, + "grad_norm": 11.814135551452637, + "learning_rate": 3.99060350673254e-06, + "loss": 0.6377, + "mean_token_accuracy": 0.8094885662198067, + "num_tokens": 9905027.0, + "step": 8240 + }, + { + "entropy": 1.8559225603938103, + "epoch": 0.025574278165998766, + "grad_norm": 16.11547088623047, + "learning_rate": 3.9954470599631896e-06, + "loss": 0.6124, + "mean_token_accuracy": 0.8048399239778519, + "num_tokens": 9917498.0, + "step": 8250 + }, + { + "entropy": 1.9120782747864724, + "epoch": 0.02560527729104846, + "grad_norm": 6.817448616027832, + "learning_rate": 4.0002906131938395e-06, + "loss": 0.6091, + "mean_token_accuracy": 0.8105715274810791, + "num_tokens": 9929963.0, + "step": 8260 + }, + { + "entropy": 1.8872886016964912, + "epoch": 0.025636276416098155, + "grad_norm": 13.45474624633789, + "learning_rate": 4.005134166424489e-06, + "loss": 0.5923, + "mean_token_accuracy": 0.8145119935274124, + "num_tokens": 9942393.0, + "step": 8270 + }, + { + "entropy": 1.9791032537817954, + "epoch": 0.025667275541147852, + "grad_norm": 12.72803020477295, + "learning_rate": 4.009977719655139e-06, + "loss": 0.6922, + "mean_token_accuracy": 0.7994261741638183, + "num_tokens": 9953810.0, + "step": 8280 + }, + { + "entropy": 1.9359009936451912, + "epoch": 0.025698274666197545, + "grad_norm": 11.705611228942871, + "learning_rate": 4.014821272885789e-06, + "loss": 0.6192, + "mean_token_accuracy": 0.8066216111183167, + "num_tokens": 9965045.0, + "step": 8290 + }, + { + "entropy": 1.8848050013184547, + "epoch": 0.025729273791247242, + "grad_norm": 14.14693832397461, + "learning_rate": 4.01966482611644e-06, + "loss": 0.6122, + "mean_token_accuracy": 0.8066284090280533, + "num_tokens": 9977622.0, + "step": 8300 + }, + { + "entropy": 1.9196738287806512, + "epoch": 0.02576027291629694, + "grad_norm": 13.333892822265625, + "learning_rate": 4.02450837934709e-06, + "loss": 0.6378, + "mean_token_accuracy": 0.8114329054951668, + "num_tokens": 9989396.0, + "step": 8310 + }, + { + "entropy": 1.9220668867230415, + "epoch": 0.02579127204134663, + "grad_norm": 8.307788848876953, + "learning_rate": 4.029351932577739e-06, + "loss": 0.676, + "mean_token_accuracy": 0.8107616409659386, + "num_tokens": 10001039.0, + "step": 8320 + }, + { + "entropy": 1.924265030026436, + "epoch": 0.025822271166396328, + "grad_norm": 13.42192554473877, + "learning_rate": 4.0341954858083895e-06, + "loss": 0.6432, + "mean_token_accuracy": 0.8023953422904014, + "num_tokens": 10012862.0, + "step": 8330 + }, + { + "entropy": 1.8803733557462692, + "epoch": 0.025853270291446025, + "grad_norm": 14.513736724853516, + "learning_rate": 4.039039039039039e-06, + "loss": 0.614, + "mean_token_accuracy": 0.8223489403724671, + "num_tokens": 10024615.0, + "step": 8340 + }, + { + "entropy": 1.8715410023927688, + "epoch": 0.025884269416495718, + "grad_norm": 13.777809143066406, + "learning_rate": 4.043882592269689e-06, + "loss": 0.5892, + "mean_token_accuracy": 0.8061651542782784, + "num_tokens": 10036943.0, + "step": 8350 + }, + { + "entropy": 1.8626436039805412, + "epoch": 0.025915268541545414, + "grad_norm": 5.88369607925415, + "learning_rate": 4.04872614550034e-06, + "loss": 0.5776, + "mean_token_accuracy": 0.816595071554184, + "num_tokens": 10049582.0, + "step": 8360 + }, + { + "entropy": 1.893427351117134, + "epoch": 0.02594626766659511, + "grad_norm": 5.693576335906982, + "learning_rate": 4.053569698730989e-06, + "loss": 0.5993, + "mean_token_accuracy": 0.8074675589799881, + "num_tokens": 10061474.0, + "step": 8370 + }, + { + "entropy": 1.9071956276893616, + "epoch": 0.025977266791644804, + "grad_norm": 6.2122063636779785, + "learning_rate": 4.058413251961639e-06, + "loss": 0.5746, + "mean_token_accuracy": 0.8195055171847343, + "num_tokens": 10073377.0, + "step": 8380 + }, + { + "entropy": 1.9347770288586617, + "epoch": 0.0260082659166945, + "grad_norm": 6.3852434158325195, + "learning_rate": 4.06325680519229e-06, + "loss": 0.6468, + "mean_token_accuracy": 0.7985034555196762, + "num_tokens": 10085414.0, + "step": 8390 + }, + { + "entropy": 1.879788914322853, + "epoch": 0.026039265041744197, + "grad_norm": 5.960961818695068, + "learning_rate": 4.0681003584229395e-06, + "loss": 0.5019, + "mean_token_accuracy": 0.8187567561864852, + "num_tokens": 10098381.0, + "step": 8400 + }, + { + "entropy": 1.915135058760643, + "epoch": 0.026070264166793894, + "grad_norm": 5.453474521636963, + "learning_rate": 4.072943911653589e-06, + "loss": 0.5682, + "mean_token_accuracy": 0.8210947424173355, + "num_tokens": 10110037.0, + "step": 8410 + }, + { + "entropy": 1.9362727150321006, + "epoch": 0.026101263291843587, + "grad_norm": 14.594948768615723, + "learning_rate": 4.077787464884239e-06, + "loss": 0.6608, + "mean_token_accuracy": 0.7989747643470764, + "num_tokens": 10121901.0, + "step": 8420 + }, + { + "entropy": 1.8305003002285958, + "epoch": 0.026132262416893284, + "grad_norm": 14.329607963562012, + "learning_rate": 4.082631018114889e-06, + "loss": 0.5911, + "mean_token_accuracy": 0.8075525343418122, + "num_tokens": 10134905.0, + "step": 8430 + }, + { + "entropy": 1.8578375533223153, + "epoch": 0.02616326154194298, + "grad_norm": 13.74600887298584, + "learning_rate": 4.087474571345539e-06, + "loss": 0.5891, + "mean_token_accuracy": 0.8238616168498993, + "num_tokens": 10146939.0, + "step": 8440 + }, + { + "entropy": 1.9704759448766709, + "epoch": 0.026194260666992673, + "grad_norm": 12.829072952270508, + "learning_rate": 4.09231812457619e-06, + "loss": 0.6982, + "mean_token_accuracy": 0.7840036064386368, + "num_tokens": 10158552.0, + "step": 8450 + }, + { + "entropy": 1.8708418890833856, + "epoch": 0.02622525979204237, + "grad_norm": 12.569872856140137, + "learning_rate": 4.09716167780684e-06, + "loss": 0.5778, + "mean_token_accuracy": 0.8157704427838326, + "num_tokens": 10170966.0, + "step": 8460 + }, + { + "entropy": 1.9303714036941528, + "epoch": 0.026256258917092067, + "grad_norm": 14.426962852478027, + "learning_rate": 4.1020052310374896e-06, + "loss": 0.6081, + "mean_token_accuracy": 0.8103082060813904, + "num_tokens": 10183633.0, + "step": 8470 + }, + { + "entropy": 1.862780438363552, + "epoch": 0.02628725804214176, + "grad_norm": 5.966731548309326, + "learning_rate": 4.1068487842681394e-06, + "loss": 0.593, + "mean_token_accuracy": 0.804168826341629, + "num_tokens": 10195713.0, + "step": 8480 + }, + { + "entropy": 1.9560957163572312, + "epoch": 0.026318257167191456, + "grad_norm": 13.00171947479248, + "learning_rate": 4.111692337498789e-06, + "loss": 0.6648, + "mean_token_accuracy": 0.799228847026825, + "num_tokens": 10207536.0, + "step": 8490 + }, + { + "entropy": 1.889628753066063, + "epoch": 0.026349256292241153, + "grad_norm": 6.381255626678467, + "learning_rate": 4.116535890729439e-06, + "loss": 0.5822, + "mean_token_accuracy": 0.8094806551933289, + "num_tokens": 10220149.0, + "step": 8500 + }, + { + "entropy": 1.9717933177947997, + "epoch": 0.026380255417290846, + "grad_norm": 12.86796760559082, + "learning_rate": 4.12137944396009e-06, + "loss": 0.6727, + "mean_token_accuracy": 0.8050152316689492, + "num_tokens": 10231949.0, + "step": 8510 + }, + { + "entropy": 1.934307949244976, + "epoch": 0.026411254542340543, + "grad_norm": 11.483423233032227, + "learning_rate": 4.12622299719074e-06, + "loss": 0.5974, + "mean_token_accuracy": 0.8233723506331444, + "num_tokens": 10243617.0, + "step": 8520 + }, + { + "entropy": 1.9711205691099167, + "epoch": 0.02644225366739024, + "grad_norm": 11.086894035339355, + "learning_rate": 4.131066550421389e-06, + "loss": 0.67, + "mean_token_accuracy": 0.7918197363615036, + "num_tokens": 10254929.0, + "step": 8530 + }, + { + "entropy": 1.8315666139125824, + "epoch": 0.026473252792439932, + "grad_norm": 12.072243690490723, + "learning_rate": 4.13591010365204e-06, + "loss": 0.5831, + "mean_token_accuracy": 0.8164573073387146, + "num_tokens": 10268348.0, + "step": 8540 + }, + { + "entropy": 1.8700304999947548, + "epoch": 0.02650425191748963, + "grad_norm": 7.212819576263428, + "learning_rate": 4.1407536568826895e-06, + "loss": 0.5612, + "mean_token_accuracy": 0.8133175373077393, + "num_tokens": 10281420.0, + "step": 8550 + }, + { + "entropy": 1.8715410739183427, + "epoch": 0.026535251042539326, + "grad_norm": 15.102907180786133, + "learning_rate": 4.145597210113339e-06, + "loss": 0.5518, + "mean_token_accuracy": 0.8137237802147865, + "num_tokens": 10293312.0, + "step": 8560 + }, + { + "entropy": 1.8536837711930274, + "epoch": 0.02656625016758902, + "grad_norm": 18.23057746887207, + "learning_rate": 4.15044076334399e-06, + "loss": 0.5679, + "mean_token_accuracy": 0.819478040933609, + "num_tokens": 10305555.0, + "step": 8570 + }, + { + "entropy": 1.8068823255598545, + "epoch": 0.026597249292638715, + "grad_norm": 6.016721725463867, + "learning_rate": 4.155284316574639e-06, + "loss": 0.5376, + "mean_token_accuracy": 0.8214111477136612, + "num_tokens": 10318500.0, + "step": 8580 + }, + { + "entropy": 1.927972738444805, + "epoch": 0.026628248417688412, + "grad_norm": 13.958089828491211, + "learning_rate": 4.160127869805289e-06, + "loss": 0.6243, + "mean_token_accuracy": 0.8016948789358139, + "num_tokens": 10330081.0, + "step": 8590 + }, + { + "entropy": 1.8768570333719254, + "epoch": 0.026659247542738105, + "grad_norm": 11.411067962646484, + "learning_rate": 4.16497142303594e-06, + "loss": 0.5993, + "mean_token_accuracy": 0.8204882755875588, + "num_tokens": 10341630.0, + "step": 8600 + }, + { + "entropy": 1.9243871226906777, + "epoch": 0.0266902466677878, + "grad_norm": 12.71778678894043, + "learning_rate": 4.16981497626659e-06, + "loss": 0.6498, + "mean_token_accuracy": 0.8025202408432961, + "num_tokens": 10353521.0, + "step": 8610 + }, + { + "entropy": 1.8957347482442857, + "epoch": 0.026721245792837498, + "grad_norm": 13.246805191040039, + "learning_rate": 4.1746585294972395e-06, + "loss": 0.5698, + "mean_token_accuracy": 0.8230790719389915, + "num_tokens": 10365655.0, + "step": 8620 + }, + { + "entropy": 1.9469288036227226, + "epoch": 0.02675224491788719, + "grad_norm": 13.388124465942383, + "learning_rate": 4.179502082727889e-06, + "loss": 0.6473, + "mean_token_accuracy": 0.8096310868859291, + "num_tokens": 10377607.0, + "step": 8630 + }, + { + "entropy": 1.8346548154950142, + "epoch": 0.026783244042936888, + "grad_norm": 13.726068496704102, + "learning_rate": 4.184345635958539e-06, + "loss": 0.5094, + "mean_token_accuracy": 0.8302373275160789, + "num_tokens": 10390062.0, + "step": 8640 + }, + { + "entropy": 1.892690037190914, + "epoch": 0.026814243167986584, + "grad_norm": 15.530143737792969, + "learning_rate": 4.189189189189189e-06, + "loss": 0.6238, + "mean_token_accuracy": 0.8069882616400719, + "num_tokens": 10402361.0, + "step": 8650 + }, + { + "entropy": 1.8609767884016037, + "epoch": 0.026845242293036278, + "grad_norm": 6.528801918029785, + "learning_rate": 4.19403274241984e-06, + "loss": 0.5531, + "mean_token_accuracy": 0.8153262332081794, + "num_tokens": 10415461.0, + "step": 8660 + }, + { + "entropy": 1.869553703069687, + "epoch": 0.026876241418085974, + "grad_norm": 12.446064949035645, + "learning_rate": 4.19887629565049e-06, + "loss": 0.5657, + "mean_token_accuracy": 0.8269910797476768, + "num_tokens": 10427235.0, + "step": 8670 + }, + { + "entropy": 1.8075580313801765, + "epoch": 0.02690724054313567, + "grad_norm": 5.994472980499268, + "learning_rate": 4.20371984888114e-06, + "loss": 0.5459, + "mean_token_accuracy": 0.8210108175873756, + "num_tokens": 10440500.0, + "step": 8680 + }, + { + "entropy": 1.857536444067955, + "epoch": 0.026938239668185364, + "grad_norm": 7.034928321838379, + "learning_rate": 4.2085634021117895e-06, + "loss": 0.6801, + "mean_token_accuracy": 0.8013882115483284, + "num_tokens": 10452545.0, + "step": 8690 + }, + { + "entropy": 1.8659397497773171, + "epoch": 0.02696923879323506, + "grad_norm": 12.33353042602539, + "learning_rate": 4.2134069553424394e-06, + "loss": 0.5794, + "mean_token_accuracy": 0.8184954881668091, + "num_tokens": 10464085.0, + "step": 8700 + }, + { + "entropy": 1.9467614516615868, + "epoch": 0.027000237918284757, + "grad_norm": 14.808907508850098, + "learning_rate": 4.218250508573089e-06, + "loss": 0.6854, + "mean_token_accuracy": 0.8033217743039132, + "num_tokens": 10475471.0, + "step": 8710 + }, + { + "entropy": 1.9029730170965196, + "epoch": 0.02703123704333445, + "grad_norm": 6.386728763580322, + "learning_rate": 4.22309406180374e-06, + "loss": 0.5904, + "mean_token_accuracy": 0.8169241547584534, + "num_tokens": 10487494.0, + "step": 8720 + }, + { + "entropy": 1.8430425137281419, + "epoch": 0.027062236168384147, + "grad_norm": 5.942121982574463, + "learning_rate": 4.22793761503439e-06, + "loss": 0.5563, + "mean_token_accuracy": 0.8265012905001641, + "num_tokens": 10499362.0, + "step": 8730 + }, + { + "entropy": 1.8841292724013328, + "epoch": 0.027093235293433843, + "grad_norm": 5.7246198654174805, + "learning_rate": 4.232781168265039e-06, + "loss": 0.6059, + "mean_token_accuracy": 0.8051643744111061, + "num_tokens": 10510802.0, + "step": 8740 + }, + { + "entropy": 1.8404579356312751, + "epoch": 0.02712423441848354, + "grad_norm": 14.219062805175781, + "learning_rate": 4.23762472149569e-06, + "loss": 0.5386, + "mean_token_accuracy": 0.8213804766535759, + "num_tokens": 10523317.0, + "step": 8750 + }, + { + "entropy": 1.8721755370497704, + "epoch": 0.027155233543533233, + "grad_norm": 16.284940719604492, + "learning_rate": 4.24246827472634e-06, + "loss": 0.6095, + "mean_token_accuracy": 0.807612107694149, + "num_tokens": 10534703.0, + "step": 8760 + }, + { + "entropy": 1.8348284885287285, + "epoch": 0.02718623266858293, + "grad_norm": 11.95847225189209, + "learning_rate": 4.2473118279569895e-06, + "loss": 0.5816, + "mean_token_accuracy": 0.8146825641393661, + "num_tokens": 10547080.0, + "step": 8770 + }, + { + "entropy": 1.8284624338150024, + "epoch": 0.027217231793632626, + "grad_norm": 14.658815383911133, + "learning_rate": 4.25215538118764e-06, + "loss": 0.6493, + "mean_token_accuracy": 0.8004456847906113, + "num_tokens": 10559321.0, + "step": 8780 + }, + { + "entropy": 1.7974061518907547, + "epoch": 0.02724823091868232, + "grad_norm": 8.202540397644043, + "learning_rate": 4.256998934418289e-06, + "loss": 0.6258, + "mean_token_accuracy": 0.8043118000030518, + "num_tokens": 10572053.0, + "step": 8790 + }, + { + "entropy": 1.8207244381308556, + "epoch": 0.027279230043732016, + "grad_norm": 14.289605140686035, + "learning_rate": 4.261842487648939e-06, + "loss": 0.5533, + "mean_token_accuracy": 0.8130275592207908, + "num_tokens": 10584178.0, + "step": 8800 + }, + { + "entropy": 1.7965356424450873, + "epoch": 0.027310229168781713, + "grad_norm": 5.546839714050293, + "learning_rate": 4.26668604087959e-06, + "loss": 0.5518, + "mean_token_accuracy": 0.8202337950468064, + "num_tokens": 10597231.0, + "step": 8810 + }, + { + "entropy": 1.849162083864212, + "epoch": 0.027341228293831406, + "grad_norm": 13.331665992736816, + "learning_rate": 4.27152959411024e-06, + "loss": 0.5852, + "mean_token_accuracy": 0.8217923492193222, + "num_tokens": 10609555.0, + "step": 8820 + }, + { + "entropy": 1.8186723545193673, + "epoch": 0.027372227418881102, + "grad_norm": 13.591072082519531, + "learning_rate": 4.27637314734089e-06, + "loss": 0.6013, + "mean_token_accuracy": 0.8041837394237519, + "num_tokens": 10621885.0, + "step": 8830 + }, + { + "entropy": 1.8741968870162964, + "epoch": 0.0274032265439308, + "grad_norm": 12.443244934082031, + "learning_rate": 4.2812167005715395e-06, + "loss": 0.598, + "mean_token_accuracy": 0.807262459397316, + "num_tokens": 10633736.0, + "step": 8840 + }, + { + "entropy": 1.9037314355373383, + "epoch": 0.027434225668980492, + "grad_norm": 13.946099281311035, + "learning_rate": 4.286060253802189e-06, + "loss": 0.6851, + "mean_token_accuracy": 0.808636249601841, + "num_tokens": 10645193.0, + "step": 8850 + }, + { + "entropy": 1.8897378742694855, + "epoch": 0.02746522479403019, + "grad_norm": 13.437854766845703, + "learning_rate": 4.290903807032839e-06, + "loss": 0.6297, + "mean_token_accuracy": 0.8217552006244659, + "num_tokens": 10656076.0, + "step": 8860 + }, + { + "entropy": 1.9088585555553437, + "epoch": 0.027496223919079885, + "grad_norm": 15.542739868164062, + "learning_rate": 4.29574736026349e-06, + "loss": 0.6387, + "mean_token_accuracy": 0.8007620498538017, + "num_tokens": 10666538.0, + "step": 8870 + }, + { + "entropy": 1.8244119063019753, + "epoch": 0.02752722304412958, + "grad_norm": 13.687409400939941, + "learning_rate": 4.30059091349414e-06, + "loss": 0.5834, + "mean_token_accuracy": 0.8106234610080719, + "num_tokens": 10679216.0, + "step": 8880 + }, + { + "entropy": 1.8295306876301765, + "epoch": 0.027558222169179275, + "grad_norm": 13.513102531433105, + "learning_rate": 4.30543446672479e-06, + "loss": 0.5764, + "mean_token_accuracy": 0.8163102254271507, + "num_tokens": 10690908.0, + "step": 8890 + }, + { + "entropy": 1.8037668392062187, + "epoch": 0.02758922129422897, + "grad_norm": 10.39285659790039, + "learning_rate": 4.31027801995544e-06, + "loss": 0.5876, + "mean_token_accuracy": 0.8148684576153755, + "num_tokens": 10702625.0, + "step": 8900 + }, + { + "entropy": 1.826628988981247, + "epoch": 0.027620220419278665, + "grad_norm": 10.972448348999023, + "learning_rate": 4.3151215731860895e-06, + "loss": 0.6049, + "mean_token_accuracy": 0.8112099289894104, + "num_tokens": 10715241.0, + "step": 8910 + }, + { + "entropy": 1.9132762208580971, + "epoch": 0.02765121954432836, + "grad_norm": 11.588542938232422, + "learning_rate": 4.319965126416739e-06, + "loss": 0.6643, + "mean_token_accuracy": 0.800639072060585, + "num_tokens": 10726279.0, + "step": 8920 + }, + { + "entropy": 1.8298836424946785, + "epoch": 0.027682218669378058, + "grad_norm": 11.887832641601562, + "learning_rate": 4.32480867964739e-06, + "loss": 0.5828, + "mean_token_accuracy": 0.81552804261446, + "num_tokens": 10738806.0, + "step": 8930 + }, + { + "entropy": 1.8580557018518449, + "epoch": 0.02771321779442775, + "grad_norm": 10.222613334655762, + "learning_rate": 4.32965223287804e-06, + "loss": 0.6352, + "mean_token_accuracy": 0.8103110626339912, + "num_tokens": 10750764.0, + "step": 8940 + }, + { + "entropy": 1.7699718743562698, + "epoch": 0.027744216919477448, + "grad_norm": 5.6745147705078125, + "learning_rate": 4.334495786108689e-06, + "loss": 0.5092, + "mean_token_accuracy": 0.8255440399050713, + "num_tokens": 10764128.0, + "step": 8950 + }, + { + "entropy": 1.8009968280792237, + "epoch": 0.027775216044527144, + "grad_norm": 11.968152046203613, + "learning_rate": 4.33933933933934e-06, + "loss": 0.5227, + "mean_token_accuracy": 0.8253274574875832, + "num_tokens": 10776273.0, + "step": 8960 + }, + { + "entropy": 1.8581290900707246, + "epoch": 0.027806215169576837, + "grad_norm": 11.328075408935547, + "learning_rate": 4.34418289256999e-06, + "loss": 0.6036, + "mean_token_accuracy": 0.7987002015113831, + "num_tokens": 10787895.0, + "step": 8970 + }, + { + "entropy": 1.835921722650528, + "epoch": 0.027837214294626534, + "grad_norm": 15.295876502990723, + "learning_rate": 4.3490264458006396e-06, + "loss": 0.5987, + "mean_token_accuracy": 0.8129387706518173, + "num_tokens": 10800249.0, + "step": 8980 + }, + { + "entropy": 1.8953902840614318, + "epoch": 0.02786821341967623, + "grad_norm": 11.311132431030273, + "learning_rate": 4.35386999903129e-06, + "loss": 0.6602, + "mean_token_accuracy": 0.7996082842350006, + "num_tokens": 10811728.0, + "step": 8990 + }, + { + "entropy": 1.869747567176819, + "epoch": 0.027899212544725924, + "grad_norm": 13.892035484313965, + "learning_rate": 4.358713552261939e-06, + "loss": 0.6344, + "mean_token_accuracy": 0.8065097257494926, + "num_tokens": 10822949.0, + "step": 9000 + }, + { + "entropy": 1.8128714829683303, + "epoch": 0.02793021166977562, + "grad_norm": 10.744576454162598, + "learning_rate": 4.363557105492589e-06, + "loss": 0.5757, + "mean_token_accuracy": 0.8185910269618034, + "num_tokens": 10835661.0, + "step": 9010 + }, + { + "entropy": 1.8096760362386703, + "epoch": 0.027961210794825317, + "grad_norm": 5.07841682434082, + "learning_rate": 4.36840065872324e-06, + "loss": 0.5479, + "mean_token_accuracy": 0.8252274811267852, + "num_tokens": 10848267.0, + "step": 9020 + }, + { + "entropy": 1.9240417778491974, + "epoch": 0.02799220991987501, + "grad_norm": 13.373735427856445, + "learning_rate": 4.37324421195389e-06, + "loss": 0.6678, + "mean_token_accuracy": 0.7926280677318573, + "num_tokens": 10859705.0, + "step": 9030 + }, + { + "entropy": 1.9146719083189965, + "epoch": 0.028023209044924707, + "grad_norm": 11.327962875366211, + "learning_rate": 4.37808776518454e-06, + "loss": 0.6178, + "mean_token_accuracy": 0.826309834420681, + "num_tokens": 10871134.0, + "step": 9040 + }, + { + "entropy": 1.8291181206703186, + "epoch": 0.028054208169974403, + "grad_norm": 14.080740928649902, + "learning_rate": 4.38293131841519e-06, + "loss": 0.5395, + "mean_token_accuracy": 0.8222734659910202, + "num_tokens": 10883664.0, + "step": 9050 + }, + { + "entropy": 1.931309811770916, + "epoch": 0.028085207295024096, + "grad_norm": 14.17933177947998, + "learning_rate": 4.3877748716458395e-06, + "loss": 0.671, + "mean_token_accuracy": 0.7961855351924896, + "num_tokens": 10894837.0, + "step": 9060 + }, + { + "entropy": 1.9364981949329376, + "epoch": 0.028116206420073793, + "grad_norm": 16.13866424560547, + "learning_rate": 4.392618424876489e-06, + "loss": 0.6832, + "mean_token_accuracy": 0.8080704003572464, + "num_tokens": 10905463.0, + "step": 9070 + }, + { + "entropy": 1.8846597149968147, + "epoch": 0.02814720554512349, + "grad_norm": 12.987224578857422, + "learning_rate": 4.39746197810714e-06, + "loss": 0.6404, + "mean_token_accuracy": 0.8108007118105889, + "num_tokens": 10917119.0, + "step": 9080 + }, + { + "entropy": 1.9063777923583984, + "epoch": 0.028178204670173183, + "grad_norm": 14.922365188598633, + "learning_rate": 4.40230553133779e-06, + "loss": 0.6725, + "mean_token_accuracy": 0.8062027513980865, + "num_tokens": 10927669.0, + "step": 9090 + }, + { + "entropy": 1.8114924758672715, + "epoch": 0.02820920379522288, + "grad_norm": 10.971781730651855, + "learning_rate": 4.40714908456844e-06, + "loss": 0.5537, + "mean_token_accuracy": 0.8184123501181603, + "num_tokens": 10939764.0, + "step": 9100 + }, + { + "entropy": 1.9051185458898545, + "epoch": 0.028240202920272576, + "grad_norm": 5.854284763336182, + "learning_rate": 4.41199263779909e-06, + "loss": 0.6057, + "mean_token_accuracy": 0.8078436881303788, + "num_tokens": 10951263.0, + "step": 9110 + }, + { + "entropy": 2.0060752868652343, + "epoch": 0.028271202045322272, + "grad_norm": 13.479512214660645, + "learning_rate": 4.41683619102974e-06, + "loss": 0.7195, + "mean_token_accuracy": 0.7970252841711044, + "num_tokens": 10962301.0, + "step": 9120 + }, + { + "entropy": 1.9399765685200692, + "epoch": 0.028302201170371966, + "grad_norm": 13.337047576904297, + "learning_rate": 4.4216797442603895e-06, + "loss": 0.6101, + "mean_token_accuracy": 0.8143787398934365, + "num_tokens": 10973469.0, + "step": 9130 + }, + { + "entropy": 1.8391277641057968, + "epoch": 0.028333200295421662, + "grad_norm": 6.049315452575684, + "learning_rate": 4.42652329749104e-06, + "loss": 0.5842, + "mean_token_accuracy": 0.8160114631056785, + "num_tokens": 10986193.0, + "step": 9140 + }, + { + "entropy": 1.8236407771706582, + "epoch": 0.02836419942047136, + "grad_norm": 12.77992057800293, + "learning_rate": 4.43136685072169e-06, + "loss": 0.6036, + "mean_token_accuracy": 0.8185855254530907, + "num_tokens": 10998743.0, + "step": 9150 + }, + { + "entropy": 1.9133103117346764, + "epoch": 0.028395198545521052, + "grad_norm": 11.950321197509766, + "learning_rate": 4.436210403952339e-06, + "loss": 0.6247, + "mean_token_accuracy": 0.8190466240048409, + "num_tokens": 11010167.0, + "step": 9160 + }, + { + "entropy": 1.8094365164637565, + "epoch": 0.02842619767057075, + "grad_norm": 15.926004409790039, + "learning_rate": 4.44105395718299e-06, + "loss": 0.4952, + "mean_token_accuracy": 0.8278137043118476, + "num_tokens": 11023270.0, + "step": 9170 + }, + { + "entropy": 1.9534984081983566, + "epoch": 0.028457196795620445, + "grad_norm": 13.150014877319336, + "learning_rate": 4.44589751041364e-06, + "loss": 0.6237, + "mean_token_accuracy": 0.8099380254745483, + "num_tokens": 11034084.0, + "step": 9180 + }, + { + "entropy": 1.9308686077594757, + "epoch": 0.028488195920670138, + "grad_norm": 13.831839561462402, + "learning_rate": 4.45074106364429e-06, + "loss": 0.6109, + "mean_token_accuracy": 0.8114223212003708, + "num_tokens": 11045463.0, + "step": 9190 + }, + { + "entropy": 1.922127565741539, + "epoch": 0.028519195045719835, + "grad_norm": 14.833643913269043, + "learning_rate": 4.45558461687494e-06, + "loss": 0.6848, + "mean_token_accuracy": 0.8016584351658821, + "num_tokens": 11057209.0, + "step": 9200 + }, + { + "entropy": 1.9555729806423188, + "epoch": 0.02855019417076953, + "grad_norm": 12.715826988220215, + "learning_rate": 4.4604281701055894e-06, + "loss": 0.6802, + "mean_token_accuracy": 0.805061075091362, + "num_tokens": 11068291.0, + "step": 9210 + }, + { + "entropy": 1.942775259912014, + "epoch": 0.028581193295819225, + "grad_norm": 7.836392879486084, + "learning_rate": 4.465271723336239e-06, + "loss": 0.6386, + "mean_token_accuracy": 0.8060253381729126, + "num_tokens": 11079075.0, + "step": 9220 + }, + { + "entropy": 1.9078367695212364, + "epoch": 0.02861219242086892, + "grad_norm": 13.057887077331543, + "learning_rate": 4.47011527656689e-06, + "loss": 0.6303, + "mean_token_accuracy": 0.806829422712326, + "num_tokens": 11090899.0, + "step": 9230 + }, + { + "entropy": 1.9537988051772117, + "epoch": 0.028643191545918618, + "grad_norm": 13.034477233886719, + "learning_rate": 4.47495882979754e-06, + "loss": 0.643, + "mean_token_accuracy": 0.8042517855763436, + "num_tokens": 11102296.0, + "step": 9240 + }, + { + "entropy": 1.8851880788803101, + "epoch": 0.02867419067096831, + "grad_norm": 15.656882286071777, + "learning_rate": 4.47980238302819e-06, + "loss": 0.5952, + "mean_token_accuracy": 0.8147235870361328, + "num_tokens": 11113748.0, + "step": 9250 + }, + { + "entropy": 1.770183938741684, + "epoch": 0.028705189796018007, + "grad_norm": 12.287094116210938, + "learning_rate": 4.48464593625884e-06, + "loss": 0.5044, + "mean_token_accuracy": 0.8376102089881897, + "num_tokens": 11126535.0, + "step": 9260 + }, + { + "entropy": 1.8187786787748337, + "epoch": 0.028736188921067704, + "grad_norm": 12.566884994506836, + "learning_rate": 4.48948948948949e-06, + "loss": 0.5549, + "mean_token_accuracy": 0.8142774716019631, + "num_tokens": 11139226.0, + "step": 9270 + }, + { + "entropy": 1.9643137067556382, + "epoch": 0.028767188046117397, + "grad_norm": 13.199538230895996, + "learning_rate": 4.4943330427201395e-06, + "loss": 0.7005, + "mean_token_accuracy": 0.7958864018321037, + "num_tokens": 11150412.0, + "step": 9280 + }, + { + "entropy": 1.7707602977752686, + "epoch": 0.028798187171167094, + "grad_norm": 11.749283790588379, + "learning_rate": 4.49917659595079e-06, + "loss": 0.4821, + "mean_token_accuracy": 0.826721802353859, + "num_tokens": 11164245.0, + "step": 9290 + }, + { + "entropy": 1.8696741789579392, + "epoch": 0.02882918629621679, + "grad_norm": 13.251580238342285, + "learning_rate": 4.50402014918144e-06, + "loss": 0.5609, + "mean_token_accuracy": 0.819263182580471, + "num_tokens": 11176738.0, + "step": 9300 + }, + { + "entropy": 1.9785585284233094, + "epoch": 0.028860185421266484, + "grad_norm": 14.014039039611816, + "learning_rate": 4.50886370241209e-06, + "loss": 0.629, + "mean_token_accuracy": 0.8204264283180237, + "num_tokens": 11187416.0, + "step": 9310 + }, + { + "entropy": 1.9258405417203903, + "epoch": 0.02889118454631618, + "grad_norm": 12.051921844482422, + "learning_rate": 4.51370725564274e-06, + "loss": 0.6911, + "mean_token_accuracy": 0.8011237904429436, + "num_tokens": 11198731.0, + "step": 9320 + }, + { + "entropy": 1.7640791162848473, + "epoch": 0.028922183671365877, + "grad_norm": 13.392481803894043, + "learning_rate": 4.51855080887339e-06, + "loss": 0.5373, + "mean_token_accuracy": 0.8237902507185936, + "num_tokens": 11212726.0, + "step": 9330 + }, + { + "entropy": 1.9153805747628212, + "epoch": 0.02895318279641557, + "grad_norm": 12.259576797485352, + "learning_rate": 4.52339436210404e-06, + "loss": 0.5935, + "mean_token_accuracy": 0.8111542999744416, + "num_tokens": 11225651.0, + "step": 9340 + }, + { + "entropy": 2.023898732662201, + "epoch": 0.028984181921465266, + "grad_norm": 15.207432746887207, + "learning_rate": 4.52823791533469e-06, + "loss": 0.7204, + "mean_token_accuracy": 0.7969193458557129, + "num_tokens": 11236751.0, + "step": 9350 + }, + { + "entropy": 1.9569285050034524, + "epoch": 0.029015181046514963, + "grad_norm": 15.066774368286133, + "learning_rate": 4.53308146856534e-06, + "loss": 0.6648, + "mean_token_accuracy": 0.8044527977705002, + "num_tokens": 11248128.0, + "step": 9360 + }, + { + "entropy": 1.8909387812018394, + "epoch": 0.029046180171564656, + "grad_norm": 12.793632507324219, + "learning_rate": 4.537925021795989e-06, + "loss": 0.6201, + "mean_token_accuracy": 0.8120290517807007, + "num_tokens": 11260048.0, + "step": 9370 + }, + { + "entropy": 1.9061389192938805, + "epoch": 0.029077179296614353, + "grad_norm": 10.562239646911621, + "learning_rate": 4.54276857502664e-06, + "loss": 0.6658, + "mean_token_accuracy": 0.7987107247114181, + "num_tokens": 11271960.0, + "step": 9380 + }, + { + "entropy": 1.9409636542201043, + "epoch": 0.02910817842166405, + "grad_norm": 12.370716094970703, + "learning_rate": 4.54761212825729e-06, + "loss": 0.641, + "mean_token_accuracy": 0.8062451392412185, + "num_tokens": 11283082.0, + "step": 9390 + }, + { + "entropy": 1.9235450312495233, + "epoch": 0.029139177546713742, + "grad_norm": 11.286612510681152, + "learning_rate": 4.55245568148794e-06, + "loss": 0.6299, + "mean_token_accuracy": 0.8160214677453042, + "num_tokens": 11294418.0, + "step": 9400 + }, + { + "entropy": 1.9037536427378654, + "epoch": 0.02917017667176344, + "grad_norm": 12.946687698364258, + "learning_rate": 4.5572992347185905e-06, + "loss": 0.6663, + "mean_token_accuracy": 0.7993274956941605, + "num_tokens": 11306264.0, + "step": 9410 + }, + { + "entropy": 1.9348094776272773, + "epoch": 0.029201175796813136, + "grad_norm": 11.020223617553711, + "learning_rate": 4.5621427879492395e-06, + "loss": 0.647, + "mean_token_accuracy": 0.8057587504386902, + "num_tokens": 11316953.0, + "step": 9420 + }, + { + "entropy": 1.8925198674201966, + "epoch": 0.02923217492186283, + "grad_norm": 12.39678955078125, + "learning_rate": 4.5669863411798894e-06, + "loss": 0.6654, + "mean_token_accuracy": 0.8044493556022644, + "num_tokens": 11328541.0, + "step": 9430 + }, + { + "entropy": 1.871733333170414, + "epoch": 0.029263174046912525, + "grad_norm": 4.86548376083374, + "learning_rate": 4.57182989441054e-06, + "loss": 0.6326, + "mean_token_accuracy": 0.806520962715149, + "num_tokens": 11341221.0, + "step": 9440 + }, + { + "entropy": 1.9219319432973863, + "epoch": 0.029294173171962222, + "grad_norm": 13.70630168914795, + "learning_rate": 4.57667344764119e-06, + "loss": 0.6243, + "mean_token_accuracy": 0.8117819979786873, + "num_tokens": 11352400.0, + "step": 9450 + }, + { + "entropy": 1.9954150676727296, + "epoch": 0.029325172297011915, + "grad_norm": 16.177236557006836, + "learning_rate": 4.58151700087184e-06, + "loss": 0.6861, + "mean_token_accuracy": 0.7988776013255119, + "num_tokens": 11363461.0, + "step": 9460 + }, + { + "entropy": 1.960495764017105, + "epoch": 0.02935617142206161, + "grad_norm": 13.248772621154785, + "learning_rate": 4.58636055410249e-06, + "loss": 0.6945, + "mean_token_accuracy": 0.799818865954876, + "num_tokens": 11375111.0, + "step": 9470 + }, + { + "entropy": 1.9463176384568215, + "epoch": 0.02938717054711131, + "grad_norm": 13.98116683959961, + "learning_rate": 4.59120410733314e-06, + "loss": 0.6175, + "mean_token_accuracy": 0.814936289191246, + "num_tokens": 11386133.0, + "step": 9480 + }, + { + "entropy": 1.8891835004091262, + "epoch": 0.029418169672161005, + "grad_norm": 3.9220001697540283, + "learning_rate": 4.59604766056379e-06, + "loss": 0.58, + "mean_token_accuracy": 0.8183272242546081, + "num_tokens": 11399042.0, + "step": 9490 + }, + { + "entropy": 1.9922261223196984, + "epoch": 0.029449168797210698, + "grad_norm": 10.860396385192871, + "learning_rate": 4.60089121379444e-06, + "loss": 0.6858, + "mean_token_accuracy": 0.8004097312688827, + "num_tokens": 11410000.0, + "step": 9500 + }, + { + "entropy": 1.9224519044160844, + "epoch": 0.029480167922260395, + "grad_norm": 11.63853645324707, + "learning_rate": 4.60573476702509e-06, + "loss": 0.5717, + "mean_token_accuracy": 0.816666342318058, + "num_tokens": 11422038.0, + "step": 9510 + }, + { + "entropy": 1.8657364815473556, + "epoch": 0.02951116704731009, + "grad_norm": 13.743812561035156, + "learning_rate": 4.61057832025574e-06, + "loss": 0.5471, + "mean_token_accuracy": 0.8122471138834954, + "num_tokens": 11435434.0, + "step": 9520 + }, + { + "entropy": 1.925044772028923, + "epoch": 0.029542166172359784, + "grad_norm": 7.6943230628967285, + "learning_rate": 4.61542187348639e-06, + "loss": 0.6255, + "mean_token_accuracy": 0.8038001671433449, + "num_tokens": 11446785.0, + "step": 9530 + }, + { + "entropy": 1.9045713931322097, + "epoch": 0.02957316529740948, + "grad_norm": 14.207799911499023, + "learning_rate": 4.62026542671704e-06, + "loss": 0.6089, + "mean_token_accuracy": 0.8201724767684937, + "num_tokens": 11458736.0, + "step": 9540 + }, + { + "entropy": 1.9084357440471649, + "epoch": 0.029604164422459178, + "grad_norm": 13.012181282043457, + "learning_rate": 4.62510897994769e-06, + "loss": 0.6914, + "mean_token_accuracy": 0.8042386144399643, + "num_tokens": 11470670.0, + "step": 9550 + }, + { + "entropy": 1.8864588722586633, + "epoch": 0.02963516354750887, + "grad_norm": 11.62660026550293, + "learning_rate": 4.6299525331783405e-06, + "loss": 0.6257, + "mean_token_accuracy": 0.8103240177035331, + "num_tokens": 11482128.0, + "step": 9560 + }, + { + "entropy": 1.8707260951399802, + "epoch": 0.029666162672558567, + "grad_norm": 12.582703590393066, + "learning_rate": 4.63479608640899e-06, + "loss": 0.5899, + "mean_token_accuracy": 0.8164230138063431, + "num_tokens": 11494873.0, + "step": 9570 + }, + { + "entropy": 1.927544179558754, + "epoch": 0.029697161797608264, + "grad_norm": 14.461875915527344, + "learning_rate": 4.63963963963964e-06, + "loss": 0.624, + "mean_token_accuracy": 0.8096611618995666, + "num_tokens": 11506364.0, + "step": 9580 + }, + { + "entropy": 1.8048232197761536, + "epoch": 0.029728160922657957, + "grad_norm": 7.559422016143799, + "learning_rate": 4.64448319287029e-06, + "loss": 0.5429, + "mean_token_accuracy": 0.8254976123571396, + "num_tokens": 11519319.0, + "step": 9590 + }, + { + "entropy": 1.868215946853161, + "epoch": 0.029759160047707654, + "grad_norm": 7.954585075378418, + "learning_rate": 4.64932674610094e-06, + "loss": 0.5627, + "mean_token_accuracy": 0.8161717414855957, + "num_tokens": 11531172.0, + "step": 9600 + }, + { + "entropy": 1.9226632490754128, + "epoch": 0.02979015917275735, + "grad_norm": 12.952104568481445, + "learning_rate": 4.65417029933159e-06, + "loss": 0.6432, + "mean_token_accuracy": 0.7985331505537033, + "num_tokens": 11542291.0, + "step": 9610 + }, + { + "entropy": 1.763420394062996, + "epoch": 0.029821158297807043, + "grad_norm": 4.180752277374268, + "learning_rate": 4.659013852562241e-06, + "loss": 0.5277, + "mean_token_accuracy": 0.8286846920847892, + "num_tokens": 11555662.0, + "step": 9620 + }, + { + "entropy": 1.9089765697717667, + "epoch": 0.02985215742285674, + "grad_norm": 5.77734899520874, + "learning_rate": 4.66385740579289e-06, + "loss": 0.6166, + "mean_token_accuracy": 0.8161043807864189, + "num_tokens": 11566884.0, + "step": 9630 + }, + { + "entropy": 1.7613512337207795, + "epoch": 0.029883156547906436, + "grad_norm": 11.983185768127441, + "learning_rate": 4.6687009590235395e-06, + "loss": 0.535, + "mean_token_accuracy": 0.8203961864113808, + "num_tokens": 11580772.0, + "step": 9640 + }, + { + "entropy": 1.8508950725197792, + "epoch": 0.02991415567295613, + "grad_norm": 4.04287576675415, + "learning_rate": 4.67354451225419e-06, + "loss": 0.5947, + "mean_token_accuracy": 0.8185210764408112, + "num_tokens": 11593156.0, + "step": 9650 + }, + { + "entropy": 1.9007043689489365, + "epoch": 0.029945154798005826, + "grad_norm": 12.041136741638184, + "learning_rate": 4.67838806548484e-06, + "loss": 0.687, + "mean_token_accuracy": 0.7983652919530868, + "num_tokens": 11604534.0, + "step": 9660 + }, + { + "entropy": 1.8662898167967796, + "epoch": 0.029976153923055523, + "grad_norm": 5.588154315948486, + "learning_rate": 4.68323161871549e-06, + "loss": 0.5698, + "mean_token_accuracy": 0.8151004850864411, + "num_tokens": 11616960.0, + "step": 9670 + }, + { + "entropy": 1.8811151057481765, + "epoch": 0.030007153048105216, + "grad_norm": 15.325428009033203, + "learning_rate": 4.68807517194614e-06, + "loss": 0.6616, + "mean_token_accuracy": 0.7986567124724389, + "num_tokens": 11628059.0, + "step": 9680 + }, + { + "entropy": 1.8047630965709687, + "epoch": 0.030038152173154913, + "grad_norm": 5.080140590667725, + "learning_rate": 4.69291872517679e-06, + "loss": 0.5694, + "mean_token_accuracy": 0.8145760789513588, + "num_tokens": 11640534.0, + "step": 9690 + }, + { + "entropy": 1.9280565515160561, + "epoch": 0.03006915129820461, + "grad_norm": 6.16276216506958, + "learning_rate": 4.69776227840744e-06, + "loss": 0.6692, + "mean_token_accuracy": 0.8054024249315261, + "num_tokens": 11651985.0, + "step": 9700 + }, + { + "entropy": 1.9135492697358132, + "epoch": 0.030100150423254302, + "grad_norm": 13.727761268615723, + "learning_rate": 4.70260583163809e-06, + "loss": 0.5566, + "mean_token_accuracy": 0.8213163688778877, + "num_tokens": 11664292.0, + "step": 9710 + }, + { + "entropy": 1.8022937417030334, + "epoch": 0.030131149548304, + "grad_norm": 11.88475513458252, + "learning_rate": 4.70744938486874e-06, + "loss": 0.5153, + "mean_token_accuracy": 0.8253460243344307, + "num_tokens": 11677213.0, + "step": 9720 + }, + { + "entropy": 1.9173430383205414, + "epoch": 0.030162148673353695, + "grad_norm": 10.792257308959961, + "learning_rate": 4.71229293809939e-06, + "loss": 0.5897, + "mean_token_accuracy": 0.8223668470978737, + "num_tokens": 11689083.0, + "step": 9730 + }, + { + "entropy": 1.8793220937252044, + "epoch": 0.03019314779840339, + "grad_norm": 13.49764347076416, + "learning_rate": 4.71713649133004e-06, + "loss": 0.6808, + "mean_token_accuracy": 0.7959660813212395, + "num_tokens": 11701156.0, + "step": 9740 + }, + { + "entropy": 1.7766539767384528, + "epoch": 0.030224146923453085, + "grad_norm": 12.855195045471191, + "learning_rate": 4.72198004456069e-06, + "loss": 0.506, + "mean_token_accuracy": 0.8286942020058632, + "num_tokens": 11715335.0, + "step": 9750 + }, + { + "entropy": 1.914339354634285, + "epoch": 0.030255146048502782, + "grad_norm": 12.800863265991211, + "learning_rate": 4.72682359779134e-06, + "loss": 0.6046, + "mean_token_accuracy": 0.8148805812001229, + "num_tokens": 11727922.0, + "step": 9760 + }, + { + "entropy": 1.910454373061657, + "epoch": 0.030286145173552475, + "grad_norm": 13.581644058227539, + "learning_rate": 4.7316671510219906e-06, + "loss": 0.5975, + "mean_token_accuracy": 0.8178346559405327, + "num_tokens": 11739422.0, + "step": 9770 + }, + { + "entropy": 1.8079843878746034, + "epoch": 0.03031714429860217, + "grad_norm": 14.549524307250977, + "learning_rate": 4.7365107042526405e-06, + "loss": 0.543, + "mean_token_accuracy": 0.8197317391633987, + "num_tokens": 11752317.0, + "step": 9780 + }, + { + "entropy": 1.9487209469079971, + "epoch": 0.030348143423651868, + "grad_norm": 12.98496150970459, + "learning_rate": 4.74135425748329e-06, + "loss": 0.6664, + "mean_token_accuracy": 0.8045255482196808, + "num_tokens": 11763631.0, + "step": 9790 + }, + { + "entropy": 1.9213872998952866, + "epoch": 0.03037914254870156, + "grad_norm": 14.27520751953125, + "learning_rate": 4.74619781071394e-06, + "loss": 0.6433, + "mean_token_accuracy": 0.8067422285676003, + "num_tokens": 11775491.0, + "step": 9800 + }, + { + "entropy": 1.8178927034139634, + "epoch": 0.030410141673751258, + "grad_norm": 14.970010757446289, + "learning_rate": 4.75104136394459e-06, + "loss": 0.5629, + "mean_token_accuracy": 0.8228003144264221, + "num_tokens": 11788717.0, + "step": 9810 + }, + { + "entropy": 1.9698729425668717, + "epoch": 0.030441140798800954, + "grad_norm": 12.28023624420166, + "learning_rate": 4.75588491717524e-06, + "loss": 0.6749, + "mean_token_accuracy": 0.8033818736672401, + "num_tokens": 11799616.0, + "step": 9820 + }, + { + "entropy": 1.8569468915462495, + "epoch": 0.03047213992385065, + "grad_norm": 12.524866104125977, + "learning_rate": 4.760728470405891e-06, + "loss": 0.5486, + "mean_token_accuracy": 0.8196278423070907, + "num_tokens": 11812567.0, + "step": 9830 + }, + { + "entropy": 1.9362822517752647, + "epoch": 0.030503139048900344, + "grad_norm": 12.16551685333252, + "learning_rate": 4.76557202363654e-06, + "loss": 0.6641, + "mean_token_accuracy": 0.802090086042881, + "num_tokens": 11824072.0, + "step": 9840 + }, + { + "entropy": 1.9034365549683572, + "epoch": 0.03053413817395004, + "grad_norm": 11.310152053833008, + "learning_rate": 4.77041557686719e-06, + "loss": 0.6016, + "mean_token_accuracy": 0.8146296158432961, + "num_tokens": 11835828.0, + "step": 9850 + }, + { + "entropy": 1.949262234568596, + "epoch": 0.030565137298999737, + "grad_norm": 13.651056289672852, + "learning_rate": 4.77525913009784e-06, + "loss": 0.6148, + "mean_token_accuracy": 0.8138030260801316, + "num_tokens": 11847123.0, + "step": 9860 + }, + { + "entropy": 1.8827594295144081, + "epoch": 0.03059613642404943, + "grad_norm": 11.080418586730957, + "learning_rate": 4.78010268332849e-06, + "loss": 0.597, + "mean_token_accuracy": 0.8088963508605957, + "num_tokens": 11859028.0, + "step": 9870 + }, + { + "entropy": 1.8874688804149629, + "epoch": 0.030627135549099127, + "grad_norm": 12.232693672180176, + "learning_rate": 4.78494623655914e-06, + "loss": 0.6113, + "mean_token_accuracy": 0.814998921751976, + "num_tokens": 11869876.0, + "step": 9880 + }, + { + "entropy": 1.8576917335391046, + "epoch": 0.030658134674148824, + "grad_norm": 11.815441131591797, + "learning_rate": 4.78978978978979e-06, + "loss": 0.6147, + "mean_token_accuracy": 0.8156881853938103, + "num_tokens": 11881144.0, + "step": 9890 + }, + { + "entropy": 1.8440957143902779, + "epoch": 0.030689133799198517, + "grad_norm": 12.98918628692627, + "learning_rate": 4.79463334302044e-06, + "loss": 0.6312, + "mean_token_accuracy": 0.8034987449645996, + "num_tokens": 11893147.0, + "step": 9900 + }, + { + "entropy": 1.88355852663517, + "epoch": 0.030720132924248213, + "grad_norm": 6.641689300537109, + "learning_rate": 4.79947689625109e-06, + "loss": 0.6617, + "mean_token_accuracy": 0.7953455910086632, + "num_tokens": 11905450.0, + "step": 9910 + }, + { + "entropy": 1.9107140555977822, + "epoch": 0.03075113204929791, + "grad_norm": 5.667116641998291, + "learning_rate": 4.8043204494817405e-06, + "loss": 0.6608, + "mean_token_accuracy": 0.8010135576128959, + "num_tokens": 11916859.0, + "step": 9920 + }, + { + "entropy": 1.9044810444116593, + "epoch": 0.030782131174347603, + "grad_norm": 13.490184783935547, + "learning_rate": 4.80916400271239e-06, + "loss": 0.6331, + "mean_token_accuracy": 0.8088266268372536, + "num_tokens": 11927564.0, + "step": 9930 + }, + { + "entropy": 1.899574799835682, + "epoch": 0.0308131302993973, + "grad_norm": 13.318133354187012, + "learning_rate": 4.81400755594304e-06, + "loss": 0.6828, + "mean_token_accuracy": 0.8059917777776718, + "num_tokens": 11938823.0, + "step": 9940 + }, + { + "entropy": 1.8431586802005768, + "epoch": 0.030844129424446996, + "grad_norm": 11.968719482421875, + "learning_rate": 4.81885110917369e-06, + "loss": 0.5588, + "mean_token_accuracy": 0.8303678393363952, + "num_tokens": 11950208.0, + "step": 9950 + }, + { + "entropy": 1.8151010930538178, + "epoch": 0.03087512854949669, + "grad_norm": 13.34570026397705, + "learning_rate": 4.82369466240434e-06, + "loss": 0.5367, + "mean_token_accuracy": 0.8233583375811577, + "num_tokens": 11963211.0, + "step": 9960 + }, + { + "entropy": 1.9202764973044395, + "epoch": 0.030906127674546386, + "grad_norm": 12.33815860748291, + "learning_rate": 4.82853821563499e-06, + "loss": 0.6453, + "mean_token_accuracy": 0.7984903916716576, + "num_tokens": 11974413.0, + "step": 9970 + }, + { + "entropy": 1.7811052426695824, + "epoch": 0.030937126799596083, + "grad_norm": 13.612791061401367, + "learning_rate": 4.833381768865641e-06, + "loss": 0.5295, + "mean_token_accuracy": 0.8154791623353959, + "num_tokens": 11986759.0, + "step": 9980 + }, + { + "entropy": 1.8961547821760179, + "epoch": 0.030968125924645776, + "grad_norm": 13.582847595214844, + "learning_rate": 4.8382253220962906e-06, + "loss": 0.6424, + "mean_token_accuracy": 0.8010322540998459, + "num_tokens": 11997882.0, + "step": 9990 + }, + { + "entropy": 1.7699449375271796, + "epoch": 0.030999125049695472, + "grad_norm": 12.8268461227417, + "learning_rate": 4.8430688753269404e-06, + "loss": 0.5349, + "mean_token_accuracy": 0.8104767709970474, + "num_tokens": 12011562.0, + "step": 10000 + }, + { + "entropy": 1.7704377323389053, + "epoch": 0.03103012417474517, + "grad_norm": 7.003698348999023, + "learning_rate": 4.84791242855759e-06, + "loss": 0.4874, + "mean_token_accuracy": 0.817914342880249, + "num_tokens": 12024642.0, + "step": 10010 + }, + { + "entropy": 1.8417405053973197, + "epoch": 0.031061123299794862, + "grad_norm": 12.85013198852539, + "learning_rate": 4.85275598178824e-06, + "loss": 0.6207, + "mean_token_accuracy": 0.8156400442123413, + "num_tokens": 12035688.0, + "step": 10020 + }, + { + "entropy": 1.871736840903759, + "epoch": 0.03109212242484456, + "grad_norm": 10.676023483276367, + "learning_rate": 4.85759953501889e-06, + "loss": 0.6359, + "mean_token_accuracy": 0.8091729044914245, + "num_tokens": 12046810.0, + "step": 10030 + }, + { + "entropy": 1.8386752873659133, + "epoch": 0.031123121549894255, + "grad_norm": 5.60599422454834, + "learning_rate": 4.862443088249541e-06, + "loss": 0.5317, + "mean_token_accuracy": 0.8189519822597504, + "num_tokens": 12059567.0, + "step": 10040 + }, + { + "entropy": 1.9133656844496727, + "epoch": 0.03115412067494395, + "grad_norm": 13.941995620727539, + "learning_rate": 4.86728664148019e-06, + "loss": 0.6437, + "mean_token_accuracy": 0.8070616811513901, + "num_tokens": 12070777.0, + "step": 10050 + }, + { + "entropy": 1.859761357307434, + "epoch": 0.031185119799993645, + "grad_norm": 11.398521423339844, + "learning_rate": 4.87213019471084e-06, + "loss": 0.557, + "mean_token_accuracy": 0.8148921579122543, + "num_tokens": 12082916.0, + "step": 10060 + }, + { + "entropy": 1.8685668006539344, + "epoch": 0.03121611892504334, + "grad_norm": 11.567840576171875, + "learning_rate": 4.8769737479414905e-06, + "loss": 0.6081, + "mean_token_accuracy": 0.812491662800312, + "num_tokens": 12094481.0, + "step": 10070 + }, + { + "entropy": 1.7921167001128198, + "epoch": 0.031247118050093035, + "grad_norm": 11.81413745880127, + "learning_rate": 4.88181730117214e-06, + "loss": 0.4818, + "mean_token_accuracy": 0.8256913438439369, + "num_tokens": 12107579.0, + "step": 10080 + }, + { + "entropy": 1.9026924163103103, + "epoch": 0.031278117175142735, + "grad_norm": 5.831927299499512, + "learning_rate": 4.88666085440279e-06, + "loss": 0.5999, + "mean_token_accuracy": 0.8175748229026795, + "num_tokens": 12119504.0, + "step": 10090 + }, + { + "entropy": 1.8218435019254684, + "epoch": 0.031309116300192424, + "grad_norm": 6.674549579620361, + "learning_rate": 4.89150440763344e-06, + "loss": 0.5129, + "mean_token_accuracy": 0.8210835456848145, + "num_tokens": 12132997.0, + "step": 10100 + }, + { + "entropy": 1.7946079865098, + "epoch": 0.03134011542524212, + "grad_norm": 6.814689636230469, + "learning_rate": 4.89634796086409e-06, + "loss": 0.5653, + "mean_token_accuracy": 0.8189253076910973, + "num_tokens": 12145732.0, + "step": 10110 + }, + { + "entropy": 1.8335962176322937, + "epoch": 0.03137111455029182, + "grad_norm": 12.774432182312012, + "learning_rate": 4.90119151409474e-06, + "loss": 0.5843, + "mean_token_accuracy": 0.8219740748405456, + "num_tokens": 12157527.0, + "step": 10120 + }, + { + "entropy": 1.9156937181949616, + "epoch": 0.031402113675341514, + "grad_norm": 13.017987251281738, + "learning_rate": 4.906035067325391e-06, + "loss": 0.6375, + "mean_token_accuracy": 0.8114208281040192, + "num_tokens": 12168529.0, + "step": 10130 + }, + { + "entropy": 1.929344841837883, + "epoch": 0.03143311280039121, + "grad_norm": 12.184325218200684, + "learning_rate": 4.9108786205560405e-06, + "loss": 0.6351, + "mean_token_accuracy": 0.816659900546074, + "num_tokens": 12179791.0, + "step": 10140 + }, + { + "entropy": 1.8231835559010505, + "epoch": 0.03146411192544091, + "grad_norm": 13.120546340942383, + "learning_rate": 4.91572217378669e-06, + "loss": 0.5967, + "mean_token_accuracy": 0.8101472899317741, + "num_tokens": 12192369.0, + "step": 10150 + }, + { + "entropy": 1.7879482999444007, + "epoch": 0.0314951110504906, + "grad_norm": 14.349749565124512, + "learning_rate": 4.92056572701734e-06, + "loss": 0.5482, + "mean_token_accuracy": 0.8103778302669525, + "num_tokens": 12205341.0, + "step": 10160 + }, + { + "entropy": 1.9244187206029892, + "epoch": 0.031526110175540294, + "grad_norm": 11.694113731384277, + "learning_rate": 4.92540928024799e-06, + "loss": 0.643, + "mean_token_accuracy": 0.8135433197021484, + "num_tokens": 12217222.0, + "step": 10170 + }, + { + "entropy": 1.9573982939124108, + "epoch": 0.03155710930058999, + "grad_norm": 12.177897453308105, + "learning_rate": 4.93025283347864e-06, + "loss": 0.6126, + "mean_token_accuracy": 0.809203888475895, + "num_tokens": 12228264.0, + "step": 10180 + }, + { + "entropy": 1.9127047300338744, + "epoch": 0.03158810842563969, + "grad_norm": 13.4846830368042, + "learning_rate": 4.935096386709291e-06, + "loss": 0.6306, + "mean_token_accuracy": 0.8137389361858368, + "num_tokens": 12240050.0, + "step": 10190 + }, + { + "entropy": 1.910162153840065, + "epoch": 0.03161910755068938, + "grad_norm": 14.54655647277832, + "learning_rate": 4.939939939939941e-06, + "loss": 0.6165, + "mean_token_accuracy": 0.8089611247181893, + "num_tokens": 12251372.0, + "step": 10200 + }, + { + "entropy": 1.9011714354157447, + "epoch": 0.03165010667573908, + "grad_norm": 12.256708145141602, + "learning_rate": 4.9447834931705905e-06, + "loss": 0.5892, + "mean_token_accuracy": 0.8112087488174439, + "num_tokens": 12263039.0, + "step": 10210 + }, + { + "entropy": 1.8994778007268907, + "epoch": 0.03168110580078877, + "grad_norm": 14.00627613067627, + "learning_rate": 4.9496270464012404e-06, + "loss": 0.732, + "mean_token_accuracy": 0.7959692940115929, + "num_tokens": 12275633.0, + "step": 10220 + }, + { + "entropy": 1.7719287261366845, + "epoch": 0.031712104925838466, + "grad_norm": 10.754083633422852, + "learning_rate": 4.95447059963189e-06, + "loss": 0.5132, + "mean_token_accuracy": 0.8201853647828102, + "num_tokens": 12288123.0, + "step": 10230 + }, + { + "entropy": 1.9072040289640426, + "epoch": 0.03174310405088816, + "grad_norm": 10.166359901428223, + "learning_rate": 4.95931415286254e-06, + "loss": 0.6276, + "mean_token_accuracy": 0.8175885871052742, + "num_tokens": 12299187.0, + "step": 10240 + }, + { + "entropy": 1.897184392809868, + "epoch": 0.03177410317593786, + "grad_norm": 10.87751293182373, + "learning_rate": 4.964157706093191e-06, + "loss": 0.6334, + "mean_token_accuracy": 0.8149495676159859, + "num_tokens": 12309767.0, + "step": 10250 + }, + { + "entropy": 1.8759140372276306, + "epoch": 0.031805102300987556, + "grad_norm": 10.203163146972656, + "learning_rate": 4.96900125932384e-06, + "loss": 0.634, + "mean_token_accuracy": 0.8065581321716309, + "num_tokens": 12320842.0, + "step": 10260 + }, + { + "entropy": 1.8192242681980133, + "epoch": 0.03183610142603725, + "grad_norm": 13.203461647033691, + "learning_rate": 4.97384481255449e-06, + "loss": 0.5818, + "mean_token_accuracy": 0.8157040163874626, + "num_tokens": 12332641.0, + "step": 10270 + }, + { + "entropy": 1.9448095709085464, + "epoch": 0.03186710055108694, + "grad_norm": 16.432714462280273, + "learning_rate": 4.978688365785141e-06, + "loss": 0.7231, + "mean_token_accuracy": 0.7967922583222389, + "num_tokens": 12343283.0, + "step": 10280 + }, + { + "entropy": 1.8554027765989303, + "epoch": 0.03189809967613664, + "grad_norm": 10.973164558410645, + "learning_rate": 4.9835319190157905e-06, + "loss": 0.6028, + "mean_token_accuracy": 0.8122664794325829, + "num_tokens": 12355277.0, + "step": 10290 + }, + { + "entropy": 1.8099492847919465, + "epoch": 0.031929098801186336, + "grad_norm": 13.487873077392578, + "learning_rate": 4.98837547224644e-06, + "loss": 0.5666, + "mean_token_accuracy": 0.8163850530982018, + "num_tokens": 12367374.0, + "step": 10300 + }, + { + "entropy": 1.907564203441143, + "epoch": 0.03196009792623603, + "grad_norm": 14.73357105255127, + "learning_rate": 4.99321902547709e-06, + "loss": 0.6864, + "mean_token_accuracy": 0.804863877594471, + "num_tokens": 12379185.0, + "step": 10310 + }, + { + "entropy": 1.9018947511911393, + "epoch": 0.03199109705128573, + "grad_norm": 12.761754035949707, + "learning_rate": 4.99806257870774e-06, + "loss": 0.6285, + "mean_token_accuracy": 0.7974288746714592, + "num_tokens": 12390639.0, + "step": 10320 + }, + { + "entropy": 1.8945544630289077, + "epoch": 0.032022096176335425, + "grad_norm": 12.64550495147705, + "learning_rate": 5.002906131938391e-06, + "loss": 0.696, + "mean_token_accuracy": 0.7960980877280235, + "num_tokens": 12401445.0, + "step": 10330 + }, + { + "entropy": 1.8769317850470544, + "epoch": 0.032053095301385115, + "grad_norm": 10.872472763061523, + "learning_rate": 5.00774968516904e-06, + "loss": 0.6736, + "mean_token_accuracy": 0.8056196987628936, + "num_tokens": 12412842.0, + "step": 10340 + }, + { + "entropy": 1.874910195171833, + "epoch": 0.03208409442643481, + "grad_norm": 11.449685096740723, + "learning_rate": 5.01259323839969e-06, + "loss": 0.6081, + "mean_token_accuracy": 0.8147278919816017, + "num_tokens": 12424094.0, + "step": 10350 + }, + { + "entropy": 1.8590925335884094, + "epoch": 0.03211509355148451, + "grad_norm": 13.95566177368164, + "learning_rate": 5.0174367916303405e-06, + "loss": 0.5992, + "mean_token_accuracy": 0.8060554280877114, + "num_tokens": 12436102.0, + "step": 10360 + }, + { + "entropy": 1.9232334434986114, + "epoch": 0.032146092676534205, + "grad_norm": 13.31452751159668, + "learning_rate": 5.02228034486099e-06, + "loss": 0.6143, + "mean_token_accuracy": 0.819976469874382, + "num_tokens": 12447424.0, + "step": 10370 + }, + { + "entropy": 1.860893575847149, + "epoch": 0.0321770918015839, + "grad_norm": 11.070972442626953, + "learning_rate": 5.02712389809164e-06, + "loss": 0.6446, + "mean_token_accuracy": 0.8143087536096573, + "num_tokens": 12459620.0, + "step": 10380 + }, + { + "entropy": 1.8405973672866822, + "epoch": 0.0322080909266336, + "grad_norm": 11.35168743133545, + "learning_rate": 5.03196745132229e-06, + "loss": 0.5767, + "mean_token_accuracy": 0.8185856312513351, + "num_tokens": 12471581.0, + "step": 10390 + }, + { + "entropy": 1.8092712104320525, + "epoch": 0.032239090051683295, + "grad_norm": 12.700512886047363, + "learning_rate": 5.036811004552941e-06, + "loss": 0.5496, + "mean_token_accuracy": 0.8279015332460403, + "num_tokens": 12483574.0, + "step": 10400 + }, + { + "entropy": 1.772294245660305, + "epoch": 0.032270089176732984, + "grad_norm": 11.940170288085938, + "learning_rate": 5.041654557783591e-06, + "loss": 0.5448, + "mean_token_accuracy": 0.8129552945494651, + "num_tokens": 12496082.0, + "step": 10410 + }, + { + "entropy": 1.7716655775904655, + "epoch": 0.03230108830178268, + "grad_norm": 5.408534049987793, + "learning_rate": 5.046498111014241e-06, + "loss": 0.5785, + "mean_token_accuracy": 0.8172632664442062, + "num_tokens": 12508722.0, + "step": 10420 + }, + { + "entropy": 1.7848970398306847, + "epoch": 0.03233208742683238, + "grad_norm": 10.916991233825684, + "learning_rate": 5.0513416642448905e-06, + "loss": 0.5381, + "mean_token_accuracy": 0.822851151227951, + "num_tokens": 12521638.0, + "step": 10430 + }, + { + "entropy": 1.88555389046669, + "epoch": 0.032363086551882074, + "grad_norm": 16.887577056884766, + "learning_rate": 5.0561852174755396e-06, + "loss": 0.6788, + "mean_token_accuracy": 0.7913921490311623, + "num_tokens": 12533306.0, + "step": 10440 + }, + { + "entropy": 1.8448584645986557, + "epoch": 0.03239408567693177, + "grad_norm": 5.770571708679199, + "learning_rate": 5.06102877070619e-06, + "loss": 0.545, + "mean_token_accuracy": 0.8273654997348785, + "num_tokens": 12544900.0, + "step": 10450 + }, + { + "entropy": 1.8120036974549294, + "epoch": 0.03242508480198147, + "grad_norm": 12.20251178741455, + "learning_rate": 5.06587232393684e-06, + "loss": 0.5249, + "mean_token_accuracy": 0.8255019530653953, + "num_tokens": 12557961.0, + "step": 10460 + }, + { + "entropy": 1.8894508898258209, + "epoch": 0.03245608392703116, + "grad_norm": 12.401219367980957, + "learning_rate": 5.07071587716749e-06, + "loss": 0.6334, + "mean_token_accuracy": 0.8066725865006447, + "num_tokens": 12569409.0, + "step": 10470 + }, + { + "entropy": 1.8568513855338096, + "epoch": 0.03248708305208085, + "grad_norm": 15.682320594787598, + "learning_rate": 5.07555943039814e-06, + "loss": 0.5906, + "mean_token_accuracy": 0.8160646855831146, + "num_tokens": 12581146.0, + "step": 10480 + }, + { + "entropy": 1.806802648305893, + "epoch": 0.03251808217713055, + "grad_norm": 13.31312370300293, + "learning_rate": 5.080402983628791e-06, + "loss": 0.5478, + "mean_token_accuracy": 0.8170752301812172, + "num_tokens": 12593979.0, + "step": 10490 + }, + { + "entropy": 1.8914892196655273, + "epoch": 0.03254908130218025, + "grad_norm": 13.41655445098877, + "learning_rate": 5.0852465368594406e-06, + "loss": 0.6523, + "mean_token_accuracy": 0.8074697732925415, + "num_tokens": 12604872.0, + "step": 10500 + }, + { + "entropy": 1.8397974416613578, + "epoch": 0.03258008042722994, + "grad_norm": 13.245908737182617, + "learning_rate": 5.0900900900900905e-06, + "loss": 0.6411, + "mean_token_accuracy": 0.8051371321082115, + "num_tokens": 12616464.0, + "step": 10510 + }, + { + "entropy": 1.7762023329734802, + "epoch": 0.03261107955227964, + "grad_norm": 12.972115516662598, + "learning_rate": 5.094933643320741e-06, + "loss": 0.5699, + "mean_token_accuracy": 0.8134799718856811, + "num_tokens": 12628431.0, + "step": 10520 + }, + { + "entropy": 1.7372042164206505, + "epoch": 0.03264207867732933, + "grad_norm": 14.975220680236816, + "learning_rate": 5.099777196551391e-06, + "loss": 0.5565, + "mean_token_accuracy": 0.8230763703584671, + "num_tokens": 12641420.0, + "step": 10530 + }, + { + "entropy": 1.8806333974003793, + "epoch": 0.032673077802379026, + "grad_norm": 10.917582511901855, + "learning_rate": 5.104620749782041e-06, + "loss": 0.6079, + "mean_token_accuracy": 0.8124522119760513, + "num_tokens": 12652755.0, + "step": 10540 + }, + { + "entropy": 1.8468591958284377, + "epoch": 0.03270407692742872, + "grad_norm": 15.210384368896484, + "learning_rate": 5.10946430301269e-06, + "loss": 0.6178, + "mean_token_accuracy": 0.8065985247492791, + "num_tokens": 12664093.0, + "step": 10550 + }, + { + "entropy": 1.7937588766217232, + "epoch": 0.03273507605247842, + "grad_norm": 11.96823787689209, + "learning_rate": 5.11430785624334e-06, + "loss": 0.5948, + "mean_token_accuracy": 0.8129746183753014, + "num_tokens": 12676810.0, + "step": 10560 + }, + { + "entropy": 1.8590254932641983, + "epoch": 0.032766075177528116, + "grad_norm": 13.634770393371582, + "learning_rate": 5.119151409473991e-06, + "loss": 0.622, + "mean_token_accuracy": 0.812176737189293, + "num_tokens": 12688390.0, + "step": 10570 + }, + { + "entropy": 1.8288069173693657, + "epoch": 0.03279707430257781, + "grad_norm": 19.997926712036133, + "learning_rate": 5.1239949627046405e-06, + "loss": 0.6365, + "mean_token_accuracy": 0.8077499896287919, + "num_tokens": 12700682.0, + "step": 10580 + }, + { + "entropy": 1.9130954205989839, + "epoch": 0.0328280734276275, + "grad_norm": 13.037125587463379, + "learning_rate": 5.12883851593529e-06, + "loss": 0.7036, + "mean_token_accuracy": 0.8007219597697258, + "num_tokens": 12711160.0, + "step": 10590 + }, + { + "entropy": 1.908418272435665, + "epoch": 0.0328590725526772, + "grad_norm": 9.9063720703125, + "learning_rate": 5.13368206916594e-06, + "loss": 0.6536, + "mean_token_accuracy": 0.8085066750645638, + "num_tokens": 12722404.0, + "step": 10600 + }, + { + "entropy": 1.88576088398695, + "epoch": 0.032890071677726895, + "grad_norm": 15.735233306884766, + "learning_rate": 5.138525622396591e-06, + "loss": 0.6415, + "mean_token_accuracy": 0.8067146822810173, + "num_tokens": 12735219.0, + "step": 10610 + }, + { + "entropy": 1.8483037024736404, + "epoch": 0.03292107080277659, + "grad_norm": 16.17683982849121, + "learning_rate": 5.143369175627241e-06, + "loss": 0.5999, + "mean_token_accuracy": 0.809206846356392, + "num_tokens": 12747804.0, + "step": 10620 + }, + { + "entropy": 1.7985512629151343, + "epoch": 0.03295206992782629, + "grad_norm": 5.797553062438965, + "learning_rate": 5.148212728857891e-06, + "loss": 0.5436, + "mean_token_accuracy": 0.8201219871640205, + "num_tokens": 12760529.0, + "step": 10630 + }, + { + "entropy": 1.829072842001915, + "epoch": 0.032983069052875985, + "grad_norm": 11.334760665893555, + "learning_rate": 5.153056282088541e-06, + "loss": 0.5998, + "mean_token_accuracy": 0.8162604227662087, + "num_tokens": 12772598.0, + "step": 10640 + }, + { + "entropy": 1.8073836967349053, + "epoch": 0.033014068177925675, + "grad_norm": 11.55426025390625, + "learning_rate": 5.15789983531919e-06, + "loss": 0.5323, + "mean_token_accuracy": 0.8164139956235885, + "num_tokens": 12784907.0, + "step": 10650 + }, + { + "entropy": 1.8018400803208352, + "epoch": 0.03304506730297537, + "grad_norm": 11.91412353515625, + "learning_rate": 5.16274338854984e-06, + "loss": 0.5519, + "mean_token_accuracy": 0.8153271213173866, + "num_tokens": 12797883.0, + "step": 10660 + }, + { + "entropy": 1.7438832193613052, + "epoch": 0.03307606642802507, + "grad_norm": 13.903759956359863, + "learning_rate": 5.16758694178049e-06, + "loss": 0.5102, + "mean_token_accuracy": 0.8333475485444068, + "num_tokens": 12810429.0, + "step": 10670 + }, + { + "entropy": 1.8345250859856606, + "epoch": 0.033107065553074765, + "grad_norm": 12.650176048278809, + "learning_rate": 5.17243049501114e-06, + "loss": 0.5793, + "mean_token_accuracy": 0.8180657878518105, + "num_tokens": 12822129.0, + "step": 10680 + }, + { + "entropy": 1.8904076486825943, + "epoch": 0.03313806467812446, + "grad_norm": 10.247991561889648, + "learning_rate": 5.17727404824179e-06, + "loss": 0.6336, + "mean_token_accuracy": 0.8089143499732018, + "num_tokens": 12833756.0, + "step": 10690 + }, + { + "entropy": 1.8453685097396373, + "epoch": 0.03316906380317416, + "grad_norm": 6.841571807861328, + "learning_rate": 5.182117601472441e-06, + "loss": 0.5723, + "mean_token_accuracy": 0.8071049571037292, + "num_tokens": 12846112.0, + "step": 10700 + }, + { + "entropy": 1.8263072147965431, + "epoch": 0.03320006292822385, + "grad_norm": 10.83479118347168, + "learning_rate": 5.186961154703091e-06, + "loss": 0.5561, + "mean_token_accuracy": 0.8183331623673439, + "num_tokens": 12858793.0, + "step": 10710 + }, + { + "entropy": 1.9237968116998672, + "epoch": 0.033231062053273544, + "grad_norm": 14.300333023071289, + "learning_rate": 5.1918047079337406e-06, + "loss": 0.6285, + "mean_token_accuracy": 0.8162373065948486, + "num_tokens": 12869520.0, + "step": 10720 + }, + { + "entropy": 1.804058338701725, + "epoch": 0.03326206117832324, + "grad_norm": 12.697318077087402, + "learning_rate": 5.196648261164391e-06, + "loss": 0.5877, + "mean_token_accuracy": 0.8053817078471184, + "num_tokens": 12883158.0, + "step": 10730 + }, + { + "entropy": 1.858067548274994, + "epoch": 0.03329306030337294, + "grad_norm": 12.211063385009766, + "learning_rate": 5.201491814395041e-06, + "loss": 0.5606, + "mean_token_accuracy": 0.8255058348178863, + "num_tokens": 12894190.0, + "step": 10740 + }, + { + "entropy": 1.8079882711172104, + "epoch": 0.033324059428422634, + "grad_norm": 6.42866325378418, + "learning_rate": 5.206335367625691e-06, + "loss": 0.5676, + "mean_token_accuracy": 0.8178721934556961, + "num_tokens": 12907324.0, + "step": 10750 + }, + { + "entropy": 1.851043240725994, + "epoch": 0.03335505855347233, + "grad_norm": 13.750490188598633, + "learning_rate": 5.21117892085634e-06, + "loss": 0.6343, + "mean_token_accuracy": 0.8060193166136742, + "num_tokens": 12918683.0, + "step": 10760 + }, + { + "entropy": 1.8697371006011962, + "epoch": 0.03338605767852203, + "grad_norm": 6.069192409515381, + "learning_rate": 5.21602247408699e-06, + "loss": 0.6548, + "mean_token_accuracy": 0.8067969933152199, + "num_tokens": 12930503.0, + "step": 10770 + }, + { + "entropy": 1.833288662135601, + "epoch": 0.03341705680357172, + "grad_norm": 11.35355281829834, + "learning_rate": 5.220866027317641e-06, + "loss": 0.575, + "mean_token_accuracy": 0.8082702040672303, + "num_tokens": 12943156.0, + "step": 10780 + }, + { + "entropy": 1.895593549311161, + "epoch": 0.03344805592862141, + "grad_norm": 6.248627662658691, + "learning_rate": 5.225709580548291e-06, + "loss": 0.629, + "mean_token_accuracy": 0.8071183800697327, + "num_tokens": 12955988.0, + "step": 10790 + }, + { + "entropy": 1.8191024243831635, + "epoch": 0.03347905505367111, + "grad_norm": 4.646948337554932, + "learning_rate": 5.2305531337789405e-06, + "loss": 0.6002, + "mean_token_accuracy": 0.81056018024683, + "num_tokens": 12969403.0, + "step": 10800 + }, + { + "entropy": 1.8694421991705894, + "epoch": 0.033510054178720806, + "grad_norm": 6.7921977043151855, + "learning_rate": 5.23539668700959e-06, + "loss": 0.5993, + "mean_token_accuracy": 0.8087976023554801, + "num_tokens": 12981414.0, + "step": 10810 + }, + { + "entropy": 1.8624958485364913, + "epoch": 0.0335410533037705, + "grad_norm": 14.213687896728516, + "learning_rate": 5.240240240240241e-06, + "loss": 0.6583, + "mean_token_accuracy": 0.8021715626120567, + "num_tokens": 12992353.0, + "step": 10820 + }, + { + "entropy": 1.7845357745885848, + "epoch": 0.0335720524288202, + "grad_norm": 5.307274341583252, + "learning_rate": 5.245083793470891e-06, + "loss": 0.5681, + "mean_token_accuracy": 0.8313144221901894, + "num_tokens": 13004955.0, + "step": 10830 + }, + { + "entropy": 1.8418287009000778, + "epoch": 0.03360305155386989, + "grad_norm": 8.079380989074707, + "learning_rate": 5.249927346701541e-06, + "loss": 0.5771, + "mean_token_accuracy": 0.8063233241438865, + "num_tokens": 13017444.0, + "step": 10840 + }, + { + "entropy": 1.8194942593574523, + "epoch": 0.033634050678919586, + "grad_norm": 10.57718563079834, + "learning_rate": 5.254770899932191e-06, + "loss": 0.5911, + "mean_token_accuracy": 0.8096415162086487, + "num_tokens": 13029772.0, + "step": 10850 + }, + { + "entropy": 1.910832443833351, + "epoch": 0.03366504980396928, + "grad_norm": 11.854331970214844, + "learning_rate": 5.25961445316284e-06, + "loss": 0.6181, + "mean_token_accuracy": 0.8215459808707237, + "num_tokens": 13041096.0, + "step": 10860 + }, + { + "entropy": 1.8814880549907684, + "epoch": 0.03369604892901898, + "grad_norm": 12.540098190307617, + "learning_rate": 5.2644580063934905e-06, + "loss": 0.6347, + "mean_token_accuracy": 0.805778457224369, + "num_tokens": 13052804.0, + "step": 10870 + }, + { + "entropy": 1.8503670692443848, + "epoch": 0.033727048054068676, + "grad_norm": 5.048069477081299, + "learning_rate": 5.26930155962414e-06, + "loss": 0.5956, + "mean_token_accuracy": 0.8164945662021637, + "num_tokens": 13064995.0, + "step": 10880 + }, + { + "entropy": 1.871761327981949, + "epoch": 0.03375804717911837, + "grad_norm": 17.457483291625977, + "learning_rate": 5.27414511285479e-06, + "loss": 0.5965, + "mean_token_accuracy": 0.8171331033110618, + "num_tokens": 13076407.0, + "step": 10890 + }, + { + "entropy": 1.8655700251460074, + "epoch": 0.03378904630416806, + "grad_norm": 5.408127307891846, + "learning_rate": 5.27898866608544e-06, + "loss": 0.6076, + "mean_token_accuracy": 0.8107570946216583, + "num_tokens": 13088254.0, + "step": 10900 + }, + { + "entropy": 1.9251561522483827, + "epoch": 0.03382004542921776, + "grad_norm": 11.156283378601074, + "learning_rate": 5.283832219316091e-06, + "loss": 0.708, + "mean_token_accuracy": 0.7975843846797943, + "num_tokens": 13099187.0, + "step": 10910 + }, + { + "entropy": 1.9327981039881705, + "epoch": 0.033851044554267455, + "grad_norm": 15.034012794494629, + "learning_rate": 5.288675772546741e-06, + "loss": 0.6567, + "mean_token_accuracy": 0.8078183338046074, + "num_tokens": 13110633.0, + "step": 10920 + }, + { + "entropy": 1.8207261368632317, + "epoch": 0.03388204367931715, + "grad_norm": 13.088176727294922, + "learning_rate": 5.293519325777391e-06, + "loss": 0.5434, + "mean_token_accuracy": 0.8240138441324234, + "num_tokens": 13122812.0, + "step": 10930 + }, + { + "entropy": 1.914224511384964, + "epoch": 0.03391304280436685, + "grad_norm": 12.61460018157959, + "learning_rate": 5.298362879008041e-06, + "loss": 0.6495, + "mean_token_accuracy": 0.8125144988298416, + "num_tokens": 13133966.0, + "step": 10940 + }, + { + "entropy": 1.8054762348532676, + "epoch": 0.033944041929416545, + "grad_norm": 12.354558944702148, + "learning_rate": 5.303206432238691e-06, + "loss": 0.5174, + "mean_token_accuracy": 0.8258527040481567, + "num_tokens": 13146315.0, + "step": 10950 + }, + { + "entropy": 1.7843435242772103, + "epoch": 0.033975041054466235, + "grad_norm": 11.364967346191406, + "learning_rate": 5.308049985469341e-06, + "loss": 0.5668, + "mean_token_accuracy": 0.8167797103524208, + "num_tokens": 13158286.0, + "step": 10960 + }, + { + "entropy": 1.8914837822318078, + "epoch": 0.03400604017951593, + "grad_norm": 12.683460235595703, + "learning_rate": 5.31289353869999e-06, + "loss": 0.6436, + "mean_token_accuracy": 0.8010558471083641, + "num_tokens": 13170443.0, + "step": 10970 + }, + { + "entropy": 1.8247480258345603, + "epoch": 0.03403703930456563, + "grad_norm": 5.69301176071167, + "learning_rate": 5.31773709193064e-06, + "loss": 0.5587, + "mean_token_accuracy": 0.8227360025048256, + "num_tokens": 13181845.0, + "step": 10980 + }, + { + "entropy": 1.8641392514109612, + "epoch": 0.034068038429615324, + "grad_norm": 12.605916023254395, + "learning_rate": 5.322580645161291e-06, + "loss": 0.6679, + "mean_token_accuracy": 0.80641338378191, + "num_tokens": 13193899.0, + "step": 10990 + }, + { + "entropy": 1.8444667972624302, + "epoch": 0.03409903755466502, + "grad_norm": 5.727068901062012, + "learning_rate": 5.327424198391941e-06, + "loss": 0.5699, + "mean_token_accuracy": 0.8202215030789375, + "num_tokens": 13206043.0, + "step": 11000 + }, + { + "entropy": 1.773200060427189, + "epoch": 0.03413003667971472, + "grad_norm": 13.433945655822754, + "learning_rate": 5.332267751622591e-06, + "loss": 0.5159, + "mean_token_accuracy": 0.827538113296032, + "num_tokens": 13218691.0, + "step": 11010 + }, + { + "entropy": 1.9272786289453507, + "epoch": 0.03416103580476441, + "grad_norm": 11.8448486328125, + "learning_rate": 5.3371113048532405e-06, + "loss": 0.6449, + "mean_token_accuracy": 0.8075118377804756, + "num_tokens": 13229756.0, + "step": 11020 + }, + { + "entropy": 1.8923597291111947, + "epoch": 0.034192034929814104, + "grad_norm": 5.769096374511719, + "learning_rate": 5.341954858083891e-06, + "loss": 0.5911, + "mean_token_accuracy": 0.8067701622843743, + "num_tokens": 13241705.0, + "step": 11030 + }, + { + "entropy": 1.8403789684176446, + "epoch": 0.0342230340548638, + "grad_norm": 12.353031158447266, + "learning_rate": 5.346798411314541e-06, + "loss": 0.5409, + "mean_token_accuracy": 0.8218411967158318, + "num_tokens": 13254778.0, + "step": 11040 + }, + { + "entropy": 1.8932681947946548, + "epoch": 0.0342540331799135, + "grad_norm": 5.458434581756592, + "learning_rate": 5.351641964545191e-06, + "loss": 0.5972, + "mean_token_accuracy": 0.8183223947882652, + "num_tokens": 13266049.0, + "step": 11050 + }, + { + "entropy": 1.86180839240551, + "epoch": 0.034285032304963194, + "grad_norm": 8.72637939453125, + "learning_rate": 5.356485517775841e-06, + "loss": 0.5903, + "mean_token_accuracy": 0.8129496097564697, + "num_tokens": 13277888.0, + "step": 11060 + }, + { + "entropy": 1.9068016976118087, + "epoch": 0.03431603143001289, + "grad_norm": 12.052847862243652, + "learning_rate": 5.36132907100649e-06, + "loss": 0.6476, + "mean_token_accuracy": 0.8061621204018593, + "num_tokens": 13289432.0, + "step": 11070 + }, + { + "entropy": 1.8573195546865464, + "epoch": 0.03434703055506259, + "grad_norm": 10.996872901916504, + "learning_rate": 5.366172624237141e-06, + "loss": 0.5506, + "mean_token_accuracy": 0.818072022497654, + "num_tokens": 13301147.0, + "step": 11080 + }, + { + "entropy": 1.8704863846302033, + "epoch": 0.034378029680112276, + "grad_norm": 13.163795471191406, + "learning_rate": 5.3710161774677905e-06, + "loss": 0.5488, + "mean_token_accuracy": 0.816085159778595, + "num_tokens": 13312829.0, + "step": 11090 + }, + { + "entropy": 1.8130246475338936, + "epoch": 0.03440902880516197, + "grad_norm": 14.593989372253418, + "learning_rate": 5.37585973069844e-06, + "loss": 0.585, + "mean_token_accuracy": 0.8232684478163719, + "num_tokens": 13325336.0, + "step": 11100 + }, + { + "entropy": 1.835708625614643, + "epoch": 0.03444002793021167, + "grad_norm": 12.702786445617676, + "learning_rate": 5.380703283929091e-06, + "loss": 0.5843, + "mean_token_accuracy": 0.8147074475884437, + "num_tokens": 13337824.0, + "step": 11110 + }, + { + "entropy": 1.8307800009846686, + "epoch": 0.034471027055261366, + "grad_norm": 11.551653861999512, + "learning_rate": 5.385546837159741e-06, + "loss": 0.594, + "mean_token_accuracy": 0.811168585717678, + "num_tokens": 13349534.0, + "step": 11120 + }, + { + "entropy": 1.8517664805054665, + "epoch": 0.03450202618031106, + "grad_norm": 2.7256433963775635, + "learning_rate": 5.390390390390391e-06, + "loss": 0.602, + "mean_token_accuracy": 0.8006679996848106, + "num_tokens": 13363408.0, + "step": 11130 + }, + { + "entropy": 1.8399043202400207, + "epoch": 0.03453302530536076, + "grad_norm": 7.206852912902832, + "learning_rate": 5.395233943621041e-06, + "loss": 0.6157, + "mean_token_accuracy": 0.8117770627140999, + "num_tokens": 13376213.0, + "step": 11140 + }, + { + "entropy": 1.9139886111021043, + "epoch": 0.03456402443041045, + "grad_norm": 12.817607879638672, + "learning_rate": 5.4000774968516915e-06, + "loss": 0.6346, + "mean_token_accuracy": 0.8084447503089904, + "num_tokens": 13387413.0, + "step": 11150 + }, + { + "entropy": 1.8213655844330787, + "epoch": 0.034595023555460146, + "grad_norm": 11.663803100585938, + "learning_rate": 5.404921050082341e-06, + "loss": 0.6088, + "mean_token_accuracy": 0.8164772510528564, + "num_tokens": 13399017.0, + "step": 11160 + }, + { + "entropy": 1.8936782956123352, + "epoch": 0.03462602268050984, + "grad_norm": 10.823102951049805, + "learning_rate": 5.409764603312991e-06, + "loss": 0.6693, + "mean_token_accuracy": 0.810118442773819, + "num_tokens": 13410180.0, + "step": 11170 + }, + { + "entropy": 1.8289520889520645, + "epoch": 0.03465702180555954, + "grad_norm": 12.268133163452148, + "learning_rate": 5.41460815654364e-06, + "loss": 0.5937, + "mean_token_accuracy": 0.812929916381836, + "num_tokens": 13422880.0, + "step": 11180 + }, + { + "entropy": 1.7611708968877793, + "epoch": 0.034688020930609235, + "grad_norm": 5.681921482086182, + "learning_rate": 5.41945170977429e-06, + "loss": 0.5208, + "mean_token_accuracy": 0.8295134902000427, + "num_tokens": 13436525.0, + "step": 11190 + }, + { + "entropy": 1.894238282740116, + "epoch": 0.03471902005565893, + "grad_norm": 9.887272834777832, + "learning_rate": 5.424295263004941e-06, + "loss": 0.6165, + "mean_token_accuracy": 0.8203635275363922, + "num_tokens": 13447769.0, + "step": 11200 + }, + { + "entropy": 1.8931616693735123, + "epoch": 0.03475001918070862, + "grad_norm": 10.567079544067383, + "learning_rate": 5.429138816235591e-06, + "loss": 0.6151, + "mean_token_accuracy": 0.813125379383564, + "num_tokens": 13459142.0, + "step": 11210 + }, + { + "entropy": 1.8201383396983146, + "epoch": 0.03478101830575832, + "grad_norm": 6.484344482421875, + "learning_rate": 5.433982369466241e-06, + "loss": 0.5731, + "mean_token_accuracy": 0.8103004142642021, + "num_tokens": 13471313.0, + "step": 11220 + }, + { + "entropy": 1.8642778664827346, + "epoch": 0.034812017430808015, + "grad_norm": 13.053911209106445, + "learning_rate": 5.4388259226968906e-06, + "loss": 0.5937, + "mean_token_accuracy": 0.8092061296105385, + "num_tokens": 13483091.0, + "step": 11230 + }, + { + "entropy": 1.8669605866074561, + "epoch": 0.03484301655585771, + "grad_norm": 11.9703369140625, + "learning_rate": 5.443669475927541e-06, + "loss": 0.6023, + "mean_token_accuracy": 0.8229537546634674, + "num_tokens": 13494177.0, + "step": 11240 + }, + { + "entropy": 1.85545664280653, + "epoch": 0.03487401568090741, + "grad_norm": 6.333286762237549, + "learning_rate": 5.448513029158191e-06, + "loss": 0.6025, + "mean_token_accuracy": 0.810517068207264, + "num_tokens": 13507247.0, + "step": 11250 + }, + { + "entropy": 1.7715984418988229, + "epoch": 0.034905014805957105, + "grad_norm": 10.145940780639648, + "learning_rate": 5.453356582388841e-06, + "loss": 0.5342, + "mean_token_accuracy": 0.825646486878395, + "num_tokens": 13520781.0, + "step": 11260 + }, + { + "entropy": 1.9231941044330596, + "epoch": 0.034936013931006794, + "grad_norm": 12.940445899963379, + "learning_rate": 5.458200135619491e-06, + "loss": 0.6732, + "mean_token_accuracy": 0.799634762108326, + "num_tokens": 13531942.0, + "step": 11270 + }, + { + "entropy": 1.8580936834216117, + "epoch": 0.03496701305605649, + "grad_norm": 13.450532913208008, + "learning_rate": 5.46304368885014e-06, + "loss": 0.6076, + "mean_token_accuracy": 0.8047621414065361, + "num_tokens": 13543831.0, + "step": 11280 + }, + { + "entropy": 1.8055393621325493, + "epoch": 0.03499801218110619, + "grad_norm": 12.781180381774902, + "learning_rate": 5.467887242080791e-06, + "loss": 0.5527, + "mean_token_accuracy": 0.8207426607608795, + "num_tokens": 13555967.0, + "step": 11290 + }, + { + "entropy": 1.804980905354023, + "epoch": 0.035029011306155884, + "grad_norm": 12.926373481750488, + "learning_rate": 5.472730795311441e-06, + "loss": 0.5702, + "mean_token_accuracy": 0.8110515549778938, + "num_tokens": 13569161.0, + "step": 11300 + }, + { + "entropy": 1.9503383368253708, + "epoch": 0.03506001043120558, + "grad_norm": 12.764983177185059, + "learning_rate": 5.4775743485420905e-06, + "loss": 0.6721, + "mean_token_accuracy": 0.7947013214230537, + "num_tokens": 13580275.0, + "step": 11310 + }, + { + "entropy": 1.88107870221138, + "epoch": 0.03509100955625528, + "grad_norm": 14.808971405029297, + "learning_rate": 5.482417901772741e-06, + "loss": 0.6075, + "mean_token_accuracy": 0.8077093288302422, + "num_tokens": 13592745.0, + "step": 11320 + }, + { + "entropy": 1.8953585937619208, + "epoch": 0.03512200868130497, + "grad_norm": 12.039188385009766, + "learning_rate": 5.487261455003391e-06, + "loss": 0.6186, + "mean_token_accuracy": 0.8103330656886101, + "num_tokens": 13604350.0, + "step": 11330 + }, + { + "entropy": 1.8628807738423347, + "epoch": 0.035153007806354664, + "grad_norm": 11.431326866149902, + "learning_rate": 5.492105008234041e-06, + "loss": 0.5886, + "mean_token_accuracy": 0.8245170086622238, + "num_tokens": 13616428.0, + "step": 11340 + }, + { + "entropy": 1.9492103517055512, + "epoch": 0.03518400693140436, + "grad_norm": 13.414464950561523, + "learning_rate": 5.496948561464691e-06, + "loss": 0.6753, + "mean_token_accuracy": 0.7961807489395142, + "num_tokens": 13627865.0, + "step": 11350 + }, + { + "entropy": 1.9296086087822915, + "epoch": 0.03521500605645406, + "grad_norm": 12.078662872314453, + "learning_rate": 5.501792114695342e-06, + "loss": 0.7047, + "mean_token_accuracy": 0.7942573204636574, + "num_tokens": 13639469.0, + "step": 11360 + }, + { + "entropy": 1.9266355335712433, + "epoch": 0.03524600518150375, + "grad_norm": 10.940107345581055, + "learning_rate": 5.5066356679259915e-06, + "loss": 0.6479, + "mean_token_accuracy": 0.8066198214888572, + "num_tokens": 13650098.0, + "step": 11370 + }, + { + "entropy": 1.7104891866445542, + "epoch": 0.03527700430655345, + "grad_norm": 13.101654052734375, + "learning_rate": 5.511479221156641e-06, + "loss": 0.5101, + "mean_token_accuracy": 0.8279589235782623, + "num_tokens": 13664250.0, + "step": 11380 + }, + { + "entropy": 1.8383988574147225, + "epoch": 0.03530800343160314, + "grad_norm": 12.127103805541992, + "learning_rate": 5.51632277438729e-06, + "loss": 0.6044, + "mean_token_accuracy": 0.8270195707678795, + "num_tokens": 13675591.0, + "step": 11390 + }, + { + "entropy": 1.8982412829995154, + "epoch": 0.035339002556652836, + "grad_norm": 13.012894630432129, + "learning_rate": 5.52116632761794e-06, + "loss": 0.6433, + "mean_token_accuracy": 0.8037685632705689, + "num_tokens": 13687364.0, + "step": 11400 + }, + { + "entropy": 1.85855975151062, + "epoch": 0.03537000168170253, + "grad_norm": 5.7059454917907715, + "learning_rate": 5.526009880848591e-06, + "loss": 0.5678, + "mean_token_accuracy": 0.8210862413048744, + "num_tokens": 13699442.0, + "step": 11410 + }, + { + "entropy": 1.8508766368031502, + "epoch": 0.03540100080675223, + "grad_norm": 12.565545082092285, + "learning_rate": 5.530853434079241e-06, + "loss": 0.6132, + "mean_token_accuracy": 0.8048885554075241, + "num_tokens": 13711862.0, + "step": 11420 + }, + { + "entropy": 1.8528811484575272, + "epoch": 0.035431999931801926, + "grad_norm": 10.851259231567383, + "learning_rate": 5.535696987309891e-06, + "loss": 0.6248, + "mean_token_accuracy": 0.8083219036459923, + "num_tokens": 13723049.0, + "step": 11430 + }, + { + "entropy": 1.7467531949281692, + "epoch": 0.03546299905685162, + "grad_norm": 12.76328182220459, + "learning_rate": 5.540540540540541e-06, + "loss": 0.4742, + "mean_token_accuracy": 0.8341274619102478, + "num_tokens": 13736433.0, + "step": 11440 + }, + { + "entropy": 1.9269695818424224, + "epoch": 0.03549399818190132, + "grad_norm": 13.179561614990234, + "learning_rate": 5.545384093771191e-06, + "loss": 0.6138, + "mean_token_accuracy": 0.8090446889400482, + "num_tokens": 13748433.0, + "step": 11450 + }, + { + "entropy": 1.871768619120121, + "epoch": 0.03552499730695101, + "grad_norm": 11.79694652557373, + "learning_rate": 5.550227647001841e-06, + "loss": 0.6131, + "mean_token_accuracy": 0.8084805279970169, + "num_tokens": 13760554.0, + "step": 11460 + }, + { + "entropy": 1.899412962794304, + "epoch": 0.035555996432000705, + "grad_norm": 10.14828872680664, + "learning_rate": 5.555071200232491e-06, + "loss": 0.6452, + "mean_token_accuracy": 0.8113683596253395, + "num_tokens": 13771849.0, + "step": 11470 + }, + { + "entropy": 1.8810555890202523, + "epoch": 0.0355869955570504, + "grad_norm": 13.629297256469727, + "learning_rate": 5.559914753463141e-06, + "loss": 0.6262, + "mean_token_accuracy": 0.8057652726769448, + "num_tokens": 13783349.0, + "step": 11480 + }, + { + "entropy": 1.8832147046923637, + "epoch": 0.0356179946821001, + "grad_norm": 11.237863540649414, + "learning_rate": 5.56475830669379e-06, + "loss": 0.6097, + "mean_token_accuracy": 0.80062695145607, + "num_tokens": 13794988.0, + "step": 11490 + }, + { + "entropy": 1.734711329638958, + "epoch": 0.035648993807149795, + "grad_norm": 15.71545696258545, + "learning_rate": 5.569601859924441e-06, + "loss": 0.459, + "mean_token_accuracy": 0.8244102329015732, + "num_tokens": 13808196.0, + "step": 11500 + }, + { + "entropy": 1.8736824676394463, + "epoch": 0.03567999293219949, + "grad_norm": 10.090490341186523, + "learning_rate": 5.574445413155091e-06, + "loss": 0.5995, + "mean_token_accuracy": 0.8152802541851998, + "num_tokens": 13820833.0, + "step": 11510 + }, + { + "entropy": 1.7825218871235848, + "epoch": 0.03571099205724918, + "grad_norm": 10.388526916503906, + "learning_rate": 5.579288966385741e-06, + "loss": 0.4678, + "mean_token_accuracy": 0.8239463314414024, + "num_tokens": 13834118.0, + "step": 11520 + }, + { + "entropy": 1.9106709718704225, + "epoch": 0.03574199118229888, + "grad_norm": 5.574368000030518, + "learning_rate": 5.584132519616391e-06, + "loss": 0.6794, + "mean_token_accuracy": 0.8046201914548874, + "num_tokens": 13845774.0, + "step": 11530 + }, + { + "entropy": 1.9362187922000884, + "epoch": 0.035772990307348575, + "grad_norm": 11.167245864868164, + "learning_rate": 5.588976072847041e-06, + "loss": 0.6901, + "mean_token_accuracy": 0.7971135929226876, + "num_tokens": 13856205.0, + "step": 11540 + }, + { + "entropy": 1.792908415198326, + "epoch": 0.03580398943239827, + "grad_norm": 5.944072723388672, + "learning_rate": 5.593819626077691e-06, + "loss": 0.5636, + "mean_token_accuracy": 0.8229343697428704, + "num_tokens": 13868827.0, + "step": 11550 + }, + { + "entropy": 1.7896666795015335, + "epoch": 0.03583498855744797, + "grad_norm": 5.675596237182617, + "learning_rate": 5.598663179308341e-06, + "loss": 0.5246, + "mean_token_accuracy": 0.8251826211810112, + "num_tokens": 13881592.0, + "step": 11560 + }, + { + "entropy": 1.9190635159611702, + "epoch": 0.035865987682497665, + "grad_norm": 14.54418659210205, + "learning_rate": 5.603506732538992e-06, + "loss": 0.6535, + "mean_token_accuracy": 0.8147982686758042, + "num_tokens": 13892831.0, + "step": 11570 + }, + { + "entropy": 1.9398990795016289, + "epoch": 0.035896986807547354, + "grad_norm": 5.379489898681641, + "learning_rate": 5.608350285769642e-06, + "loss": 0.6346, + "mean_token_accuracy": 0.807078929245472, + "num_tokens": 13903824.0, + "step": 11580 + }, + { + "entropy": 1.8159923285245896, + "epoch": 0.03592798593259705, + "grad_norm": 12.649768829345703, + "learning_rate": 5.6131938390002915e-06, + "loss": 0.5691, + "mean_token_accuracy": 0.8225948542356492, + "num_tokens": 13916211.0, + "step": 11590 + }, + { + "entropy": 1.8722815930843353, + "epoch": 0.03595898505764675, + "grad_norm": 11.376723289489746, + "learning_rate": 5.6180373922309405e-06, + "loss": 0.5969, + "mean_token_accuracy": 0.8152380838990212, + "num_tokens": 13927975.0, + "step": 11600 + }, + { + "entropy": 1.8421933129429817, + "epoch": 0.035989984182696444, + "grad_norm": 11.949515342712402, + "learning_rate": 5.62288094546159e-06, + "loss": 0.6167, + "mean_token_accuracy": 0.8134393319487572, + "num_tokens": 13939474.0, + "step": 11610 + }, + { + "entropy": 1.888566829264164, + "epoch": 0.03602098330774614, + "grad_norm": 17.51249885559082, + "learning_rate": 5.627724498692241e-06, + "loss": 0.624, + "mean_token_accuracy": 0.817312179505825, + "num_tokens": 13951148.0, + "step": 11620 + }, + { + "entropy": 1.8805680245161056, + "epoch": 0.03605198243279584, + "grad_norm": 11.506142616271973, + "learning_rate": 5.632568051922891e-06, + "loss": 0.6413, + "mean_token_accuracy": 0.8139393076300621, + "num_tokens": 13962535.0, + "step": 11630 + }, + { + "entropy": 1.939769622683525, + "epoch": 0.03608298155784553, + "grad_norm": 11.291607856750488, + "learning_rate": 5.637411605153541e-06, + "loss": 0.6347, + "mean_token_accuracy": 0.81319679915905, + "num_tokens": 13973297.0, + "step": 11640 + }, + { + "entropy": 1.871855989843607, + "epoch": 0.03611398068289522, + "grad_norm": 13.56961727142334, + "learning_rate": 5.642255158384191e-06, + "loss": 0.5749, + "mean_token_accuracy": 0.8270270243287087, + "num_tokens": 13985337.0, + "step": 11650 + }, + { + "entropy": 1.807421001791954, + "epoch": 0.03614497980794492, + "grad_norm": 4.713283538818359, + "learning_rate": 5.6470987116148415e-06, + "loss": 0.4808, + "mean_token_accuracy": 0.8386715143918991, + "num_tokens": 13998425.0, + "step": 11660 + }, + { + "entropy": 1.784872618317604, + "epoch": 0.03617597893299462, + "grad_norm": 11.134720802307129, + "learning_rate": 5.651942264845491e-06, + "loss": 0.5639, + "mean_token_accuracy": 0.8182505384087563, + "num_tokens": 14011957.0, + "step": 11670 + }, + { + "entropy": 1.842606595158577, + "epoch": 0.03620697805804431, + "grad_norm": 5.860245227813721, + "learning_rate": 5.656785818076141e-06, + "loss": 0.5969, + "mean_token_accuracy": 0.8185199961066246, + "num_tokens": 14023989.0, + "step": 11680 + }, + { + "entropy": 1.7569019049406052, + "epoch": 0.03623797718309401, + "grad_norm": 11.391021728515625, + "learning_rate": 5.661629371306791e-06, + "loss": 0.4851, + "mean_token_accuracy": 0.8376433789730072, + "num_tokens": 14037138.0, + "step": 11690 + }, + { + "entropy": 1.8167656242847443, + "epoch": 0.0362689763081437, + "grad_norm": 14.077984809875488, + "learning_rate": 5.66647292453744e-06, + "loss": 0.5651, + "mean_token_accuracy": 0.8293104261159897, + "num_tokens": 14049119.0, + "step": 11700 + }, + { + "entropy": 1.8543547958135604, + "epoch": 0.036299975433193396, + "grad_norm": 9.835662841796875, + "learning_rate": 5.671316477768091e-06, + "loss": 0.598, + "mean_token_accuracy": 0.8196697190403939, + "num_tokens": 14061251.0, + "step": 11710 + }, + { + "entropy": 1.8527500122785567, + "epoch": 0.03633097455824309, + "grad_norm": 13.09237289428711, + "learning_rate": 5.676160030998741e-06, + "loss": 0.5879, + "mean_token_accuracy": 0.8196158647537232, + "num_tokens": 14073877.0, + "step": 11720 + }, + { + "entropy": 1.870137917995453, + "epoch": 0.03636197368329279, + "grad_norm": 12.378225326538086, + "learning_rate": 5.681003584229391e-06, + "loss": 0.6102, + "mean_token_accuracy": 0.81648840457201, + "num_tokens": 14086089.0, + "step": 11730 + }, + { + "entropy": 1.8904090464115142, + "epoch": 0.036392972808342486, + "grad_norm": 10.42041301727295, + "learning_rate": 5.6858471374600414e-06, + "loss": 0.5896, + "mean_token_accuracy": 0.8135670512914658, + "num_tokens": 14098409.0, + "step": 11740 + }, + { + "entropy": 1.9056991636753082, + "epoch": 0.03642397193339218, + "grad_norm": 5.637151718139648, + "learning_rate": 5.690690690690691e-06, + "loss": 0.6525, + "mean_token_accuracy": 0.8007242888212204, + "num_tokens": 14110078.0, + "step": 11750 + }, + { + "entropy": 1.7788226932287217, + "epoch": 0.03645497105844187, + "grad_norm": 6.211390018463135, + "learning_rate": 5.695534243921341e-06, + "loss": 0.5271, + "mean_token_accuracy": 0.8264580845832825, + "num_tokens": 14122941.0, + "step": 11760 + }, + { + "entropy": 1.8122435554862022, + "epoch": 0.03648597018349157, + "grad_norm": 10.39504623413086, + "learning_rate": 5.700377797151991e-06, + "loss": 0.5682, + "mean_token_accuracy": 0.8231259554624557, + "num_tokens": 14135164.0, + "step": 11770 + }, + { + "entropy": 1.9115518778562546, + "epoch": 0.036516969308541265, + "grad_norm": 11.225345611572266, + "learning_rate": 5.705221350382642e-06, + "loss": 0.6966, + "mean_token_accuracy": 0.8027231857180596, + "num_tokens": 14145993.0, + "step": 11780 + }, + { + "entropy": 1.833278726041317, + "epoch": 0.03654796843359096, + "grad_norm": 9.642135620117188, + "learning_rate": 5.710064903613292e-06, + "loss": 0.6039, + "mean_token_accuracy": 0.8039411261677742, + "num_tokens": 14159775.0, + "step": 11790 + }, + { + "entropy": 1.7943380519747734, + "epoch": 0.03657896755864066, + "grad_norm": 11.112443923950195, + "learning_rate": 5.714908456843942e-06, + "loss": 0.5057, + "mean_token_accuracy": 0.8308914095163346, + "num_tokens": 14172584.0, + "step": 11800 + }, + { + "entropy": 1.7970113635063172, + "epoch": 0.036609966683690355, + "grad_norm": 12.6159029006958, + "learning_rate": 5.719752010074591e-06, + "loss": 0.5638, + "mean_token_accuracy": 0.8075689300894737, + "num_tokens": 14184681.0, + "step": 11810 + }, + { + "entropy": 1.7890537694096564, + "epoch": 0.03664096580874005, + "grad_norm": 4.384820461273193, + "learning_rate": 5.7245955633052405e-06, + "loss": 0.6038, + "mean_token_accuracy": 0.8110948413610458, + "num_tokens": 14198022.0, + "step": 11820 + }, + { + "entropy": 1.814190225303173, + "epoch": 0.03667196493378974, + "grad_norm": 4.793267726898193, + "learning_rate": 5.729439116535891e-06, + "loss": 0.5878, + "mean_token_accuracy": 0.8212701037526131, + "num_tokens": 14209842.0, + "step": 11830 + }, + { + "entropy": 1.7815122097730636, + "epoch": 0.03670296405883944, + "grad_norm": 10.278016090393066, + "learning_rate": 5.734282669766541e-06, + "loss": 0.5281, + "mean_token_accuracy": 0.8179376661777497, + "num_tokens": 14222224.0, + "step": 11840 + }, + { + "entropy": 1.8016631290316583, + "epoch": 0.036733963183889135, + "grad_norm": 11.094133377075195, + "learning_rate": 5.739126222997191e-06, + "loss": 0.5766, + "mean_token_accuracy": 0.8138196378946304, + "num_tokens": 14234855.0, + "step": 11850 + }, + { + "entropy": 1.8083413437008857, + "epoch": 0.03676496230893883, + "grad_norm": 6.810488700866699, + "learning_rate": 5.743969776227841e-06, + "loss": 0.6051, + "mean_token_accuracy": 0.8194642826914788, + "num_tokens": 14246248.0, + "step": 11860 + }, + { + "entropy": 1.754711863398552, + "epoch": 0.03679596143398853, + "grad_norm": 5.887208938598633, + "learning_rate": 5.748813329458492e-06, + "loss": 0.6317, + "mean_token_accuracy": 0.8068835064768791, + "num_tokens": 14258816.0, + "step": 11870 + }, + { + "entropy": 1.7588348999619483, + "epoch": 0.036826960559038224, + "grad_norm": 12.953771591186523, + "learning_rate": 5.7536568826891415e-06, + "loss": 0.5202, + "mean_token_accuracy": 0.8235802292823792, + "num_tokens": 14271525.0, + "step": 11880 + }, + { + "entropy": 1.8742596834897995, + "epoch": 0.036857959684087914, + "grad_norm": 11.466156959533691, + "learning_rate": 5.758500435919791e-06, + "loss": 0.6855, + "mean_token_accuracy": 0.8022027894854545, + "num_tokens": 14282330.0, + "step": 11890 + }, + { + "entropy": 1.7715501874685287, + "epoch": 0.03688895880913761, + "grad_norm": 10.178691864013672, + "learning_rate": 5.763343989150441e-06, + "loss": 0.5384, + "mean_token_accuracy": 0.8171212136745453, + "num_tokens": 14294446.0, + "step": 11900 + }, + { + "entropy": 1.8660178661346436, + "epoch": 0.03691995793418731, + "grad_norm": 11.348404884338379, + "learning_rate": 5.76818754238109e-06, + "loss": 0.6431, + "mean_token_accuracy": 0.8038701593875885, + "num_tokens": 14305947.0, + "step": 11910 + }, + { + "entropy": 1.7659137800335885, + "epoch": 0.036950957059237004, + "grad_norm": 11.47120475769043, + "learning_rate": 5.773031095611741e-06, + "loss": 0.5643, + "mean_token_accuracy": 0.8206716164946556, + "num_tokens": 14318249.0, + "step": 11920 + }, + { + "entropy": 1.747839505970478, + "epoch": 0.0369819561842867, + "grad_norm": 5.5586018562316895, + "learning_rate": 5.777874648842391e-06, + "loss": 0.559, + "mean_token_accuracy": 0.8257301524281502, + "num_tokens": 14330993.0, + "step": 11930 + }, + { + "entropy": 1.8217350393533707, + "epoch": 0.0370129553093364, + "grad_norm": 10.629807472229004, + "learning_rate": 5.782718202073041e-06, + "loss": 0.6345, + "mean_token_accuracy": 0.8065755069255829, + "num_tokens": 14343198.0, + "step": 11940 + }, + { + "entropy": 1.8037606567144393, + "epoch": 0.03704395443438609, + "grad_norm": 8.647960662841797, + "learning_rate": 5.7875617553036915e-06, + "loss": 0.598, + "mean_token_accuracy": 0.8171042606234551, + "num_tokens": 14355609.0, + "step": 11950 + }, + { + "entropy": 1.8690994665026666, + "epoch": 0.03707495355943578, + "grad_norm": 13.143199920654297, + "learning_rate": 5.792405308534341e-06, + "loss": 0.6191, + "mean_token_accuracy": 0.8137925118207932, + "num_tokens": 14367235.0, + "step": 11960 + }, + { + "entropy": 1.7714310929179191, + "epoch": 0.03710595268448548, + "grad_norm": 11.381326675415039, + "learning_rate": 5.797248861764991e-06, + "loss": 0.5207, + "mean_token_accuracy": 0.8231312274932862, + "num_tokens": 14379885.0, + "step": 11970 + }, + { + "entropy": 1.8847584262490273, + "epoch": 0.037136951809535176, + "grad_norm": 12.403789520263672, + "learning_rate": 5.802092414995641e-06, + "loss": 0.6177, + "mean_token_accuracy": 0.8122402995824813, + "num_tokens": 14390884.0, + "step": 11980 + }, + { + "entropy": 1.7906603574752809, + "epoch": 0.03716795093458487, + "grad_norm": 12.341434478759766, + "learning_rate": 5.806935968226292e-06, + "loss": 0.6091, + "mean_token_accuracy": 0.8056104972958564, + "num_tokens": 14402659.0, + "step": 11990 + }, + { + "entropy": 1.8461511224508285, + "epoch": 0.03719895005963457, + "grad_norm": 11.438586235046387, + "learning_rate": 5.811779521456942e-06, + "loss": 0.6092, + "mean_token_accuracy": 0.8133256167173386, + "num_tokens": 14413793.0, + "step": 12000 + }, + { + "entropy": 1.797146451473236, + "epoch": 0.03722994918468426, + "grad_norm": 12.915786743164062, + "learning_rate": 5.816623074687592e-06, + "loss": 0.5399, + "mean_token_accuracy": 0.826125793159008, + "num_tokens": 14426225.0, + "step": 12010 + }, + { + "entropy": 1.7861742541193961, + "epoch": 0.037260948309733956, + "grad_norm": 11.406455993652344, + "learning_rate": 5.821466627918241e-06, + "loss": 0.5892, + "mean_token_accuracy": 0.8146935313940048, + "num_tokens": 14438143.0, + "step": 12020 + }, + { + "entropy": 1.8457993239164352, + "epoch": 0.03729194743478365, + "grad_norm": 10.185896873474121, + "learning_rate": 5.826310181148891e-06, + "loss": 0.6127, + "mean_token_accuracy": 0.8021126002073288, + "num_tokens": 14451306.0, + "step": 12030 + }, + { + "entropy": 1.8906755059957505, + "epoch": 0.03732294655983335, + "grad_norm": 12.862022399902344, + "learning_rate": 5.831153734379541e-06, + "loss": 0.722, + "mean_token_accuracy": 0.7925963416695595, + "num_tokens": 14463297.0, + "step": 12040 + }, + { + "entropy": 1.760866042971611, + "epoch": 0.037353945684883046, + "grad_norm": 12.60151195526123, + "learning_rate": 5.835997287610191e-06, + "loss": 0.5435, + "mean_token_accuracy": 0.8173602819442749, + "num_tokens": 14476740.0, + "step": 12050 + }, + { + "entropy": 1.8725519806146622, + "epoch": 0.03738494480993274, + "grad_norm": 13.23414134979248, + "learning_rate": 5.840840840840841e-06, + "loss": 0.6585, + "mean_token_accuracy": 0.804969422519207, + "num_tokens": 14488274.0, + "step": 12060 + }, + { + "entropy": 1.821217942237854, + "epoch": 0.03741594393498243, + "grad_norm": 12.871102333068848, + "learning_rate": 5.845684394071491e-06, + "loss": 0.5877, + "mean_token_accuracy": 0.8180323630571366, + "num_tokens": 14499989.0, + "step": 12070 + }, + { + "entropy": 1.8127268552780151, + "epoch": 0.03744694306003213, + "grad_norm": 12.796869277954102, + "learning_rate": 5.850527947302142e-06, + "loss": 0.5265, + "mean_token_accuracy": 0.8186282083392143, + "num_tokens": 14512825.0, + "step": 12080 + }, + { + "entropy": 1.8107111915946006, + "epoch": 0.037477942185081825, + "grad_norm": 5.6175360679626465, + "learning_rate": 5.855371500532792e-06, + "loss": 0.5838, + "mean_token_accuracy": 0.809958079457283, + "num_tokens": 14525109.0, + "step": 12090 + }, + { + "entropy": 1.8637285217642785, + "epoch": 0.03750894131013152, + "grad_norm": 12.704855918884277, + "learning_rate": 5.8602150537634415e-06, + "loss": 0.6368, + "mean_token_accuracy": 0.8088982105255127, + "num_tokens": 14536754.0, + "step": 12100 + }, + { + "entropy": 1.7325578138232232, + "epoch": 0.03753994043518122, + "grad_norm": 6.377108573913574, + "learning_rate": 5.865058606994092e-06, + "loss": 0.5025, + "mean_token_accuracy": 0.8252245962619782, + "num_tokens": 14551185.0, + "step": 12110 + }, + { + "entropy": 1.802834041416645, + "epoch": 0.037570939560230915, + "grad_norm": 12.303117752075195, + "learning_rate": 5.86990216022474e-06, + "loss": 0.5427, + "mean_token_accuracy": 0.8231405153870582, + "num_tokens": 14564613.0, + "step": 12120 + }, + { + "entropy": 1.8981367230415345, + "epoch": 0.037601938685280605, + "grad_norm": 12.030719757080078, + "learning_rate": 5.874745713455391e-06, + "loss": 0.6299, + "mean_token_accuracy": 0.8138510972261429, + "num_tokens": 14575459.0, + "step": 12130 + }, + { + "entropy": 1.8835298061370849, + "epoch": 0.0376329378103303, + "grad_norm": 11.876147270202637, + "learning_rate": 5.879589266686041e-06, + "loss": 0.624, + "mean_token_accuracy": 0.8129880890250206, + "num_tokens": 14587190.0, + "step": 12140 + }, + { + "entropy": 1.8323535963892936, + "epoch": 0.03766393693538, + "grad_norm": 13.480051040649414, + "learning_rate": 5.884432819916691e-06, + "loss": 0.5953, + "mean_token_accuracy": 0.819337697327137, + "num_tokens": 14598092.0, + "step": 12150 + }, + { + "entropy": 1.874643650650978, + "epoch": 0.037694936060429694, + "grad_norm": 10.773601531982422, + "learning_rate": 5.889276373147342e-06, + "loss": 0.6391, + "mean_token_accuracy": 0.8008844360709191, + "num_tokens": 14609253.0, + "step": 12160 + }, + { + "entropy": 1.864101167023182, + "epoch": 0.03772593518547939, + "grad_norm": 5.442148208618164, + "learning_rate": 5.8941199263779915e-06, + "loss": 0.6138, + "mean_token_accuracy": 0.8117442667484284, + "num_tokens": 14620881.0, + "step": 12170 + }, + { + "entropy": 1.9181242629885673, + "epoch": 0.03775693431052909, + "grad_norm": 10.096672058105469, + "learning_rate": 5.898963479608641e-06, + "loss": 0.6121, + "mean_token_accuracy": 0.8077910885214805, + "num_tokens": 14632799.0, + "step": 12180 + }, + { + "entropy": 1.870473875105381, + "epoch": 0.037787933435578784, + "grad_norm": 12.328435897827148, + "learning_rate": 5.903807032839291e-06, + "loss": 0.5875, + "mean_token_accuracy": 0.8157344311475754, + "num_tokens": 14643832.0, + "step": 12190 + }, + { + "entropy": 1.8657541304826737, + "epoch": 0.037818932560628474, + "grad_norm": 14.332444190979004, + "learning_rate": 5.908650586069942e-06, + "loss": 0.62, + "mean_token_accuracy": 0.8118143856525422, + "num_tokens": 14655521.0, + "step": 12200 + }, + { + "entropy": 1.8177946463227272, + "epoch": 0.03784993168567817, + "grad_norm": 11.780436515808105, + "learning_rate": 5.913494139300592e-06, + "loss": 0.6046, + "mean_token_accuracy": 0.8122022807598114, + "num_tokens": 14668492.0, + "step": 12210 + }, + { + "entropy": 1.8311690405011176, + "epoch": 0.03788093081072787, + "grad_norm": 10.684754371643066, + "learning_rate": 5.918337692531242e-06, + "loss": 0.5527, + "mean_token_accuracy": 0.8185856059193611, + "num_tokens": 14681336.0, + "step": 12220 + }, + { + "entropy": 1.8790747031569481, + "epoch": 0.037911929935777564, + "grad_norm": 12.979804039001465, + "learning_rate": 5.923181245761891e-06, + "loss": 0.6315, + "mean_token_accuracy": 0.8123493686318397, + "num_tokens": 14693888.0, + "step": 12230 + }, + { + "entropy": 1.8931312650442123, + "epoch": 0.03794292906082726, + "grad_norm": 12.229876518249512, + "learning_rate": 5.928024798992541e-06, + "loss": 0.625, + "mean_token_accuracy": 0.8202015697956085, + "num_tokens": 14704937.0, + "step": 12240 + }, + { + "entropy": 1.8407990396022798, + "epoch": 0.03797392818587696, + "grad_norm": 11.653843879699707, + "learning_rate": 5.9328683522231914e-06, + "loss": 0.5902, + "mean_token_accuracy": 0.8176357612013817, + "num_tokens": 14717674.0, + "step": 12250 + }, + { + "entropy": 1.8561912134289742, + "epoch": 0.038004927310926646, + "grad_norm": 13.023904800415039, + "learning_rate": 5.937711905453841e-06, + "loss": 0.5849, + "mean_token_accuracy": 0.8190959095954895, + "num_tokens": 14729749.0, + "step": 12260 + }, + { + "entropy": 1.9160822182893753, + "epoch": 0.03803592643597634, + "grad_norm": 12.721014976501465, + "learning_rate": 5.942555458684491e-06, + "loss": 0.6519, + "mean_token_accuracy": 0.8113419517874718, + "num_tokens": 14741605.0, + "step": 12270 + }, + { + "entropy": 1.8194750413298606, + "epoch": 0.03806692556102604, + "grad_norm": 11.955613136291504, + "learning_rate": 5.947399011915141e-06, + "loss": 0.5518, + "mean_token_accuracy": 0.8214196056127548, + "num_tokens": 14754303.0, + "step": 12280 + }, + { + "entropy": 1.8162646040320396, + "epoch": 0.038097924686075736, + "grad_norm": 5.1032328605651855, + "learning_rate": 5.952242565145792e-06, + "loss": 0.5327, + "mean_token_accuracy": 0.818191859126091, + "num_tokens": 14766657.0, + "step": 12290 + }, + { + "entropy": 1.846658205986023, + "epoch": 0.03812892381112543, + "grad_norm": 5.808051109313965, + "learning_rate": 5.957086118376442e-06, + "loss": 0.63, + "mean_token_accuracy": 0.8142410039901733, + "num_tokens": 14778196.0, + "step": 12300 + }, + { + "entropy": 1.7994149655103684, + "epoch": 0.03815992293617513, + "grad_norm": 12.859893798828125, + "learning_rate": 5.961929671607092e-06, + "loss": 0.5125, + "mean_token_accuracy": 0.8262584999203682, + "num_tokens": 14790293.0, + "step": 12310 + }, + { + "entropy": 1.951202955842018, + "epoch": 0.03819092206122482, + "grad_norm": 9.63222599029541, + "learning_rate": 5.966773224837742e-06, + "loss": 0.6364, + "mean_token_accuracy": 0.8155479997396469, + "num_tokens": 14801257.0, + "step": 12320 + }, + { + "entropy": 1.8327611729502677, + "epoch": 0.038221921186274516, + "grad_norm": 12.394364356994629, + "learning_rate": 5.9716167780683905e-06, + "loss": 0.5406, + "mean_token_accuracy": 0.8244035348296166, + "num_tokens": 14813680.0, + "step": 12330 + }, + { + "entropy": 1.8879570379853248, + "epoch": 0.03825292031132421, + "grad_norm": 9.891463279724121, + "learning_rate": 5.976460331299041e-06, + "loss": 0.5948, + "mean_token_accuracy": 0.815042594075203, + "num_tokens": 14825260.0, + "step": 12340 + }, + { + "entropy": 1.8341692447662354, + "epoch": 0.03828391943637391, + "grad_norm": 14.30816650390625, + "learning_rate": 5.981303884529691e-06, + "loss": 0.5992, + "mean_token_accuracy": 0.8117454037070274, + "num_tokens": 14837969.0, + "step": 12350 + }, + { + "entropy": 1.8351563602685927, + "epoch": 0.038314918561423605, + "grad_norm": 12.036502838134766, + "learning_rate": 5.986147437760341e-06, + "loss": 0.5528, + "mean_token_accuracy": 0.820985272526741, + "num_tokens": 14850504.0, + "step": 12360 + }, + { + "entropy": 1.8652123495936395, + "epoch": 0.0383459176864733, + "grad_norm": 5.860367298126221, + "learning_rate": 5.990990990990992e-06, + "loss": 0.5289, + "mean_token_accuracy": 0.8303888604044914, + "num_tokens": 14862430.0, + "step": 12370 + }, + { + "entropy": 1.8906607389450074, + "epoch": 0.03837691681152299, + "grad_norm": 11.164518356323242, + "learning_rate": 5.995834544221642e-06, + "loss": 0.5727, + "mean_token_accuracy": 0.8233511716127395, + "num_tokens": 14874608.0, + "step": 12380 + }, + { + "entropy": 1.9229727298021317, + "epoch": 0.03840791593657269, + "grad_norm": 10.484548568725586, + "learning_rate": 6.0006780974522915e-06, + "loss": 0.617, + "mean_token_accuracy": 0.8203805893659591, + "num_tokens": 14885555.0, + "step": 12390 + }, + { + "entropy": 1.858140294253826, + "epoch": 0.038438915061622385, + "grad_norm": 3.475583791732788, + "learning_rate": 6.005521650682941e-06, + "loss": 0.5348, + "mean_token_accuracy": 0.8278280302882195, + "num_tokens": 14899067.0, + "step": 12400 + }, + { + "entropy": 1.9508850559592248, + "epoch": 0.03846991418667208, + "grad_norm": 14.349974632263184, + "learning_rate": 6.010365203913592e-06, + "loss": 0.6307, + "mean_token_accuracy": 0.8059279710054398, + "num_tokens": 14910681.0, + "step": 12410 + }, + { + "entropy": 1.9063424944877625, + "epoch": 0.03850091331172178, + "grad_norm": 12.377765655517578, + "learning_rate": 6.015208757144242e-06, + "loss": 0.5669, + "mean_token_accuracy": 0.8251767575740814, + "num_tokens": 14922446.0, + "step": 12420 + }, + { + "entropy": 1.8672158405184747, + "epoch": 0.038531912436771475, + "grad_norm": 14.53426456451416, + "learning_rate": 6.020052310374892e-06, + "loss": 0.5257, + "mean_token_accuracy": 0.8267401665449142, + "num_tokens": 14934912.0, + "step": 12430 + }, + { + "entropy": 1.946044033765793, + "epoch": 0.038562911561821164, + "grad_norm": 12.349529266357422, + "learning_rate": 6.024895863605541e-06, + "loss": 0.7026, + "mean_token_accuracy": 0.7958286583423615, + "num_tokens": 14946270.0, + "step": 12440 + }, + { + "entropy": 1.9352963030338288, + "epoch": 0.03859391068687086, + "grad_norm": 15.071447372436523, + "learning_rate": 6.029739416836191e-06, + "loss": 0.6828, + "mean_token_accuracy": 0.8007722824811936, + "num_tokens": 14957285.0, + "step": 12450 + }, + { + "entropy": 1.9621846169233321, + "epoch": 0.03862490981192056, + "grad_norm": 11.935574531555176, + "learning_rate": 6.0345829700668415e-06, + "loss": 0.7034, + "mean_token_accuracy": 0.7971277639269829, + "num_tokens": 14968080.0, + "step": 12460 + }, + { + "entropy": 1.8536121487617492, + "epoch": 0.038655908936970254, + "grad_norm": 12.425955772399902, + "learning_rate": 6.0394265232974914e-06, + "loss": 0.571, + "mean_token_accuracy": 0.8072732269763947, + "num_tokens": 14980128.0, + "step": 12470 + }, + { + "entropy": 1.853446225821972, + "epoch": 0.03868690806201995, + "grad_norm": 5.7373366355896, + "learning_rate": 6.044270076528141e-06, + "loss": 0.6053, + "mean_token_accuracy": 0.8060764700174332, + "num_tokens": 14991962.0, + "step": 12480 + }, + { + "entropy": 1.792759819328785, + "epoch": 0.03871790718706965, + "grad_norm": 11.446489334106445, + "learning_rate": 6.049113629758791e-06, + "loss": 0.5256, + "mean_token_accuracy": 0.8205834746360778, + "num_tokens": 15004797.0, + "step": 12490 + }, + { + "entropy": 1.8071394935250282, + "epoch": 0.038748906312119344, + "grad_norm": 10.317464828491211, + "learning_rate": 6.053957182989442e-06, + "loss": 0.5875, + "mean_token_accuracy": 0.817318132519722, + "num_tokens": 15017391.0, + "step": 12500 + }, + { + "entropy": 1.8499668538570404, + "epoch": 0.038779905437169034, + "grad_norm": 11.95541763305664, + "learning_rate": 6.058800736220092e-06, + "loss": 0.6207, + "mean_token_accuracy": 0.8172942087054252, + "num_tokens": 15028762.0, + "step": 12510 + }, + { + "entropy": 1.8761356472969055, + "epoch": 0.03881090456221873, + "grad_norm": 11.275162696838379, + "learning_rate": 6.063644289450742e-06, + "loss": 0.615, + "mean_token_accuracy": 0.8089762479066849, + "num_tokens": 15040328.0, + "step": 12520 + }, + { + "entropy": 1.8349191531538964, + "epoch": 0.03884190368726843, + "grad_norm": 12.247875213623047, + "learning_rate": 6.0684878426813924e-06, + "loss": 0.628, + "mean_token_accuracy": 0.8030536040663719, + "num_tokens": 15052829.0, + "step": 12530 + }, + { + "entropy": 1.8827327758073806, + "epoch": 0.03887290281231812, + "grad_norm": 13.34996509552002, + "learning_rate": 6.073331395912041e-06, + "loss": 0.5946, + "mean_token_accuracy": 0.8162320896983146, + "num_tokens": 15064110.0, + "step": 12540 + }, + { + "entropy": 1.825656296312809, + "epoch": 0.03890390193736782, + "grad_norm": 9.6633939743042, + "learning_rate": 6.078174949142691e-06, + "loss": 0.5161, + "mean_token_accuracy": 0.8281683087348938, + "num_tokens": 15076335.0, + "step": 12550 + }, + { + "entropy": 1.8427977129817008, + "epoch": 0.038934901062417517, + "grad_norm": 7.553597927093506, + "learning_rate": 6.083018502373341e-06, + "loss": 0.6407, + "mean_token_accuracy": 0.8068282768130303, + "num_tokens": 15088344.0, + "step": 12560 + }, + { + "entropy": 1.7809947788715363, + "epoch": 0.038965900187467206, + "grad_norm": 5.229321002960205, + "learning_rate": 6.087862055603991e-06, + "loss": 0.5344, + "mean_token_accuracy": 0.8179076954722404, + "num_tokens": 15101575.0, + "step": 12570 + }, + { + "entropy": 1.9495843350887299, + "epoch": 0.0389968993125169, + "grad_norm": 10.395573616027832, + "learning_rate": 6.092705608834642e-06, + "loss": 0.6253, + "mean_token_accuracy": 0.8120292127132416, + "num_tokens": 15112551.0, + "step": 12580 + }, + { + "entropy": 1.8639352589845657, + "epoch": 0.0390278984375666, + "grad_norm": 14.738471984863281, + "learning_rate": 6.097549162065292e-06, + "loss": 0.6255, + "mean_token_accuracy": 0.8092112436890602, + "num_tokens": 15124473.0, + "step": 12590 + }, + { + "entropy": 1.8332767322659493, + "epoch": 0.039058897562616296, + "grad_norm": 6.293068885803223, + "learning_rate": 6.102392715295942e-06, + "loss": 0.6048, + "mean_token_accuracy": 0.8051409214735031, + "num_tokens": 15136268.0, + "step": 12600 + }, + { + "entropy": 1.8370594762265682, + "epoch": 0.03908989668766599, + "grad_norm": 11.538333892822266, + "learning_rate": 6.1072362685265915e-06, + "loss": 0.5955, + "mean_token_accuracy": 0.8151108309626579, + "num_tokens": 15147270.0, + "step": 12610 + }, + { + "entropy": 1.8287363216280936, + "epoch": 0.03912089581271569, + "grad_norm": 12.497879981994629, + "learning_rate": 6.112079821757242e-06, + "loss": 0.5737, + "mean_token_accuracy": 0.8114735826849937, + "num_tokens": 15159305.0, + "step": 12620 + }, + { + "entropy": 1.841526921093464, + "epoch": 0.03915189493776538, + "grad_norm": 15.22536563873291, + "learning_rate": 6.116923374987892e-06, + "loss": 0.6205, + "mean_token_accuracy": 0.8125358670949936, + "num_tokens": 15170956.0, + "step": 12630 + }, + { + "entropy": 1.8685049675405025, + "epoch": 0.039182894062815075, + "grad_norm": 10.67180347442627, + "learning_rate": 6.121766928218542e-06, + "loss": 0.6239, + "mean_token_accuracy": 0.8018307209014892, + "num_tokens": 15183129.0, + "step": 12640 + }, + { + "entropy": 1.8348088413476944, + "epoch": 0.03921389318786477, + "grad_norm": 6.374260902404785, + "learning_rate": 6.126610481449191e-06, + "loss": 0.5658, + "mean_token_accuracy": 0.8142863139510155, + "num_tokens": 15195411.0, + "step": 12650 + }, + { + "entropy": 1.9104970484972, + "epoch": 0.03924489231291447, + "grad_norm": 11.84320068359375, + "learning_rate": 6.131454034679841e-06, + "loss": 0.7097, + "mean_token_accuracy": 0.7972751423716545, + "num_tokens": 15206633.0, + "step": 12660 + }, + { + "entropy": 1.8191149190068245, + "epoch": 0.039275891437964165, + "grad_norm": 14.073987007141113, + "learning_rate": 6.136297587910492e-06, + "loss": 0.6266, + "mean_token_accuracy": 0.8092659756541252, + "num_tokens": 15218732.0, + "step": 12670 + }, + { + "entropy": 1.839244581758976, + "epoch": 0.03930689056301386, + "grad_norm": 9.864472389221191, + "learning_rate": 6.1411411411411415e-06, + "loss": 0.5842, + "mean_token_accuracy": 0.8092280417680741, + "num_tokens": 15231394.0, + "step": 12680 + }, + { + "entropy": 1.8042413115501403, + "epoch": 0.03933788968806355, + "grad_norm": 12.334712982177734, + "learning_rate": 6.145984694371791e-06, + "loss": 0.6022, + "mean_token_accuracy": 0.8171849220991134, + "num_tokens": 15242791.0, + "step": 12690 + }, + { + "entropy": 1.8627389699220658, + "epoch": 0.03936888881311325, + "grad_norm": 13.441896438598633, + "learning_rate": 6.150828247602441e-06, + "loss": 0.5941, + "mean_token_accuracy": 0.8153049975633622, + "num_tokens": 15254161.0, + "step": 12700 + }, + { + "entropy": 1.8331183463335037, + "epoch": 0.039399887938162945, + "grad_norm": 12.164008140563965, + "learning_rate": 6.155671800833092e-06, + "loss": 0.5919, + "mean_token_accuracy": 0.8250399813055992, + "num_tokens": 15265756.0, + "step": 12710 + }, + { + "entropy": 1.8537740007042884, + "epoch": 0.03943088706321264, + "grad_norm": 11.0955810546875, + "learning_rate": 6.160515354063742e-06, + "loss": 0.5697, + "mean_token_accuracy": 0.8286507219076157, + "num_tokens": 15277513.0, + "step": 12720 + }, + { + "entropy": 1.8841774478554725, + "epoch": 0.03946188618826234, + "grad_norm": 6.4962663650512695, + "learning_rate": 6.165358907294392e-06, + "loss": 0.6102, + "mean_token_accuracy": 0.8118945837020874, + "num_tokens": 15288556.0, + "step": 12730 + }, + { + "entropy": 1.8026639834046363, + "epoch": 0.039492885313312034, + "grad_norm": 12.76574420928955, + "learning_rate": 6.1702024605250425e-06, + "loss": 0.5569, + "mean_token_accuracy": 0.8168751522898674, + "num_tokens": 15301444.0, + "step": 12740 + }, + { + "entropy": 1.8253360256552695, + "epoch": 0.039523884438361724, + "grad_norm": 5.973870754241943, + "learning_rate": 6.175046013755691e-06, + "loss": 0.5451, + "mean_token_accuracy": 0.8278811991214752, + "num_tokens": 15313764.0, + "step": 12750 + }, + { + "entropy": 1.7969791740179062, + "epoch": 0.03955488356341142, + "grad_norm": 14.835697174072266, + "learning_rate": 6.1798895669863415e-06, + "loss": 0.5442, + "mean_token_accuracy": 0.8214199602603912, + "num_tokens": 15326494.0, + "step": 12760 + }, + { + "entropy": 1.8358357205986977, + "epoch": 0.03958588268846112, + "grad_norm": 6.574784278869629, + "learning_rate": 6.184733120216991e-06, + "loss": 0.6013, + "mean_token_accuracy": 0.8155452191829682, + "num_tokens": 15338454.0, + "step": 12770 + }, + { + "entropy": 1.7818957820534707, + "epoch": 0.039616881813510814, + "grad_norm": 5.757773399353027, + "learning_rate": 6.189576673447641e-06, + "loss": 0.5374, + "mean_token_accuracy": 0.8222944095730782, + "num_tokens": 15351199.0, + "step": 12780 + }, + { + "entropy": 1.7962487503886222, + "epoch": 0.03964788093856051, + "grad_norm": 11.691226959228516, + "learning_rate": 6.194420226678292e-06, + "loss": 0.5549, + "mean_token_accuracy": 0.8140451073646545, + "num_tokens": 15363661.0, + "step": 12790 + }, + { + "entropy": 1.869248776137829, + "epoch": 0.03967888006361021, + "grad_norm": 13.061457633972168, + "learning_rate": 6.199263779908942e-06, + "loss": 0.6036, + "mean_token_accuracy": 0.8111810341477395, + "num_tokens": 15375191.0, + "step": 12800 + }, + { + "entropy": 1.9534028589725494, + "epoch": 0.0397098791886599, + "grad_norm": 12.596964836120605, + "learning_rate": 6.204107333139592e-06, + "loss": 0.7028, + "mean_token_accuracy": 0.7962926715612412, + "num_tokens": 15386223.0, + "step": 12810 + }, + { + "entropy": 1.8273673072457313, + "epoch": 0.03974087831370959, + "grad_norm": 6.692715644836426, + "learning_rate": 6.208950886370242e-06, + "loss": 0.6008, + "mean_token_accuracy": 0.8225413948297501, + "num_tokens": 15398818.0, + "step": 12820 + }, + { + "entropy": 1.8166001297533512, + "epoch": 0.03977187743875929, + "grad_norm": 13.053438186645508, + "learning_rate": 6.213794439600892e-06, + "loss": 0.5571, + "mean_token_accuracy": 0.825442411005497, + "num_tokens": 15411042.0, + "step": 12830 + }, + { + "entropy": 1.8038362354040145, + "epoch": 0.03980287656380899, + "grad_norm": 14.547704696655273, + "learning_rate": 6.218637992831542e-06, + "loss": 0.5879, + "mean_token_accuracy": 0.8112800106406212, + "num_tokens": 15423517.0, + "step": 12840 + }, + { + "entropy": 1.8522241950035094, + "epoch": 0.03983387568885868, + "grad_norm": 13.58086109161377, + "learning_rate": 6.223481546062192e-06, + "loss": 0.6283, + "mean_token_accuracy": 0.8121964901685714, + "num_tokens": 15434996.0, + "step": 12850 + }, + { + "entropy": 1.8666557878255845, + "epoch": 0.03986487481390838, + "grad_norm": 12.256643295288086, + "learning_rate": 6.228325099292841e-06, + "loss": 0.6245, + "mean_token_accuracy": 0.7993756279349327, + "num_tokens": 15446746.0, + "step": 12860 + }, + { + "entropy": 1.8094900250434875, + "epoch": 0.039895873938958076, + "grad_norm": 12.334588050842285, + "learning_rate": 6.233168652523491e-06, + "loss": 0.5687, + "mean_token_accuracy": 0.8260209634900093, + "num_tokens": 15459054.0, + "step": 12870 + }, + { + "entropy": 1.8770345002412796, + "epoch": 0.039926873064007766, + "grad_norm": 12.718692779541016, + "learning_rate": 6.238012205754142e-06, + "loss": 0.6855, + "mean_token_accuracy": 0.7952335923910141, + "num_tokens": 15469948.0, + "step": 12880 + }, + { + "entropy": 1.8727562785148621, + "epoch": 0.03995787218905746, + "grad_norm": 9.794669151306152, + "learning_rate": 6.242855758984792e-06, + "loss": 0.6304, + "mean_token_accuracy": 0.816286937892437, + "num_tokens": 15482541.0, + "step": 12890 + }, + { + "entropy": 1.8690064042806624, + "epoch": 0.03998887131410716, + "grad_norm": 11.124152183532715, + "learning_rate": 6.2476993122154415e-06, + "loss": 0.589, + "mean_token_accuracy": 0.8171432822942734, + "num_tokens": 15494339.0, + "step": 12900 + }, + { + "entropy": 1.815145094692707, + "epoch": 0.040019870439156856, + "grad_norm": 11.725676536560059, + "learning_rate": 6.252542865446091e-06, + "loss": 0.5313, + "mean_token_accuracy": 0.822598272562027, + "num_tokens": 15506838.0, + "step": 12910 + }, + { + "entropy": 1.7810951352119446, + "epoch": 0.04005086956420655, + "grad_norm": 6.2188520431518555, + "learning_rate": 6.257386418676742e-06, + "loss": 0.5694, + "mean_token_accuracy": 0.8137658298015594, + "num_tokens": 15520324.0, + "step": 12920 + }, + { + "entropy": 1.8406692005693912, + "epoch": 0.04008186868925625, + "grad_norm": 10.676637649536133, + "learning_rate": 6.262229971907392e-06, + "loss": 0.5757, + "mean_token_accuracy": 0.8103611707687378, + "num_tokens": 15533133.0, + "step": 12930 + }, + { + "entropy": 1.9034065037965775, + "epoch": 0.04011286781430594, + "grad_norm": 12.021625518798828, + "learning_rate": 6.267073525138042e-06, + "loss": 0.6135, + "mean_token_accuracy": 0.8219456240534783, + "num_tokens": 15543784.0, + "step": 12940 + }, + { + "entropy": 1.859899564087391, + "epoch": 0.040143866939355635, + "grad_norm": 5.559539794921875, + "learning_rate": 6.271917078368693e-06, + "loss": 0.6117, + "mean_token_accuracy": 0.8060070484876632, + "num_tokens": 15555512.0, + "step": 12950 + }, + { + "entropy": 1.8671554252505302, + "epoch": 0.04017486606440533, + "grad_norm": 4.944991111755371, + "learning_rate": 6.276760631599341e-06, + "loss": 0.6074, + "mean_token_accuracy": 0.8140358075499534, + "num_tokens": 15567319.0, + "step": 12960 + }, + { + "entropy": 1.8478776171803475, + "epoch": 0.04020586518945503, + "grad_norm": 6.253074645996094, + "learning_rate": 6.2816041848299916e-06, + "loss": 0.6098, + "mean_token_accuracy": 0.8053146034479142, + "num_tokens": 15579682.0, + "step": 12970 + }, + { + "entropy": 1.9027994275093079, + "epoch": 0.040236864314504725, + "grad_norm": 11.593799591064453, + "learning_rate": 6.2864477380606414e-06, + "loss": 0.6247, + "mean_token_accuracy": 0.8165087446570396, + "num_tokens": 15590423.0, + "step": 12980 + }, + { + "entropy": 1.8880388498306275, + "epoch": 0.04026786343955442, + "grad_norm": 12.472247123718262, + "learning_rate": 6.291291291291291e-06, + "loss": 0.6243, + "mean_token_accuracy": 0.8147116348147392, + "num_tokens": 15601491.0, + "step": 12990 + }, + { + "entropy": 1.8754856497049333, + "epoch": 0.04029886256460411, + "grad_norm": 10.099672317504883, + "learning_rate": 6.296134844521942e-06, + "loss": 0.615, + "mean_token_accuracy": 0.8218295946717262, + "num_tokens": 15611664.0, + "step": 13000 + }, + { + "entropy": 1.8529302537441255, + "epoch": 0.04032986168965381, + "grad_norm": 11.330911636352539, + "learning_rate": 6.300978397752592e-06, + "loss": 0.6227, + "mean_token_accuracy": 0.8136036321520805, + "num_tokens": 15623729.0, + "step": 13010 + }, + { + "entropy": 1.8294827699661256, + "epoch": 0.040360860814703504, + "grad_norm": 12.999713897705078, + "learning_rate": 6.305821950983242e-06, + "loss": 0.6562, + "mean_token_accuracy": 0.8110322475433349, + "num_tokens": 15634983.0, + "step": 13020 + }, + { + "entropy": 1.8562739863991737, + "epoch": 0.0403918599397532, + "grad_norm": 9.199260711669922, + "learning_rate": 6.310665504213892e-06, + "loss": 0.5929, + "mean_token_accuracy": 0.8223294109106064, + "num_tokens": 15646086.0, + "step": 13030 + }, + { + "entropy": 1.8410490170121192, + "epoch": 0.0404228590648029, + "grad_norm": 12.867775917053223, + "learning_rate": 6.3155090574445424e-06, + "loss": 0.6028, + "mean_token_accuracy": 0.8097691163420677, + "num_tokens": 15658322.0, + "step": 13040 + }, + { + "entropy": 1.770823860168457, + "epoch": 0.040453858189852594, + "grad_norm": 5.460647106170654, + "learning_rate": 6.320352610675192e-06, + "loss": 0.4783, + "mean_token_accuracy": 0.8246093928813935, + "num_tokens": 15671766.0, + "step": 13050 + }, + { + "entropy": 1.856894339621067, + "epoch": 0.040484857314902284, + "grad_norm": 12.062054634094238, + "learning_rate": 6.325196163905842e-06, + "loss": 0.5404, + "mean_token_accuracy": 0.8271098077297211, + "num_tokens": 15683812.0, + "step": 13060 + }, + { + "entropy": 1.8751189470291139, + "epoch": 0.04051585643995198, + "grad_norm": 6.459671974182129, + "learning_rate": 6.330039717136491e-06, + "loss": 0.6067, + "mean_token_accuracy": 0.8083498194813729, + "num_tokens": 15695486.0, + "step": 13070 + }, + { + "entropy": 1.8417337238788605, + "epoch": 0.04054685556500168, + "grad_norm": 6.831528663635254, + "learning_rate": 6.334883270367141e-06, + "loss": 0.6252, + "mean_token_accuracy": 0.8124615296721458, + "num_tokens": 15707915.0, + "step": 13080 + }, + { + "entropy": 1.854557254910469, + "epoch": 0.040577854690051374, + "grad_norm": 10.786755561828613, + "learning_rate": 6.339726823597792e-06, + "loss": 0.5652, + "mean_token_accuracy": 0.8192732855677605, + "num_tokens": 15720535.0, + "step": 13090 + }, + { + "entropy": 1.9224106088280677, + "epoch": 0.04060885381510107, + "grad_norm": 12.00683879852295, + "learning_rate": 6.344570376828442e-06, + "loss": 0.698, + "mean_token_accuracy": 0.7983192473649978, + "num_tokens": 15732872.0, + "step": 13100 + }, + { + "entropy": 1.8551457852125168, + "epoch": 0.04063985294015077, + "grad_norm": 7.283522605895996, + "learning_rate": 6.349413930059092e-06, + "loss": 0.6393, + "mean_token_accuracy": 0.8065184399485588, + "num_tokens": 15745391.0, + "step": 13110 + }, + { + "entropy": 1.8762536928057671, + "epoch": 0.04067085206520046, + "grad_norm": 5.411208629608154, + "learning_rate": 6.3542574832897415e-06, + "loss": 0.5594, + "mean_token_accuracy": 0.8252108618617058, + "num_tokens": 15756896.0, + "step": 13120 + }, + { + "entropy": 1.9300520285964011, + "epoch": 0.04070185119025015, + "grad_norm": 11.322684288024902, + "learning_rate": 6.359101036520392e-06, + "loss": 0.6418, + "mean_token_accuracy": 0.8126945987343788, + "num_tokens": 15768386.0, + "step": 13130 + }, + { + "entropy": 1.7394558861851692, + "epoch": 0.04073285031529985, + "grad_norm": 3.4418604373931885, + "learning_rate": 6.363944589751042e-06, + "loss": 0.5333, + "mean_token_accuracy": 0.8297542706131935, + "num_tokens": 15782338.0, + "step": 13140 + }, + { + "entropy": 1.886605440080166, + "epoch": 0.040763849440349546, + "grad_norm": 10.883418083190918, + "learning_rate": 6.368788142981692e-06, + "loss": 0.5851, + "mean_token_accuracy": 0.8118449002504349, + "num_tokens": 15794126.0, + "step": 13150 + }, + { + "entropy": 1.833412842452526, + "epoch": 0.04079484856539924, + "grad_norm": 5.032846450805664, + "learning_rate": 6.373631696212343e-06, + "loss": 0.5319, + "mean_token_accuracy": 0.8236513167619706, + "num_tokens": 15806365.0, + "step": 13160 + }, + { + "entropy": 1.8451514735817909, + "epoch": 0.04082584769044894, + "grad_norm": 12.876490592956543, + "learning_rate": 6.378475249442991e-06, + "loss": 0.6205, + "mean_token_accuracy": 0.8131675496697426, + "num_tokens": 15817789.0, + "step": 13170 + }, + { + "entropy": 1.9239716470241546, + "epoch": 0.04085684681549863, + "grad_norm": 10.991585731506348, + "learning_rate": 6.383318802673642e-06, + "loss": 0.6288, + "mean_token_accuracy": 0.8010996967554093, + "num_tokens": 15828472.0, + "step": 13180 + }, + { + "entropy": 1.861264744400978, + "epoch": 0.040887845940548326, + "grad_norm": 12.282683372497559, + "learning_rate": 6.3881623559042915e-06, + "loss": 0.5954, + "mean_token_accuracy": 0.8243414610624313, + "num_tokens": 15840805.0, + "step": 13190 + }, + { + "entropy": 1.7732505962252616, + "epoch": 0.04091884506559802, + "grad_norm": 10.902586936950684, + "learning_rate": 6.3930059091349414e-06, + "loss": 0.5532, + "mean_token_accuracy": 0.8143455445766449, + "num_tokens": 15854669.0, + "step": 13200 + }, + { + "entropy": 1.8966855704784393, + "epoch": 0.04094984419064772, + "grad_norm": 15.284767150878906, + "learning_rate": 6.397849462365592e-06, + "loss": 0.6365, + "mean_token_accuracy": 0.8180679067969322, + "num_tokens": 15866135.0, + "step": 13210 + }, + { + "entropy": 1.8465805247426033, + "epoch": 0.040980843315697416, + "grad_norm": 10.624847412109375, + "learning_rate": 6.402693015596242e-06, + "loss": 0.5707, + "mean_token_accuracy": 0.8213504150509834, + "num_tokens": 15878224.0, + "step": 13220 + }, + { + "entropy": 1.9168522462248803, + "epoch": 0.04101184244074711, + "grad_norm": 10.14426326751709, + "learning_rate": 6.407536568826892e-06, + "loss": 0.6252, + "mean_token_accuracy": 0.8033525243401527, + "num_tokens": 15891104.0, + "step": 13230 + }, + { + "entropy": 1.902791763842106, + "epoch": 0.04104284156579681, + "grad_norm": 12.496599197387695, + "learning_rate": 6.412380122057542e-06, + "loss": 0.5903, + "mean_token_accuracy": 0.809710368514061, + "num_tokens": 15903392.0, + "step": 13240 + }, + { + "entropy": 1.834274485707283, + "epoch": 0.0410738406908465, + "grad_norm": 10.14233684539795, + "learning_rate": 6.4172236752881925e-06, + "loss": 0.5469, + "mean_token_accuracy": 0.8176365941762924, + "num_tokens": 15916354.0, + "step": 13250 + }, + { + "entropy": 1.7371825829148293, + "epoch": 0.041104839815896195, + "grad_norm": 4.078171730041504, + "learning_rate": 6.4220672285188424e-06, + "loss": 0.4872, + "mean_token_accuracy": 0.8276883006095886, + "num_tokens": 15929845.0, + "step": 13260 + }, + { + "entropy": 1.906320759654045, + "epoch": 0.04113583894094589, + "grad_norm": 13.177295684814453, + "learning_rate": 6.426910781749492e-06, + "loss": 0.6577, + "mean_token_accuracy": 0.803479115664959, + "num_tokens": 15941475.0, + "step": 13270 + }, + { + "entropy": 1.86391938328743, + "epoch": 0.04116683806599559, + "grad_norm": 12.480244636535645, + "learning_rate": 6.431754334980141e-06, + "loss": 0.6242, + "mean_token_accuracy": 0.8072491884231567, + "num_tokens": 15953798.0, + "step": 13280 + }, + { + "entropy": 1.8636515244841576, + "epoch": 0.041197837191045285, + "grad_norm": 11.828653335571289, + "learning_rate": 6.436597888210791e-06, + "loss": 0.6048, + "mean_token_accuracy": 0.8113080978393554, + "num_tokens": 15965803.0, + "step": 13290 + }, + { + "entropy": 1.8319952994585038, + "epoch": 0.04122883631609498, + "grad_norm": 10.738685607910156, + "learning_rate": 6.441441441441442e-06, + "loss": 0.6163, + "mean_token_accuracy": 0.8101550653576851, + "num_tokens": 15978557.0, + "step": 13300 + }, + { + "entropy": 1.8741496190428735, + "epoch": 0.04125983544114467, + "grad_norm": 11.56949234008789, + "learning_rate": 6.446284994672092e-06, + "loss": 0.5879, + "mean_token_accuracy": 0.811277537047863, + "num_tokens": 15990542.0, + "step": 13310 + }, + { + "entropy": 1.9263570591807366, + "epoch": 0.04129083456619437, + "grad_norm": 5.565122127532959, + "learning_rate": 6.451128547902742e-06, + "loss": 0.6913, + "mean_token_accuracy": 0.7948109433054924, + "num_tokens": 16001976.0, + "step": 13320 + }, + { + "entropy": 1.9291939318180085, + "epoch": 0.041321833691244064, + "grad_norm": 11.844070434570312, + "learning_rate": 6.455972101133392e-06, + "loss": 0.6205, + "mean_token_accuracy": 0.8096133157610893, + "num_tokens": 16012910.0, + "step": 13330 + }, + { + "entropy": 1.8042861357331277, + "epoch": 0.04135283281629376, + "grad_norm": 10.28139877319336, + "learning_rate": 6.460815654364042e-06, + "loss": 0.5329, + "mean_token_accuracy": 0.825696873664856, + "num_tokens": 16025765.0, + "step": 13340 + }, + { + "entropy": 1.9436326138675213, + "epoch": 0.04138383194134346, + "grad_norm": 12.31657886505127, + "learning_rate": 6.465659207594692e-06, + "loss": 0.6457, + "mean_token_accuracy": 0.7972278758883476, + "num_tokens": 16037425.0, + "step": 13350 + }, + { + "entropy": 1.8341301783919335, + "epoch": 0.041414831066393154, + "grad_norm": 11.382768630981445, + "learning_rate": 6.470502760825342e-06, + "loss": 0.5102, + "mean_token_accuracy": 0.8359068840742111, + "num_tokens": 16050108.0, + "step": 13360 + }, + { + "entropy": 1.9616862878203392, + "epoch": 0.041445830191442844, + "grad_norm": 11.222855567932129, + "learning_rate": 6.475346314055993e-06, + "loss": 0.6498, + "mean_token_accuracy": 0.8021759241819382, + "num_tokens": 16061548.0, + "step": 13370 + }, + { + "entropy": 1.9346465274691582, + "epoch": 0.04147682931649254, + "grad_norm": 10.253416061401367, + "learning_rate": 6.480189867286641e-06, + "loss": 0.6582, + "mean_token_accuracy": 0.8080699786543846, + "num_tokens": 16072805.0, + "step": 13380 + }, + { + "entropy": 1.8889733031392097, + "epoch": 0.04150782844154224, + "grad_norm": 10.119233131408691, + "learning_rate": 6.485033420517292e-06, + "loss": 0.6144, + "mean_token_accuracy": 0.8134203433990479, + "num_tokens": 16084811.0, + "step": 13390 + }, + { + "entropy": 1.935821218788624, + "epoch": 0.041538827566591933, + "grad_norm": 11.066147804260254, + "learning_rate": 6.489876973747942e-06, + "loss": 0.6034, + "mean_token_accuracy": 0.8229247480630875, + "num_tokens": 16095975.0, + "step": 13400 + }, + { + "entropy": 1.8281036272644997, + "epoch": 0.04156982669164163, + "grad_norm": 12.909806251525879, + "learning_rate": 6.4947205269785915e-06, + "loss": 0.5791, + "mean_token_accuracy": 0.8127323508262634, + "num_tokens": 16108978.0, + "step": 13410 + }, + { + "entropy": 1.915538875758648, + "epoch": 0.04160082581669133, + "grad_norm": 11.429049491882324, + "learning_rate": 6.499564080209242e-06, + "loss": 0.6331, + "mean_token_accuracy": 0.8143791824579238, + "num_tokens": 16121160.0, + "step": 13420 + }, + { + "entropy": 1.8657421082258225, + "epoch": 0.041631824941741016, + "grad_norm": 5.312261581420898, + "learning_rate": 6.504407633439892e-06, + "loss": 0.5418, + "mean_token_accuracy": 0.8279088050127029, + "num_tokens": 16133755.0, + "step": 13430 + }, + { + "entropy": 1.9448801666498183, + "epoch": 0.04166282406679071, + "grad_norm": 11.335988998413086, + "learning_rate": 6.509251186670542e-06, + "loss": 0.6206, + "mean_token_accuracy": 0.8046508118510246, + "num_tokens": 16145359.0, + "step": 13440 + }, + { + "entropy": 1.837200105190277, + "epoch": 0.04169382319184041, + "grad_norm": 8.97873592376709, + "learning_rate": 6.514094739901192e-06, + "loss": 0.577, + "mean_token_accuracy": 0.823949719965458, + "num_tokens": 16157112.0, + "step": 13450 + }, + { + "entropy": 1.8169792860746383, + "epoch": 0.041724822316890106, + "grad_norm": 13.009424209594727, + "learning_rate": 6.518938293131843e-06, + "loss": 0.5284, + "mean_token_accuracy": 0.8228383541107178, + "num_tokens": 16169936.0, + "step": 13460 + }, + { + "entropy": 1.7715341180562973, + "epoch": 0.0417558214419398, + "grad_norm": 4.483689308166504, + "learning_rate": 6.5237818463624925e-06, + "loss": 0.5045, + "mean_token_accuracy": 0.8237165853381156, + "num_tokens": 16184134.0, + "step": 13470 + }, + { + "entropy": 1.9055643543601035, + "epoch": 0.0417868205669895, + "grad_norm": 11.176976203918457, + "learning_rate": 6.528625399593142e-06, + "loss": 0.616, + "mean_token_accuracy": 0.807232391834259, + "num_tokens": 16195929.0, + "step": 13480 + }, + { + "entropy": 1.7436404943466186, + "epoch": 0.04181781969203919, + "grad_norm": 11.869129180908203, + "learning_rate": 6.5334689528237915e-06, + "loss": 0.4807, + "mean_token_accuracy": 0.8363062143325806, + "num_tokens": 16210306.0, + "step": 13490 + }, + { + "entropy": 1.8251071318984031, + "epoch": 0.041848818817088886, + "grad_norm": 6.912018775939941, + "learning_rate": 6.538312506054441e-06, + "loss": 0.5753, + "mean_token_accuracy": 0.8134068369865417, + "num_tokens": 16222376.0, + "step": 13500 + }, + { + "entropy": 1.8436161801218987, + "epoch": 0.04187981794213858, + "grad_norm": 10.873340606689453, + "learning_rate": 6.543156059285092e-06, + "loss": 0.5726, + "mean_token_accuracy": 0.8138954237103462, + "num_tokens": 16235581.0, + "step": 13510 + }, + { + "entropy": 1.9201088815927505, + "epoch": 0.04191081706718828, + "grad_norm": 12.203166007995605, + "learning_rate": 6.547999612515742e-06, + "loss": 0.6399, + "mean_token_accuracy": 0.809160690009594, + "num_tokens": 16246598.0, + "step": 13520 + }, + { + "entropy": 1.9009541541337966, + "epoch": 0.041941816192237975, + "grad_norm": 12.666475296020508, + "learning_rate": 6.552843165746392e-06, + "loss": 0.6352, + "mean_token_accuracy": 0.7980454340577126, + "num_tokens": 16258072.0, + "step": 13530 + }, + { + "entropy": 1.8785468250513078, + "epoch": 0.04197281531728767, + "grad_norm": 10.72907829284668, + "learning_rate": 6.557686718977042e-06, + "loss": 0.6576, + "mean_token_accuracy": 0.8070585578680038, + "num_tokens": 16269003.0, + "step": 13540 + }, + { + "entropy": 1.7919475421309472, + "epoch": 0.04200381444233736, + "grad_norm": 12.568443298339844, + "learning_rate": 6.5625302722076924e-06, + "loss": 0.5273, + "mean_token_accuracy": 0.8193079605698586, + "num_tokens": 16281900.0, + "step": 13550 + }, + { + "entropy": 1.865149575471878, + "epoch": 0.04203481356738706, + "grad_norm": 11.324207305908203, + "learning_rate": 6.567373825438342e-06, + "loss": 0.6203, + "mean_token_accuracy": 0.810130500793457, + "num_tokens": 16293834.0, + "step": 13560 + }, + { + "entropy": 1.9343768432736397, + "epoch": 0.042065812692436755, + "grad_norm": 13.249279975891113, + "learning_rate": 6.572217378668992e-06, + "loss": 0.6251, + "mean_token_accuracy": 0.8070745259523392, + "num_tokens": 16305965.0, + "step": 13570 + }, + { + "entropy": 1.916192325949669, + "epoch": 0.04209681181748645, + "grad_norm": 12.691089630126953, + "learning_rate": 6.577060931899643e-06, + "loss": 0.5989, + "mean_token_accuracy": 0.8105158194899559, + "num_tokens": 16317096.0, + "step": 13580 + }, + { + "entropy": 1.888793683052063, + "epoch": 0.04212781094253615, + "grad_norm": 9.996535301208496, + "learning_rate": 6.581904485130292e-06, + "loss": 0.6723, + "mean_token_accuracy": 0.8087509065866471, + "num_tokens": 16329359.0, + "step": 13590 + }, + { + "entropy": 1.8468269050121306, + "epoch": 0.042158810067585845, + "grad_norm": 10.06303596496582, + "learning_rate": 6.586748038360942e-06, + "loss": 0.5719, + "mean_token_accuracy": 0.8199773550033569, + "num_tokens": 16341773.0, + "step": 13600 + }, + { + "entropy": 1.8991917297244072, + "epoch": 0.04218980919263554, + "grad_norm": 3.874143362045288, + "learning_rate": 6.591591591591592e-06, + "loss": 0.6202, + "mean_token_accuracy": 0.8165913313627243, + "num_tokens": 16353377.0, + "step": 13610 + }, + { + "entropy": 1.9188966274261474, + "epoch": 0.04222080831768523, + "grad_norm": 11.466548919677734, + "learning_rate": 6.596435144822242e-06, + "loss": 0.6185, + "mean_token_accuracy": 0.8085856437683105, + "num_tokens": 16365134.0, + "step": 13620 + }, + { + "entropy": 1.9046609073877334, + "epoch": 0.04225180744273493, + "grad_norm": 10.0979585647583, + "learning_rate": 6.601278698052892e-06, + "loss": 0.5963, + "mean_token_accuracy": 0.8179086208343506, + "num_tokens": 16376299.0, + "step": 13630 + }, + { + "entropy": 1.828147941827774, + "epoch": 0.042282806567784624, + "grad_norm": 10.194396018981934, + "learning_rate": 6.606122251283542e-06, + "loss": 0.5521, + "mean_token_accuracy": 0.8191608220338822, + "num_tokens": 16388768.0, + "step": 13640 + }, + { + "entropy": 1.8713184520602226, + "epoch": 0.04231380569283432, + "grad_norm": 12.538405418395996, + "learning_rate": 6.610965804514192e-06, + "loss": 0.6416, + "mean_token_accuracy": 0.8056328803300857, + "num_tokens": 16400797.0, + "step": 13650 + }, + { + "entropy": 1.8713348254561424, + "epoch": 0.04234480481788402, + "grad_norm": 12.18179988861084, + "learning_rate": 6.615809357744842e-06, + "loss": 0.5399, + "mean_token_accuracy": 0.8290207415819169, + "num_tokens": 16412316.0, + "step": 13660 + }, + { + "entropy": 1.7908138126134872, + "epoch": 0.042375803942933714, + "grad_norm": 11.162075996398926, + "learning_rate": 6.620652910975493e-06, + "loss": 0.548, + "mean_token_accuracy": 0.8253503978252411, + "num_tokens": 16426142.0, + "step": 13670 + }, + { + "entropy": 1.8941476494073868, + "epoch": 0.042406803067983403, + "grad_norm": 15.437644958496094, + "learning_rate": 6.625496464206143e-06, + "loss": 0.6019, + "mean_token_accuracy": 0.8184331357479095, + "num_tokens": 16438321.0, + "step": 13680 + }, + { + "entropy": 1.9231675088405609, + "epoch": 0.0424378021930331, + "grad_norm": 12.137004852294922, + "learning_rate": 6.6303400174367925e-06, + "loss": 0.6361, + "mean_token_accuracy": 0.814939396083355, + "num_tokens": 16449946.0, + "step": 13690 + }, + { + "entropy": 1.8700689136981965, + "epoch": 0.0424688013180828, + "grad_norm": 10.420080184936523, + "learning_rate": 6.6351835706674416e-06, + "loss": 0.5667, + "mean_token_accuracy": 0.8213317602872848, + "num_tokens": 16461823.0, + "step": 13700 + }, + { + "entropy": 1.9616285428404807, + "epoch": 0.04249980044313249, + "grad_norm": 11.999855995178223, + "learning_rate": 6.6400271238980914e-06, + "loss": 0.6274, + "mean_token_accuracy": 0.8095705136656761, + "num_tokens": 16473326.0, + "step": 13710 + }, + { + "entropy": 1.8622044518589973, + "epoch": 0.04253079956818219, + "grad_norm": 6.192050457000732, + "learning_rate": 6.644870677128742e-06, + "loss": 0.5515, + "mean_token_accuracy": 0.8142581716179847, + "num_tokens": 16486260.0, + "step": 13720 + }, + { + "entropy": 1.9137049853801726, + "epoch": 0.042561798693231886, + "grad_norm": 10.177970886230469, + "learning_rate": 6.649714230359392e-06, + "loss": 0.6307, + "mean_token_accuracy": 0.8191399827599526, + "num_tokens": 16497734.0, + "step": 13730 + }, + { + "entropy": 1.8745167449116706, + "epoch": 0.042592797818281576, + "grad_norm": 10.241938591003418, + "learning_rate": 6.654557783590042e-06, + "loss": 0.5971, + "mean_token_accuracy": 0.8165997639298439, + "num_tokens": 16509288.0, + "step": 13740 + }, + { + "entropy": 1.93837161809206, + "epoch": 0.04262379694333127, + "grad_norm": 12.580158233642578, + "learning_rate": 6.659401336820692e-06, + "loss": 0.6656, + "mean_token_accuracy": 0.7997888430953026, + "num_tokens": 16520584.0, + "step": 13750 + }, + { + "entropy": 1.9007338181138038, + "epoch": 0.04265479606838097, + "grad_norm": 10.395405769348145, + "learning_rate": 6.6642448900513426e-06, + "loss": 0.6244, + "mean_token_accuracy": 0.8126766815781593, + "num_tokens": 16532328.0, + "step": 13760 + }, + { + "entropy": 1.9072567522525787, + "epoch": 0.042685795193430666, + "grad_norm": 12.813956260681152, + "learning_rate": 6.6690884432819924e-06, + "loss": 0.6346, + "mean_token_accuracy": 0.81207554936409, + "num_tokens": 16543530.0, + "step": 13770 + }, + { + "entropy": 1.9019085496664048, + "epoch": 0.04271679431848036, + "grad_norm": 12.529980659484863, + "learning_rate": 6.673931996512642e-06, + "loss": 0.66, + "mean_token_accuracy": 0.8073429599404335, + "num_tokens": 16554610.0, + "step": 13780 + }, + { + "entropy": 1.915639691054821, + "epoch": 0.04274779344353006, + "grad_norm": 6.579958438873291, + "learning_rate": 6.678775549743293e-06, + "loss": 0.6188, + "mean_token_accuracy": 0.8055771961808205, + "num_tokens": 16566534.0, + "step": 13790 + }, + { + "entropy": 1.897673524916172, + "epoch": 0.04277879256857975, + "grad_norm": 13.530618667602539, + "learning_rate": 6.683619102973943e-06, + "loss": 0.5869, + "mean_token_accuracy": 0.821336168050766, + "num_tokens": 16577725.0, + "step": 13800 + }, + { + "entropy": 1.8759017661213875, + "epoch": 0.042809791693629445, + "grad_norm": 10.226716995239258, + "learning_rate": 6.688462656204592e-06, + "loss": 0.5829, + "mean_token_accuracy": 0.823052391409874, + "num_tokens": 16589628.0, + "step": 13810 + }, + { + "entropy": 1.8439259082078934, + "epoch": 0.04284079081867914, + "grad_norm": 10.295611381530762, + "learning_rate": 6.693306209435242e-06, + "loss": 0.581, + "mean_token_accuracy": 0.8230698376893997, + "num_tokens": 16601483.0, + "step": 13820 + }, + { + "entropy": 1.9123721539974212, + "epoch": 0.04287178994372884, + "grad_norm": 4.241229057312012, + "learning_rate": 6.698149762665892e-06, + "loss": 0.5893, + "mean_token_accuracy": 0.8144331857562065, + "num_tokens": 16613580.0, + "step": 13830 + }, + { + "entropy": 1.8960613742470742, + "epoch": 0.042902789068778535, + "grad_norm": 6.314639091491699, + "learning_rate": 6.7029933158965425e-06, + "loss": 0.591, + "mean_token_accuracy": 0.8104770466685295, + "num_tokens": 16626132.0, + "step": 13840 + }, + { + "entropy": 1.9086899921298026, + "epoch": 0.04293378819382823, + "grad_norm": 5.750073432922363, + "learning_rate": 6.707836869127192e-06, + "loss": 0.6769, + "mean_token_accuracy": 0.8074093982577324, + "num_tokens": 16637715.0, + "step": 13850 + }, + { + "entropy": 1.9291119009256363, + "epoch": 0.04296478731887792, + "grad_norm": 10.997711181640625, + "learning_rate": 6.712680422357842e-06, + "loss": 0.64, + "mean_token_accuracy": 0.8164422243833542, + "num_tokens": 16648790.0, + "step": 13860 + }, + { + "entropy": 1.7718395471572876, + "epoch": 0.04299578644392762, + "grad_norm": 5.029774188995361, + "learning_rate": 6.717523975588492e-06, + "loss": 0.5216, + "mean_token_accuracy": 0.824312150478363, + "num_tokens": 16661991.0, + "step": 13870 + }, + { + "entropy": 1.897067406773567, + "epoch": 0.043026785568977315, + "grad_norm": 13.85363483428955, + "learning_rate": 6.722367528819143e-06, + "loss": 0.6711, + "mean_token_accuracy": 0.8090870007872581, + "num_tokens": 16672565.0, + "step": 13880 + }, + { + "entropy": 1.8265616819262505, + "epoch": 0.04305778469402701, + "grad_norm": 13.524333000183105, + "learning_rate": 6.727211082049793e-06, + "loss": 0.6015, + "mean_token_accuracy": 0.8150227144360542, + "num_tokens": 16685456.0, + "step": 13890 + }, + { + "entropy": 1.8813450008630752, + "epoch": 0.04308878381907671, + "grad_norm": 12.762773513793945, + "learning_rate": 6.732054635280443e-06, + "loss": 0.5886, + "mean_token_accuracy": 0.8163325443863869, + "num_tokens": 16697034.0, + "step": 13900 + }, + { + "entropy": 1.8345113858580588, + "epoch": 0.043119782944126404, + "grad_norm": 12.19513988494873, + "learning_rate": 6.736898188511092e-06, + "loss": 0.5777, + "mean_token_accuracy": 0.8157405987381935, + "num_tokens": 16708871.0, + "step": 13910 + }, + { + "entropy": 1.8650273010134697, + "epoch": 0.043150782069176094, + "grad_norm": 12.387640953063965, + "learning_rate": 6.7417417417417415e-06, + "loss": 0.6953, + "mean_token_accuracy": 0.8010308906435967, + "num_tokens": 16720631.0, + "step": 13920 + }, + { + "entropy": 1.8238304048776626, + "epoch": 0.04318178119422579, + "grad_norm": 10.031027793884277, + "learning_rate": 6.746585294972392e-06, + "loss": 0.5536, + "mean_token_accuracy": 0.8242588087916374, + "num_tokens": 16733234.0, + "step": 13930 + }, + { + "entropy": 1.8230559036135674, + "epoch": 0.04321278031927549, + "grad_norm": 12.307608604431152, + "learning_rate": 6.751428848203042e-06, + "loss": 0.5682, + "mean_token_accuracy": 0.8173372864723205, + "num_tokens": 16745599.0, + "step": 13940 + }, + { + "entropy": 1.9341980874538423, + "epoch": 0.043243779444325184, + "grad_norm": 12.861936569213867, + "learning_rate": 6.756272401433692e-06, + "loss": 0.6583, + "mean_token_accuracy": 0.8072870954871177, + "num_tokens": 16756209.0, + "step": 13950 + }, + { + "entropy": 1.8573587149381638, + "epoch": 0.04327477856937488, + "grad_norm": 11.555964469909668, + "learning_rate": 6.761115954664342e-06, + "loss": 0.5883, + "mean_token_accuracy": 0.8267058104276657, + "num_tokens": 16767433.0, + "step": 13960 + }, + { + "entropy": 1.8955969214439392, + "epoch": 0.04330577769442458, + "grad_norm": 11.410296440124512, + "learning_rate": 6.765959507894993e-06, + "loss": 0.5956, + "mean_token_accuracy": 0.8207617849111557, + "num_tokens": 16778097.0, + "step": 13970 + }, + { + "entropy": 1.876904509961605, + "epoch": 0.043336776819474274, + "grad_norm": 10.337623596191406, + "learning_rate": 6.7708030611256425e-06, + "loss": 0.636, + "mean_token_accuracy": 0.8040330380201339, + "num_tokens": 16789800.0, + "step": 13980 + }, + { + "entropy": 1.9401722326874733, + "epoch": 0.04336777594452396, + "grad_norm": 11.589249610900879, + "learning_rate": 6.7756466143562924e-06, + "loss": 0.6594, + "mean_token_accuracy": 0.8035330668091774, + "num_tokens": 16801605.0, + "step": 13990 + }, + { + "entropy": 1.890137755870819, + "epoch": 0.04339877506957366, + "grad_norm": 10.558548927307129, + "learning_rate": 6.780490167586943e-06, + "loss": 0.6248, + "mean_token_accuracy": 0.8102419808506965, + "num_tokens": 16813740.0, + "step": 14000 + }, + { + "entropy": 1.888036273419857, + "epoch": 0.043429774194623356, + "grad_norm": 6.69431734085083, + "learning_rate": 6.785333720817593e-06, + "loss": 0.6036, + "mean_token_accuracy": 0.81003537774086, + "num_tokens": 16826251.0, + "step": 14010 + }, + { + "entropy": 1.949825246632099, + "epoch": 0.04346077331967305, + "grad_norm": 11.190848350524902, + "learning_rate": 6.790177274048242e-06, + "loss": 0.6626, + "mean_token_accuracy": 0.8077096566557884, + "num_tokens": 16837543.0, + "step": 14020 + }, + { + "entropy": 1.8271329566836356, + "epoch": 0.04349177244472275, + "grad_norm": 5.520564556121826, + "learning_rate": 6.795020827278892e-06, + "loss": 0.5847, + "mean_token_accuracy": 0.8192524507641792, + "num_tokens": 16850237.0, + "step": 14030 + }, + { + "entropy": 1.8742077186703683, + "epoch": 0.043522771569772446, + "grad_norm": 12.861546516418457, + "learning_rate": 6.799864380509542e-06, + "loss": 0.5947, + "mean_token_accuracy": 0.8112540423870087, + "num_tokens": 16861883.0, + "step": 14040 + }, + { + "entropy": 1.813956792652607, + "epoch": 0.043553770694822136, + "grad_norm": 13.800911903381348, + "learning_rate": 6.8047079337401926e-06, + "loss": 0.5804, + "mean_token_accuracy": 0.8147607937455177, + "num_tokens": 16874756.0, + "step": 14050 + }, + { + "entropy": 1.9326902404427528, + "epoch": 0.04358476981987183, + "grad_norm": 11.105195045471191, + "learning_rate": 6.8095514869708425e-06, + "loss": 0.6847, + "mean_token_accuracy": 0.793427674472332, + "num_tokens": 16885758.0, + "step": 14060 + }, + { + "entropy": 1.9657369270920753, + "epoch": 0.04361576894492153, + "grad_norm": 11.325508117675781, + "learning_rate": 6.814395040201492e-06, + "loss": 0.6517, + "mean_token_accuracy": 0.7974928095936775, + "num_tokens": 16896989.0, + "step": 14070 + }, + { + "entropy": 1.7869946122169496, + "epoch": 0.043646768069971226, + "grad_norm": 10.723480224609375, + "learning_rate": 6.819238593432142e-06, + "loss": 0.5484, + "mean_token_accuracy": 0.8241583168506622, + "num_tokens": 16910317.0, + "step": 14080 + }, + { + "entropy": 1.886980764567852, + "epoch": 0.04367776719502092, + "grad_norm": 10.611824989318848, + "learning_rate": 6.824082146662793e-06, + "loss": 0.5972, + "mean_token_accuracy": 0.821302755177021, + "num_tokens": 16921546.0, + "step": 14090 + }, + { + "entropy": 1.795514563471079, + "epoch": 0.04370876632007062, + "grad_norm": 3.9574859142303467, + "learning_rate": 6.828925699893443e-06, + "loss": 0.5667, + "mean_token_accuracy": 0.8096185430884362, + "num_tokens": 16934810.0, + "step": 14100 + }, + { + "entropy": 1.8915703102946282, + "epoch": 0.04373976544512031, + "grad_norm": 12.432307243347168, + "learning_rate": 6.833769253124093e-06, + "loss": 0.6694, + "mean_token_accuracy": 0.7994834899902343, + "num_tokens": 16946379.0, + "step": 14110 + }, + { + "entropy": 1.872760045528412, + "epoch": 0.043770764570170005, + "grad_norm": 12.616002082824707, + "learning_rate": 6.838612806354742e-06, + "loss": 0.6253, + "mean_token_accuracy": 0.8020053207874298, + "num_tokens": 16958537.0, + "step": 14120 + }, + { + "entropy": 1.8190848156809807, + "epoch": 0.0438017636952197, + "grad_norm": 6.749497890472412, + "learning_rate": 6.843456359585392e-06, + "loss": 0.566, + "mean_token_accuracy": 0.8224000081419944, + "num_tokens": 16971281.0, + "step": 14130 + }, + { + "entropy": 1.7950861573219299, + "epoch": 0.0438327628202694, + "grad_norm": 5.5427021980285645, + "learning_rate": 6.848299912816042e-06, + "loss": 0.523, + "mean_token_accuracy": 0.821921581029892, + "num_tokens": 16983378.0, + "step": 14140 + }, + { + "entropy": 1.7081642150878906, + "epoch": 0.043863761945319095, + "grad_norm": 14.56338882446289, + "learning_rate": 6.853143466046692e-06, + "loss": 0.4977, + "mean_token_accuracy": 0.8312652423977852, + "num_tokens": 16996461.0, + "step": 14150 + }, + { + "entropy": 1.7555221557617187, + "epoch": 0.04389476107036879, + "grad_norm": 5.769707679748535, + "learning_rate": 6.857987019277342e-06, + "loss": 0.5165, + "mean_token_accuracy": 0.81901466101408, + "num_tokens": 17009223.0, + "step": 14160 + }, + { + "entropy": 1.8228839874267577, + "epoch": 0.04392576019541848, + "grad_norm": 14.009598731994629, + "learning_rate": 6.862830572507992e-06, + "loss": 0.5565, + "mean_token_accuracy": 0.8186228111386299, + "num_tokens": 17021369.0, + "step": 14170 + }, + { + "entropy": 1.8004897370934487, + "epoch": 0.04395675932046818, + "grad_norm": 13.954216957092285, + "learning_rate": 6.867674125738643e-06, + "loss": 0.6168, + "mean_token_accuracy": 0.8179115906357766, + "num_tokens": 17033773.0, + "step": 14180 + }, + { + "entropy": 1.8727715358138084, + "epoch": 0.043987758445517874, + "grad_norm": 5.836693286895752, + "learning_rate": 6.872517678969293e-06, + "loss": 0.63, + "mean_token_accuracy": 0.8152304857969284, + "num_tokens": 17045247.0, + "step": 14190 + }, + { + "entropy": 1.9000848352909088, + "epoch": 0.04401875757056757, + "grad_norm": 13.797229766845703, + "learning_rate": 6.8773612321999425e-06, + "loss": 0.6389, + "mean_token_accuracy": 0.8070854544639587, + "num_tokens": 17056322.0, + "step": 14200 + }, + { + "entropy": 1.8419507443904877, + "epoch": 0.04404975669561727, + "grad_norm": 10.07559585571289, + "learning_rate": 6.882204785430593e-06, + "loss": 0.607, + "mean_token_accuracy": 0.8120106473565102, + "num_tokens": 17068663.0, + "step": 14210 + }, + { + "entropy": 1.8738445043563843, + "epoch": 0.044080755820666964, + "grad_norm": 12.083559036254883, + "learning_rate": 6.887048338661243e-06, + "loss": 0.597, + "mean_token_accuracy": 0.8096369743347168, + "num_tokens": 17080468.0, + "step": 14220 + }, + { + "entropy": 1.8636516377329826, + "epoch": 0.044111754945716654, + "grad_norm": 10.079278945922852, + "learning_rate": 6.891891891891892e-06, + "loss": 0.6145, + "mean_token_accuracy": 0.8174061790108681, + "num_tokens": 17091833.0, + "step": 14230 + }, + { + "entropy": 1.8882641837000846, + "epoch": 0.04414275407076635, + "grad_norm": 13.434144973754883, + "learning_rate": 6.896735445122542e-06, + "loss": 0.6377, + "mean_token_accuracy": 0.8117697656154632, + "num_tokens": 17102951.0, + "step": 14240 + }, + { + "entropy": 1.9334753528237343, + "epoch": 0.04417375319581605, + "grad_norm": 9.094237327575684, + "learning_rate": 6.901578998353192e-06, + "loss": 0.6594, + "mean_token_accuracy": 0.8049651876091957, + "num_tokens": 17114582.0, + "step": 14250 + }, + { + "entropy": 1.8502332031726838, + "epoch": 0.044204752320865744, + "grad_norm": 9.20546817779541, + "learning_rate": 6.906422551583843e-06, + "loss": 0.6099, + "mean_token_accuracy": 0.8162536874413491, + "num_tokens": 17127285.0, + "step": 14260 + }, + { + "entropy": 1.8943143799901008, + "epoch": 0.04423575144591544, + "grad_norm": 10.137446403503418, + "learning_rate": 6.9112661048144926e-06, + "loss": 0.6203, + "mean_token_accuracy": 0.8087219893932343, + "num_tokens": 17139089.0, + "step": 14270 + }, + { + "entropy": 1.887183803319931, + "epoch": 0.04426675057096514, + "grad_norm": 10.977912902832031, + "learning_rate": 6.9161096580451424e-06, + "loss": 0.6091, + "mean_token_accuracy": 0.8069622635841369, + "num_tokens": 17151186.0, + "step": 14280 + }, + { + "entropy": 1.8306689888238907, + "epoch": 0.04429774969601483, + "grad_norm": 11.67708969116211, + "learning_rate": 6.920953211275792e-06, + "loss": 0.567, + "mean_token_accuracy": 0.8185241803526878, + "num_tokens": 17162472.0, + "step": 14290 + }, + { + "entropy": 1.9653134107589723, + "epoch": 0.04432874882106452, + "grad_norm": 12.62784481048584, + "learning_rate": 6.925796764506443e-06, + "loss": 0.7129, + "mean_token_accuracy": 0.7954209297895432, + "num_tokens": 17173491.0, + "step": 14300 + }, + { + "entropy": 1.8972666263580322, + "epoch": 0.04435974794611422, + "grad_norm": 10.280572891235352, + "learning_rate": 6.930640317737093e-06, + "loss": 0.6188, + "mean_token_accuracy": 0.8118947297334671, + "num_tokens": 17184219.0, + "step": 14310 + }, + { + "entropy": 1.8529438108205796, + "epoch": 0.044390747071163916, + "grad_norm": 13.168403625488281, + "learning_rate": 6.935483870967743e-06, + "loss": 0.5719, + "mean_token_accuracy": 0.8159810289740562, + "num_tokens": 17195740.0, + "step": 14320 + }, + { + "entropy": 1.8751512482762336, + "epoch": 0.04442174619621361, + "grad_norm": 5.939009666442871, + "learning_rate": 6.940327424198392e-06, + "loss": 0.6072, + "mean_token_accuracy": 0.8138178676366806, + "num_tokens": 17207488.0, + "step": 14330 + }, + { + "entropy": 1.9205424144864083, + "epoch": 0.04445274532126331, + "grad_norm": 9.95887565612793, + "learning_rate": 6.945170977429042e-06, + "loss": 0.6255, + "mean_token_accuracy": 0.8147617742419243, + "num_tokens": 17218552.0, + "step": 14340 + }, + { + "entropy": 1.9514837980270385, + "epoch": 0.044483744446313006, + "grad_norm": 11.541547775268555, + "learning_rate": 6.9500145306596925e-06, + "loss": 0.7025, + "mean_token_accuracy": 0.7957576259970665, + "num_tokens": 17229630.0, + "step": 14350 + }, + { + "entropy": 1.9413245290517807, + "epoch": 0.044514743571362696, + "grad_norm": 11.1705322265625, + "learning_rate": 6.954858083890342e-06, + "loss": 0.6673, + "mean_token_accuracy": 0.8048760443925858, + "num_tokens": 17240633.0, + "step": 14360 + }, + { + "entropy": 1.8450009673833847, + "epoch": 0.04454574269641239, + "grad_norm": 6.041028022766113, + "learning_rate": 6.959701637120992e-06, + "loss": 0.6093, + "mean_token_accuracy": 0.8054246112704277, + "num_tokens": 17253698.0, + "step": 14370 + }, + { + "entropy": 1.833503720164299, + "epoch": 0.04457674182146209, + "grad_norm": 4.289499759674072, + "learning_rate": 6.964545190351643e-06, + "loss": 0.5688, + "mean_token_accuracy": 0.8216146498918533, + "num_tokens": 17266350.0, + "step": 14380 + }, + { + "entropy": 1.7655683636665345, + "epoch": 0.044607740946511786, + "grad_norm": 12.362275123596191, + "learning_rate": 6.969388743582293e-06, + "loss": 0.5197, + "mean_token_accuracy": 0.8274104654788971, + "num_tokens": 17279397.0, + "step": 14390 + }, + { + "entropy": 1.8648739516735078, + "epoch": 0.04463874007156148, + "grad_norm": 11.831520080566406, + "learning_rate": 6.974232296812943e-06, + "loss": 0.5705, + "mean_token_accuracy": 0.8219448134303093, + "num_tokens": 17291500.0, + "step": 14400 + }, + { + "entropy": 1.8236894220113755, + "epoch": 0.04466973919661118, + "grad_norm": 10.356793403625488, + "learning_rate": 6.979075850043593e-06, + "loss": 0.5211, + "mean_token_accuracy": 0.8268671408295631, + "num_tokens": 17304068.0, + "step": 14410 + }, + { + "entropy": 1.8797018930315972, + "epoch": 0.04470073832166087, + "grad_norm": 9.828136444091797, + "learning_rate": 6.983919403274243e-06, + "loss": 0.5973, + "mean_token_accuracy": 0.8069930672645569, + "num_tokens": 17316443.0, + "step": 14420 + }, + { + "entropy": 1.9420302003622054, + "epoch": 0.044731737446710565, + "grad_norm": 12.722853660583496, + "learning_rate": 6.988762956504893e-06, + "loss": 0.6601, + "mean_token_accuracy": 0.8111462906002999, + "num_tokens": 17326906.0, + "step": 14430 + }, + { + "entropy": 1.8793850436806678, + "epoch": 0.04476273657176026, + "grad_norm": 13.556890487670898, + "learning_rate": 6.993606509735542e-06, + "loss": 0.5825, + "mean_token_accuracy": 0.8150814548134804, + "num_tokens": 17338259.0, + "step": 14440 + }, + { + "entropy": 1.8735877990722656, + "epoch": 0.04479373569680996, + "grad_norm": 10.880188941955566, + "learning_rate": 6.998450062966192e-06, + "loss": 0.5381, + "mean_token_accuracy": 0.8156931459903717, + "num_tokens": 17351172.0, + "step": 14450 + }, + { + "entropy": 1.8445069909095764, + "epoch": 0.044824734821859655, + "grad_norm": 11.748570442199707, + "learning_rate": 7.003293616196842e-06, + "loss": 0.5565, + "mean_token_accuracy": 0.8234645172953605, + "num_tokens": 17363083.0, + "step": 14460 + }, + { + "entropy": 1.8664287984371186, + "epoch": 0.04485573394690935, + "grad_norm": 11.027554512023926, + "learning_rate": 7.008137169427493e-06, + "loss": 0.6284, + "mean_token_accuracy": 0.8028878584504128, + "num_tokens": 17374802.0, + "step": 14470 + }, + { + "entropy": 1.8942121878266334, + "epoch": 0.04488673307195904, + "grad_norm": 12.925647735595703, + "learning_rate": 7.012980722658143e-06, + "loss": 0.6221, + "mean_token_accuracy": 0.8129858180880547, + "num_tokens": 17386778.0, + "step": 14480 + }, + { + "entropy": 1.8534621268510818, + "epoch": 0.04491773219700874, + "grad_norm": 11.046483039855957, + "learning_rate": 7.0178242758887926e-06, + "loss": 0.5628, + "mean_token_accuracy": 0.8115181505680085, + "num_tokens": 17399462.0, + "step": 14490 + }, + { + "entropy": 1.8930703341960906, + "epoch": 0.044948731322058434, + "grad_norm": 14.628288269042969, + "learning_rate": 7.0226678291194424e-06, + "loss": 0.6421, + "mean_token_accuracy": 0.8067412465810776, + "num_tokens": 17411740.0, + "step": 14500 + }, + { + "entropy": 1.9142782092094421, + "epoch": 0.04497973044710813, + "grad_norm": 13.652180671691895, + "learning_rate": 7.027511382350093e-06, + "loss": 0.6096, + "mean_token_accuracy": 0.8202868893742561, + "num_tokens": 17423288.0, + "step": 14510 + }, + { + "entropy": 1.9144923388957977, + "epoch": 0.04501072957215783, + "grad_norm": 9.822308540344238, + "learning_rate": 7.032354935580743e-06, + "loss": 0.5705, + "mean_token_accuracy": 0.8314526736736297, + "num_tokens": 17434455.0, + "step": 14520 + }, + { + "entropy": 1.926996847987175, + "epoch": 0.045041728697207524, + "grad_norm": 12.479947090148926, + "learning_rate": 7.037198488811393e-06, + "loss": 0.6008, + "mean_token_accuracy": 0.8161266520619392, + "num_tokens": 17445979.0, + "step": 14530 + }, + { + "entropy": 1.9374195352196693, + "epoch": 0.045072727822257214, + "grad_norm": 15.353309631347656, + "learning_rate": 7.042042042042042e-06, + "loss": 0.6577, + "mean_token_accuracy": 0.8026217222213745, + "num_tokens": 17457698.0, + "step": 14540 + }, + { + "entropy": 1.960123062133789, + "epoch": 0.04510372694730691, + "grad_norm": 13.066014289855957, + "learning_rate": 7.046885595272692e-06, + "loss": 0.664, + "mean_token_accuracy": 0.8060840681195259, + "num_tokens": 17468922.0, + "step": 14550 + }, + { + "entropy": 1.9222340703010559, + "epoch": 0.04513472607235661, + "grad_norm": 11.0694580078125, + "learning_rate": 7.051729148503343e-06, + "loss": 0.6043, + "mean_token_accuracy": 0.8135023102164268, + "num_tokens": 17480520.0, + "step": 14560 + }, + { + "entropy": 1.8666946336627006, + "epoch": 0.0451657251974063, + "grad_norm": 7.747961044311523, + "learning_rate": 7.0565727017339925e-06, + "loss": 0.5063, + "mean_token_accuracy": 0.837678550183773, + "num_tokens": 17493323.0, + "step": 14570 + }, + { + "entropy": 1.8420935034751893, + "epoch": 0.045196724322456, + "grad_norm": 10.135159492492676, + "learning_rate": 7.061416254964642e-06, + "loss": 0.502, + "mean_token_accuracy": 0.8299189880490303, + "num_tokens": 17506110.0, + "step": 14580 + }, + { + "entropy": 1.9041017875075341, + "epoch": 0.0452277234475057, + "grad_norm": 9.9268159866333, + "learning_rate": 7.066259808195293e-06, + "loss": 0.6489, + "mean_token_accuracy": 0.8125099197030068, + "num_tokens": 17517261.0, + "step": 14590 + }, + { + "entropy": 1.8802707374095917, + "epoch": 0.045258722572555386, + "grad_norm": 9.548727989196777, + "learning_rate": 7.071103361425943e-06, + "loss": 0.6023, + "mean_token_accuracy": 0.8187322363257408, + "num_tokens": 17529058.0, + "step": 14600 + }, + { + "entropy": 1.9222388476133347, + "epoch": 0.04528972169760508, + "grad_norm": 13.59824275970459, + "learning_rate": 7.075946914656593e-06, + "loss": 0.6298, + "mean_token_accuracy": 0.8107534229755402, + "num_tokens": 17540313.0, + "step": 14610 + }, + { + "entropy": 1.8212895065546035, + "epoch": 0.04532072082265478, + "grad_norm": 5.862724304199219, + "learning_rate": 7.080790467887243e-06, + "loss": 0.5766, + "mean_token_accuracy": 0.8119758501648903, + "num_tokens": 17553154.0, + "step": 14620 + }, + { + "entropy": 1.919022636115551, + "epoch": 0.045351719947704476, + "grad_norm": 11.380285263061523, + "learning_rate": 7.0856340211178935e-06, + "loss": 0.6269, + "mean_token_accuracy": 0.804240868985653, + "num_tokens": 17564581.0, + "step": 14630 + }, + { + "entropy": 1.8831742450594902, + "epoch": 0.04538271907275417, + "grad_norm": 12.382061004638672, + "learning_rate": 7.090477574348543e-06, + "loss": 0.5588, + "mean_token_accuracy": 0.8190920054912567, + "num_tokens": 17576403.0, + "step": 14640 + }, + { + "entropy": 1.8691495105624198, + "epoch": 0.04541371819780387, + "grad_norm": 5.398478984832764, + "learning_rate": 7.095321127579192e-06, + "loss": 0.5668, + "mean_token_accuracy": 0.8149119704961777, + "num_tokens": 17587726.0, + "step": 14650 + }, + { + "entropy": 1.7776469945907594, + "epoch": 0.045444717322853566, + "grad_norm": 11.435405731201172, + "learning_rate": 7.100164680809842e-06, + "loss": 0.5784, + "mean_token_accuracy": 0.8274336785078049, + "num_tokens": 17600178.0, + "step": 14660 + }, + { + "entropy": 1.8829714879393578, + "epoch": 0.045475716447903256, + "grad_norm": 6.995675563812256, + "learning_rate": 7.105008234040492e-06, + "loss": 0.6475, + "mean_token_accuracy": 0.8128258779644966, + "num_tokens": 17611382.0, + "step": 14670 + }, + { + "entropy": 1.906617347896099, + "epoch": 0.04550671557295295, + "grad_norm": 6.328146457672119, + "learning_rate": 7.109851787271143e-06, + "loss": 0.6191, + "mean_token_accuracy": 0.8149865731596947, + "num_tokens": 17623005.0, + "step": 14680 + }, + { + "entropy": 1.8847622662782668, + "epoch": 0.04553771469800265, + "grad_norm": 12.374866485595703, + "learning_rate": 7.114695340501793e-06, + "loss": 0.5861, + "mean_token_accuracy": 0.8213684529066085, + "num_tokens": 17634543.0, + "step": 14690 + }, + { + "entropy": 1.8927973687648774, + "epoch": 0.045568713823052345, + "grad_norm": 11.521233558654785, + "learning_rate": 7.119538893732443e-06, + "loss": 0.5843, + "mean_token_accuracy": 0.8219080328941345, + "num_tokens": 17645810.0, + "step": 14700 + }, + { + "entropy": 1.9088530361652374, + "epoch": 0.04559971294810204, + "grad_norm": 11.580865859985352, + "learning_rate": 7.1243824469630925e-06, + "loss": 0.6404, + "mean_token_accuracy": 0.8121732175350189, + "num_tokens": 17657230.0, + "step": 14710 + }, + { + "entropy": 1.906784760951996, + "epoch": 0.04563071207315174, + "grad_norm": 10.853860855102539, + "learning_rate": 7.129226000193743e-06, + "loss": 0.6028, + "mean_token_accuracy": 0.8171251997351646, + "num_tokens": 17669157.0, + "step": 14720 + }, + { + "entropy": 1.8272288024425507, + "epoch": 0.04566171119820143, + "grad_norm": 11.097689628601074, + "learning_rate": 7.134069553424393e-06, + "loss": 0.5436, + "mean_token_accuracy": 0.8214239537715912, + "num_tokens": 17682522.0, + "step": 14730 + }, + { + "entropy": 1.9602118730545044, + "epoch": 0.045692710323251125, + "grad_norm": 11.482585906982422, + "learning_rate": 7.138913106655043e-06, + "loss": 0.6694, + "mean_token_accuracy": 0.8125876560807228, + "num_tokens": 17693830.0, + "step": 14740 + }, + { + "entropy": 1.8845744907855988, + "epoch": 0.04572370944830082, + "grad_norm": 11.898690223693848, + "learning_rate": 7.143756659885692e-06, + "loss": 0.621, + "mean_token_accuracy": 0.8045976728200912, + "num_tokens": 17706045.0, + "step": 14750 + }, + { + "entropy": 1.8579593807458878, + "epoch": 0.04575470857335052, + "grad_norm": 5.022470474243164, + "learning_rate": 7.148600213116342e-06, + "loss": 0.6451, + "mean_token_accuracy": 0.804808932542801, + "num_tokens": 17718119.0, + "step": 14760 + }, + { + "entropy": 1.9066347777843475, + "epoch": 0.045785707698400215, + "grad_norm": 10.700624465942383, + "learning_rate": 7.153443766346993e-06, + "loss": 0.61, + "mean_token_accuracy": 0.8180598929524422, + "num_tokens": 17729029.0, + "step": 14770 + }, + { + "entropy": 1.8993641972541808, + "epoch": 0.04581670682344991, + "grad_norm": 10.812292098999023, + "learning_rate": 7.1582873195776426e-06, + "loss": 0.6628, + "mean_token_accuracy": 0.8115692853927612, + "num_tokens": 17740072.0, + "step": 14780 + }, + { + "entropy": 1.8603774085640907, + "epoch": 0.0458477059484996, + "grad_norm": 13.853246688842773, + "learning_rate": 7.1631308728082925e-06, + "loss": 0.609, + "mean_token_accuracy": 0.8194773256778717, + "num_tokens": 17751523.0, + "step": 14790 + }, + { + "entropy": 1.7647764384746552, + "epoch": 0.0458787050735493, + "grad_norm": 13.947328567504883, + "learning_rate": 7.167974426038943e-06, + "loss": 0.5377, + "mean_token_accuracy": 0.8233131125569344, + "num_tokens": 17763817.0, + "step": 14800 + }, + { + "entropy": 1.750529208779335, + "epoch": 0.045909704198598994, + "grad_norm": 5.576157569885254, + "learning_rate": 7.172817979269593e-06, + "loss": 0.5442, + "mean_token_accuracy": 0.8132794231176377, + "num_tokens": 17777124.0, + "step": 14810 + }, + { + "entropy": 1.851732324063778, + "epoch": 0.04594070332364869, + "grad_norm": 11.042110443115234, + "learning_rate": 7.177661532500243e-06, + "loss": 0.6467, + "mean_token_accuracy": 0.8058951094746589, + "num_tokens": 17788612.0, + "step": 14820 + }, + { + "entropy": 1.7839807882905006, + "epoch": 0.04597170244869839, + "grad_norm": 10.22685718536377, + "learning_rate": 7.182505085730893e-06, + "loss": 0.5832, + "mean_token_accuracy": 0.8168496385216713, + "num_tokens": 17801156.0, + "step": 14830 + }, + { + "entropy": 1.8429742008447647, + "epoch": 0.046002701573748084, + "grad_norm": 5.943838596343994, + "learning_rate": 7.1873486389615436e-06, + "loss": 0.5539, + "mean_token_accuracy": 0.8247615218162536, + "num_tokens": 17813245.0, + "step": 14840 + }, + { + "entropy": 1.8560289978981017, + "epoch": 0.04603370069879777, + "grad_norm": 11.962748527526855, + "learning_rate": 7.1921921921921935e-06, + "loss": 0.651, + "mean_token_accuracy": 0.8067975297570229, + "num_tokens": 17824749.0, + "step": 14850 + }, + { + "entropy": 1.7992481097579003, + "epoch": 0.04606469982384747, + "grad_norm": 13.64804744720459, + "learning_rate": 7.1970357454228425e-06, + "loss": 0.5454, + "mean_token_accuracy": 0.8238918125629425, + "num_tokens": 17838276.0, + "step": 14860 + }, + { + "entropy": 1.925189484655857, + "epoch": 0.04609569894889717, + "grad_norm": 10.546503067016602, + "learning_rate": 7.201879298653492e-06, + "loss": 0.6402, + "mean_token_accuracy": 0.8055410549044609, + "num_tokens": 17849457.0, + "step": 14870 + }, + { + "entropy": 1.939789319038391, + "epoch": 0.04612669807394686, + "grad_norm": 13.753273963928223, + "learning_rate": 7.206722851884142e-06, + "loss": 0.6455, + "mean_token_accuracy": 0.807138554751873, + "num_tokens": 17860102.0, + "step": 14880 + }, + { + "entropy": 1.8846666172146798, + "epoch": 0.04615769719899656, + "grad_norm": 11.280816078186035, + "learning_rate": 7.211566405114793e-06, + "loss": 0.6067, + "mean_token_accuracy": 0.8096478402614593, + "num_tokens": 17871606.0, + "step": 14890 + }, + { + "entropy": 1.918617771565914, + "epoch": 0.046188696324046256, + "grad_norm": 10.672183990478516, + "learning_rate": 7.216409958345443e-06, + "loss": 0.6779, + "mean_token_accuracy": 0.8008549973368645, + "num_tokens": 17882968.0, + "step": 14900 + }, + { + "entropy": 1.9250801861286164, + "epoch": 0.046219695449095946, + "grad_norm": 10.703913688659668, + "learning_rate": 7.221253511576093e-06, + "loss": 0.6541, + "mean_token_accuracy": 0.8061710268259048, + "num_tokens": 17894617.0, + "step": 14910 + }, + { + "entropy": 1.8236558228731155, + "epoch": 0.04625069457414564, + "grad_norm": 11.690652847290039, + "learning_rate": 7.226097064806743e-06, + "loss": 0.5484, + "mean_token_accuracy": 0.8214540228247642, + "num_tokens": 17907854.0, + "step": 14920 + }, + { + "entropy": 1.8554262310266494, + "epoch": 0.04628169369919534, + "grad_norm": 10.404338836669922, + "learning_rate": 7.230940618037393e-06, + "loss": 0.5821, + "mean_token_accuracy": 0.8083539769053459, + "num_tokens": 17920382.0, + "step": 14930 + }, + { + "entropy": 1.855211953818798, + "epoch": 0.046312692824245036, + "grad_norm": 10.875576972961426, + "learning_rate": 7.235784171268043e-06, + "loss": 0.5979, + "mean_token_accuracy": 0.8178864941000938, + "num_tokens": 17932205.0, + "step": 14940 + }, + { + "entropy": 1.8454056218266488, + "epoch": 0.04634369194929473, + "grad_norm": 11.040106773376465, + "learning_rate": 7.240627724498693e-06, + "loss": 0.58, + "mean_token_accuracy": 0.8090640112757683, + "num_tokens": 17943421.0, + "step": 14950 + }, + { + "entropy": 1.901095400750637, + "epoch": 0.04637469107434443, + "grad_norm": 10.230569839477539, + "learning_rate": 7.245471277729342e-06, + "loss": 0.5863, + "mean_token_accuracy": 0.8283701628446579, + "num_tokens": 17954715.0, + "step": 14960 + }, + { + "entropy": 1.8909978330135346, + "epoch": 0.04640569019939412, + "grad_norm": 12.003896713256836, + "learning_rate": 7.250314830959992e-06, + "loss": 0.6415, + "mean_token_accuracy": 0.809289188683033, + "num_tokens": 17966078.0, + "step": 14970 + }, + { + "entropy": 1.893742746114731, + "epoch": 0.046436689324443815, + "grad_norm": 12.008511543273926, + "learning_rate": 7.255158384190643e-06, + "loss": 0.5831, + "mean_token_accuracy": 0.8165367186069489, + "num_tokens": 17977838.0, + "step": 14980 + }, + { + "entropy": 1.9413186386227608, + "epoch": 0.04646768844949351, + "grad_norm": 11.98163890838623, + "learning_rate": 7.260001937421293e-06, + "loss": 0.632, + "mean_token_accuracy": 0.8125703752040863, + "num_tokens": 17989242.0, + "step": 14990 + }, + { + "entropy": 1.8333601012825966, + "epoch": 0.04649868757454321, + "grad_norm": 5.240977764129639, + "learning_rate": 7.2648454906519426e-06, + "loss": 0.537, + "mean_token_accuracy": 0.8350656241178512, + "num_tokens": 18001491.0, + "step": 15000 + }, + { + "entropy": 1.8348119348287582, + "epoch": 0.046529686699592905, + "grad_norm": 5.844879150390625, + "learning_rate": 7.269689043882593e-06, + "loss": 0.602, + "mean_token_accuracy": 0.8153130680322647, + "num_tokens": 18014233.0, + "step": 15010 + }, + { + "entropy": 1.8076252147555352, + "epoch": 0.0465606858246426, + "grad_norm": 5.277684688568115, + "learning_rate": 7.274532597113243e-06, + "loss": 0.5404, + "mean_token_accuracy": 0.824401643872261, + "num_tokens": 18027418.0, + "step": 15020 + }, + { + "entropy": 1.9218341365456582, + "epoch": 0.0465916849496923, + "grad_norm": 10.615427017211914, + "learning_rate": 7.279376150343893e-06, + "loss": 0.608, + "mean_token_accuracy": 0.8177351862192154, + "num_tokens": 18038854.0, + "step": 15030 + }, + { + "entropy": 1.9281207531690598, + "epoch": 0.04662268407474199, + "grad_norm": 9.128558158874512, + "learning_rate": 7.284219703574543e-06, + "loss": 0.598, + "mean_token_accuracy": 0.8243363618850708, + "num_tokens": 18049861.0, + "step": 15040 + }, + { + "entropy": 1.8921822875738143, + "epoch": 0.046653683199791685, + "grad_norm": 13.774589538574219, + "learning_rate": 7.289063256805194e-06, + "loss": 0.6362, + "mean_token_accuracy": 0.8057242497801781, + "num_tokens": 18061596.0, + "step": 15050 + }, + { + "entropy": 1.832139255106449, + "epoch": 0.04668468232484138, + "grad_norm": 10.49471378326416, + "learning_rate": 7.2939068100358436e-06, + "loss": 0.5717, + "mean_token_accuracy": 0.816807533800602, + "num_tokens": 18073371.0, + "step": 15060 + }, + { + "entropy": 1.8594244837760925, + "epoch": 0.04671568144989108, + "grad_norm": 11.264952659606934, + "learning_rate": 7.298750363266493e-06, + "loss": 0.6349, + "mean_token_accuracy": 0.8005358681082726, + "num_tokens": 18086010.0, + "step": 15070 + }, + { + "entropy": 1.8758624821901322, + "epoch": 0.046746680574940774, + "grad_norm": 13.645959854125977, + "learning_rate": 7.3035939164971425e-06, + "loss": 0.5787, + "mean_token_accuracy": 0.8165410995483399, + "num_tokens": 18097503.0, + "step": 15080 + }, + { + "entropy": 1.8632541045546531, + "epoch": 0.04677767969999047, + "grad_norm": 12.606849670410156, + "learning_rate": 7.308437469727792e-06, + "loss": 0.5916, + "mean_token_accuracy": 0.8230202242732048, + "num_tokens": 18109025.0, + "step": 15090 + }, + { + "entropy": 1.8468170419335366, + "epoch": 0.04680867882504016, + "grad_norm": 5.763012409210205, + "learning_rate": 7.313281022958443e-06, + "loss": 0.5543, + "mean_token_accuracy": 0.82147196829319, + "num_tokens": 18120920.0, + "step": 15100 + }, + { + "entropy": 1.8019221499562263, + "epoch": 0.04683967795008986, + "grad_norm": 12.194572448730469, + "learning_rate": 7.318124576189093e-06, + "loss": 0.534, + "mean_token_accuracy": 0.8267902106046676, + "num_tokens": 18134004.0, + "step": 15110 + }, + { + "entropy": 1.9301525950431824, + "epoch": 0.046870677075139554, + "grad_norm": 11.044305801391602, + "learning_rate": 7.322968129419743e-06, + "loss": 0.7477, + "mean_token_accuracy": 0.798159584403038, + "num_tokens": 18145789.0, + "step": 15120 + }, + { + "entropy": 1.9049590498209, + "epoch": 0.04690167620018925, + "grad_norm": 12.671975135803223, + "learning_rate": 7.327811682650393e-06, + "loss": 0.6079, + "mean_token_accuracy": 0.8189090758562088, + "num_tokens": 18157088.0, + "step": 15130 + }, + { + "entropy": 1.8682396605610847, + "epoch": 0.04693267532523895, + "grad_norm": 9.93550968170166, + "learning_rate": 7.3326552358810435e-06, + "loss": 0.5791, + "mean_token_accuracy": 0.8179777503013611, + "num_tokens": 18169072.0, + "step": 15140 + }, + { + "entropy": 1.8468847021460533, + "epoch": 0.046963674450288644, + "grad_norm": 9.431316375732422, + "learning_rate": 7.337498789111693e-06, + "loss": 0.5507, + "mean_token_accuracy": 0.8228641420602798, + "num_tokens": 18181019.0, + "step": 15150 + }, + { + "entropy": 1.775972270965576, + "epoch": 0.04699467357533833, + "grad_norm": 10.788688659667969, + "learning_rate": 7.342342342342343e-06, + "loss": 0.4845, + "mean_token_accuracy": 0.8265146508812904, + "num_tokens": 18194155.0, + "step": 15160 + }, + { + "entropy": 1.8457147806882859, + "epoch": 0.04702567270038803, + "grad_norm": 9.854268074035645, + "learning_rate": 7.347185895572992e-06, + "loss": 0.5475, + "mean_token_accuracy": 0.8127473801374435, + "num_tokens": 18206750.0, + "step": 15170 + }, + { + "entropy": 1.7577089250087738, + "epoch": 0.047056671825437726, + "grad_norm": 5.139494895935059, + "learning_rate": 7.352029448803642e-06, + "loss": 0.4973, + "mean_token_accuracy": 0.8430294960737228, + "num_tokens": 18219202.0, + "step": 15180 + }, + { + "entropy": 1.8935526996850967, + "epoch": 0.04708767095048742, + "grad_norm": 5.4716057777404785, + "learning_rate": 7.356873002034293e-06, + "loss": 0.5882, + "mean_token_accuracy": 0.8127918854355812, + "num_tokens": 18230861.0, + "step": 15190 + }, + { + "entropy": 1.8892264723777772, + "epoch": 0.04711867007553712, + "grad_norm": 12.728769302368164, + "learning_rate": 7.361716555264943e-06, + "loss": 0.5607, + "mean_token_accuracy": 0.8149558499455452, + "num_tokens": 18242915.0, + "step": 15200 + }, + { + "entropy": 1.8911462113261224, + "epoch": 0.047149669200586816, + "grad_norm": 9.735945701599121, + "learning_rate": 7.366560108495593e-06, + "loss": 0.6442, + "mean_token_accuracy": 0.8063351511955261, + "num_tokens": 18253977.0, + "step": 15210 + }, + { + "entropy": 1.9077469795942306, + "epoch": 0.047180668325636506, + "grad_norm": 13.403579711914062, + "learning_rate": 7.371403661726243e-06, + "loss": 0.6695, + "mean_token_accuracy": 0.8085703745484352, + "num_tokens": 18264673.0, + "step": 15220 + }, + { + "entropy": 1.86230625808239, + "epoch": 0.0472116674506862, + "grad_norm": 5.826930999755859, + "learning_rate": 7.376247214956893e-06, + "loss": 0.5957, + "mean_token_accuracy": 0.8189520880579948, + "num_tokens": 18276799.0, + "step": 15230 + }, + { + "entropy": 1.9372797414660454, + "epoch": 0.0472426665757359, + "grad_norm": 12.060833930969238, + "learning_rate": 7.381090768187543e-06, + "loss": 0.5974, + "mean_token_accuracy": 0.8150049105286599, + "num_tokens": 18288796.0, + "step": 15240 + }, + { + "entropy": 1.788297127187252, + "epoch": 0.047273665700785596, + "grad_norm": 5.706685543060303, + "learning_rate": 7.385934321418193e-06, + "loss": 0.5107, + "mean_token_accuracy": 0.8224897965788841, + "num_tokens": 18301726.0, + "step": 15250 + }, + { + "entropy": 1.9241210684180259, + "epoch": 0.04730466482583529, + "grad_norm": 11.742621421813965, + "learning_rate": 7.390777874648844e-06, + "loss": 0.6246, + "mean_token_accuracy": 0.810030820965767, + "num_tokens": 18312680.0, + "step": 15260 + }, + { + "entropy": 1.9467894092202187, + "epoch": 0.04733566395088499, + "grad_norm": 9.387283325195312, + "learning_rate": 7.395621427879494e-06, + "loss": 0.6334, + "mean_token_accuracy": 0.8097245275974274, + "num_tokens": 18324139.0, + "step": 15270 + }, + { + "entropy": 1.9100206598639489, + "epoch": 0.04736666307593468, + "grad_norm": 10.441351890563965, + "learning_rate": 7.400464981110143e-06, + "loss": 0.6332, + "mean_token_accuracy": 0.7996808186173439, + "num_tokens": 18336228.0, + "step": 15280 + }, + { + "entropy": 1.96844242811203, + "epoch": 0.047397662200984375, + "grad_norm": 11.180095672607422, + "learning_rate": 7.405308534340793e-06, + "loss": 0.675, + "mean_token_accuracy": 0.8107746347784996, + "num_tokens": 18346950.0, + "step": 15290 + }, + { + "entropy": 1.9150108516216278, + "epoch": 0.04742866132603407, + "grad_norm": 9.653234481811523, + "learning_rate": 7.4101520875714425e-06, + "loss": 0.6054, + "mean_token_accuracy": 0.8183826908469201, + "num_tokens": 18358537.0, + "step": 15300 + }, + { + "entropy": 1.8632504418492317, + "epoch": 0.04745966045108377, + "grad_norm": 13.043754577636719, + "learning_rate": 7.414995640802093e-06, + "loss": 0.5891, + "mean_token_accuracy": 0.8238464057445526, + "num_tokens": 18371462.0, + "step": 15310 + }, + { + "entropy": 1.8496581763029099, + "epoch": 0.047490659576133465, + "grad_norm": 11.938183784484863, + "learning_rate": 7.419839194032743e-06, + "loss": 0.5633, + "mean_token_accuracy": 0.827662679553032, + "num_tokens": 18384026.0, + "step": 15320 + }, + { + "entropy": 1.8263291805982589, + "epoch": 0.04752165870118316, + "grad_norm": 10.78771686553955, + "learning_rate": 7.424682747263393e-06, + "loss": 0.5709, + "mean_token_accuracy": 0.815859617292881, + "num_tokens": 18396414.0, + "step": 15330 + }, + { + "entropy": 1.8266072757542133, + "epoch": 0.04755265782623285, + "grad_norm": 11.682705879211426, + "learning_rate": 7.429526300494043e-06, + "loss": 0.5583, + "mean_token_accuracy": 0.8147292017936707, + "num_tokens": 18409403.0, + "step": 15340 + }, + { + "entropy": 1.8509322211146355, + "epoch": 0.04758365695128255, + "grad_norm": 4.418774127960205, + "learning_rate": 7.434369853724694e-06, + "loss": 0.5744, + "mean_token_accuracy": 0.822237353026867, + "num_tokens": 18421350.0, + "step": 15350 + }, + { + "entropy": 1.8988280490040779, + "epoch": 0.047614656076332244, + "grad_norm": 11.46986198425293, + "learning_rate": 7.4392134069553435e-06, + "loss": 0.5755, + "mean_token_accuracy": 0.8086514964699745, + "num_tokens": 18433565.0, + "step": 15360 + }, + { + "entropy": 1.883128097653389, + "epoch": 0.04764565520138194, + "grad_norm": 10.333133697509766, + "learning_rate": 7.444056960185993e-06, + "loss": 0.6242, + "mean_token_accuracy": 0.8101354941725731, + "num_tokens": 18445233.0, + "step": 15370 + }, + { + "entropy": 1.7941233783960342, + "epoch": 0.04767665432643164, + "grad_norm": 4.953238010406494, + "learning_rate": 7.448900513416642e-06, + "loss": 0.4966, + "mean_token_accuracy": 0.8319120317697525, + "num_tokens": 18457975.0, + "step": 15380 + }, + { + "entropy": 1.9092483133077622, + "epoch": 0.047707653451481334, + "grad_norm": 11.126029014587402, + "learning_rate": 7.453744066647292e-06, + "loss": 0.6611, + "mean_token_accuracy": 0.8040424823760987, + "num_tokens": 18469319.0, + "step": 15390 + }, + { + "entropy": 1.8364340022206307, + "epoch": 0.04773865257653103, + "grad_norm": 5.549501419067383, + "learning_rate": 7.458587619877943e-06, + "loss": 0.6241, + "mean_token_accuracy": 0.8029653668403626, + "num_tokens": 18482789.0, + "step": 15400 + }, + { + "entropy": 1.8426788106560708, + "epoch": 0.04776965170158072, + "grad_norm": 11.887903213500977, + "learning_rate": 7.463431173108593e-06, + "loss": 0.5929, + "mean_token_accuracy": 0.8010544419288635, + "num_tokens": 18495234.0, + "step": 15410 + }, + { + "entropy": 1.898276437819004, + "epoch": 0.04780065082663042, + "grad_norm": 11.306375503540039, + "learning_rate": 7.468274726339243e-06, + "loss": 0.5772, + "mean_token_accuracy": 0.8193996638059616, + "num_tokens": 18507024.0, + "step": 15420 + }, + { + "entropy": 1.9499488294124603, + "epoch": 0.047831649951680114, + "grad_norm": 13.077889442443848, + "learning_rate": 7.4731182795698935e-06, + "loss": 0.7473, + "mean_token_accuracy": 0.7906885385513306, + "num_tokens": 18518699.0, + "step": 15430 + }, + { + "entropy": 1.8603081166744233, + "epoch": 0.04786264907672981, + "grad_norm": 11.552556037902832, + "learning_rate": 7.477961832800543e-06, + "loss": 0.5398, + "mean_token_accuracy": 0.8289436608552933, + "num_tokens": 18530154.0, + "step": 15440 + }, + { + "entropy": 1.8968264564871788, + "epoch": 0.04789364820177951, + "grad_norm": 12.209922790527344, + "learning_rate": 7.482805386031193e-06, + "loss": 0.6208, + "mean_token_accuracy": 0.8118745431303978, + "num_tokens": 18541201.0, + "step": 15450 + }, + { + "entropy": 1.7692169472575188, + "epoch": 0.0479246473268292, + "grad_norm": 11.662141799926758, + "learning_rate": 7.487648939261843e-06, + "loss": 0.5171, + "mean_token_accuracy": 0.8198059305548668, + "num_tokens": 18554224.0, + "step": 15460 + }, + { + "entropy": 1.927010752260685, + "epoch": 0.04795564645187889, + "grad_norm": 10.837479591369629, + "learning_rate": 7.492492492492494e-06, + "loss": 0.5931, + "mean_token_accuracy": 0.8046490982174873, + "num_tokens": 18566184.0, + "step": 15470 + }, + { + "entropy": 1.854862241446972, + "epoch": 0.04798664557692859, + "grad_norm": 9.293188095092773, + "learning_rate": 7.497336045723144e-06, + "loss": 0.5992, + "mean_token_accuracy": 0.8218624517321587, + "num_tokens": 18578244.0, + "step": 15480 + }, + { + "entropy": 1.8932930827140808, + "epoch": 0.048017644701978286, + "grad_norm": 10.518685340881348, + "learning_rate": 7.502179598953793e-06, + "loss": 0.6514, + "mean_token_accuracy": 0.8079543009400367, + "num_tokens": 18589205.0, + "step": 15490 + }, + { + "entropy": 1.8627614587545396, + "epoch": 0.04804864382702798, + "grad_norm": 14.004125595092773, + "learning_rate": 7.507023152184443e-06, + "loss": 0.6486, + "mean_token_accuracy": 0.8162886321544647, + "num_tokens": 18600091.0, + "step": 15500 + }, + { + "entropy": 1.8322096571326256, + "epoch": 0.04807964295207768, + "grad_norm": 10.797554016113281, + "learning_rate": 7.5118667054150926e-06, + "loss": 0.5938, + "mean_token_accuracy": 0.8135915264487267, + "num_tokens": 18611780.0, + "step": 15510 + }, + { + "entropy": 1.7482316687703132, + "epoch": 0.048110642077127376, + "grad_norm": 11.08513069152832, + "learning_rate": 7.516710258645743e-06, + "loss": 0.581, + "mean_token_accuracy": 0.8262606129050255, + "num_tokens": 18624882.0, + "step": 15520 + }, + { + "entropy": 1.8912085920572281, + "epoch": 0.048141641202177066, + "grad_norm": 12.140768051147461, + "learning_rate": 7.521553811876393e-06, + "loss": 0.6816, + "mean_token_accuracy": 0.8002603054046631, + "num_tokens": 18636954.0, + "step": 15530 + }, + { + "entropy": 1.8268685653805732, + "epoch": 0.04817264032722676, + "grad_norm": 12.404664039611816, + "learning_rate": 7.526397365107043e-06, + "loss": 0.5911, + "mean_token_accuracy": 0.8149344757199287, + "num_tokens": 18649266.0, + "step": 15540 + }, + { + "entropy": 1.8767468333244324, + "epoch": 0.04820363945227646, + "grad_norm": 10.459026336669922, + "learning_rate": 7.531240918337693e-06, + "loss": 0.6271, + "mean_token_accuracy": 0.8109247386455536, + "num_tokens": 18660751.0, + "step": 15550 + }, + { + "entropy": 1.831296342611313, + "epoch": 0.048234638577326155, + "grad_norm": 10.499650955200195, + "learning_rate": 7.536084471568344e-06, + "loss": 0.5749, + "mean_token_accuracy": 0.8176342695951462, + "num_tokens": 18673033.0, + "step": 15560 + }, + { + "entropy": 1.8214371725916862, + "epoch": 0.04826563770237585, + "grad_norm": 14.110614776611328, + "learning_rate": 7.5409280247989936e-06, + "loss": 0.5775, + "mean_token_accuracy": 0.8230008244514465, + "num_tokens": 18685183.0, + "step": 15570 + }, + { + "entropy": 1.8123807251453399, + "epoch": 0.04829663682742555, + "grad_norm": 10.320479393005371, + "learning_rate": 7.5457715780296435e-06, + "loss": 0.5839, + "mean_token_accuracy": 0.8221219345927239, + "num_tokens": 18696924.0, + "step": 15580 + }, + { + "entropy": 1.7216524064540863, + "epoch": 0.04832763595247524, + "grad_norm": 9.836614608764648, + "learning_rate": 7.5506151312602925e-06, + "loss": 0.4975, + "mean_token_accuracy": 0.8300070941448212, + "num_tokens": 18710414.0, + "step": 15590 + }, + { + "entropy": 1.889795495569706, + "epoch": 0.048358635077524935, + "grad_norm": 11.627612113952637, + "learning_rate": 7.555458684490942e-06, + "loss": 0.6584, + "mean_token_accuracy": 0.8059937000274658, + "num_tokens": 18721801.0, + "step": 15600 + }, + { + "entropy": 1.8819514483213424, + "epoch": 0.04838963420257463, + "grad_norm": 6.095297813415527, + "learning_rate": 7.560302237721593e-06, + "loss": 0.6108, + "mean_token_accuracy": 0.807888326048851, + "num_tokens": 18733560.0, + "step": 15610 + }, + { + "entropy": 1.8655756518244744, + "epoch": 0.04842063332762433, + "grad_norm": 10.622197151184082, + "learning_rate": 7.565145790952243e-06, + "loss": 0.6113, + "mean_token_accuracy": 0.8073986008763313, + "num_tokens": 18746189.0, + "step": 15620 + }, + { + "entropy": 1.8603959396481513, + "epoch": 0.048451632452674025, + "grad_norm": 11.854666709899902, + "learning_rate": 7.569989344182893e-06, + "loss": 0.5933, + "mean_token_accuracy": 0.817948243021965, + "num_tokens": 18757924.0, + "step": 15630 + }, + { + "entropy": 1.9059595987200737, + "epoch": 0.04848263157772372, + "grad_norm": 9.098125457763672, + "learning_rate": 7.574832897413544e-06, + "loss": 0.6782, + "mean_token_accuracy": 0.8045271784067154, + "num_tokens": 18769446.0, + "step": 15640 + }, + { + "entropy": 1.7989111676812173, + "epoch": 0.04851363070277341, + "grad_norm": 11.551828384399414, + "learning_rate": 7.5796764506441935e-06, + "loss": 0.5537, + "mean_token_accuracy": 0.8248863905668259, + "num_tokens": 18782700.0, + "step": 15650 + }, + { + "entropy": 1.8210396483540534, + "epoch": 0.04854462982782311, + "grad_norm": 4.9011759757995605, + "learning_rate": 7.584520003874843e-06, + "loss": 0.5557, + "mean_token_accuracy": 0.8225024446845055, + "num_tokens": 18795390.0, + "step": 15660 + }, + { + "entropy": 1.811398984491825, + "epoch": 0.048575628952872804, + "grad_norm": 11.048978805541992, + "learning_rate": 7.589363557105493e-06, + "loss": 0.5542, + "mean_token_accuracy": 0.8189741969108582, + "num_tokens": 18807396.0, + "step": 15670 + }, + { + "entropy": 1.892940777540207, + "epoch": 0.0486066280779225, + "grad_norm": 12.325209617614746, + "learning_rate": 7.594207110336144e-06, + "loss": 0.6986, + "mean_token_accuracy": 0.7981882244348526, + "num_tokens": 18819378.0, + "step": 15680 + }, + { + "entropy": 1.886229231953621, + "epoch": 0.0486376272029722, + "grad_norm": 10.576592445373535, + "learning_rate": 7.599050663566794e-06, + "loss": 0.6074, + "mean_token_accuracy": 0.8123501881957054, + "num_tokens": 18830551.0, + "step": 15690 + }, + { + "entropy": 1.8603569209575652, + "epoch": 0.048668626328021894, + "grad_norm": 11.795734405517578, + "learning_rate": 7.603894216797443e-06, + "loss": 0.5157, + "mean_token_accuracy": 0.8348536252975464, + "num_tokens": 18842716.0, + "step": 15700 + }, + { + "entropy": 1.9385449796915055, + "epoch": 0.04869962545307159, + "grad_norm": 10.914395332336426, + "learning_rate": 7.608737770028093e-06, + "loss": 0.6356, + "mean_token_accuracy": 0.8061470225453377, + "num_tokens": 18853817.0, + "step": 15710 + }, + { + "entropy": 1.8827759057283402, + "epoch": 0.04873062457812128, + "grad_norm": 14.19170093536377, + "learning_rate": 7.613581323258743e-06, + "loss": 0.6335, + "mean_token_accuracy": 0.8078219577670097, + "num_tokens": 18865592.0, + "step": 15720 + }, + { + "entropy": 1.8566590577363968, + "epoch": 0.04876162370317098, + "grad_norm": 13.042546272277832, + "learning_rate": 7.618424876489393e-06, + "loss": 0.6099, + "mean_token_accuracy": 0.8074685573577881, + "num_tokens": 18878717.0, + "step": 15730 + }, + { + "entropy": 1.8778809905052185, + "epoch": 0.04879262282822067, + "grad_norm": 9.526302337646484, + "learning_rate": 7.623268429720043e-06, + "loss": 0.5393, + "mean_token_accuracy": 0.8302160531282425, + "num_tokens": 18890522.0, + "step": 15740 + }, + { + "entropy": 1.843679629266262, + "epoch": 0.04882362195327037, + "grad_norm": 11.040970802307129, + "learning_rate": 7.628111982950693e-06, + "loss": 0.6254, + "mean_token_accuracy": 0.812717217206955, + "num_tokens": 18902667.0, + "step": 15750 + }, + { + "entropy": 1.9100946336984634, + "epoch": 0.04885462107832007, + "grad_norm": 7.509261608123779, + "learning_rate": 7.632955536181344e-06, + "loss": 0.6673, + "mean_token_accuracy": 0.8177873358130455, + "num_tokens": 18913868.0, + "step": 15760 + }, + { + "entropy": 1.8826175585389138, + "epoch": 0.04888562020336976, + "grad_norm": 11.155854225158691, + "learning_rate": 7.637799089411994e-06, + "loss": 0.6481, + "mean_token_accuracy": 0.8087216004729271, + "num_tokens": 18925988.0, + "step": 15770 + }, + { + "entropy": 1.7796615183353424, + "epoch": 0.04891661932841945, + "grad_norm": 5.291562080383301, + "learning_rate": 7.642642642642644e-06, + "loss": 0.5153, + "mean_token_accuracy": 0.8381139814853669, + "num_tokens": 18938466.0, + "step": 15780 + }, + { + "entropy": 1.9144254103302956, + "epoch": 0.04894761845346915, + "grad_norm": 10.03088092803955, + "learning_rate": 7.647486195873294e-06, + "loss": 0.6672, + "mean_token_accuracy": 0.8007161229848861, + "num_tokens": 18949499.0, + "step": 15790 + }, + { + "entropy": 1.8293558046221734, + "epoch": 0.048978617578518846, + "grad_norm": 10.299362182617188, + "learning_rate": 7.652329749103943e-06, + "loss": 0.6104, + "mean_token_accuracy": 0.8068207755684853, + "num_tokens": 18961354.0, + "step": 15800 + }, + { + "entropy": 1.843856942653656, + "epoch": 0.04900961670356854, + "grad_norm": 5.878364562988281, + "learning_rate": 7.657173302334593e-06, + "loss": 0.5521, + "mean_token_accuracy": 0.8132404252886772, + "num_tokens": 18973459.0, + "step": 15810 + }, + { + "entropy": 1.8029664367437364, + "epoch": 0.04904061582861824, + "grad_norm": 13.001435279846191, + "learning_rate": 7.662016855565243e-06, + "loss": 0.5321, + "mean_token_accuracy": 0.8184084549546242, + "num_tokens": 18986940.0, + "step": 15820 + }, + { + "entropy": 1.836187256872654, + "epoch": 0.049071614953667936, + "grad_norm": 14.026951789855957, + "learning_rate": 7.666860408795893e-06, + "loss": 0.5792, + "mean_token_accuracy": 0.8184718936681747, + "num_tokens": 19000053.0, + "step": 15830 + }, + { + "entropy": 2.0127352684736253, + "epoch": 0.049102614078717625, + "grad_norm": 10.804215431213379, + "learning_rate": 7.671703962026543e-06, + "loss": 0.6748, + "mean_token_accuracy": 0.8046320468187332, + "num_tokens": 19010864.0, + "step": 15840 + }, + { + "entropy": 1.8914284870028495, + "epoch": 0.04913361320376732, + "grad_norm": 12.271016120910645, + "learning_rate": 7.676547515257193e-06, + "loss": 0.6046, + "mean_token_accuracy": 0.8181375965476037, + "num_tokens": 19022131.0, + "step": 15850 + }, + { + "entropy": 1.8896478191018105, + "epoch": 0.04916461232881702, + "grad_norm": 10.453010559082031, + "learning_rate": 7.681391068487843e-06, + "loss": 0.6347, + "mean_token_accuracy": 0.8188618034124374, + "num_tokens": 19033759.0, + "step": 15860 + }, + { + "entropy": 1.9267440363764763, + "epoch": 0.049195611453866715, + "grad_norm": 12.614462852478027, + "learning_rate": 7.686234621718494e-06, + "loss": 0.6087, + "mean_token_accuracy": 0.819622540473938, + "num_tokens": 19044630.0, + "step": 15870 + }, + { + "entropy": 1.8585731774568557, + "epoch": 0.04922661057891641, + "grad_norm": 11.191518783569336, + "learning_rate": 7.691078174949144e-06, + "loss": 0.6359, + "mean_token_accuracy": 0.8025752380490303, + "num_tokens": 19056706.0, + "step": 15880 + }, + { + "entropy": 1.8346818998456, + "epoch": 0.04925760970396611, + "grad_norm": 10.337366104125977, + "learning_rate": 7.695921728179794e-06, + "loss": 0.5781, + "mean_token_accuracy": 0.8142318353056908, + "num_tokens": 19069370.0, + "step": 15890 + }, + { + "entropy": 1.8798568680882455, + "epoch": 0.0492886088290158, + "grad_norm": 11.572778701782227, + "learning_rate": 7.700765281410444e-06, + "loss": 0.6304, + "mean_token_accuracy": 0.8030693680047989, + "num_tokens": 19081586.0, + "step": 15900 + }, + { + "entropy": 1.8789875194430352, + "epoch": 0.049319607954065495, + "grad_norm": 6.301096439361572, + "learning_rate": 7.705608834641092e-06, + "loss": 0.618, + "mean_token_accuracy": 0.8181715860962868, + "num_tokens": 19093093.0, + "step": 15910 + }, + { + "entropy": 1.7927953436970712, + "epoch": 0.04935060707911519, + "grad_norm": 6.900021076202393, + "learning_rate": 7.710452387871744e-06, + "loss": 0.5129, + "mean_token_accuracy": 0.8337122991681098, + "num_tokens": 19105589.0, + "step": 15920 + }, + { + "entropy": 1.8942086696624756, + "epoch": 0.04938160620416489, + "grad_norm": 9.592936515808105, + "learning_rate": 7.715295941102394e-06, + "loss": 0.6314, + "mean_token_accuracy": 0.8135825991630554, + "num_tokens": 19117515.0, + "step": 15930 + }, + { + "entropy": 1.8845902875065803, + "epoch": 0.049412605329214584, + "grad_norm": 5.827622890472412, + "learning_rate": 7.720139494333044e-06, + "loss": 0.5792, + "mean_token_accuracy": 0.8127764299511909, + "num_tokens": 19129539.0, + "step": 15940 + }, + { + "entropy": 1.8030100539326668, + "epoch": 0.04944360445426428, + "grad_norm": 10.93111515045166, + "learning_rate": 7.724983047563693e-06, + "loss": 0.5162, + "mean_token_accuracy": 0.8306656986474991, + "num_tokens": 19142450.0, + "step": 15950 + }, + { + "entropy": 1.8265103816986084, + "epoch": 0.04947460357931397, + "grad_norm": 4.990983486175537, + "learning_rate": 7.729826600794343e-06, + "loss": 0.5402, + "mean_token_accuracy": 0.8204995647072792, + "num_tokens": 19154745.0, + "step": 15960 + }, + { + "entropy": 1.8918519958853721, + "epoch": 0.04950560270436367, + "grad_norm": 9.628335952758789, + "learning_rate": 7.734670154024993e-06, + "loss": 0.6261, + "mean_token_accuracy": 0.8113234505057335, + "num_tokens": 19166521.0, + "step": 15970 + }, + { + "entropy": 1.886893954873085, + "epoch": 0.049536601829413364, + "grad_norm": 11.60335922241211, + "learning_rate": 7.739513707255643e-06, + "loss": 0.612, + "mean_token_accuracy": 0.8159346550703048, + "num_tokens": 19177833.0, + "step": 15980 + }, + { + "entropy": 1.831916256248951, + "epoch": 0.04956760095446306, + "grad_norm": 9.314860343933105, + "learning_rate": 7.744357260486293e-06, + "loss": 0.5776, + "mean_token_accuracy": 0.816097392141819, + "num_tokens": 19189742.0, + "step": 15990 + }, + { + "entropy": 1.9015469133853913, + "epoch": 0.04959860007951276, + "grad_norm": 11.75980281829834, + "learning_rate": 7.749200813716945e-06, + "loss": 0.6715, + "mean_token_accuracy": 0.8024095699191094, + "num_tokens": 19200772.0, + "step": 16000 + }, + { + "entropy": 1.8420792520046234, + "epoch": 0.049629599204562454, + "grad_norm": 10.332155227661133, + "learning_rate": 7.754044366947593e-06, + "loss": 0.5571, + "mean_token_accuracy": 0.8230875134468079, + "num_tokens": 19212754.0, + "step": 16010 + }, + { + "entropy": 1.9134253069758416, + "epoch": 0.04966059832961214, + "grad_norm": 12.374646186828613, + "learning_rate": 7.758887920178243e-06, + "loss": 0.6448, + "mean_token_accuracy": 0.8036070108413697, + "num_tokens": 19224725.0, + "step": 16020 + }, + { + "entropy": 1.8665786176919936, + "epoch": 0.04969159745466184, + "grad_norm": 4.8785624504089355, + "learning_rate": 7.763731473408892e-06, + "loss": 0.6109, + "mean_token_accuracy": 0.8155334115028381, + "num_tokens": 19236500.0, + "step": 16030 + }, + { + "entropy": 1.827447460591793, + "epoch": 0.04972259657971154, + "grad_norm": 10.244511604309082, + "learning_rate": 7.768575026639544e-06, + "loss": 0.5549, + "mean_token_accuracy": 0.8148609265685082, + "num_tokens": 19249091.0, + "step": 16040 + }, + { + "entropy": 1.875395241379738, + "epoch": 0.04975359570476123, + "grad_norm": 11.924630165100098, + "learning_rate": 7.773418579870194e-06, + "loss": 0.5739, + "mean_token_accuracy": 0.8249801099300385, + "num_tokens": 19260725.0, + "step": 16050 + }, + { + "entropy": 1.8245926171541214, + "epoch": 0.04978459482981093, + "grad_norm": 5.5314226150512695, + "learning_rate": 7.778262133100844e-06, + "loss": 0.5986, + "mean_token_accuracy": 0.8085296213626861, + "num_tokens": 19272815.0, + "step": 16060 + }, + { + "entropy": 1.879467526078224, + "epoch": 0.049815593954860626, + "grad_norm": 10.704133033752441, + "learning_rate": 7.783105686331494e-06, + "loss": 0.6219, + "mean_token_accuracy": 0.8161226466298104, + "num_tokens": 19284571.0, + "step": 16070 + }, + { + "entropy": 1.8190572828054428, + "epoch": 0.04984659307991032, + "grad_norm": 12.365442276000977, + "learning_rate": 7.787949239562144e-06, + "loss": 0.5841, + "mean_token_accuracy": 0.8116471886634826, + "num_tokens": 19297247.0, + "step": 16080 + }, + { + "entropy": 1.8453327685594558, + "epoch": 0.04987759220496001, + "grad_norm": 11.743905067443848, + "learning_rate": 7.792792792792793e-06, + "loss": 0.5898, + "mean_token_accuracy": 0.8130872502923012, + "num_tokens": 19309118.0, + "step": 16090 + }, + { + "entropy": 1.8390842065215112, + "epoch": 0.04990859133000971, + "grad_norm": 11.728593826293945, + "learning_rate": 7.797636346023443e-06, + "loss": 0.5926, + "mean_token_accuracy": 0.8115570530295372, + "num_tokens": 19321049.0, + "step": 16100 + }, + { + "entropy": 1.8872535049915313, + "epoch": 0.049939590455059406, + "grad_norm": 9.616110801696777, + "learning_rate": 7.802479899254093e-06, + "loss": 0.6174, + "mean_token_accuracy": 0.8136581242084503, + "num_tokens": 19332620.0, + "step": 16110 + }, + { + "entropy": 1.9285971507430077, + "epoch": 0.0499705895801091, + "grad_norm": 10.771924018859863, + "learning_rate": 7.807323452484743e-06, + "loss": 0.6794, + "mean_token_accuracy": 0.7982939928770065, + "num_tokens": 19344422.0, + "step": 16120 + }, + { + "entropy": 1.8309599250555038, + "epoch": 0.0500015887051588, + "grad_norm": 11.928840637207031, + "learning_rate": 7.812167005715393e-06, + "loss": 0.5435, + "mean_token_accuracy": 0.8183940485119819, + "num_tokens": 19356711.0, + "step": 16130 + }, + { + "entropy": 1.7304350897669791, + "epoch": 0.050032587830208496, + "grad_norm": 11.800362586975098, + "learning_rate": 7.817010558946043e-06, + "loss": 0.5798, + "mean_token_accuracy": 0.8197820097208023, + "num_tokens": 19370369.0, + "step": 16140 + }, + { + "entropy": 1.8792952716350555, + "epoch": 0.050063586955258185, + "grad_norm": 12.090376853942871, + "learning_rate": 7.821854112176693e-06, + "loss": 0.6114, + "mean_token_accuracy": 0.8166804388165474, + "num_tokens": 19381499.0, + "step": 16150 + }, + { + "entropy": 1.8935234278440476, + "epoch": 0.05009458608030788, + "grad_norm": 11.883420944213867, + "learning_rate": 7.826697665407343e-06, + "loss": 0.6121, + "mean_token_accuracy": 0.8147160053253174, + "num_tokens": 19392659.0, + "step": 16160 + }, + { + "entropy": 1.846675968170166, + "epoch": 0.05012558520535758, + "grad_norm": 11.219228744506836, + "learning_rate": 7.831541218637994e-06, + "loss": 0.6005, + "mean_token_accuracy": 0.8102226793766022, + "num_tokens": 19404388.0, + "step": 16170 + }, + { + "entropy": 1.7764321342110634, + "epoch": 0.050156584330407275, + "grad_norm": 10.584576606750488, + "learning_rate": 7.836384771868644e-06, + "loss": 0.5192, + "mean_token_accuracy": 0.823691800236702, + "num_tokens": 19417440.0, + "step": 16180 + }, + { + "entropy": 1.82427935898304, + "epoch": 0.05018758345545697, + "grad_norm": 5.237261772155762, + "learning_rate": 7.841228325099294e-06, + "loss": 0.5448, + "mean_token_accuracy": 0.8294490948319435, + "num_tokens": 19429964.0, + "step": 16190 + }, + { + "entropy": 1.830063909292221, + "epoch": 0.05021858258050667, + "grad_norm": 10.078315734863281, + "learning_rate": 7.846071878329944e-06, + "loss": 0.5906, + "mean_token_accuracy": 0.816213846206665, + "num_tokens": 19441944.0, + "step": 16200 + }, + { + "entropy": 1.8124560460448265, + "epoch": 0.05024958170555636, + "grad_norm": 7.47139835357666, + "learning_rate": 7.850915431560594e-06, + "loss": 0.6127, + "mean_token_accuracy": 0.8125569224357605, + "num_tokens": 19454488.0, + "step": 16210 + }, + { + "entropy": 1.8454759851098061, + "epoch": 0.050280580830606054, + "grad_norm": 11.923384666442871, + "learning_rate": 7.855758984791244e-06, + "loss": 0.5783, + "mean_token_accuracy": 0.8168366953730584, + "num_tokens": 19466582.0, + "step": 16220 + }, + { + "entropy": 1.8621361300349235, + "epoch": 0.05031157995565575, + "grad_norm": 11.48657512664795, + "learning_rate": 7.860602538021894e-06, + "loss": 0.6007, + "mean_token_accuracy": 0.8186299324035644, + "num_tokens": 19478249.0, + "step": 16230 + }, + { + "entropy": 1.8037197709083557, + "epoch": 0.05034257908070545, + "grad_norm": 5.228190898895264, + "learning_rate": 7.865446091252543e-06, + "loss": 0.5935, + "mean_token_accuracy": 0.8192895010113717, + "num_tokens": 19489977.0, + "step": 16240 + }, + { + "entropy": 1.9085147067904473, + "epoch": 0.050373578205755144, + "grad_norm": 12.668362617492676, + "learning_rate": 7.870289644483193e-06, + "loss": 0.6271, + "mean_token_accuracy": 0.8066832914948463, + "num_tokens": 19501682.0, + "step": 16250 + }, + { + "entropy": 1.8951284110546112, + "epoch": 0.05040457733080484, + "grad_norm": 5.59218168258667, + "learning_rate": 7.875133197713843e-06, + "loss": 0.5867, + "mean_token_accuracy": 0.8141255229711533, + "num_tokens": 19513281.0, + "step": 16260 + }, + { + "entropy": 1.8662694096565247, + "epoch": 0.05043557645585453, + "grad_norm": 11.334489822387695, + "learning_rate": 7.879976750944493e-06, + "loss": 0.5662, + "mean_token_accuracy": 0.8265201568603515, + "num_tokens": 19525550.0, + "step": 16270 + }, + { + "entropy": 1.838777382671833, + "epoch": 0.05046657558090423, + "grad_norm": 12.854705810546875, + "learning_rate": 7.884820304175143e-06, + "loss": 0.5784, + "mean_token_accuracy": 0.8137482151389122, + "num_tokens": 19537201.0, + "step": 16280 + }, + { + "entropy": 1.8109822481870652, + "epoch": 0.050497574705953924, + "grad_norm": 11.828731536865234, + "learning_rate": 7.889663857405795e-06, + "loss": 0.6238, + "mean_token_accuracy": 0.8079608172178269, + "num_tokens": 19550217.0, + "step": 16290 + }, + { + "entropy": 1.8977823466062547, + "epoch": 0.05052857383100362, + "grad_norm": 6.278931140899658, + "learning_rate": 7.894507410636444e-06, + "loss": 0.63, + "mean_token_accuracy": 0.8086236611008644, + "num_tokens": 19562455.0, + "step": 16300 + }, + { + "entropy": 1.885308313369751, + "epoch": 0.05055957295605332, + "grad_norm": 12.805869102478027, + "learning_rate": 7.899350963867094e-06, + "loss": 0.6222, + "mean_token_accuracy": 0.8057803988456727, + "num_tokens": 19573567.0, + "step": 16310 + }, + { + "entropy": 1.8054832592606544, + "epoch": 0.050590572081103014, + "grad_norm": 10.450759887695312, + "learning_rate": 7.904194517097744e-06, + "loss": 0.5596, + "mean_token_accuracy": 0.8225147247314453, + "num_tokens": 19585912.0, + "step": 16320 + }, + { + "entropy": 1.836893168091774, + "epoch": 0.0506215712061527, + "grad_norm": 8.395682334899902, + "learning_rate": 7.909038070328392e-06, + "loss": 0.604, + "mean_token_accuracy": 0.8111759915947914, + "num_tokens": 19597962.0, + "step": 16330 + }, + { + "entropy": 1.8264792799949645, + "epoch": 0.0506525703312024, + "grad_norm": 9.940167427062988, + "learning_rate": 7.913881623559044e-06, + "loss": 0.5945, + "mean_token_accuracy": 0.8128548488020897, + "num_tokens": 19610522.0, + "step": 16340 + }, + { + "entropy": 1.7342656940221786, + "epoch": 0.050683569456252096, + "grad_norm": 5.958490371704102, + "learning_rate": 7.918725176789694e-06, + "loss": 0.4647, + "mean_token_accuracy": 0.8350143611431122, + "num_tokens": 19623800.0, + "step": 16350 + }, + { + "entropy": 1.8424322217702866, + "epoch": 0.05071456858130179, + "grad_norm": 10.20142936706543, + "learning_rate": 7.923568730020344e-06, + "loss": 0.6181, + "mean_token_accuracy": 0.8129007190465927, + "num_tokens": 19635543.0, + "step": 16360 + }, + { + "entropy": 1.8671317547559738, + "epoch": 0.05074556770635149, + "grad_norm": 11.634441375732422, + "learning_rate": 7.928412283250994e-06, + "loss": 0.5988, + "mean_token_accuracy": 0.8180768474936485, + "num_tokens": 19646796.0, + "step": 16370 + }, + { + "entropy": 1.8461050242185593, + "epoch": 0.050776566831401186, + "grad_norm": 5.321835041046143, + "learning_rate": 7.933255836481643e-06, + "loss": 0.6055, + "mean_token_accuracy": 0.8090061485767365, + "num_tokens": 19658855.0, + "step": 16380 + }, + { + "entropy": 1.8895393520593644, + "epoch": 0.050807565956450876, + "grad_norm": 12.092207908630371, + "learning_rate": 7.938099389712293e-06, + "loss": 0.6138, + "mean_token_accuracy": 0.8007647573947907, + "num_tokens": 19670438.0, + "step": 16390 + }, + { + "entropy": 1.93761787712574, + "epoch": 0.05083856508150057, + "grad_norm": 13.244606971740723, + "learning_rate": 7.942942942942943e-06, + "loss": 0.6953, + "mean_token_accuracy": 0.7985577821731568, + "num_tokens": 19681443.0, + "step": 16400 + }, + { + "entropy": 1.8733505114912987, + "epoch": 0.05086956420655027, + "grad_norm": 10.528936386108398, + "learning_rate": 7.947786496173593e-06, + "loss": 0.5767, + "mean_token_accuracy": 0.8295532435178756, + "num_tokens": 19693237.0, + "step": 16410 + }, + { + "entropy": 1.8836349695920944, + "epoch": 0.050900563331599966, + "grad_norm": 9.953451156616211, + "learning_rate": 7.952630049404245e-06, + "loss": 0.5955, + "mean_token_accuracy": 0.8276330158114433, + "num_tokens": 19704482.0, + "step": 16420 + }, + { + "entropy": 1.8280857503414154, + "epoch": 0.05093156245664966, + "grad_norm": 11.806560516357422, + "learning_rate": 7.957473602634893e-06, + "loss": 0.5736, + "mean_token_accuracy": 0.8197454378008843, + "num_tokens": 19716072.0, + "step": 16430 + }, + { + "entropy": 1.868168619275093, + "epoch": 0.05096256158169936, + "grad_norm": 14.613922119140625, + "learning_rate": 7.962317155865543e-06, + "loss": 0.6294, + "mean_token_accuracy": 0.8104723706841469, + "num_tokens": 19727908.0, + "step": 16440 + }, + { + "entropy": 1.8337607711553574, + "epoch": 0.050993560706749055, + "grad_norm": 11.018813133239746, + "learning_rate": 7.967160709096193e-06, + "loss": 0.5722, + "mean_token_accuracy": 0.815605454146862, + "num_tokens": 19740124.0, + "step": 16450 + }, + { + "entropy": 1.8944316014647484, + "epoch": 0.051024559831798745, + "grad_norm": 12.068158149719238, + "learning_rate": 7.972004262326844e-06, + "loss": 0.611, + "mean_token_accuracy": 0.8088446974754333, + "num_tokens": 19752042.0, + "step": 16460 + }, + { + "entropy": 1.8141653031110763, + "epoch": 0.05105555895684844, + "grad_norm": 11.938932418823242, + "learning_rate": 7.976847815557494e-06, + "loss": 0.5529, + "mean_token_accuracy": 0.8189818143844605, + "num_tokens": 19764559.0, + "step": 16470 + }, + { + "entropy": 1.8091833949089051, + "epoch": 0.05108655808189814, + "grad_norm": 6.31152868270874, + "learning_rate": 7.981691368788144e-06, + "loss": 0.6014, + "mean_token_accuracy": 0.8165456935763359, + "num_tokens": 19776260.0, + "step": 16480 + }, + { + "entropy": 1.812686663866043, + "epoch": 0.051117557206947835, + "grad_norm": 5.221765995025635, + "learning_rate": 7.986534922018794e-06, + "loss": 0.5688, + "mean_token_accuracy": 0.8260109111666679, + "num_tokens": 19788934.0, + "step": 16490 + }, + { + "entropy": 1.8625101804733277, + "epoch": 0.05114855633199753, + "grad_norm": 11.6726713180542, + "learning_rate": 7.991378475249444e-06, + "loss": 0.5964, + "mean_token_accuracy": 0.8176575809717178, + "num_tokens": 19800717.0, + "step": 16500 + }, + { + "entropy": 1.8866008058190347, + "epoch": 0.05117955545704723, + "grad_norm": 13.77607250213623, + "learning_rate": 7.996222028480094e-06, + "loss": 0.6522, + "mean_token_accuracy": 0.7992734283208847, + "num_tokens": 19812273.0, + "step": 16510 + }, + { + "entropy": 1.9345821738243103, + "epoch": 0.05121055458209692, + "grad_norm": 10.338969230651855, + "learning_rate": 8.001065581710744e-06, + "loss": 0.684, + "mean_token_accuracy": 0.7988832354545593, + "num_tokens": 19822905.0, + "step": 16520 + }, + { + "entropy": 1.9000392645597457, + "epoch": 0.051241553707146614, + "grad_norm": 10.453362464904785, + "learning_rate": 8.005909134941393e-06, + "loss": 0.6257, + "mean_token_accuracy": 0.8212269991636276, + "num_tokens": 19833662.0, + "step": 16530 + }, + { + "entropy": 1.8661847695708276, + "epoch": 0.05127255283219631, + "grad_norm": 10.485684394836426, + "learning_rate": 8.010752688172043e-06, + "loss": 0.643, + "mean_token_accuracy": 0.8103910699486733, + "num_tokens": 19845416.0, + "step": 16540 + }, + { + "entropy": 1.8219714432954788, + "epoch": 0.05130355195724601, + "grad_norm": 12.01816463470459, + "learning_rate": 8.015596241402693e-06, + "loss": 0.6021, + "mean_token_accuracy": 0.8123174890875816, + "num_tokens": 19857098.0, + "step": 16550 + }, + { + "entropy": 1.8485499978065492, + "epoch": 0.051334551082295704, + "grad_norm": 9.451250076293945, + "learning_rate": 8.020439794633343e-06, + "loss": 0.6128, + "mean_token_accuracy": 0.8077960431575775, + "num_tokens": 19869230.0, + "step": 16560 + }, + { + "entropy": 1.8622397229075431, + "epoch": 0.0513655502073454, + "grad_norm": 10.47313404083252, + "learning_rate": 8.025283347863993e-06, + "loss": 0.6179, + "mean_token_accuracy": 0.8121888309717178, + "num_tokens": 19881415.0, + "step": 16570 + }, + { + "entropy": 1.763058114051819, + "epoch": 0.05139654933239509, + "grad_norm": 5.442761421203613, + "learning_rate": 8.030126901094643e-06, + "loss": 0.5156, + "mean_token_accuracy": 0.8221294581890106, + "num_tokens": 19894415.0, + "step": 16580 + }, + { + "entropy": 1.7793406203389168, + "epoch": 0.05142754845744479, + "grad_norm": 12.98482894897461, + "learning_rate": 8.034970454325294e-06, + "loss": 0.5569, + "mean_token_accuracy": 0.8247652933001518, + "num_tokens": 19907687.0, + "step": 16590 + }, + { + "entropy": 1.874410592019558, + "epoch": 0.051458547582494484, + "grad_norm": 4.950082778930664, + "learning_rate": 8.039814007555944e-06, + "loss": 0.6167, + "mean_token_accuracy": 0.8031128868460655, + "num_tokens": 19919926.0, + "step": 16600 + }, + { + "entropy": 1.8641373217105865, + "epoch": 0.05148954670754418, + "grad_norm": 11.791984558105469, + "learning_rate": 8.044657560786594e-06, + "loss": 0.6205, + "mean_token_accuracy": 0.8170140698552132, + "num_tokens": 19932297.0, + "step": 16610 + }, + { + "entropy": 1.8777590736746788, + "epoch": 0.05152054583259388, + "grad_norm": 14.65937614440918, + "learning_rate": 8.049501114017244e-06, + "loss": 0.6264, + "mean_token_accuracy": 0.8093700706958771, + "num_tokens": 19944296.0, + "step": 16620 + }, + { + "entropy": 1.83215441852808, + "epoch": 0.05155154495764357, + "grad_norm": 18.492868423461914, + "learning_rate": 8.054344667247894e-06, + "loss": 0.5367, + "mean_token_accuracy": 0.83001299649477, + "num_tokens": 19956294.0, + "step": 16630 + }, + { + "entropy": 1.8398912638425826, + "epoch": 0.05158254408269326, + "grad_norm": 12.522979736328125, + "learning_rate": 8.059188220478544e-06, + "loss": 0.615, + "mean_token_accuracy": 0.8146472930908203, + "num_tokens": 19968456.0, + "step": 16640 + }, + { + "entropy": 1.8897781684994697, + "epoch": 0.05161354320774296, + "grad_norm": 11.673059463500977, + "learning_rate": 8.064031773709194e-06, + "loss": 0.6458, + "mean_token_accuracy": 0.8108509004116058, + "num_tokens": 19980146.0, + "step": 16650 + }, + { + "entropy": 1.9111702471971512, + "epoch": 0.051644542332792656, + "grad_norm": 11.109082221984863, + "learning_rate": 8.068875326939844e-06, + "loss": 0.633, + "mean_token_accuracy": 0.8093647316098214, + "num_tokens": 19992383.0, + "step": 16660 + }, + { + "entropy": 1.9211183845996858, + "epoch": 0.05167554145784235, + "grad_norm": 11.176039695739746, + "learning_rate": 8.073718880170494e-06, + "loss": 0.6448, + "mean_token_accuracy": 0.8067373007535934, + "num_tokens": 20002938.0, + "step": 16670 + }, + { + "entropy": 1.8170073732733727, + "epoch": 0.05170654058289205, + "grad_norm": 11.94314193725586, + "learning_rate": 8.078562433401143e-06, + "loss": 0.5507, + "mean_token_accuracy": 0.8301214516162873, + "num_tokens": 20014929.0, + "step": 16680 + }, + { + "entropy": 1.8377844780683517, + "epoch": 0.051737539707941746, + "grad_norm": 9.638590812683105, + "learning_rate": 8.083405986631793e-06, + "loss": 0.5725, + "mean_token_accuracy": 0.8208416193723679, + "num_tokens": 20026901.0, + "step": 16690 + }, + { + "entropy": 1.866816473007202, + "epoch": 0.051768538832991436, + "grad_norm": 10.777813911437988, + "learning_rate": 8.088249539862443e-06, + "loss": 0.639, + "mean_token_accuracy": 0.8086013346910477, + "num_tokens": 20038260.0, + "step": 16700 + }, + { + "entropy": 1.8705125212669373, + "epoch": 0.05179953795804113, + "grad_norm": 12.59802532196045, + "learning_rate": 8.093093093093095e-06, + "loss": 0.5945, + "mean_token_accuracy": 0.8204304948449135, + "num_tokens": 20049362.0, + "step": 16710 + }, + { + "entropy": 1.81406359821558, + "epoch": 0.05183053708309083, + "grad_norm": 14.44949722290039, + "learning_rate": 8.097936646323745e-06, + "loss": 0.6025, + "mean_token_accuracy": 0.8081631407141685, + "num_tokens": 20062608.0, + "step": 16720 + }, + { + "entropy": 1.9008635878562927, + "epoch": 0.051861536208140525, + "grad_norm": 9.96707534790039, + "learning_rate": 8.102780199554395e-06, + "loss": 0.6399, + "mean_token_accuracy": 0.8111796498298645, + "num_tokens": 20074235.0, + "step": 16730 + }, + { + "entropy": 1.8784829795360565, + "epoch": 0.05189253533319022, + "grad_norm": 11.282867431640625, + "learning_rate": 8.107623752785044e-06, + "loss": 0.5778, + "mean_token_accuracy": 0.8110943511128426, + "num_tokens": 20086392.0, + "step": 16740 + }, + { + "entropy": 1.915185084939003, + "epoch": 0.05192353445823992, + "grad_norm": 10.151687622070312, + "learning_rate": 8.112467306015693e-06, + "loss": 0.6498, + "mean_token_accuracy": 0.8103260189294815, + "num_tokens": 20097631.0, + "step": 16750 + }, + { + "entropy": 1.791477920114994, + "epoch": 0.05195453358328961, + "grad_norm": 5.157210350036621, + "learning_rate": 8.117310859246344e-06, + "loss": 0.5729, + "mean_token_accuracy": 0.8181603282690049, + "num_tokens": 20111249.0, + "step": 16760 + }, + { + "entropy": 1.8779030591249466, + "epoch": 0.051985532708339305, + "grad_norm": 10.711991310119629, + "learning_rate": 8.122154412476994e-06, + "loss": 0.6099, + "mean_token_accuracy": 0.8104298338294029, + "num_tokens": 20122999.0, + "step": 16770 + }, + { + "entropy": 1.8585640251636506, + "epoch": 0.052016531833389, + "grad_norm": 5.962745666503906, + "learning_rate": 8.126997965707644e-06, + "loss": 0.5765, + "mean_token_accuracy": 0.8122934922575951, + "num_tokens": 20135004.0, + "step": 16780 + }, + { + "entropy": 1.8683840811252594, + "epoch": 0.0520475309584387, + "grad_norm": 10.104706764221191, + "learning_rate": 8.131841518938294e-06, + "loss": 0.5892, + "mean_token_accuracy": 0.8203515037894249, + "num_tokens": 20146845.0, + "step": 16790 + }, + { + "entropy": 1.8777433142066002, + "epoch": 0.052078530083488395, + "grad_norm": 11.71458625793457, + "learning_rate": 8.136685072168944e-06, + "loss": 0.6173, + "mean_token_accuracy": 0.816298334300518, + "num_tokens": 20157635.0, + "step": 16800 + }, + { + "entropy": 1.8254910349845885, + "epoch": 0.05210952920853809, + "grad_norm": 12.500772476196289, + "learning_rate": 8.141528625399594e-06, + "loss": 0.5744, + "mean_token_accuracy": 0.8122817128896713, + "num_tokens": 20169780.0, + "step": 16810 + }, + { + "entropy": 1.7841567173600197, + "epoch": 0.05214052833358779, + "grad_norm": 5.604972839355469, + "learning_rate": 8.146372178630243e-06, + "loss": 0.5449, + "mean_token_accuracy": 0.8187017843127251, + "num_tokens": 20182608.0, + "step": 16820 + }, + { + "entropy": 1.8085433050990105, + "epoch": 0.05217152745863748, + "grad_norm": 12.112675666809082, + "learning_rate": 8.151215731860895e-06, + "loss": 0.5584, + "mean_token_accuracy": 0.819066570699215, + "num_tokens": 20194982.0, + "step": 16830 + }, + { + "entropy": 1.8413061544299125, + "epoch": 0.052202526583687174, + "grad_norm": 12.422449111938477, + "learning_rate": 8.156059285091545e-06, + "loss": 0.5611, + "mean_token_accuracy": 0.8272005066275596, + "num_tokens": 20206354.0, + "step": 16840 + }, + { + "entropy": 1.8800123199820518, + "epoch": 0.05223352570873687, + "grad_norm": 5.7091779708862305, + "learning_rate": 8.160902838322193e-06, + "loss": 0.6096, + "mean_token_accuracy": 0.8087597385048866, + "num_tokens": 20218011.0, + "step": 16850 + }, + { + "entropy": 1.7969383016228675, + "epoch": 0.05226452483378657, + "grad_norm": 10.848820686340332, + "learning_rate": 8.165746391552843e-06, + "loss": 0.5235, + "mean_token_accuracy": 0.8236014172434807, + "num_tokens": 20230550.0, + "step": 16860 + }, + { + "entropy": 1.8589994862675667, + "epoch": 0.052295523958836264, + "grad_norm": 10.558844566345215, + "learning_rate": 8.170589944783493e-06, + "loss": 0.56, + "mean_token_accuracy": 0.8191485345363617, + "num_tokens": 20242485.0, + "step": 16870 + }, + { + "entropy": 1.8285365521907806, + "epoch": 0.05232652308388596, + "grad_norm": 10.265873908996582, + "learning_rate": 8.175433498014144e-06, + "loss": 0.5369, + "mean_token_accuracy": 0.827988238632679, + "num_tokens": 20254416.0, + "step": 16880 + }, + { + "entropy": 1.7561014324426651, + "epoch": 0.05235752220893565, + "grad_norm": 10.504303932189941, + "learning_rate": 8.180277051244794e-06, + "loss": 0.5194, + "mean_token_accuracy": 0.8308162286877632, + "num_tokens": 20267582.0, + "step": 16890 + }, + { + "entropy": 1.8213954389095306, + "epoch": 0.05238852133398535, + "grad_norm": 11.907408714294434, + "learning_rate": 8.185120604475444e-06, + "loss": 0.5975, + "mean_token_accuracy": 0.8134548336267471, + "num_tokens": 20279358.0, + "step": 16900 + }, + { + "entropy": 1.855673785507679, + "epoch": 0.05241952045903504, + "grad_norm": 12.381454467773438, + "learning_rate": 8.189964157706094e-06, + "loss": 0.6341, + "mean_token_accuracy": 0.8102763295173645, + "num_tokens": 20291624.0, + "step": 16910 + }, + { + "entropy": 1.8303613483905792, + "epoch": 0.05245051958408474, + "grad_norm": 11.09730052947998, + "learning_rate": 8.194807710936744e-06, + "loss": 0.5403, + "mean_token_accuracy": 0.8215647727251053, + "num_tokens": 20304029.0, + "step": 16920 + }, + { + "entropy": 1.7606044977903366, + "epoch": 0.052481518709134437, + "grad_norm": 14.970794677734375, + "learning_rate": 8.199651264167394e-06, + "loss": 0.5461, + "mean_token_accuracy": 0.8292558521032334, + "num_tokens": 20317399.0, + "step": 16930 + }, + { + "entropy": 1.9114146530628204, + "epoch": 0.05251251783418413, + "grad_norm": 11.057762145996094, + "learning_rate": 8.204494817398044e-06, + "loss": 0.6605, + "mean_token_accuracy": 0.8093030959367752, + "num_tokens": 20329008.0, + "step": 16940 + }, + { + "entropy": 1.9093146950006485, + "epoch": 0.05254351695923382, + "grad_norm": 10.629437446594238, + "learning_rate": 8.209338370628694e-06, + "loss": 0.6731, + "mean_token_accuracy": 0.8056405037641525, + "num_tokens": 20340231.0, + "step": 16950 + }, + { + "entropy": 1.8436574935913086, + "epoch": 0.05257451608428352, + "grad_norm": 11.923999786376953, + "learning_rate": 8.214181923859344e-06, + "loss": 0.5748, + "mean_token_accuracy": 0.8199576199054718, + "num_tokens": 20352072.0, + "step": 16960 + }, + { + "entropy": 1.8853593990206718, + "epoch": 0.052605515209333216, + "grad_norm": 10.41568660736084, + "learning_rate": 8.219025477089993e-06, + "loss": 0.6458, + "mean_token_accuracy": 0.8017956003546715, + "num_tokens": 20363499.0, + "step": 16970 + }, + { + "entropy": 1.918022060394287, + "epoch": 0.05263651433438291, + "grad_norm": 8.378069877624512, + "learning_rate": 8.223869030320643e-06, + "loss": 0.6415, + "mean_token_accuracy": 0.8088062942028046, + "num_tokens": 20375464.0, + "step": 16980 + }, + { + "entropy": 1.8862889647483825, + "epoch": 0.05266751345943261, + "grad_norm": 10.294898986816406, + "learning_rate": 8.228712583551293e-06, + "loss": 0.636, + "mean_token_accuracy": 0.8037711903452873, + "num_tokens": 20387540.0, + "step": 16990 + }, + { + "entropy": 1.9208069905638694, + "epoch": 0.052698512584482306, + "grad_norm": 10.81685733795166, + "learning_rate": 8.233556136781943e-06, + "loss": 0.7022, + "mean_token_accuracy": 0.8014640405774116, + "num_tokens": 20399798.0, + "step": 17000 + }, + { + "entropy": 1.8465980917215348, + "epoch": 0.052729511709531995, + "grad_norm": 12.846884727478027, + "learning_rate": 8.238399690012595e-06, + "loss": 0.592, + "mean_token_accuracy": 0.820027408003807, + "num_tokens": 20411783.0, + "step": 17010 + }, + { + "entropy": 1.8712544098496438, + "epoch": 0.05276051083458169, + "grad_norm": 12.44199275970459, + "learning_rate": 8.243243243243245e-06, + "loss": 0.6153, + "mean_token_accuracy": 0.8144369840621948, + "num_tokens": 20423720.0, + "step": 17020 + }, + { + "entropy": 1.861243399977684, + "epoch": 0.05279150995963139, + "grad_norm": 10.822843551635742, + "learning_rate": 8.248086796473894e-06, + "loss": 0.6249, + "mean_token_accuracy": 0.8173502340912819, + "num_tokens": 20435655.0, + "step": 17030 + }, + { + "entropy": 1.831656913459301, + "epoch": 0.052822509084681085, + "grad_norm": 10.160820960998535, + "learning_rate": 8.252930349704544e-06, + "loss": 0.5545, + "mean_token_accuracy": 0.8063213124871254, + "num_tokens": 20449025.0, + "step": 17040 + }, + { + "entropy": 1.78798815459013, + "epoch": 0.05285350820973078, + "grad_norm": 5.9856414794921875, + "learning_rate": 8.257773902935194e-06, + "loss": 0.5513, + "mean_token_accuracy": 0.8148110911250115, + "num_tokens": 20461447.0, + "step": 17050 + }, + { + "entropy": 1.86494023501873, + "epoch": 0.05288450733478048, + "grad_norm": 9.526678085327148, + "learning_rate": 8.262617456165844e-06, + "loss": 0.6113, + "mean_token_accuracy": 0.815309202671051, + "num_tokens": 20472983.0, + "step": 17060 + }, + { + "entropy": 1.8194462105631828, + "epoch": 0.05291550645983017, + "grad_norm": 5.433764934539795, + "learning_rate": 8.267461009396494e-06, + "loss": 0.5941, + "mean_token_accuracy": 0.8140083074569702, + "num_tokens": 20485617.0, + "step": 17070 + }, + { + "entropy": 1.876285555958748, + "epoch": 0.052946505584879865, + "grad_norm": 5.180232524871826, + "learning_rate": 8.272304562627144e-06, + "loss": 0.5879, + "mean_token_accuracy": 0.8107918843626976, + "num_tokens": 20496764.0, + "step": 17080 + }, + { + "entropy": 1.7992719858884811, + "epoch": 0.05297750470992956, + "grad_norm": 10.29720687866211, + "learning_rate": 8.277148115857794e-06, + "loss": 0.5607, + "mean_token_accuracy": 0.8232490435242653, + "num_tokens": 20509654.0, + "step": 17090 + }, + { + "entropy": 1.8305759117007256, + "epoch": 0.05300850383497926, + "grad_norm": 11.629636764526367, + "learning_rate": 8.281991669088444e-06, + "loss": 0.5491, + "mean_token_accuracy": 0.8326339006423951, + "num_tokens": 20521330.0, + "step": 17100 + }, + { + "entropy": 1.8921547949314117, + "epoch": 0.053039502960028954, + "grad_norm": 10.69015884399414, + "learning_rate": 8.286835222319093e-06, + "loss": 0.5905, + "mean_token_accuracy": 0.8138958439230919, + "num_tokens": 20532830.0, + "step": 17110 + }, + { + "entropy": 1.8374129354953765, + "epoch": 0.05307050208507865, + "grad_norm": 14.065573692321777, + "learning_rate": 8.291678775549743e-06, + "loss": 0.5772, + "mean_token_accuracy": 0.8190833449363708, + "num_tokens": 20545189.0, + "step": 17120 + }, + { + "entropy": 1.8749100968241692, + "epoch": 0.05310150121012834, + "grad_norm": 11.14418888092041, + "learning_rate": 8.296522328780395e-06, + "loss": 0.5848, + "mean_token_accuracy": 0.8282869830727577, + "num_tokens": 20557028.0, + "step": 17130 + }, + { + "entropy": 1.9455086424946786, + "epoch": 0.05313250033517804, + "grad_norm": 6.915708065032959, + "learning_rate": 8.301365882011045e-06, + "loss": 0.6958, + "mean_token_accuracy": 0.7915912076830864, + "num_tokens": 20567980.0, + "step": 17140 + }, + { + "entropy": 1.962218326330185, + "epoch": 0.053163499460227734, + "grad_norm": 12.134001731872559, + "learning_rate": 8.306209435241695e-06, + "loss": 0.6929, + "mean_token_accuracy": 0.8000232353806496, + "num_tokens": 20578839.0, + "step": 17150 + }, + { + "entropy": 1.845068359375, + "epoch": 0.05319449858527743, + "grad_norm": 11.644062995910645, + "learning_rate": 8.311052988472345e-06, + "loss": 0.6593, + "mean_token_accuracy": 0.8096413478255272, + "num_tokens": 20590558.0, + "step": 17160 + }, + { + "entropy": 1.8619411289691925, + "epoch": 0.05322549771032713, + "grad_norm": 10.61169719696045, + "learning_rate": 8.315896541702993e-06, + "loss": 0.5858, + "mean_token_accuracy": 0.8225790098309517, + "num_tokens": 20601969.0, + "step": 17170 + }, + { + "entropy": 1.8861952975392342, + "epoch": 0.053256496835376824, + "grad_norm": 5.271222114562988, + "learning_rate": 8.320740094933644e-06, + "loss": 0.6097, + "mean_token_accuracy": 0.8090833589434624, + "num_tokens": 20614378.0, + "step": 17180 + }, + { + "entropy": 1.7966760843992233, + "epoch": 0.05328749596042652, + "grad_norm": 10.908940315246582, + "learning_rate": 8.325583648164294e-06, + "loss": 0.5047, + "mean_token_accuracy": 0.8201152965426445, + "num_tokens": 20628222.0, + "step": 17190 + }, + { + "entropy": 1.8242161065340041, + "epoch": 0.05331849508547621, + "grad_norm": 5.578505992889404, + "learning_rate": 8.330427201394944e-06, + "loss": 0.5196, + "mean_token_accuracy": 0.8250388026237487, + "num_tokens": 20641356.0, + "step": 17200 + }, + { + "entropy": 1.9080461800098418, + "epoch": 0.053349494210525907, + "grad_norm": 10.300594329833984, + "learning_rate": 8.335270754625594e-06, + "loss": 0.6218, + "mean_token_accuracy": 0.8083810389041901, + "num_tokens": 20653007.0, + "step": 17210 + }, + { + "entropy": 1.9093635857105256, + "epoch": 0.0533804933355756, + "grad_norm": 11.184460639953613, + "learning_rate": 8.340114307856244e-06, + "loss": 0.6377, + "mean_token_accuracy": 0.8104760378599167, + "num_tokens": 20663792.0, + "step": 17220 + }, + { + "entropy": 1.9165063306689263, + "epoch": 0.0534114924606253, + "grad_norm": 10.442662239074707, + "learning_rate": 8.344957861086894e-06, + "loss": 0.5942, + "mean_token_accuracy": 0.8143234044313431, + "num_tokens": 20675785.0, + "step": 17230 + }, + { + "entropy": 1.9359146520495414, + "epoch": 0.053442491585674996, + "grad_norm": 8.756479263305664, + "learning_rate": 8.349801414317544e-06, + "loss": 0.6133, + "mean_token_accuracy": 0.8071959316730499, + "num_tokens": 20687331.0, + "step": 17240 + }, + { + "entropy": 1.945779299736023, + "epoch": 0.05347349071072469, + "grad_norm": 11.406476020812988, + "learning_rate": 8.354644967548195e-06, + "loss": 0.6008, + "mean_token_accuracy": 0.8256149128079414, + "num_tokens": 20697727.0, + "step": 17250 + }, + { + "entropy": 1.8896187216043472, + "epoch": 0.05350448983577438, + "grad_norm": 12.807340621948242, + "learning_rate": 8.359488520778845e-06, + "loss": 0.6007, + "mean_token_accuracy": 0.8113436698913574, + "num_tokens": 20709839.0, + "step": 17260 + }, + { + "entropy": 1.8658792898058891, + "epoch": 0.05353548896082408, + "grad_norm": 11.434633255004883, + "learning_rate": 8.364332074009493e-06, + "loss": 0.6164, + "mean_token_accuracy": 0.806533083319664, + "num_tokens": 20721529.0, + "step": 17270 + }, + { + "entropy": 1.8530214801430702, + "epoch": 0.053566488085873776, + "grad_norm": 10.531983375549316, + "learning_rate": 8.369175627240143e-06, + "loss": 0.5375, + "mean_token_accuracy": 0.819164864718914, + "num_tokens": 20734410.0, + "step": 17280 + }, + { + "entropy": 1.864107219874859, + "epoch": 0.05359748721092347, + "grad_norm": 11.01248550415039, + "learning_rate": 8.374019180470793e-06, + "loss": 0.6512, + "mean_token_accuracy": 0.8096802055835723, + "num_tokens": 20746182.0, + "step": 17290 + }, + { + "entropy": 1.8512002035975457, + "epoch": 0.05362848633597317, + "grad_norm": 10.909806251525879, + "learning_rate": 8.378862733701445e-06, + "loss": 0.6076, + "mean_token_accuracy": 0.8117505341768265, + "num_tokens": 20758266.0, + "step": 17300 + }, + { + "entropy": 1.9608330637216569, + "epoch": 0.053659485461022866, + "grad_norm": 9.702219009399414, + "learning_rate": 8.383706286932095e-06, + "loss": 0.6645, + "mean_token_accuracy": 0.8040051057934761, + "num_tokens": 20768751.0, + "step": 17310 + }, + { + "entropy": 1.9042405292391777, + "epoch": 0.053690484586072555, + "grad_norm": 10.729706764221191, + "learning_rate": 8.388549840162744e-06, + "loss": 0.6057, + "mean_token_accuracy": 0.817810270190239, + "num_tokens": 20780886.0, + "step": 17320 + }, + { + "entropy": 1.8193862006068229, + "epoch": 0.05372148371112225, + "grad_norm": 5.850473880767822, + "learning_rate": 8.393393393393394e-06, + "loss": 0.5504, + "mean_token_accuracy": 0.8224448531866073, + "num_tokens": 20793098.0, + "step": 17330 + }, + { + "entropy": 1.7956085950136185, + "epoch": 0.05375248283617195, + "grad_norm": 4.813007354736328, + "learning_rate": 8.398236946624044e-06, + "loss": 0.5571, + "mean_token_accuracy": 0.818726347386837, + "num_tokens": 20806408.0, + "step": 17340 + }, + { + "entropy": 1.8429106876254082, + "epoch": 0.053783481961221645, + "grad_norm": 4.898284435272217, + "learning_rate": 8.403080499854694e-06, + "loss": 0.5251, + "mean_token_accuracy": 0.828820888698101, + "num_tokens": 20819353.0, + "step": 17350 + }, + { + "entropy": 1.8711024463176726, + "epoch": 0.05381448108627134, + "grad_norm": 6.471118927001953, + "learning_rate": 8.407924053085344e-06, + "loss": 0.588, + "mean_token_accuracy": 0.8236263871192933, + "num_tokens": 20830636.0, + "step": 17360 + }, + { + "entropy": 1.8240604743361473, + "epoch": 0.05384548021132104, + "grad_norm": 10.193392753601074, + "learning_rate": 8.412767606315994e-06, + "loss": 0.5247, + "mean_token_accuracy": 0.8229044646024704, + "num_tokens": 20843265.0, + "step": 17370 + }, + { + "entropy": 1.9022013053297997, + "epoch": 0.05387647933637073, + "grad_norm": 11.367668151855469, + "learning_rate": 8.417611159546644e-06, + "loss": 0.6635, + "mean_token_accuracy": 0.8005659952759743, + "num_tokens": 20855224.0, + "step": 17380 + }, + { + "entropy": 1.874705444276333, + "epoch": 0.053907478461420424, + "grad_norm": 13.563992500305176, + "learning_rate": 8.422454712777294e-06, + "loss": 0.6151, + "mean_token_accuracy": 0.8215208485722542, + "num_tokens": 20866903.0, + "step": 17390 + }, + { + "entropy": 1.9510533064603806, + "epoch": 0.05393847758647012, + "grad_norm": 12.109170913696289, + "learning_rate": 8.427298266007944e-06, + "loss": 0.6787, + "mean_token_accuracy": 0.793503324687481, + "num_tokens": 20878367.0, + "step": 17400 + }, + { + "entropy": 1.877579514682293, + "epoch": 0.05396947671151982, + "grad_norm": 10.051459312438965, + "learning_rate": 8.432141819238593e-06, + "loss": 0.5547, + "mean_token_accuracy": 0.8256360620260239, + "num_tokens": 20890360.0, + "step": 17410 + }, + { + "entropy": 1.9146401450037955, + "epoch": 0.054000475836569514, + "grad_norm": 10.120962142944336, + "learning_rate": 8.436985372469243e-06, + "loss": 0.6321, + "mean_token_accuracy": 0.8045231074094772, + "num_tokens": 20902287.0, + "step": 17420 + }, + { + "entropy": 1.8735201194882394, + "epoch": 0.05403147496161921, + "grad_norm": 5.3620524406433105, + "learning_rate": 8.441828925699895e-06, + "loss": 0.6369, + "mean_token_accuracy": 0.811458395421505, + "num_tokens": 20913921.0, + "step": 17430 + }, + { + "entropy": 1.8790420606732368, + "epoch": 0.0540624740866689, + "grad_norm": 11.361139297485352, + "learning_rate": 8.446672478930545e-06, + "loss": 0.5784, + "mean_token_accuracy": 0.8240441083908081, + "num_tokens": 20925521.0, + "step": 17440 + }, + { + "entropy": 1.8804828137159348, + "epoch": 0.0540934732117186, + "grad_norm": 12.176139831542969, + "learning_rate": 8.451516032161195e-06, + "loss": 0.5856, + "mean_token_accuracy": 0.8197601407766342, + "num_tokens": 20937530.0, + "step": 17450 + }, + { + "entropy": 1.8536333724856378, + "epoch": 0.054124472336768294, + "grad_norm": 8.680792808532715, + "learning_rate": 8.456359585391845e-06, + "loss": 0.5507, + "mean_token_accuracy": 0.8246132522821427, + "num_tokens": 20949885.0, + "step": 17460 + }, + { + "entropy": 1.9130322173237801, + "epoch": 0.05415547146181799, + "grad_norm": 9.968182563781738, + "learning_rate": 8.461203138622494e-06, + "loss": 0.6107, + "mean_token_accuracy": 0.8138394117355346, + "num_tokens": 20961126.0, + "step": 17470 + }, + { + "entropy": 1.8640489429235458, + "epoch": 0.05418647058686769, + "grad_norm": 14.096504211425781, + "learning_rate": 8.466046691853144e-06, + "loss": 0.5921, + "mean_token_accuracy": 0.8181562379002572, + "num_tokens": 20973024.0, + "step": 17480 + }, + { + "entropy": 1.7738539189100266, + "epoch": 0.05421746971191738, + "grad_norm": 10.170340538024902, + "learning_rate": 8.470890245083794e-06, + "loss": 0.5353, + "mean_token_accuracy": 0.8179555460810661, + "num_tokens": 20985918.0, + "step": 17490 + }, + { + "entropy": 1.8802258223295212, + "epoch": 0.05424846883696708, + "grad_norm": 10.039809226989746, + "learning_rate": 8.475733798314444e-06, + "loss": 0.6347, + "mean_token_accuracy": 0.8079760164022446, + "num_tokens": 20998102.0, + "step": 17500 + }, + { + "entropy": 1.9156350553035737, + "epoch": 0.05427946796201677, + "grad_norm": 9.540949821472168, + "learning_rate": 8.480577351545094e-06, + "loss": 0.6298, + "mean_token_accuracy": 0.8177491262555122, + "num_tokens": 21009239.0, + "step": 17510 + }, + { + "entropy": 1.804891037940979, + "epoch": 0.054310467087066466, + "grad_norm": 9.833329200744629, + "learning_rate": 8.485420904775744e-06, + "loss": 0.6428, + "mean_token_accuracy": 0.808480116724968, + "num_tokens": 21021894.0, + "step": 17520 + }, + { + "entropy": 1.9138577803969383, + "epoch": 0.05434146621211616, + "grad_norm": 6.600327968597412, + "learning_rate": 8.490264458006394e-06, + "loss": 0.5808, + "mean_token_accuracy": 0.8139492854475975, + "num_tokens": 21033383.0, + "step": 17530 + }, + { + "entropy": 1.7570361971855164, + "epoch": 0.05437246533716586, + "grad_norm": 6.329018592834473, + "learning_rate": 8.495108011237044e-06, + "loss": 0.5523, + "mean_token_accuracy": 0.818885837495327, + "num_tokens": 21047301.0, + "step": 17540 + }, + { + "entropy": 1.9657986283302307, + "epoch": 0.054403464462215556, + "grad_norm": 10.000340461730957, + "learning_rate": 8.499951564467695e-06, + "loss": 0.7018, + "mean_token_accuracy": 0.7982934132218361, + "num_tokens": 21058156.0, + "step": 17550 + }, + { + "entropy": 1.881192384660244, + "epoch": 0.05443446358726525, + "grad_norm": 9.785136222839355, + "learning_rate": 8.504795117698345e-06, + "loss": 0.6109, + "mean_token_accuracy": 0.8185616865754127, + "num_tokens": 21070030.0, + "step": 17560 + }, + { + "entropy": 1.8533698439598083, + "epoch": 0.05446546271231494, + "grad_norm": 4.606391906738281, + "learning_rate": 8.509638670928995e-06, + "loss": 0.6352, + "mean_token_accuracy": 0.8094547167420387, + "num_tokens": 21082808.0, + "step": 17570 + }, + { + "entropy": 1.8658347025513649, + "epoch": 0.05449646183736464, + "grad_norm": 10.683303833007812, + "learning_rate": 8.514482224159645e-06, + "loss": 0.5723, + "mean_token_accuracy": 0.8231870085000992, + "num_tokens": 21095739.0, + "step": 17580 + }, + { + "entropy": 1.9149486675858498, + "epoch": 0.054527460962414336, + "grad_norm": 7.256167411804199, + "learning_rate": 8.519325777390293e-06, + "loss": 0.6271, + "mean_token_accuracy": 0.8080606922507286, + "num_tokens": 21107649.0, + "step": 17590 + }, + { + "entropy": 1.857093758881092, + "epoch": 0.05455846008746403, + "grad_norm": 10.606932640075684, + "learning_rate": 8.524169330620945e-06, + "loss": 0.575, + "mean_token_accuracy": 0.8211929991841316, + "num_tokens": 21120550.0, + "step": 17600 + }, + { + "entropy": 1.8073217749595643, + "epoch": 0.05458945921251373, + "grad_norm": 10.49376392364502, + "learning_rate": 8.529012883851594e-06, + "loss": 0.5124, + "mean_token_accuracy": 0.8263013437390327, + "num_tokens": 21134529.0, + "step": 17610 + }, + { + "entropy": 1.8306412398815155, + "epoch": 0.054620458337563425, + "grad_norm": 9.479334831237793, + "learning_rate": 8.533856437082244e-06, + "loss": 0.6191, + "mean_token_accuracy": 0.8138529911637307, + "num_tokens": 21148789.0, + "step": 17620 + }, + { + "entropy": 1.9288539364933968, + "epoch": 0.054651457462613115, + "grad_norm": 9.992156028747559, + "learning_rate": 8.538699990312894e-06, + "loss": 0.6597, + "mean_token_accuracy": 0.8028325289487839, + "num_tokens": 21160363.0, + "step": 17630 + }, + { + "entropy": 1.865708639472723, + "epoch": 0.05468245658766281, + "grad_norm": 5.635770797729492, + "learning_rate": 8.543543543543544e-06, + "loss": 0.5549, + "mean_token_accuracy": 0.8194977372884751, + "num_tokens": 21173133.0, + "step": 17640 + }, + { + "entropy": 1.8686399355530738, + "epoch": 0.05471345571271251, + "grad_norm": 14.837812423706055, + "learning_rate": 8.548387096774194e-06, + "loss": 0.5678, + "mean_token_accuracy": 0.8228330850601197, + "num_tokens": 21185944.0, + "step": 17650 + }, + { + "entropy": 1.936301600933075, + "epoch": 0.054744454837762205, + "grad_norm": 9.631260871887207, + "learning_rate": 8.553230650004844e-06, + "loss": 0.5762, + "mean_token_accuracy": 0.8145893216133118, + "num_tokens": 21197684.0, + "step": 17660 + }, + { + "entropy": 1.9080315500497818, + "epoch": 0.0547754539628119, + "grad_norm": 9.75123119354248, + "learning_rate": 8.558074203235495e-06, + "loss": 0.5961, + "mean_token_accuracy": 0.8067243576049805, + "num_tokens": 21209448.0, + "step": 17670 + }, + { + "entropy": 1.941755273938179, + "epoch": 0.0548064530878616, + "grad_norm": 7.447671413421631, + "learning_rate": 8.562917756466145e-06, + "loss": 0.6306, + "mean_token_accuracy": 0.8096268206834794, + "num_tokens": 21222011.0, + "step": 17680 + }, + { + "entropy": 1.9120370209217072, + "epoch": 0.05483745221291129, + "grad_norm": 5.153918266296387, + "learning_rate": 8.567761309696794e-06, + "loss": 0.5873, + "mean_token_accuracy": 0.8094914838671684, + "num_tokens": 21234368.0, + "step": 17690 + }, + { + "entropy": 1.8817522883415223, + "epoch": 0.054868451337960984, + "grad_norm": 10.505568504333496, + "learning_rate": 8.572604862927443e-06, + "loss": 0.5737, + "mean_token_accuracy": 0.8192173585295677, + "num_tokens": 21246765.0, + "step": 17700 + }, + { + "entropy": 1.8698602363467216, + "epoch": 0.05489945046301068, + "grad_norm": 10.905664443969727, + "learning_rate": 8.577448416158093e-06, + "loss": 0.5834, + "mean_token_accuracy": 0.81576436907053, + "num_tokens": 21258879.0, + "step": 17710 + }, + { + "entropy": 1.8876920908689498, + "epoch": 0.05493044958806038, + "grad_norm": 10.402029991149902, + "learning_rate": 8.582291969388745e-06, + "loss": 0.5738, + "mean_token_accuracy": 0.8234436243772507, + "num_tokens": 21270505.0, + "step": 17720 + }, + { + "entropy": 1.8734422281384469, + "epoch": 0.054961448713110074, + "grad_norm": 10.90580940246582, + "learning_rate": 8.587135522619395e-06, + "loss": 0.5968, + "mean_token_accuracy": 0.8190149515867233, + "num_tokens": 21282053.0, + "step": 17730 + }, + { + "entropy": 1.8915531307458877, + "epoch": 0.05499244783815977, + "grad_norm": 8.619651794433594, + "learning_rate": 8.591979075850045e-06, + "loss": 0.6141, + "mean_token_accuracy": 0.8292980402708053, + "num_tokens": 21293252.0, + "step": 17740 + }, + { + "entropy": 1.819742462038994, + "epoch": 0.05502344696320946, + "grad_norm": 9.859935760498047, + "learning_rate": 8.596822629080695e-06, + "loss": 0.5696, + "mean_token_accuracy": 0.825517275929451, + "num_tokens": 21305648.0, + "step": 17750 + }, + { + "entropy": 1.7411428809165954, + "epoch": 0.05505444608825916, + "grad_norm": 11.464165687561035, + "learning_rate": 8.601666182311344e-06, + "loss": 0.5192, + "mean_token_accuracy": 0.8340383976697922, + "num_tokens": 21320159.0, + "step": 17760 + }, + { + "entropy": 1.919995127618313, + "epoch": 0.05508544521330885, + "grad_norm": 4.660171985626221, + "learning_rate": 8.606509735541994e-06, + "loss": 0.5619, + "mean_token_accuracy": 0.8194687977433205, + "num_tokens": 21331854.0, + "step": 17770 + }, + { + "entropy": 1.826239649951458, + "epoch": 0.05511644433835855, + "grad_norm": 6.208508014678955, + "learning_rate": 8.611353288772644e-06, + "loss": 0.519, + "mean_token_accuracy": 0.8199943840503693, + "num_tokens": 21344553.0, + "step": 17780 + }, + { + "entropy": 1.964565047621727, + "epoch": 0.05514744346340825, + "grad_norm": 11.809846878051758, + "learning_rate": 8.616196842003294e-06, + "loss": 0.6973, + "mean_token_accuracy": 0.8042270794510842, + "num_tokens": 21355485.0, + "step": 17790 + }, + { + "entropy": 1.9378721952438354, + "epoch": 0.05517844258845794, + "grad_norm": 9.962228775024414, + "learning_rate": 8.621040395233944e-06, + "loss": 0.6338, + "mean_token_accuracy": 0.8136247247457504, + "num_tokens": 21367049.0, + "step": 17800 + }, + { + "entropy": 1.894200675189495, + "epoch": 0.05520944171350763, + "grad_norm": 4.6598663330078125, + "learning_rate": 8.625883948464594e-06, + "loss": 0.5756, + "mean_token_accuracy": 0.8180066585540772, + "num_tokens": 21378413.0, + "step": 17810 + }, + { + "entropy": 1.828821636736393, + "epoch": 0.05524044083855733, + "grad_norm": 5.244216442108154, + "learning_rate": 8.630727501695244e-06, + "loss": 0.5283, + "mean_token_accuracy": 0.8234935000538826, + "num_tokens": 21391978.0, + "step": 17820 + }, + { + "entropy": 1.8749590143561363, + "epoch": 0.055271439963607026, + "grad_norm": 11.188740730285645, + "learning_rate": 8.635571054925894e-06, + "loss": 0.5804, + "mean_token_accuracy": 0.8215572342276574, + "num_tokens": 21403442.0, + "step": 17830 + }, + { + "entropy": 1.792897927761078, + "epoch": 0.05530243908865672, + "grad_norm": 5.151137828826904, + "learning_rate": 8.640414608156543e-06, + "loss": 0.4537, + "mean_token_accuracy": 0.8246389016509056, + "num_tokens": 21416621.0, + "step": 17840 + }, + { + "entropy": 1.9171807587146759, + "epoch": 0.05533343821370642, + "grad_norm": 10.319511413574219, + "learning_rate": 8.645258161387195e-06, + "loss": 0.7074, + "mean_token_accuracy": 0.8059808582067489, + "num_tokens": 21428271.0, + "step": 17850 + }, + { + "entropy": 1.8502112567424773, + "epoch": 0.055364437338756116, + "grad_norm": 9.720250129699707, + "learning_rate": 8.650101714617845e-06, + "loss": 0.636, + "mean_token_accuracy": 0.8085378974676132, + "num_tokens": 21439709.0, + "step": 17860 + }, + { + "entropy": 1.8581691205501556, + "epoch": 0.05539543646380581, + "grad_norm": 10.996453285217285, + "learning_rate": 8.654945267848495e-06, + "loss": 0.6044, + "mean_token_accuracy": 0.8172203198075294, + "num_tokens": 21451408.0, + "step": 17870 + }, + { + "entropy": 1.8972811639308929, + "epoch": 0.0554264355888555, + "grad_norm": 10.46535587310791, + "learning_rate": 8.659788821079145e-06, + "loss": 0.6667, + "mean_token_accuracy": 0.8105733722448349, + "num_tokens": 21463410.0, + "step": 17880 + }, + { + "entropy": 1.801578015089035, + "epoch": 0.0554574347139052, + "grad_norm": 11.458210945129395, + "learning_rate": 8.664632374309795e-06, + "loss": 0.4808, + "mean_token_accuracy": 0.8232032969594002, + "num_tokens": 21476435.0, + "step": 17890 + }, + { + "entropy": 1.8537754505872726, + "epoch": 0.055488433838954895, + "grad_norm": 9.625106811523438, + "learning_rate": 8.669475927540444e-06, + "loss": 0.5613, + "mean_token_accuracy": 0.8271110355854034, + "num_tokens": 21488620.0, + "step": 17900 + }, + { + "entropy": 1.920037829875946, + "epoch": 0.05551943296400459, + "grad_norm": 11.167927742004395, + "learning_rate": 8.674319480771094e-06, + "loss": 0.633, + "mean_token_accuracy": 0.8028773024678231, + "num_tokens": 21500165.0, + "step": 17910 + }, + { + "entropy": 1.8313481405377388, + "epoch": 0.05555043208905429, + "grad_norm": 12.061219215393066, + "learning_rate": 8.679163034001744e-06, + "loss": 0.5336, + "mean_token_accuracy": 0.8166040554642677, + "num_tokens": 21513677.0, + "step": 17920 + }, + { + "entropy": 2.0305935621261595, + "epoch": 0.055581431214103985, + "grad_norm": 10.752704620361328, + "learning_rate": 8.684006587232394e-06, + "loss": 0.6779, + "mean_token_accuracy": 0.8040038585662842, + "num_tokens": 21524307.0, + "step": 17930 + }, + { + "entropy": 1.8893893539905549, + "epoch": 0.055612430339153675, + "grad_norm": 11.609795570373535, + "learning_rate": 8.688850140463044e-06, + "loss": 0.6344, + "mean_token_accuracy": 0.8086724013090134, + "num_tokens": 21536735.0, + "step": 17940 + }, + { + "entropy": 1.8628504529595376, + "epoch": 0.05564342946420337, + "grad_norm": 7.905481815338135, + "learning_rate": 8.693693693693694e-06, + "loss": 0.5808, + "mean_token_accuracy": 0.8129374206066131, + "num_tokens": 21549205.0, + "step": 17950 + }, + { + "entropy": 1.884210042655468, + "epoch": 0.05567442858925307, + "grad_norm": 12.496581077575684, + "learning_rate": 8.698537246924344e-06, + "loss": 0.5238, + "mean_token_accuracy": 0.8244456693530082, + "num_tokens": 21560770.0, + "step": 17960 + }, + { + "entropy": 1.8549162834882735, + "epoch": 0.055705427714302765, + "grad_norm": 10.498296737670898, + "learning_rate": 8.703380800154995e-06, + "loss": 0.5502, + "mean_token_accuracy": 0.8179799810051918, + "num_tokens": 21572851.0, + "step": 17970 + }, + { + "entropy": 1.9428973540663719, + "epoch": 0.05573642683935246, + "grad_norm": 9.824682235717773, + "learning_rate": 8.708224353385645e-06, + "loss": 0.6371, + "mean_token_accuracy": 0.8067533582448959, + "num_tokens": 21584287.0, + "step": 17980 + }, + { + "entropy": 1.89179328083992, + "epoch": 0.05576742596440216, + "grad_norm": 5.299261093139648, + "learning_rate": 8.713067906616295e-06, + "loss": 0.5831, + "mean_token_accuracy": 0.8168211251497268, + "num_tokens": 21596202.0, + "step": 17990 + }, + { + "entropy": 1.8720185294747353, + "epoch": 0.05579842508945185, + "grad_norm": 9.72613525390625, + "learning_rate": 8.717911459846945e-06, + "loss": 0.5672, + "mean_token_accuracy": 0.8193200185894967, + "num_tokens": 21608507.0, + "step": 18000 + }, + { + "entropy": 1.90470210313797, + "epoch": 0.055829424214501544, + "grad_norm": 11.743047714233398, + "learning_rate": 8.722755013077593e-06, + "loss": 0.6317, + "mean_token_accuracy": 0.8138886854052544, + "num_tokens": 21619927.0, + "step": 18010 + }, + { + "entropy": 1.9255423635244369, + "epoch": 0.05586042333955124, + "grad_norm": 10.3324613571167, + "learning_rate": 8.727598566308245e-06, + "loss": 0.6288, + "mean_token_accuracy": 0.8161659583449363, + "num_tokens": 21631633.0, + "step": 18020 + }, + { + "entropy": 1.7495290905237197, + "epoch": 0.05589142246460094, + "grad_norm": 11.315978050231934, + "learning_rate": 8.732442119538895e-06, + "loss": 0.5321, + "mean_token_accuracy": 0.8315317690372467, + "num_tokens": 21645071.0, + "step": 18030 + }, + { + "entropy": 1.8392060235142709, + "epoch": 0.055922421589650634, + "grad_norm": 10.65888786315918, + "learning_rate": 8.737285672769545e-06, + "loss": 0.5339, + "mean_token_accuracy": 0.8336151942610741, + "num_tokens": 21657281.0, + "step": 18040 + }, + { + "entropy": 1.9446683064103127, + "epoch": 0.05595342071470033, + "grad_norm": 13.20337963104248, + "learning_rate": 8.742129226000194e-06, + "loss": 0.682, + "mean_token_accuracy": 0.7979879945516586, + "num_tokens": 21668367.0, + "step": 18050 + }, + { + "entropy": 1.8128132298588753, + "epoch": 0.05598441983975002, + "grad_norm": 11.751765251159668, + "learning_rate": 8.746972779230844e-06, + "loss": 0.585, + "mean_token_accuracy": 0.8146183237433433, + "num_tokens": 21681270.0, + "step": 18060 + }, + { + "entropy": 1.8214144945144652, + "epoch": 0.05601541896479972, + "grad_norm": 10.238035202026367, + "learning_rate": 8.751816332461494e-06, + "loss": 0.585, + "mean_token_accuracy": 0.8190832883119583, + "num_tokens": 21694477.0, + "step": 18070 + }, + { + "entropy": 1.8992233827710152, + "epoch": 0.05604641808984941, + "grad_norm": 3.5823521614074707, + "learning_rate": 8.756659885692144e-06, + "loss": 0.6326, + "mean_token_accuracy": 0.8172223970293999, + "num_tokens": 21706135.0, + "step": 18080 + }, + { + "entropy": 1.8768006905913353, + "epoch": 0.05607741721489911, + "grad_norm": 12.048982620239258, + "learning_rate": 8.761503438922796e-06, + "loss": 0.6044, + "mean_token_accuracy": 0.8237545311450958, + "num_tokens": 21717528.0, + "step": 18090 + }, + { + "entropy": 1.8304587841033935, + "epoch": 0.056108416339948806, + "grad_norm": 10.88693904876709, + "learning_rate": 8.766346992153446e-06, + "loss": 0.5422, + "mean_token_accuracy": 0.8200173109769822, + "num_tokens": 21730335.0, + "step": 18100 + }, + { + "entropy": 1.7798964753746986, + "epoch": 0.0561394154649985, + "grad_norm": 13.059136390686035, + "learning_rate": 8.771190545384094e-06, + "loss": 0.508, + "mean_token_accuracy": 0.8268529579043389, + "num_tokens": 21743306.0, + "step": 18110 + }, + { + "entropy": 1.9081214651465417, + "epoch": 0.05617041459004819, + "grad_norm": 11.699897766113281, + "learning_rate": 8.776034098614744e-06, + "loss": 0.6163, + "mean_token_accuracy": 0.8113586112856865, + "num_tokens": 21754558.0, + "step": 18120 + }, + { + "entropy": 1.8716801404953003, + "epoch": 0.05620141371509789, + "grad_norm": 11.486919403076172, + "learning_rate": 8.780877651845394e-06, + "loss": 0.6531, + "mean_token_accuracy": 0.7999576240777969, + "num_tokens": 21766487.0, + "step": 18130 + }, + { + "entropy": 1.8077737405896186, + "epoch": 0.056232412840147586, + "grad_norm": 10.865653991699219, + "learning_rate": 8.785721205076045e-06, + "loss": 0.5829, + "mean_token_accuracy": 0.8171229988336564, + "num_tokens": 21779437.0, + "step": 18140 + }, + { + "entropy": 1.93416608273983, + "epoch": 0.05626341196519728, + "grad_norm": 9.039263725280762, + "learning_rate": 8.790564758306695e-06, + "loss": 0.6632, + "mean_token_accuracy": 0.8090454161167144, + "num_tokens": 21790560.0, + "step": 18150 + }, + { + "entropy": 1.9330880522727967, + "epoch": 0.05629441109024698, + "grad_norm": 9.971395492553711, + "learning_rate": 8.795408311537345e-06, + "loss": 0.5888, + "mean_token_accuracy": 0.8192548274993896, + "num_tokens": 21802186.0, + "step": 18160 + }, + { + "entropy": 1.8966943442821502, + "epoch": 0.056325410215296676, + "grad_norm": 5.171525955200195, + "learning_rate": 8.800251864767995e-06, + "loss": 0.609, + "mean_token_accuracy": 0.8174291670322418, + "num_tokens": 21813141.0, + "step": 18170 + }, + { + "entropy": 1.9147083714604378, + "epoch": 0.056356409340346365, + "grad_norm": 10.910831451416016, + "learning_rate": 8.805095417998645e-06, + "loss": 0.6791, + "mean_token_accuracy": 0.802092096209526, + "num_tokens": 21824549.0, + "step": 18180 + }, + { + "entropy": 1.845213919878006, + "epoch": 0.05638740846539606, + "grad_norm": 11.23419189453125, + "learning_rate": 8.809938971229295e-06, + "loss": 0.541, + "mean_token_accuracy": 0.836587692797184, + "num_tokens": 21837064.0, + "step": 18190 + }, + { + "entropy": 1.9281988382339477, + "epoch": 0.05641840759044576, + "grad_norm": 12.443825721740723, + "learning_rate": 8.814782524459944e-06, + "loss": 0.6652, + "mean_token_accuracy": 0.8060685142874717, + "num_tokens": 21849025.0, + "step": 18200 + }, + { + "entropy": 1.985886165499687, + "epoch": 0.056449406715495455, + "grad_norm": 11.284612655639648, + "learning_rate": 8.819626077690594e-06, + "loss": 0.6496, + "mean_token_accuracy": 0.8076219454407692, + "num_tokens": 21859566.0, + "step": 18210 + }, + { + "entropy": 1.9329922825098038, + "epoch": 0.05648040584054515, + "grad_norm": 8.869474411010742, + "learning_rate": 8.824469630921244e-06, + "loss": 0.6058, + "mean_token_accuracy": 0.810475055873394, + "num_tokens": 21871056.0, + "step": 18220 + }, + { + "entropy": 1.920028594136238, + "epoch": 0.05651140496559485, + "grad_norm": 11.169597625732422, + "learning_rate": 8.829313184151894e-06, + "loss": 0.636, + "mean_token_accuracy": 0.8107326775789261, + "num_tokens": 21881630.0, + "step": 18230 + }, + { + "entropy": 1.8644825682044028, + "epoch": 0.056542404090644545, + "grad_norm": 10.988966941833496, + "learning_rate": 8.834156737382544e-06, + "loss": 0.5143, + "mean_token_accuracy": 0.8243206977844239, + "num_tokens": 21893921.0, + "step": 18240 + }, + { + "entropy": 1.8154290586709976, + "epoch": 0.056573403215694235, + "grad_norm": 6.226930141448975, + "learning_rate": 8.839000290613194e-06, + "loss": 0.5837, + "mean_token_accuracy": 0.8170310199260712, + "num_tokens": 21906253.0, + "step": 18250 + }, + { + "entropy": 1.859774151444435, + "epoch": 0.05660440234074393, + "grad_norm": 11.1119384765625, + "learning_rate": 8.843843843843844e-06, + "loss": 0.6233, + "mean_token_accuracy": 0.8082350715994835, + "num_tokens": 21918932.0, + "step": 18260 + }, + { + "entropy": 1.9023708343505858, + "epoch": 0.05663540146579363, + "grad_norm": 10.345152854919434, + "learning_rate": 8.848687397074495e-06, + "loss": 0.5928, + "mean_token_accuracy": 0.8202927365899086, + "num_tokens": 21930745.0, + "step": 18270 + }, + { + "entropy": 1.8009515717625617, + "epoch": 0.056666400590843324, + "grad_norm": 4.616584777832031, + "learning_rate": 8.853530950305145e-06, + "loss": 0.5181, + "mean_token_accuracy": 0.8291145578026772, + "num_tokens": 21943477.0, + "step": 18280 + }, + { + "entropy": 1.910208411514759, + "epoch": 0.05669739971589302, + "grad_norm": 10.986778259277344, + "learning_rate": 8.858374503535795e-06, + "loss": 0.6053, + "mean_token_accuracy": 0.8097553431987763, + "num_tokens": 21955368.0, + "step": 18290 + }, + { + "entropy": 1.8674479261040688, + "epoch": 0.05672839884094272, + "grad_norm": 12.478263854980469, + "learning_rate": 8.863218056766445e-06, + "loss": 0.592, + "mean_token_accuracy": 0.8068441540002823, + "num_tokens": 21968150.0, + "step": 18300 + }, + { + "entropy": 1.9316591426730156, + "epoch": 0.05675939796599241, + "grad_norm": 13.325005531311035, + "learning_rate": 8.868061609997095e-06, + "loss": 0.5848, + "mean_token_accuracy": 0.8228493258357048, + "num_tokens": 21979588.0, + "step": 18310 + }, + { + "entropy": 1.8889947533607483, + "epoch": 0.056790397091042104, + "grad_norm": 10.683235168457031, + "learning_rate": 8.872905163227745e-06, + "loss": 0.6101, + "mean_token_accuracy": 0.8133319228887558, + "num_tokens": 21991387.0, + "step": 18320 + }, + { + "entropy": 1.83803388774395, + "epoch": 0.0568213962160918, + "grad_norm": 9.95289421081543, + "learning_rate": 8.877748716458395e-06, + "loss": 0.5467, + "mean_token_accuracy": 0.8165292799472809, + "num_tokens": 22004539.0, + "step": 18330 + }, + { + "entropy": 1.8539226263761521, + "epoch": 0.0568523953411415, + "grad_norm": 11.232304573059082, + "learning_rate": 8.882592269689044e-06, + "loss": 0.5972, + "mean_token_accuracy": 0.8155701532959938, + "num_tokens": 22017151.0, + "step": 18340 + }, + { + "entropy": 1.8634928598999978, + "epoch": 0.056883394466191194, + "grad_norm": 10.557002067565918, + "learning_rate": 8.887435822919694e-06, + "loss": 0.5348, + "mean_token_accuracy": 0.8263462707400322, + "num_tokens": 22029875.0, + "step": 18350 + }, + { + "entropy": 1.8347053781151772, + "epoch": 0.05691439359124089, + "grad_norm": 5.2376389503479, + "learning_rate": 8.892279376150344e-06, + "loss": 0.553, + "mean_token_accuracy": 0.82252157330513, + "num_tokens": 22042520.0, + "step": 18360 + }, + { + "entropy": 1.9364128440618515, + "epoch": 0.05694539271629058, + "grad_norm": 5.532094955444336, + "learning_rate": 8.897122929380994e-06, + "loss": 0.673, + "mean_token_accuracy": 0.800466650724411, + "num_tokens": 22054370.0, + "step": 18370 + }, + { + "entropy": 1.9340432003140449, + "epoch": 0.056976391841340276, + "grad_norm": 11.368268966674805, + "learning_rate": 8.901966482611644e-06, + "loss": 0.6135, + "mean_token_accuracy": 0.8022024169564247, + "num_tokens": 22066153.0, + "step": 18380 + }, + { + "entropy": 1.8881090357899666, + "epoch": 0.05700739096638997, + "grad_norm": 5.1713972091674805, + "learning_rate": 8.906810035842296e-06, + "loss": 0.5272, + "mean_token_accuracy": 0.8244768619537354, + "num_tokens": 22078654.0, + "step": 18390 + }, + { + "entropy": 1.919769637286663, + "epoch": 0.05703839009143967, + "grad_norm": 8.57338809967041, + "learning_rate": 8.911653589072945e-06, + "loss": 0.6129, + "mean_token_accuracy": 0.8175665631890296, + "num_tokens": 22091226.0, + "step": 18400 + }, + { + "entropy": 1.7635185047984123, + "epoch": 0.057069389216489366, + "grad_norm": 3.2729835510253906, + "learning_rate": 8.916497142303595e-06, + "loss": 0.514, + "mean_token_accuracy": 0.8288957670331001, + "num_tokens": 22105037.0, + "step": 18410 + }, + { + "entropy": 1.84332295358181, + "epoch": 0.05710038834153906, + "grad_norm": 11.345158576965332, + "learning_rate": 8.921340695534245e-06, + "loss": 0.5933, + "mean_token_accuracy": 0.8165907859802246, + "num_tokens": 22117072.0, + "step": 18420 + }, + { + "entropy": 1.8966827645897866, + "epoch": 0.05713138746658875, + "grad_norm": 12.612526893615723, + "learning_rate": 8.926184248764893e-06, + "loss": 0.6189, + "mean_token_accuracy": 0.8158532097935677, + "num_tokens": 22127952.0, + "step": 18430 + }, + { + "entropy": 1.8704052582383155, + "epoch": 0.05716238659163845, + "grad_norm": 5.032566070556641, + "learning_rate": 8.931027801995545e-06, + "loss": 0.5832, + "mean_token_accuracy": 0.8137423068284988, + "num_tokens": 22139709.0, + "step": 18440 + }, + { + "entropy": 1.820935921370983, + "epoch": 0.057193385716688146, + "grad_norm": 10.067622184753418, + "learning_rate": 8.935871355226195e-06, + "loss": 0.5071, + "mean_token_accuracy": 0.8368907868862152, + "num_tokens": 22152377.0, + "step": 18450 + }, + { + "entropy": 1.896417185664177, + "epoch": 0.05722438484173784, + "grad_norm": 12.381418228149414, + "learning_rate": 8.940714908456845e-06, + "loss": 0.6965, + "mean_token_accuracy": 0.8038837254047394, + "num_tokens": 22163577.0, + "step": 18460 + }, + { + "entropy": 1.8900171846151352, + "epoch": 0.05725538396678754, + "grad_norm": 11.906807899475098, + "learning_rate": 8.945558461687495e-06, + "loss": 0.6449, + "mean_token_accuracy": 0.8201625868678093, + "num_tokens": 22174708.0, + "step": 18470 + }, + { + "entropy": 1.9111595183610917, + "epoch": 0.057286383091837235, + "grad_norm": 11.19156265258789, + "learning_rate": 8.950402014918145e-06, + "loss": 0.6555, + "mean_token_accuracy": 0.802647915482521, + "num_tokens": 22186789.0, + "step": 18480 + }, + { + "entropy": 1.9203668639063836, + "epoch": 0.057317382216886925, + "grad_norm": 11.311783790588379, + "learning_rate": 8.955245568148794e-06, + "loss": 0.6169, + "mean_token_accuracy": 0.8133427366614342, + "num_tokens": 22198408.0, + "step": 18490 + }, + { + "entropy": 1.9032350957393647, + "epoch": 0.05734838134193662, + "grad_norm": 13.073493957519531, + "learning_rate": 8.960089121379444e-06, + "loss": 0.6344, + "mean_token_accuracy": 0.80216343998909, + "num_tokens": 22210305.0, + "step": 18500 + }, + { + "entropy": 1.8219037756323815, + "epoch": 0.05737938046698632, + "grad_norm": 9.571296691894531, + "learning_rate": 8.964932674610096e-06, + "loss": 0.5245, + "mean_token_accuracy": 0.8286261260509491, + "num_tokens": 22222972.0, + "step": 18510 + }, + { + "entropy": 1.8522532120347024, + "epoch": 0.057410379592036015, + "grad_norm": 5.730122089385986, + "learning_rate": 8.969776227840746e-06, + "loss": 0.5858, + "mean_token_accuracy": 0.8103675618767738, + "num_tokens": 22235073.0, + "step": 18520 + }, + { + "entropy": 1.851724323630333, + "epoch": 0.05744137871708571, + "grad_norm": 9.87784194946289, + "learning_rate": 8.974619781071394e-06, + "loss": 0.5931, + "mean_token_accuracy": 0.8163824722170829, + "num_tokens": 22247790.0, + "step": 18530 + }, + { + "entropy": 1.8631928369402886, + "epoch": 0.05747237784213541, + "grad_norm": 5.00623893737793, + "learning_rate": 8.979463334302044e-06, + "loss": 0.559, + "mean_token_accuracy": 0.8131022065877914, + "num_tokens": 22260240.0, + "step": 18540 + }, + { + "entropy": 1.910364383459091, + "epoch": 0.0575033769671851, + "grad_norm": 11.106468200683594, + "learning_rate": 8.984306887532694e-06, + "loss": 0.6496, + "mean_token_accuracy": 0.8063996240496636, + "num_tokens": 22272059.0, + "step": 18550 + }, + { + "entropy": 1.9144617825746537, + "epoch": 0.057534376092234794, + "grad_norm": 10.869990348815918, + "learning_rate": 8.989150440763345e-06, + "loss": 0.6416, + "mean_token_accuracy": 0.8138428092002868, + "num_tokens": 22283072.0, + "step": 18560 + }, + { + "entropy": 1.9242763727903367, + "epoch": 0.05756537521728449, + "grad_norm": 10.430070877075195, + "learning_rate": 8.993993993993995e-06, + "loss": 0.6072, + "mean_token_accuracy": 0.8113959297537804, + "num_tokens": 22294697.0, + "step": 18570 + }, + { + "entropy": 1.9588421791791917, + "epoch": 0.05759637434233419, + "grad_norm": 12.86335277557373, + "learning_rate": 8.998837547224645e-06, + "loss": 0.6676, + "mean_token_accuracy": 0.80771906375885, + "num_tokens": 22305842.0, + "step": 18580 + }, + { + "entropy": 1.8290845677256584, + "epoch": 0.057627373467383884, + "grad_norm": 11.009787559509277, + "learning_rate": 9.003681100455295e-06, + "loss": 0.5679, + "mean_token_accuracy": 0.819172665476799, + "num_tokens": 22319673.0, + "step": 18590 + }, + { + "entropy": 1.8799395859241486, + "epoch": 0.05765837259243358, + "grad_norm": 5.414979457855225, + "learning_rate": 9.008524653685945e-06, + "loss": 0.5858, + "mean_token_accuracy": 0.8109808668494225, + "num_tokens": 22332734.0, + "step": 18600 + }, + { + "entropy": 1.9298516079783439, + "epoch": 0.05768937171748328, + "grad_norm": 4.051270961761475, + "learning_rate": 9.013368206916595e-06, + "loss": 0.6212, + "mean_token_accuracy": 0.8061932161450386, + "num_tokens": 22344081.0, + "step": 18610 + }, + { + "entropy": 1.9745507806539535, + "epoch": 0.05772037084253297, + "grad_norm": 10.910243034362793, + "learning_rate": 9.018211760147245e-06, + "loss": 0.7182, + "mean_token_accuracy": 0.7929084002971649, + "num_tokens": 22355535.0, + "step": 18620 + }, + { + "entropy": 1.880644341558218, + "epoch": 0.057751369967582664, + "grad_norm": 6.737995624542236, + "learning_rate": 9.023055313377894e-06, + "loss": 0.5881, + "mean_token_accuracy": 0.8097090050578117, + "num_tokens": 22368050.0, + "step": 18630 + }, + { + "entropy": 1.9336914867162704, + "epoch": 0.05778236909263236, + "grad_norm": 9.826704978942871, + "learning_rate": 9.027898866608544e-06, + "loss": 0.6545, + "mean_token_accuracy": 0.8081561967730522, + "num_tokens": 22379637.0, + "step": 18640 + }, + { + "entropy": 1.9320360347628593, + "epoch": 0.05781336821768206, + "grad_norm": 5.833460330963135, + "learning_rate": 9.032742419839194e-06, + "loss": 0.5905, + "mean_token_accuracy": 0.8115993320941925, + "num_tokens": 22391313.0, + "step": 18650 + }, + { + "entropy": 1.952963298559189, + "epoch": 0.05784436734273175, + "grad_norm": 5.079139709472656, + "learning_rate": 9.037585973069844e-06, + "loss": 0.6632, + "mean_token_accuracy": 0.8034544378519058, + "num_tokens": 22402538.0, + "step": 18660 + }, + { + "entropy": 1.7565592139959336, + "epoch": 0.05787536646778145, + "grad_norm": 4.124721050262451, + "learning_rate": 9.042429526300494e-06, + "loss": 0.5241, + "mean_token_accuracy": 0.8233913615345955, + "num_tokens": 22416764.0, + "step": 18670 + }, + { + "entropy": 1.900513444840908, + "epoch": 0.05790636559283114, + "grad_norm": 9.634764671325684, + "learning_rate": 9.047273079531144e-06, + "loss": 0.573, + "mean_token_accuracy": 0.8233014553785324, + "num_tokens": 22428670.0, + "step": 18680 + }, + { + "entropy": 1.9495866417884826, + "epoch": 0.057937364717880836, + "grad_norm": Infinity, + "learning_rate": 9.052116632761795e-06, + "loss": 0.6543, + "mean_token_accuracy": 0.8013565137982368, + "num_tokens": 22439439.0, + "step": 18690 + }, + { + "entropy": 1.8713518410921097, + "epoch": 0.05796836384293053, + "grad_norm": 9.244678497314453, + "learning_rate": 9.056960185992445e-06, + "loss": 0.5676, + "mean_token_accuracy": 0.8261402577161789, + "num_tokens": 22450954.0, + "step": 18700 + }, + { + "entropy": 1.867144940048456, + "epoch": 0.05799936296798023, + "grad_norm": 5.15731143951416, + "learning_rate": 9.061803739223095e-06, + "loss": 0.5672, + "mean_token_accuracy": 0.814095051586628, + "num_tokens": 22463821.0, + "step": 18710 + }, + { + "entropy": 1.8838742166757583, + "epoch": 0.058030362093029926, + "grad_norm": 10.645661354064941, + "learning_rate": 9.066647292453745e-06, + "loss": 0.5763, + "mean_token_accuracy": 0.82436051517725, + "num_tokens": 22475600.0, + "step": 18720 + }, + { + "entropy": 1.8638369679450988, + "epoch": 0.05806136121807962, + "grad_norm": 14.198400497436523, + "learning_rate": 9.071490845684395e-06, + "loss": 0.6381, + "mean_token_accuracy": 0.8040808498859405, + "num_tokens": 22487874.0, + "step": 18730 + }, + { + "entropy": 1.8371347174048425, + "epoch": 0.05809236034312931, + "grad_norm": 5.825725555419922, + "learning_rate": 9.076334398915045e-06, + "loss": 0.6015, + "mean_token_accuracy": 0.8231277003884315, + "num_tokens": 22500865.0, + "step": 18740 + }, + { + "entropy": 1.9247653126716613, + "epoch": 0.05812335946817901, + "grad_norm": 12.100114822387695, + "learning_rate": 9.081177952145695e-06, + "loss": 0.6471, + "mean_token_accuracy": 0.8075411379337311, + "num_tokens": 22511426.0, + "step": 18750 + }, + { + "entropy": 1.8342102706432342, + "epoch": 0.058154358593228705, + "grad_norm": 9.929917335510254, + "learning_rate": 9.086021505376345e-06, + "loss": 0.6312, + "mean_token_accuracy": 0.7977308183908463, + "num_tokens": 22524288.0, + "step": 18760 + }, + { + "entropy": 1.7855632066726685, + "epoch": 0.0581853577182784, + "grad_norm": 5.584056854248047, + "learning_rate": 9.090865058606995e-06, + "loss": 0.5442, + "mean_token_accuracy": 0.8210761070251464, + "num_tokens": 22537475.0, + "step": 18770 + }, + { + "entropy": 1.8851278007030488, + "epoch": 0.0582163568433281, + "grad_norm": 10.225055694580078, + "learning_rate": 9.095708611837644e-06, + "loss": 0.6466, + "mean_token_accuracy": 0.8099891990423203, + "num_tokens": 22548279.0, + "step": 18780 + }, + { + "entropy": 1.841400173306465, + "epoch": 0.058247355968377795, + "grad_norm": 9.732316017150879, + "learning_rate": 9.100552165068294e-06, + "loss": 0.5658, + "mean_token_accuracy": 0.820391620695591, + "num_tokens": 22560249.0, + "step": 18790 + }, + { + "entropy": 1.7526367500424385, + "epoch": 0.058278355093427485, + "grad_norm": 10.728232383728027, + "learning_rate": 9.105395718298944e-06, + "loss": 0.4948, + "mean_token_accuracy": 0.8198321342468262, + "num_tokens": 22573834.0, + "step": 18800 + }, + { + "entropy": 1.8276120245456695, + "epoch": 0.05830935421847718, + "grad_norm": 11.885202407836914, + "learning_rate": 9.110239271529596e-06, + "loss": 0.5502, + "mean_token_accuracy": 0.8176509469747544, + "num_tokens": 22586688.0, + "step": 18810 + }, + { + "entropy": 1.8741453379392623, + "epoch": 0.05834035334352688, + "grad_norm": 12.088459968566895, + "learning_rate": 9.115082824760246e-06, + "loss": 0.6949, + "mean_token_accuracy": 0.7987207651138306, + "num_tokens": 22598358.0, + "step": 18820 + }, + { + "entropy": 1.8715686663985251, + "epoch": 0.058371352468576575, + "grad_norm": 10.86611270904541, + "learning_rate": 9.119926377990896e-06, + "loss": 0.6264, + "mean_token_accuracy": 0.8120866730809212, + "num_tokens": 22609875.0, + "step": 18830 + }, + { + "entropy": 1.844074723124504, + "epoch": 0.05840235159362627, + "grad_norm": 10.710127830505371, + "learning_rate": 9.124769931221545e-06, + "loss": 0.6134, + "mean_token_accuracy": 0.8119207620620728, + "num_tokens": 22621314.0, + "step": 18840 + }, + { + "entropy": 1.8183260425925254, + "epoch": 0.05843335071867597, + "grad_norm": 9.820539474487305, + "learning_rate": 9.129613484452194e-06, + "loss": 0.6007, + "mean_token_accuracy": 0.809662489593029, + "num_tokens": 22633297.0, + "step": 18850 + }, + { + "entropy": 1.8575424775481224, + "epoch": 0.05846434984372566, + "grad_norm": 9.236701011657715, + "learning_rate": 9.134457037682845e-06, + "loss": 0.6365, + "mean_token_accuracy": 0.8071271240711212, + "num_tokens": 22645167.0, + "step": 18860 + }, + { + "entropy": 1.8752565145492555, + "epoch": 0.058495348968775354, + "grad_norm": 5.298765659332275, + "learning_rate": 9.139300590913495e-06, + "loss": 0.5903, + "mean_token_accuracy": 0.8160139411687851, + "num_tokens": 22656745.0, + "step": 18870 + }, + { + "entropy": 1.8490721896290778, + "epoch": 0.05852634809382505, + "grad_norm": 10.962682723999023, + "learning_rate": 9.144144144144145e-06, + "loss": 0.6161, + "mean_token_accuracy": 0.8067515045404434, + "num_tokens": 22669776.0, + "step": 18880 + }, + { + "entropy": 1.882719586789608, + "epoch": 0.05855734721887475, + "grad_norm": 10.184673309326172, + "learning_rate": 9.148987697374795e-06, + "loss": 0.6346, + "mean_token_accuracy": 0.8162581637501717, + "num_tokens": 22681064.0, + "step": 18890 + }, + { + "entropy": 1.7815841138362885, + "epoch": 0.058588346343924444, + "grad_norm": 9.868931770324707, + "learning_rate": 9.153831250605445e-06, + "loss": 0.4916, + "mean_token_accuracy": 0.8312078416347504, + "num_tokens": 22694642.0, + "step": 18900 + }, + { + "entropy": 1.8093548193573952, + "epoch": 0.05861934546897414, + "grad_norm": 10.930723190307617, + "learning_rate": 9.158674803836095e-06, + "loss": 0.5375, + "mean_token_accuracy": 0.8200191363692284, + "num_tokens": 22707548.0, + "step": 18910 + }, + { + "entropy": 1.8189376056194306, + "epoch": 0.05865034459402383, + "grad_norm": 9.895159721374512, + "learning_rate": 9.163518357066745e-06, + "loss": 0.6132, + "mean_token_accuracy": 0.8140119895339012, + "num_tokens": 22720201.0, + "step": 18920 + }, + { + "entropy": 1.8482043415307998, + "epoch": 0.05868134371907353, + "grad_norm": 10.34995174407959, + "learning_rate": 9.168361910297396e-06, + "loss": 0.5611, + "mean_token_accuracy": 0.8194187000393868, + "num_tokens": 22732586.0, + "step": 18930 + }, + { + "entropy": 1.8547095090150834, + "epoch": 0.05871234284412322, + "grad_norm": 9.405129432678223, + "learning_rate": 9.173205463528046e-06, + "loss": 0.6397, + "mean_token_accuracy": 0.8073139742016793, + "num_tokens": 22743872.0, + "step": 18940 + }, + { + "entropy": 1.7496973037719727, + "epoch": 0.05874334196917292, + "grad_norm": 11.039433479309082, + "learning_rate": 9.178049016758694e-06, + "loss": 0.537, + "mean_token_accuracy": 0.8219654500484467, + "num_tokens": 22756915.0, + "step": 18950 + }, + { + "entropy": 1.8445199981331826, + "epoch": 0.05877434109422262, + "grad_norm": 10.18836498260498, + "learning_rate": 9.182892569989344e-06, + "loss": 0.5484, + "mean_token_accuracy": 0.8294688493013382, + "num_tokens": 22768399.0, + "step": 18960 + }, + { + "entropy": 1.8422922030091287, + "epoch": 0.05880534021927231, + "grad_norm": 10.27499008178711, + "learning_rate": 9.187736123219994e-06, + "loss": 0.6632, + "mean_token_accuracy": 0.815483920276165, + "num_tokens": 22780846.0, + "step": 18970 + }, + { + "entropy": 1.9067127719521522, + "epoch": 0.05883633934432201, + "grad_norm": 9.372925758361816, + "learning_rate": 9.192579676450646e-06, + "loss": 0.6038, + "mean_token_accuracy": 0.8227426365017891, + "num_tokens": 22792760.0, + "step": 18980 + }, + { + "entropy": 1.9074788480997085, + "epoch": 0.0588673384693717, + "grad_norm": 5.241424560546875, + "learning_rate": 9.197423229681295e-06, + "loss": 0.5884, + "mean_token_accuracy": 0.8211500853300094, + "num_tokens": 22804075.0, + "step": 18990 + }, + { + "entropy": 1.911788022518158, + "epoch": 0.058898337594421396, + "grad_norm": 12.606864929199219, + "learning_rate": 9.202266782911945e-06, + "loss": 0.6369, + "mean_token_accuracy": 0.8175441965460777, + "num_tokens": 22814931.0, + "step": 19000 + }, + { + "entropy": 1.902238203585148, + "epoch": 0.05892933671947109, + "grad_norm": 5.6409831047058105, + "learning_rate": 9.207110336142595e-06, + "loss": 0.6392, + "mean_token_accuracy": 0.8043625161051751, + "num_tokens": 22826140.0, + "step": 19010 + }, + { + "entropy": 1.8769164964556695, + "epoch": 0.05896033584452079, + "grad_norm": 5.731686592102051, + "learning_rate": 9.211953889373245e-06, + "loss": 0.6231, + "mean_token_accuracy": 0.8021165639162063, + "num_tokens": 22837728.0, + "step": 19020 + }, + { + "entropy": 1.7606040745973588, + "epoch": 0.058991334969570486, + "grad_norm": 12.572894096374512, + "learning_rate": 9.216797442603895e-06, + "loss": 0.5168, + "mean_token_accuracy": 0.8280571162700653, + "num_tokens": 22851334.0, + "step": 19030 + }, + { + "entropy": 1.8437473088502885, + "epoch": 0.05902233409462018, + "grad_norm": 10.523193359375, + "learning_rate": 9.221640995834545e-06, + "loss": 0.5031, + "mean_token_accuracy": 0.8213176369667053, + "num_tokens": 22864325.0, + "step": 19040 + }, + { + "entropy": 1.9376160144805907, + "epoch": 0.05905333321966987, + "grad_norm": 9.312682151794434, + "learning_rate": 9.226484549065195e-06, + "loss": 0.6229, + "mean_token_accuracy": 0.8095207557082176, + "num_tokens": 22876271.0, + "step": 19050 + }, + { + "entropy": 1.8676683530211449, + "epoch": 0.05908433234471957, + "grad_norm": 10.493414878845215, + "learning_rate": 9.231328102295845e-06, + "loss": 0.5505, + "mean_token_accuracy": 0.8218972474336624, + "num_tokens": 22888674.0, + "step": 19060 + }, + { + "entropy": 1.9024165108799935, + "epoch": 0.059115331469769265, + "grad_norm": 11.642742156982422, + "learning_rate": 9.236171655526494e-06, + "loss": 0.5729, + "mean_token_accuracy": 0.8063879519701004, + "num_tokens": 22901132.0, + "step": 19070 + }, + { + "entropy": 1.8537215188145637, + "epoch": 0.05914633059481896, + "grad_norm": 10.604762077331543, + "learning_rate": 9.241015208757144e-06, + "loss": 0.5936, + "mean_token_accuracy": 0.8061328649520874, + "num_tokens": 22913819.0, + "step": 19080 + }, + { + "entropy": 1.9175572365522384, + "epoch": 0.05917732971986866, + "grad_norm": 5.126205921173096, + "learning_rate": 9.245858761987794e-06, + "loss": 0.6629, + "mean_token_accuracy": 0.8014189884066582, + "num_tokens": 22924724.0, + "step": 19090 + }, + { + "entropy": 1.9114294454455376, + "epoch": 0.059208328844918355, + "grad_norm": 10.768912315368652, + "learning_rate": 9.250702315218446e-06, + "loss": 0.6447, + "mean_token_accuracy": 0.8088298216462135, + "num_tokens": 22936371.0, + "step": 19100 + }, + { + "entropy": 1.9175557851791383, + "epoch": 0.059239327969968045, + "grad_norm": 9.46970272064209, + "learning_rate": 9.255545868449096e-06, + "loss": 0.6537, + "mean_token_accuracy": 0.810999846458435, + "num_tokens": 22948269.0, + "step": 19110 + }, + { + "entropy": 1.80474793612957, + "epoch": 0.05927032709501774, + "grad_norm": 7.381761074066162, + "learning_rate": 9.260389421679746e-06, + "loss": 0.559, + "mean_token_accuracy": 0.8227289646863938, + "num_tokens": 22961310.0, + "step": 19120 + }, + { + "entropy": 1.868805430829525, + "epoch": 0.05930132622006744, + "grad_norm": 10.440200805664062, + "learning_rate": 9.265232974910395e-06, + "loss": 0.548, + "mean_token_accuracy": 0.8252261653542519, + "num_tokens": 22973534.0, + "step": 19130 + }, + { + "entropy": 1.9009343415498734, + "epoch": 0.059332325345117135, + "grad_norm": 10.150739669799805, + "learning_rate": 9.270076528141045e-06, + "loss": 0.6062, + "mean_token_accuracy": 0.8130706340074539, + "num_tokens": 22984921.0, + "step": 19140 + }, + { + "entropy": 1.9186480671167374, + "epoch": 0.05936332447016683, + "grad_norm": 9.625990867614746, + "learning_rate": 9.274920081371695e-06, + "loss": 0.6403, + "mean_token_accuracy": 0.8057694777846336, + "num_tokens": 22996985.0, + "step": 19150 + }, + { + "entropy": 1.8890679344534873, + "epoch": 0.05939432359521653, + "grad_norm": 10.16703987121582, + "learning_rate": 9.279763634602345e-06, + "loss": 0.6015, + "mean_token_accuracy": 0.8147070035338402, + "num_tokens": 23008337.0, + "step": 19160 + }, + { + "entropy": 1.9546454161405564, + "epoch": 0.05942532272026622, + "grad_norm": 13.017064094543457, + "learning_rate": 9.284607187832995e-06, + "loss": 0.7113, + "mean_token_accuracy": 0.7958240196108818, + "num_tokens": 23019347.0, + "step": 19170 + }, + { + "entropy": 1.8713775292038917, + "epoch": 0.059456321845315914, + "grad_norm": 12.783992767333984, + "learning_rate": 9.289450741063645e-06, + "loss": 0.5894, + "mean_token_accuracy": 0.8084977343678474, + "num_tokens": 23031934.0, + "step": 19180 + }, + { + "entropy": 1.9044471591711045, + "epoch": 0.05948732097036561, + "grad_norm": 11.211695671081543, + "learning_rate": 9.294294294294295e-06, + "loss": 0.5655, + "mean_token_accuracy": 0.8160697892308235, + "num_tokens": 23043989.0, + "step": 19190 + }, + { + "entropy": 1.8023822262883187, + "epoch": 0.05951832009541531, + "grad_norm": 10.612577438354492, + "learning_rate": 9.299137847524945e-06, + "loss": 0.6033, + "mean_token_accuracy": 0.8159924641251564, + "num_tokens": 23058046.0, + "step": 19200 + }, + { + "entropy": 1.9452301010489463, + "epoch": 0.059549319220465004, + "grad_norm": 11.767992973327637, + "learning_rate": 9.303981400755595e-06, + "loss": 0.68, + "mean_token_accuracy": 0.803168785572052, + "num_tokens": 23069178.0, + "step": 19210 + }, + { + "entropy": 1.918524721264839, + "epoch": 0.0595803183455147, + "grad_norm": 10.314653396606445, + "learning_rate": 9.308824953986244e-06, + "loss": 0.6834, + "mean_token_accuracy": 0.8066515281796456, + "num_tokens": 23080400.0, + "step": 19220 + }, + { + "entropy": 1.9129308730363845, + "epoch": 0.05961131747056439, + "grad_norm": 12.398180961608887, + "learning_rate": 9.313668507216896e-06, + "loss": 0.6329, + "mean_token_accuracy": 0.8124301135540009, + "num_tokens": 23091734.0, + "step": 19230 + }, + { + "entropy": 1.9237210959196092, + "epoch": 0.05964231659561409, + "grad_norm": 10.278820037841797, + "learning_rate": 9.318512060447546e-06, + "loss": 0.6262, + "mean_token_accuracy": 0.8095857813954354, + "num_tokens": 23103397.0, + "step": 19240 + }, + { + "entropy": 1.8663773894309998, + "epoch": 0.05967331572066378, + "grad_norm": 10.0914888381958, + "learning_rate": 9.323355613678196e-06, + "loss": 0.5701, + "mean_token_accuracy": 0.821681647002697, + "num_tokens": 23115814.0, + "step": 19250 + }, + { + "entropy": 1.7783602967858314, + "epoch": 0.05970431484571348, + "grad_norm": 3.7728965282440186, + "learning_rate": 9.328199166908846e-06, + "loss": 0.5334, + "mean_token_accuracy": 0.8306898340582848, + "num_tokens": 23128899.0, + "step": 19260 + }, + { + "entropy": 1.894435779750347, + "epoch": 0.059735313970763176, + "grad_norm": 11.809426307678223, + "learning_rate": 9.333042720139494e-06, + "loss": 0.6599, + "mean_token_accuracy": 0.8009930282831192, + "num_tokens": 23140688.0, + "step": 19270 + }, + { + "entropy": 1.8965399265289307, + "epoch": 0.05976631309581287, + "grad_norm": 10.043713569641113, + "learning_rate": 9.337886273370145e-06, + "loss": 0.6145, + "mean_token_accuracy": 0.8076204761862755, + "num_tokens": 23152768.0, + "step": 19280 + }, + { + "entropy": 1.9386232048273087, + "epoch": 0.05979731222086257, + "grad_norm": 9.522479057312012, + "learning_rate": 9.342729826600795e-06, + "loss": 0.6379, + "mean_token_accuracy": 0.8103454813361168, + "num_tokens": 23164363.0, + "step": 19290 + }, + { + "entropy": 1.9121969774365426, + "epoch": 0.05982831134591226, + "grad_norm": 5.025160789489746, + "learning_rate": 9.347573379831445e-06, + "loss": 0.5836, + "mean_token_accuracy": 0.8114672183990479, + "num_tokens": 23176305.0, + "step": 19300 + }, + { + "entropy": 1.8917111858725548, + "epoch": 0.059859310470961956, + "grad_norm": 4.935334205627441, + "learning_rate": 9.352416933062095e-06, + "loss": 0.599, + "mean_token_accuracy": 0.8276031494140625, + "num_tokens": 23187205.0, + "step": 19310 + }, + { + "entropy": 1.96262284219265, + "epoch": 0.05989030959601165, + "grad_norm": 9.109956741333008, + "learning_rate": 9.357260486292745e-06, + "loss": 0.646, + "mean_token_accuracy": 0.8043781608343125, + "num_tokens": 23198670.0, + "step": 19320 + }, + { + "entropy": 1.9255134493112565, + "epoch": 0.05992130872106135, + "grad_norm": 9.983922958374023, + "learning_rate": 9.362104039523395e-06, + "loss": 0.6009, + "mean_token_accuracy": 0.8162341311573982, + "num_tokens": 23210792.0, + "step": 19330 + }, + { + "entropy": 1.9093994736671447, + "epoch": 0.059952307846111046, + "grad_norm": 9.770051002502441, + "learning_rate": 9.366947592754045e-06, + "loss": 0.663, + "mean_token_accuracy": 0.8095744162797928, + "num_tokens": 23222187.0, + "step": 19340 + }, + { + "entropy": 1.8983681246638298, + "epoch": 0.05998330697116074, + "grad_norm": 13.514256477355957, + "learning_rate": 9.371791145984696e-06, + "loss": 0.6243, + "mean_token_accuracy": 0.7971692577004432, + "num_tokens": 23233365.0, + "step": 19350 + }, + { + "entropy": 1.9183629781007767, + "epoch": 0.06001430609621043, + "grad_norm": 10.821701049804688, + "learning_rate": 9.376634699215346e-06, + "loss": 0.6502, + "mean_token_accuracy": 0.8030115276575088, + "num_tokens": 23243672.0, + "step": 19360 + }, + { + "entropy": 1.807152123749256, + "epoch": 0.06004530522126013, + "grad_norm": 4.91693639755249, + "learning_rate": 9.381478252445994e-06, + "loss": 0.4923, + "mean_token_accuracy": 0.8239471137523651, + "num_tokens": 23256613.0, + "step": 19370 + }, + { + "entropy": 1.869692163169384, + "epoch": 0.060076304346309825, + "grad_norm": 8.64077377319336, + "learning_rate": 9.386321805676644e-06, + "loss": 0.6068, + "mean_token_accuracy": 0.8187321960926056, + "num_tokens": 23268668.0, + "step": 19380 + }, + { + "entropy": 1.857068532705307, + "epoch": 0.06010730347135952, + "grad_norm": 11.18220329284668, + "learning_rate": 9.391165358907294e-06, + "loss": 0.6335, + "mean_token_accuracy": 0.8069566667079926, + "num_tokens": 23281042.0, + "step": 19390 + }, + { + "entropy": 1.7603770941495895, + "epoch": 0.06013830259640922, + "grad_norm": 4.09375524520874, + "learning_rate": 9.396008912137946e-06, + "loss": 0.5366, + "mean_token_accuracy": 0.8220126062631607, + "num_tokens": 23295383.0, + "step": 19400 + }, + { + "entropy": 1.8189956784248351, + "epoch": 0.060169301721458915, + "grad_norm": 13.177933692932129, + "learning_rate": 9.400852465368596e-06, + "loss": 0.5506, + "mean_token_accuracy": 0.8236996352672576, + "num_tokens": 23307499.0, + "step": 19410 + }, + { + "entropy": 1.7352744668722153, + "epoch": 0.060200300846508605, + "grad_norm": 10.09691333770752, + "learning_rate": 9.405696018599245e-06, + "loss": 0.4693, + "mean_token_accuracy": 0.8295902729034423, + "num_tokens": 23320862.0, + "step": 19420 + }, + { + "entropy": 1.8696271255612373, + "epoch": 0.0602312999715583, + "grad_norm": 11.082074165344238, + "learning_rate": 9.410539571829895e-06, + "loss": 0.5574, + "mean_token_accuracy": 0.8225132539868355, + "num_tokens": 23332219.0, + "step": 19430 + }, + { + "entropy": 1.8461137875914573, + "epoch": 0.060262299096608, + "grad_norm": 10.549184799194336, + "learning_rate": 9.415383125060545e-06, + "loss": 0.6182, + "mean_token_accuracy": 0.8168609410524368, + "num_tokens": 23343965.0, + "step": 19440 + }, + { + "entropy": 1.920869068801403, + "epoch": 0.060293298221657694, + "grad_norm": 4.992405891418457, + "learning_rate": 9.420226678291195e-06, + "loss": 0.6075, + "mean_token_accuracy": 0.8106775835156441, + "num_tokens": 23355438.0, + "step": 19450 + }, + { + "entropy": 1.8544103041291238, + "epoch": 0.06032429734670739, + "grad_norm": 10.945703506469727, + "learning_rate": 9.425070231521845e-06, + "loss": 0.5264, + "mean_token_accuracy": 0.8255405530333519, + "num_tokens": 23368404.0, + "step": 19460 + }, + { + "entropy": 1.841069608926773, + "epoch": 0.06035529647175709, + "grad_norm": 11.322453498840332, + "learning_rate": 9.429913784752495e-06, + "loss": 0.5469, + "mean_token_accuracy": 0.8234514787793159, + "num_tokens": 23381143.0, + "step": 19470 + }, + { + "entropy": 1.8469559505581856, + "epoch": 0.06038629559680678, + "grad_norm": 3.433302164077759, + "learning_rate": 9.434757337983145e-06, + "loss": 0.5658, + "mean_token_accuracy": 0.8232303768396377, + "num_tokens": 23393701.0, + "step": 19480 + }, + { + "entropy": 1.965156337618828, + "epoch": 0.060417294721856474, + "grad_norm": 11.884709358215332, + "learning_rate": 9.439600891213795e-06, + "loss": 0.6732, + "mean_token_accuracy": 0.8007081165909767, + "num_tokens": 23404106.0, + "step": 19490 + }, + { + "entropy": 1.9104407191276551, + "epoch": 0.06044829384690617, + "grad_norm": 9.23556900024414, + "learning_rate": 9.444444444444445e-06, + "loss": 0.6494, + "mean_token_accuracy": 0.8021333515644073, + "num_tokens": 23415825.0, + "step": 19500 + }, + { + "entropy": 1.885290040075779, + "epoch": 0.06047929297195587, + "grad_norm": 10.439990043640137, + "learning_rate": 9.449287997675094e-06, + "loss": 0.5873, + "mean_token_accuracy": 0.8280000373721123, + "num_tokens": 23426924.0, + "step": 19510 + }, + { + "entropy": 1.8925482839345933, + "epoch": 0.060510292097005564, + "grad_norm": 10.890645027160645, + "learning_rate": 9.454131550905746e-06, + "loss": 0.5907, + "mean_token_accuracy": 0.8103524506092071, + "num_tokens": 23439002.0, + "step": 19520 + }, + { + "entropy": 1.9133965462446212, + "epoch": 0.06054129122205526, + "grad_norm": 11.458105087280273, + "learning_rate": 9.458975104136396e-06, + "loss": 0.6338, + "mean_token_accuracy": 0.8142278388142585, + "num_tokens": 23450330.0, + "step": 19530 + }, + { + "entropy": 1.9326462358236314, + "epoch": 0.06057229034710495, + "grad_norm": 9.607535362243652, + "learning_rate": 9.463818657367046e-06, + "loss": 0.686, + "mean_token_accuracy": 0.7996918767690658, + "num_tokens": 23461629.0, + "step": 19540 + }, + { + "entropy": 1.876935575157404, + "epoch": 0.060603289472154646, + "grad_norm": 3.0272324085235596, + "learning_rate": 9.468662210597696e-06, + "loss": 0.5273, + "mean_token_accuracy": 0.8154638946056366, + "num_tokens": 23474503.0, + "step": 19550 + }, + { + "entropy": 1.9294023901224135, + "epoch": 0.06063428859720434, + "grad_norm": 9.758378982543945, + "learning_rate": 9.473505763828346e-06, + "loss": 0.6173, + "mean_token_accuracy": 0.8036470055580139, + "num_tokens": 23486313.0, + "step": 19560 + }, + { + "entropy": 1.9118182629346847, + "epoch": 0.06066528772225404, + "grad_norm": 8.618282318115234, + "learning_rate": 9.478349317058995e-06, + "loss": 0.6425, + "mean_token_accuracy": 0.8147132098674774, + "num_tokens": 23497631.0, + "step": 19570 + }, + { + "entropy": 1.8255969345569611, + "epoch": 0.060696286847303736, + "grad_norm": 4.955978870391846, + "learning_rate": 9.483192870289645e-06, + "loss": 0.5395, + "mean_token_accuracy": 0.8196621656417846, + "num_tokens": 23511011.0, + "step": 19580 + }, + { + "entropy": 1.7804076254367829, + "epoch": 0.06072728597235343, + "grad_norm": 9.806567192077637, + "learning_rate": 9.488036423520295e-06, + "loss": 0.5345, + "mean_token_accuracy": 0.8275730326771736, + "num_tokens": 23523377.0, + "step": 19590 + }, + { + "entropy": 1.7767413407564163, + "epoch": 0.06075828509740312, + "grad_norm": 5.016587257385254, + "learning_rate": 9.492879976750945e-06, + "loss": 0.5298, + "mean_token_accuracy": 0.8214924201369286, + "num_tokens": 23537122.0, + "step": 19600 + }, + { + "entropy": 1.8715672463178634, + "epoch": 0.06078928422245282, + "grad_norm": 6.352642059326172, + "learning_rate": 9.497723529981595e-06, + "loss": 0.5998, + "mean_token_accuracy": 0.8125947907567024, + "num_tokens": 23548737.0, + "step": 19610 + }, + { + "entropy": 1.7911435902118682, + "epoch": 0.060820283347502516, + "grad_norm": 11.929505348205566, + "learning_rate": 9.502567083212245e-06, + "loss": 0.546, + "mean_token_accuracy": 0.8234824985265732, + "num_tokens": 23561524.0, + "step": 19620 + }, + { + "entropy": 1.8728584364056586, + "epoch": 0.06085128247255221, + "grad_norm": 9.866854667663574, + "learning_rate": 9.507410636442895e-06, + "loss": 0.5745, + "mean_token_accuracy": 0.8147120550274849, + "num_tokens": 23573298.0, + "step": 19630 + }, + { + "entropy": 1.7628555655479432, + "epoch": 0.06088228159760191, + "grad_norm": 4.673979759216309, + "learning_rate": 9.512254189673545e-06, + "loss": 0.551, + "mean_token_accuracy": 0.823185084760189, + "num_tokens": 23585490.0, + "step": 19640 + }, + { + "entropy": 1.9162281841039657, + "epoch": 0.060913280722651605, + "grad_norm": 9.883894920349121, + "learning_rate": 9.517097742904196e-06, + "loss": 0.6322, + "mean_token_accuracy": 0.8094197094440461, + "num_tokens": 23596459.0, + "step": 19650 + }, + { + "entropy": 1.7629586443305016, + "epoch": 0.0609442798477013, + "grad_norm": 9.179272651672363, + "learning_rate": 9.521941296134846e-06, + "loss": 0.5226, + "mean_token_accuracy": 0.8276288509368896, + "num_tokens": 23609459.0, + "step": 19660 + }, + { + "entropy": 1.8585124626755714, + "epoch": 0.06097527897275099, + "grad_norm": 10.178589820861816, + "learning_rate": 9.526784849365496e-06, + "loss": 0.6334, + "mean_token_accuracy": 0.8119170770049096, + "num_tokens": 23622078.0, + "step": 19670 + }, + { + "entropy": 1.8904533594846726, + "epoch": 0.06100627809780069, + "grad_norm": 9.675638198852539, + "learning_rate": 9.531628402596146e-06, + "loss": 0.6203, + "mean_token_accuracy": 0.8164917901158333, + "num_tokens": 23632839.0, + "step": 19680 + }, + { + "entropy": 1.8115496248006822, + "epoch": 0.061037277222850385, + "grad_norm": 12.059412956237793, + "learning_rate": 9.536471955826794e-06, + "loss": 0.5708, + "mean_token_accuracy": 0.8198250159621239, + "num_tokens": 23644698.0, + "step": 19690 + }, + { + "entropy": 1.8150453560054303, + "epoch": 0.06106827634790008, + "grad_norm": 10.134939193725586, + "learning_rate": 9.541315509057446e-06, + "loss": 0.6001, + "mean_token_accuracy": 0.815633225440979, + "num_tokens": 23657908.0, + "step": 19700 + }, + { + "entropy": 1.7574081301689148, + "epoch": 0.06109927547294978, + "grad_norm": 5.226987838745117, + "learning_rate": 9.546159062288096e-06, + "loss": 0.5158, + "mean_token_accuracy": 0.8180340453982353, + "num_tokens": 23671715.0, + "step": 19710 + }, + { + "entropy": 1.8007687643170356, + "epoch": 0.061130274597999475, + "grad_norm": 9.646768569946289, + "learning_rate": 9.551002615518745e-06, + "loss": 0.5652, + "mean_token_accuracy": 0.8148997142910958, + "num_tokens": 23684332.0, + "step": 19720 + }, + { + "entropy": 1.836924096941948, + "epoch": 0.061161273723049164, + "grad_norm": 9.189767837524414, + "learning_rate": 9.555846168749395e-06, + "loss": 0.568, + "mean_token_accuracy": 0.817747424542904, + "num_tokens": 23696452.0, + "step": 19730 + }, + { + "entropy": 1.8786251202225686, + "epoch": 0.06119227284809886, + "grad_norm": 12.79636287689209, + "learning_rate": 9.560689721980045e-06, + "loss": 0.6144, + "mean_token_accuracy": 0.8185794189572334, + "num_tokens": 23707815.0, + "step": 19740 + }, + { + "entropy": 1.8025352910161019, + "epoch": 0.06122327197314856, + "grad_norm": 11.08669376373291, + "learning_rate": 9.565533275210695e-06, + "loss": 0.5575, + "mean_token_accuracy": 0.8157659009099006, + "num_tokens": 23720644.0, + "step": 19750 + }, + { + "entropy": 1.9162618890404701, + "epoch": 0.061254271098198254, + "grad_norm": 10.929869651794434, + "learning_rate": 9.570376828441345e-06, + "loss": 0.6926, + "mean_token_accuracy": 0.8083868056535721, + "num_tokens": 23731943.0, + "step": 19760 + }, + { + "entropy": 1.8641342878341676, + "epoch": 0.06128527022324795, + "grad_norm": 9.528030395507812, + "learning_rate": 9.575220381671997e-06, + "loss": 0.5878, + "mean_token_accuracy": 0.8118364483118057, + "num_tokens": 23744425.0, + "step": 19770 + }, + { + "entropy": 1.8606672033667564, + "epoch": 0.06131626934829765, + "grad_norm": 12.20190715789795, + "learning_rate": 9.580063934902646e-06, + "loss": 0.6066, + "mean_token_accuracy": 0.8095232203602791, + "num_tokens": 23755827.0, + "step": 19780 + }, + { + "entropy": 1.9077710419893266, + "epoch": 0.06134726847334734, + "grad_norm": 9.677120208740234, + "learning_rate": 9.584907488133295e-06, + "loss": 0.6099, + "mean_token_accuracy": 0.8066776558756829, + "num_tokens": 23767808.0, + "step": 19790 + }, + { + "entropy": 1.8880801230669022, + "epoch": 0.061378267598397034, + "grad_norm": 10.036674499511719, + "learning_rate": 9.589751041363944e-06, + "loss": 0.5946, + "mean_token_accuracy": 0.8205381706357002, + "num_tokens": 23779742.0, + "step": 19800 + }, + { + "entropy": 1.9239039212465285, + "epoch": 0.06140926672344673, + "grad_norm": 10.33579158782959, + "learning_rate": 9.594594594594594e-06, + "loss": 0.6357, + "mean_token_accuracy": 0.8081588789820671, + "num_tokens": 23790560.0, + "step": 19810 + }, + { + "entropy": 1.8777620539069175, + "epoch": 0.06144026584849643, + "grad_norm": 9.710867881774902, + "learning_rate": 9.599438147825246e-06, + "loss": 0.6344, + "mean_token_accuracy": 0.8112061858177185, + "num_tokens": 23802654.0, + "step": 19820 + }, + { + "entropy": 1.8921529993414878, + "epoch": 0.06147126497354612, + "grad_norm": 10.47066879272461, + "learning_rate": 9.604281701055896e-06, + "loss": 0.6444, + "mean_token_accuracy": 0.8064588844776154, + "num_tokens": 23813660.0, + "step": 19830 + }, + { + "entropy": 1.8871847927570342, + "epoch": 0.06150226409859582, + "grad_norm": 10.314284324645996, + "learning_rate": 9.609125254286546e-06, + "loss": 0.5919, + "mean_token_accuracy": 0.8220043256878853, + "num_tokens": 23825033.0, + "step": 19840 + }, + { + "entropy": 1.8442516967654228, + "epoch": 0.06153326322364551, + "grad_norm": 2.9218690395355225, + "learning_rate": 9.613968807517196e-06, + "loss": 0.5303, + "mean_token_accuracy": 0.8329377219080925, + "num_tokens": 23837886.0, + "step": 19850 + }, + { + "entropy": 1.9648214638233186, + "epoch": 0.061564262348695206, + "grad_norm": 10.593524932861328, + "learning_rate": 9.618812360747845e-06, + "loss": 0.6819, + "mean_token_accuracy": 0.8005811557173729, + "num_tokens": 23848949.0, + "step": 19860 + }, + { + "entropy": 1.9761517822742463, + "epoch": 0.0615952614737449, + "grad_norm": 11.34875774383545, + "learning_rate": 9.623655913978495e-06, + "loss": 0.6411, + "mean_token_accuracy": 0.8136752307415008, + "num_tokens": 23860198.0, + "step": 19870 + }, + { + "entropy": 1.935739828646183, + "epoch": 0.0616262605987946, + "grad_norm": 10.864781379699707, + "learning_rate": 9.628499467209145e-06, + "loss": 0.6602, + "mean_token_accuracy": 0.8089005783200264, + "num_tokens": 23872637.0, + "step": 19880 + }, + { + "entropy": 1.8924452632665634, + "epoch": 0.061657259723844296, + "grad_norm": 10.361483573913574, + "learning_rate": 9.633343020439797e-06, + "loss": 0.5566, + "mean_token_accuracy": 0.8306096389889717, + "num_tokens": 23885123.0, + "step": 19890 + }, + { + "entropy": 1.9373007863759995, + "epoch": 0.06168825884889399, + "grad_norm": 9.866893768310547, + "learning_rate": 9.638186573670445e-06, + "loss": 0.5791, + "mean_token_accuracy": 0.8127657011151314, + "num_tokens": 23897519.0, + "step": 19900 + }, + { + "entropy": 1.9933653771877289, + "epoch": 0.06171925797394368, + "grad_norm": 10.006440162658691, + "learning_rate": 9.643030126901095e-06, + "loss": 0.6684, + "mean_token_accuracy": 0.810961103439331, + "num_tokens": 23908829.0, + "step": 19910 + }, + { + "entropy": 1.9257026076316834, + "epoch": 0.06175025709899338, + "grad_norm": 10.654083251953125, + "learning_rate": 9.647873680131745e-06, + "loss": 0.6223, + "mean_token_accuracy": 0.8109946802258492, + "num_tokens": 23920767.0, + "step": 19920 + }, + { + "entropy": 1.956542044878006, + "epoch": 0.061781256224043075, + "grad_norm": 11.350798606872559, + "learning_rate": 9.652717233362395e-06, + "loss": 0.6668, + "mean_token_accuracy": 0.8026480153203011, + "num_tokens": 23932164.0, + "step": 19930 + }, + { + "entropy": 2.0048277229070663, + "epoch": 0.06181225534909277, + "grad_norm": 11.242584228515625, + "learning_rate": 9.657560786593046e-06, + "loss": 0.6457, + "mean_token_accuracy": 0.8089631497859955, + "num_tokens": 23942822.0, + "step": 19940 + }, + { + "entropy": 1.9254214867949486, + "epoch": 0.06184325447414247, + "grad_norm": 10.975452423095703, + "learning_rate": 9.662404339823696e-06, + "loss": 0.6117, + "mean_token_accuracy": 0.8136309564113617, + "num_tokens": 23954872.0, + "step": 19950 + }, + { + "entropy": 1.8969217911362648, + "epoch": 0.061874253599192165, + "grad_norm": 9.684366226196289, + "learning_rate": 9.667247893054346e-06, + "loss": 0.6079, + "mean_token_accuracy": 0.8263826936483383, + "num_tokens": 23967198.0, + "step": 19960 + }, + { + "entropy": 1.9214407287538051, + "epoch": 0.061905252724241855, + "grad_norm": 10.67751693725586, + "learning_rate": 9.672091446284996e-06, + "loss": 0.582, + "mean_token_accuracy": 0.8223884150385856, + "num_tokens": 23979194.0, + "step": 19970 + }, + { + "entropy": 1.8657239809632302, + "epoch": 0.06193625184929155, + "grad_norm": 9.526717185974121, + "learning_rate": 9.676934999515646e-06, + "loss": 0.538, + "mean_token_accuracy": 0.8274316728115082, + "num_tokens": 23991363.0, + "step": 19980 + }, + { + "entropy": 1.9447234928607942, + "epoch": 0.06196725097434125, + "grad_norm": 11.365060806274414, + "learning_rate": 9.681778552746296e-06, + "loss": 0.6874, + "mean_token_accuracy": 0.8004522323608398, + "num_tokens": 24002188.0, + "step": 19990 + }, + { + "entropy": 1.8590040877461433, + "epoch": 0.061998250099390945, + "grad_norm": 6.042444705963135, + "learning_rate": 9.686622105976946e-06, + "loss": 0.5923, + "mean_token_accuracy": 0.814041730761528, + "num_tokens": 24015280.0, + "step": 20000 + }, + { + "entropy": 1.9083186939358712, + "epoch": 0.06202924922444064, + "grad_norm": 10.619732856750488, + "learning_rate": 9.691465659207595e-06, + "loss": 0.5909, + "mean_token_accuracy": 0.8160168409347535, + "num_tokens": 24027453.0, + "step": 20010 + }, + { + "entropy": 1.922300359606743, + "epoch": 0.06206024834949034, + "grad_norm": 12.882160186767578, + "learning_rate": 9.696309212438245e-06, + "loss": 0.649, + "mean_token_accuracy": 0.7888608485460281, + "num_tokens": 24038637.0, + "step": 20020 + }, + { + "entropy": 1.9450786724686622, + "epoch": 0.062091247474540034, + "grad_norm": 9.845813751220703, + "learning_rate": 9.701152765668895e-06, + "loss": 0.645, + "mean_token_accuracy": 0.8010739460587502, + "num_tokens": 24050162.0, + "step": 20030 + }, + { + "entropy": 1.9187857955694199, + "epoch": 0.062122246599589724, + "grad_norm": 9.616793632507324, + "learning_rate": 9.705996318899545e-06, + "loss": 0.6361, + "mean_token_accuracy": 0.8148136526346207, + "num_tokens": 24061516.0, + "step": 20040 + }, + { + "entropy": 1.90914705991745, + "epoch": 0.06215324572463942, + "grad_norm": 12.597700119018555, + "learning_rate": 9.710839872130195e-06, + "loss": 0.6398, + "mean_token_accuracy": 0.8118062347173691, + "num_tokens": 24072040.0, + "step": 20050 + }, + { + "entropy": 1.7790238752961158, + "epoch": 0.06218424484968912, + "grad_norm": 14.254045486450195, + "learning_rate": 9.715683425360845e-06, + "loss": 0.5597, + "mean_token_accuracy": 0.824961057305336, + "num_tokens": 24084516.0, + "step": 20060 + }, + { + "entropy": 1.8951515421271323, + "epoch": 0.062215243974738814, + "grad_norm": 10.398776054382324, + "learning_rate": 9.720526978591496e-06, + "loss": 0.6339, + "mean_token_accuracy": 0.8136864483356476, + "num_tokens": 24096174.0, + "step": 20070 + }, + { + "entropy": 1.8427141726016998, + "epoch": 0.06224624309978851, + "grad_norm": 9.789461135864258, + "learning_rate": 9.725370531822146e-06, + "loss": 0.5943, + "mean_token_accuracy": 0.8145849913358688, + "num_tokens": 24107922.0, + "step": 20080 + }, + { + "entropy": 1.9320272147655486, + "epoch": 0.06227724222483821, + "grad_norm": 11.617203712463379, + "learning_rate": 9.730214085052796e-06, + "loss": 0.668, + "mean_token_accuracy": 0.802889634668827, + "num_tokens": 24119219.0, + "step": 20090 + }, + { + "entropy": 1.8822171539068222, + "epoch": 0.0623082413498879, + "grad_norm": 10.93297004699707, + "learning_rate": 9.735057638283446e-06, + "loss": 0.5582, + "mean_token_accuracy": 0.8173771098256111, + "num_tokens": 24131388.0, + "step": 20100 + }, + { + "entropy": 1.9237861156463623, + "epoch": 0.06233924047493759, + "grad_norm": 10.885433197021484, + "learning_rate": 9.739901191514094e-06, + "loss": 0.6236, + "mean_token_accuracy": 0.8068635702133179, + "num_tokens": 24143423.0, + "step": 20110 + }, + { + "entropy": 1.8605427652597428, + "epoch": 0.06237023959998729, + "grad_norm": 10.937981605529785, + "learning_rate": 9.744744744744746e-06, + "loss": 0.5793, + "mean_token_accuracy": 0.8124633759260178, + "num_tokens": 24155500.0, + "step": 20120 + }, + { + "entropy": 1.921093289554119, + "epoch": 0.06240123872503699, + "grad_norm": 4.816215515136719, + "learning_rate": 9.749588297975396e-06, + "loss": 0.6207, + "mean_token_accuracy": 0.8212477296590805, + "num_tokens": 24167119.0, + "step": 20130 + }, + { + "entropy": 1.822756864130497, + "epoch": 0.06243223785008668, + "grad_norm": 10.295876502990723, + "learning_rate": 9.754431851206046e-06, + "loss": 0.5307, + "mean_token_accuracy": 0.8297939330339432, + "num_tokens": 24179336.0, + "step": 20140 + }, + { + "entropy": 1.8902019761502742, + "epoch": 0.06246323697513638, + "grad_norm": 9.802702903747559, + "learning_rate": 9.759275404436695e-06, + "loss": 0.592, + "mean_token_accuracy": 0.8155349537730217, + "num_tokens": 24191945.0, + "step": 20150 + }, + { + "entropy": 1.9736180931329728, + "epoch": 0.06249423610018607, + "grad_norm": 10.913537979125977, + "learning_rate": 9.764118957667345e-06, + "loss": 0.646, + "mean_token_accuracy": 0.8018572524189949, + "num_tokens": 24203220.0, + "step": 20160 + }, + { + "entropy": 1.8472725585103036, + "epoch": 0.06252523522523577, + "grad_norm": 11.09897518157959, + "learning_rate": 9.768962510897995e-06, + "loss": 0.6617, + "mean_token_accuracy": 0.8151746213436126, + "num_tokens": 24216924.0, + "step": 20170 + }, + { + "entropy": 1.8363325014710425, + "epoch": 0.06255623435028547, + "grad_norm": 9.55675220489502, + "learning_rate": 9.773806064128645e-06, + "loss": 0.5394, + "mean_token_accuracy": 0.8332865908741951, + "num_tokens": 24229749.0, + "step": 20180 + }, + { + "entropy": 1.8290324866771699, + "epoch": 0.06258723347533515, + "grad_norm": 12.252378463745117, + "learning_rate": 9.778649617359297e-06, + "loss": 0.5426, + "mean_token_accuracy": 0.8365808725357056, + "num_tokens": 24242819.0, + "step": 20190 + }, + { + "entropy": 1.9081189304590225, + "epoch": 0.06261823260038485, + "grad_norm": 10.117230415344238, + "learning_rate": 9.783493170589947e-06, + "loss": 0.6745, + "mean_token_accuracy": 0.8055532008409501, + "num_tokens": 24253826.0, + "step": 20200 + }, + { + "entropy": 1.8748064145445824, + "epoch": 0.06264923172543455, + "grad_norm": 9.556116104125977, + "learning_rate": 9.788336723820595e-06, + "loss": 0.6262, + "mean_token_accuracy": 0.8184110090136528, + "num_tokens": 24264932.0, + "step": 20210 + }, + { + "entropy": 1.8884572684764862, + "epoch": 0.06268023085048424, + "grad_norm": 4.309450149536133, + "learning_rate": 9.793180277051245e-06, + "loss": 0.6162, + "mean_token_accuracy": 0.8134237229824066, + "num_tokens": 24277309.0, + "step": 20220 + }, + { + "entropy": 1.946840487420559, + "epoch": 0.06271122997553394, + "grad_norm": 10.026110649108887, + "learning_rate": 9.798023830281895e-06, + "loss": 0.6271, + "mean_token_accuracy": 0.8044777557253837, + "num_tokens": 24288780.0, + "step": 20230 + }, + { + "entropy": 2.013564696907997, + "epoch": 0.06274222910058364, + "grad_norm": 9.077942848205566, + "learning_rate": 9.802867383512546e-06, + "loss": 0.6781, + "mean_token_accuracy": 0.800484599173069, + "num_tokens": 24299775.0, + "step": 20240 + }, + { + "entropy": 1.8675778850913047, + "epoch": 0.06277322822563333, + "grad_norm": 9.851303100585938, + "learning_rate": 9.807710936743196e-06, + "loss": 0.5568, + "mean_token_accuracy": 0.8174749821424484, + "num_tokens": 24312128.0, + "step": 20250 + }, + { + "entropy": 1.9423525124788283, + "epoch": 0.06280422735068303, + "grad_norm": 7.952591419219971, + "learning_rate": 9.812554489973846e-06, + "loss": 0.6073, + "mean_token_accuracy": 0.8145397856831551, + "num_tokens": 24323096.0, + "step": 20260 + }, + { + "entropy": 1.7513898000121118, + "epoch": 0.06283522647573273, + "grad_norm": 5.437192916870117, + "learning_rate": 9.817398043204496e-06, + "loss": 0.5386, + "mean_token_accuracy": 0.8165362849831581, + "num_tokens": 24337460.0, + "step": 20270 + }, + { + "entropy": 1.907211183011532, + "epoch": 0.06286622560078242, + "grad_norm": 5.400768756866455, + "learning_rate": 9.822241596435146e-06, + "loss": 0.6175, + "mean_token_accuracy": 0.8163802400231361, + "num_tokens": 24349174.0, + "step": 20280 + }, + { + "entropy": 1.93508680164814, + "epoch": 0.06289722472583212, + "grad_norm": 10.331210136413574, + "learning_rate": 9.827085149665796e-06, + "loss": 0.6279, + "mean_token_accuracy": 0.8010757058858872, + "num_tokens": 24361061.0, + "step": 20290 + }, + { + "entropy": 1.9249700546264648, + "epoch": 0.06292822385088181, + "grad_norm": 9.38033390045166, + "learning_rate": 9.831928702896445e-06, + "loss": 0.608, + "mean_token_accuracy": 0.8222104609012604, + "num_tokens": 24372277.0, + "step": 20300 + }, + { + "entropy": 1.8913897737860679, + "epoch": 0.0629592229759315, + "grad_norm": 4.759194374084473, + "learning_rate": 9.836772256127097e-06, + "loss": 0.6044, + "mean_token_accuracy": 0.8175428286194801, + "num_tokens": 24384034.0, + "step": 20310 + }, + { + "entropy": 1.8855891197919845, + "epoch": 0.0629902221009812, + "grad_norm": 4.686741828918457, + "learning_rate": 9.841615809357745e-06, + "loss": 0.6091, + "mean_token_accuracy": 0.8151167094707489, + "num_tokens": 24395926.0, + "step": 20320 + }, + { + "entropy": 1.9041914016008377, + "epoch": 0.06302122122603089, + "grad_norm": 4.964253902435303, + "learning_rate": 9.846459362588395e-06, + "loss": 0.6008, + "mean_token_accuracy": 0.8038349270820617, + "num_tokens": 24407924.0, + "step": 20330 + }, + { + "entropy": 1.8939720645546914, + "epoch": 0.06305222035108059, + "grad_norm": 10.10213851928711, + "learning_rate": 9.851302915819045e-06, + "loss": 0.6206, + "mean_token_accuracy": 0.8080206617712975, + "num_tokens": 24419254.0, + "step": 20340 + }, + { + "entropy": 1.8911166504025458, + "epoch": 0.06308321947613028, + "grad_norm": 4.227776050567627, + "learning_rate": 9.856146469049695e-06, + "loss": 0.6281, + "mean_token_accuracy": 0.8054097011685372, + "num_tokens": 24431388.0, + "step": 20350 + }, + { + "entropy": 1.8960236981511116, + "epoch": 0.06311421860117998, + "grad_norm": 12.084880828857422, + "learning_rate": 9.860990022280346e-06, + "loss": 0.601, + "mean_token_accuracy": 0.8084520295262336, + "num_tokens": 24443385.0, + "step": 20360 + }, + { + "entropy": 1.871785145998001, + "epoch": 0.06314521772622968, + "grad_norm": 10.692331314086914, + "learning_rate": 9.865833575510996e-06, + "loss": 0.5794, + "mean_token_accuracy": 0.8134460240602494, + "num_tokens": 24454988.0, + "step": 20370 + }, + { + "entropy": 1.9693672031164169, + "epoch": 0.06317621685127937, + "grad_norm": 10.991315841674805, + "learning_rate": 9.870677128741646e-06, + "loss": 0.7165, + "mean_token_accuracy": 0.7969113975763321, + "num_tokens": 24465991.0, + "step": 20380 + }, + { + "entropy": 1.9215772941708564, + "epoch": 0.06320721597632907, + "grad_norm": 11.294339179992676, + "learning_rate": 9.875520681972296e-06, + "loss": 0.6661, + "mean_token_accuracy": 0.7995204910635948, + "num_tokens": 24478784.0, + "step": 20390 + }, + { + "entropy": 1.750515715777874, + "epoch": 0.06323821510137877, + "grad_norm": 9.189525604248047, + "learning_rate": 9.880364235202946e-06, + "loss": 0.4688, + "mean_token_accuracy": 0.8383323296904563, + "num_tokens": 24492223.0, + "step": 20400 + }, + { + "entropy": 1.8770269468426704, + "epoch": 0.06326921422642846, + "grad_norm": 9.287210464477539, + "learning_rate": 9.885207788433596e-06, + "loss": 0.6492, + "mean_token_accuracy": 0.8062692806124687, + "num_tokens": 24503912.0, + "step": 20410 + }, + { + "entropy": 1.8530636951327324, + "epoch": 0.06330021335147816, + "grad_norm": 7.70045280456543, + "learning_rate": 9.890051341664246e-06, + "loss": 0.5643, + "mean_token_accuracy": 0.8267319366335869, + "num_tokens": 24515605.0, + "step": 20420 + }, + { + "entropy": 1.9133968889713286, + "epoch": 0.06333121247652786, + "grad_norm": 10.080760955810547, + "learning_rate": 9.894894894894896e-06, + "loss": 0.6142, + "mean_token_accuracy": 0.8130926743149758, + "num_tokens": 24527065.0, + "step": 20430 + }, + { + "entropy": 1.876038283109665, + "epoch": 0.06336221160157754, + "grad_norm": 9.016529083251953, + "learning_rate": 9.899738448125546e-06, + "loss": 0.6295, + "mean_token_accuracy": 0.8140005797147751, + "num_tokens": 24538739.0, + "step": 20440 + }, + { + "entropy": 1.8368092089891435, + "epoch": 0.06339321072662724, + "grad_norm": 9.778054237365723, + "learning_rate": 9.904582001356195e-06, + "loss": 0.6006, + "mean_token_accuracy": 0.81819748878479, + "num_tokens": 24551215.0, + "step": 20450 + }, + { + "entropy": 1.7856343001127244, + "epoch": 0.06342420985167693, + "grad_norm": 4.523138523101807, + "learning_rate": 9.909425554586845e-06, + "loss": 0.509, + "mean_token_accuracy": 0.8145459353923797, + "num_tokens": 24564545.0, + "step": 20460 + }, + { + "entropy": 1.8864993780851365, + "epoch": 0.06345520897672663, + "grad_norm": 9.66295051574707, + "learning_rate": 9.914269107817495e-06, + "loss": 0.6304, + "mean_token_accuracy": 0.8107953786849975, + "num_tokens": 24576201.0, + "step": 20470 + }, + { + "entropy": 1.8306699201464653, + "epoch": 0.06348620810177633, + "grad_norm": 8.954712867736816, + "learning_rate": 9.919112661048145e-06, + "loss": 0.5188, + "mean_token_accuracy": 0.8272392004728317, + "num_tokens": 24588462.0, + "step": 20480 + }, + { + "entropy": 1.8582230091094971, + "epoch": 0.06351720722682602, + "grad_norm": 5.900847911834717, + "learning_rate": 9.923956214278797e-06, + "loss": 0.6333, + "mean_token_accuracy": 0.8117007941007615, + "num_tokens": 24601048.0, + "step": 20490 + }, + { + "entropy": 1.9099309206008912, + "epoch": 0.06354820635187572, + "grad_norm": 10.398176193237305, + "learning_rate": 9.928799767509447e-06, + "loss": 0.6342, + "mean_token_accuracy": 0.8157493248581886, + "num_tokens": 24612082.0, + "step": 20500 + }, + { + "entropy": 1.8192737758159638, + "epoch": 0.06357920547692542, + "grad_norm": 4.901766300201416, + "learning_rate": 9.933643320740096e-06, + "loss": 0.5962, + "mean_token_accuracy": 0.8077544063329697, + "num_tokens": 24624244.0, + "step": 20510 + }, + { + "entropy": 1.9380394339561462, + "epoch": 0.06361020460197511, + "grad_norm": 9.72773551940918, + "learning_rate": 9.938486873970746e-06, + "loss": 0.6725, + "mean_token_accuracy": 0.8050031334161758, + "num_tokens": 24635227.0, + "step": 20520 + }, + { + "entropy": 1.8201100319623946, + "epoch": 0.06364120372702481, + "grad_norm": 5.236706256866455, + "learning_rate": 9.943330427201394e-06, + "loss": 0.5545, + "mean_token_accuracy": 0.8271597489714623, + "num_tokens": 24647349.0, + "step": 20530 + }, + { + "entropy": 1.955614548921585, + "epoch": 0.0636722028520745, + "grad_norm": 13.138460159301758, + "learning_rate": 9.948173980432046e-06, + "loss": 0.7057, + "mean_token_accuracy": 0.796806488931179, + "num_tokens": 24658536.0, + "step": 20540 + }, + { + "entropy": 1.8944236859679222, + "epoch": 0.0637032019771242, + "grad_norm": 7.109180927276611, + "learning_rate": 9.953017533662696e-06, + "loss": 0.6351, + "mean_token_accuracy": 0.8019287914037705, + "num_tokens": 24670364.0, + "step": 20550 + }, + { + "entropy": 1.8803847134113312, + "epoch": 0.06373420110217388, + "grad_norm": 9.590316772460938, + "learning_rate": 9.957861086893346e-06, + "loss": 0.6256, + "mean_token_accuracy": 0.8110292464494705, + "num_tokens": 24682351.0, + "step": 20560 + }, + { + "entropy": 1.9103557452559472, + "epoch": 0.06376520022722358, + "grad_norm": 10.402158737182617, + "learning_rate": 9.962704640123996e-06, + "loss": 0.6655, + "mean_token_accuracy": 0.7996529176831245, + "num_tokens": 24693410.0, + "step": 20570 + }, + { + "entropy": 1.8738579228520393, + "epoch": 0.06379619935227328, + "grad_norm": 8.206809997558594, + "learning_rate": 9.967548193354646e-06, + "loss": 0.5871, + "mean_token_accuracy": 0.8173448964953423, + "num_tokens": 24705775.0, + "step": 20580 + }, + { + "entropy": 1.8843411058187485, + "epoch": 0.06382719847732297, + "grad_norm": 11.01453971862793, + "learning_rate": 9.972391746585295e-06, + "loss": 0.6117, + "mean_token_accuracy": 0.808614219725132, + "num_tokens": 24717660.0, + "step": 20590 + }, + { + "entropy": 1.8635393604636192, + "epoch": 0.06385819760237267, + "grad_norm": 10.689123153686523, + "learning_rate": 9.977235299815945e-06, + "loss": 0.6426, + "mean_token_accuracy": 0.8036311730742455, + "num_tokens": 24729686.0, + "step": 20600 + }, + { + "entropy": 1.9377967566251755, + "epoch": 0.06388919672742237, + "grad_norm": 11.218433380126953, + "learning_rate": 9.982078853046597e-06, + "loss": 0.6821, + "mean_token_accuracy": 0.8068268105387688, + "num_tokens": 24739992.0, + "step": 20610 + }, + { + "entropy": 1.8930637896060944, + "epoch": 0.06392019585247206, + "grad_norm": 10.52428150177002, + "learning_rate": 9.986922406277247e-06, + "loss": 0.6418, + "mean_token_accuracy": 0.802625036239624, + "num_tokens": 24751815.0, + "step": 20620 + }, + { + "entropy": 1.9082047209143638, + "epoch": 0.06395119497752176, + "grad_norm": 9.473612785339355, + "learning_rate": 9.991765959507895e-06, + "loss": 0.612, + "mean_token_accuracy": 0.8163775265216827, + "num_tokens": 24763588.0, + "step": 20630 + }, + { + "entropy": 1.8783060640096665, + "epoch": 0.06398219410257146, + "grad_norm": 10.244823455810547, + "learning_rate": 9.996609512738545e-06, + "loss": 0.6235, + "mean_token_accuracy": 0.8123615190386773, + "num_tokens": 24776091.0, + "step": 20640 + }, + { + "entropy": 1.8689485356211661, + "epoch": 0.06401319322762115, + "grad_norm": 9.722251892089844, + "learning_rate": 9.999273546183343e-06, + "loss": 0.5782, + "mean_token_accuracy": 0.8126691862940788, + "num_tokens": 24787537.0, + "step": 20650 + }, + { + "entropy": 1.869950420409441, + "epoch": 0.06404419235267085, + "grad_norm": 9.074020385742188, + "learning_rate": 9.99685317639837e-06, + "loss": 0.5865, + "mean_token_accuracy": 0.8139035999774933, + "num_tokens": 24799896.0, + "step": 20660 + }, + { + "entropy": 1.8778914123773576, + "epoch": 0.06407519147772055, + "grad_norm": 11.21394157409668, + "learning_rate": 9.99443456334761e-06, + "loss": 0.543, + "mean_token_accuracy": 0.8123081102967262, + "num_tokens": 24811525.0, + "step": 20670 + }, + { + "entropy": 1.9493975609540939, + "epoch": 0.06410619060277023, + "grad_norm": 13.748980522155762, + "learning_rate": 9.992017704906994e-06, + "loss": 0.647, + "mean_token_accuracy": 0.8159363105893135, + "num_tokens": 24822680.0, + "step": 20680 + }, + { + "entropy": 1.8906102269887923, + "epoch": 0.06413718972781993, + "grad_norm": 8.22630786895752, + "learning_rate": 9.989602598956046e-06, + "loss": 0.6267, + "mean_token_accuracy": 0.8106621414422989, + "num_tokens": 24834924.0, + "step": 20690 + }, + { + "entropy": 1.7157309882342815, + "epoch": 0.06416818885286962, + "grad_norm": 7.910831928253174, + "learning_rate": 9.987189243377873e-06, + "loss": 0.4686, + "mean_token_accuracy": 0.8414781808853149, + "num_tokens": 24849152.0, + "step": 20700 + }, + { + "entropy": 1.9214319348335267, + "epoch": 0.06419918797791932, + "grad_norm": 10.313517570495605, + "learning_rate": 9.984777636059161e-06, + "loss": 0.7036, + "mean_token_accuracy": 0.7919689938426018, + "num_tokens": 24860357.0, + "step": 20710 + }, + { + "entropy": 1.8385179117321968, + "epoch": 0.06423018710296902, + "grad_norm": 12.135076522827148, + "learning_rate": 9.98236777489017e-06, + "loss": 0.6188, + "mean_token_accuracy": 0.8177133709192276, + "num_tokens": 24872095.0, + "step": 20720 + }, + { + "entropy": 1.777934755384922, + "epoch": 0.06426118622801871, + "grad_norm": 4.797874450683594, + "learning_rate": 9.979959657764716e-06, + "loss": 0.4933, + "mean_token_accuracy": 0.8323575839400291, + "num_tokens": 24884741.0, + "step": 20730 + }, + { + "entropy": 1.8482744053006173, + "epoch": 0.06429218535306841, + "grad_norm": 9.592241287231445, + "learning_rate": 9.977553282580177e-06, + "loss": 0.5549, + "mean_token_accuracy": 0.8222592979669571, + "num_tokens": 24897187.0, + "step": 20740 + }, + { + "entropy": 1.8500724270939828, + "epoch": 0.0643231844781181, + "grad_norm": 12.197331428527832, + "learning_rate": 9.975148647237474e-06, + "loss": 0.6756, + "mean_token_accuracy": 0.7995992138981819, + "num_tokens": 24908381.0, + "step": 20750 + }, + { + "entropy": 1.75242570489645, + "epoch": 0.0643541836031678, + "grad_norm": 10.444792747497559, + "learning_rate": 9.972745749641067e-06, + "loss": 0.5345, + "mean_token_accuracy": 0.8243147745728493, + "num_tokens": 24920923.0, + "step": 20760 + }, + { + "entropy": 1.8526248589158059, + "epoch": 0.0643851827282175, + "grad_norm": 9.078262329101562, + "learning_rate": 9.97034458769895e-06, + "loss": 0.6349, + "mean_token_accuracy": 0.8068570986390113, + "num_tokens": 24932578.0, + "step": 20770 + }, + { + "entropy": 1.9244740456342697, + "epoch": 0.0644161818532672, + "grad_norm": 10.673161506652832, + "learning_rate": 9.967945159322642e-06, + "loss": 0.687, + "mean_token_accuracy": 0.7971266448497772, + "num_tokens": 24942881.0, + "step": 20780 + }, + { + "entropy": 1.8756011351943016, + "epoch": 0.06444718097831689, + "grad_norm": 9.983181953430176, + "learning_rate": 9.965547462427177e-06, + "loss": 0.6225, + "mean_token_accuracy": 0.8142784401774407, + "num_tokens": 24954564.0, + "step": 20790 + }, + { + "entropy": 1.8296712294220925, + "epoch": 0.06447818010336659, + "grad_norm": 10.166830062866211, + "learning_rate": 9.963151494931094e-06, + "loss": 0.6427, + "mean_token_accuracy": 0.8073472276329994, + "num_tokens": 24967386.0, + "step": 20800 + }, + { + "entropy": 1.8981720179319381, + "epoch": 0.06450917922841627, + "grad_norm": 11.244156837463379, + "learning_rate": 9.960757254756438e-06, + "loss": 0.669, + "mean_token_accuracy": 0.8046011224389076, + "num_tokens": 24978303.0, + "step": 20810 + }, + { + "entropy": 1.7593111276626587, + "epoch": 0.06454017835346597, + "grad_norm": 11.301054954528809, + "learning_rate": 9.958364739828752e-06, + "loss": 0.5735, + "mean_token_accuracy": 0.8244746640324593, + "num_tokens": 24991174.0, + "step": 20820 + }, + { + "entropy": 1.7012663453817367, + "epoch": 0.06457117747851567, + "grad_norm": 11.489786148071289, + "learning_rate": 9.955973948077055e-06, + "loss": 0.4713, + "mean_token_accuracy": 0.8441385194659233, + "num_tokens": 25003945.0, + "step": 20830 + }, + { + "entropy": 1.8124509736895562, + "epoch": 0.06460217660356536, + "grad_norm": 10.668350219726562, + "learning_rate": 9.953584877433851e-06, + "loss": 0.6364, + "mean_token_accuracy": 0.8089659824967385, + "num_tokens": 25016118.0, + "step": 20840 + }, + { + "entropy": 1.745204885303974, + "epoch": 0.06463317572861506, + "grad_norm": 10.901769638061523, + "learning_rate": 9.951197525835119e-06, + "loss": 0.5265, + "mean_token_accuracy": 0.8224120557308197, + "num_tokens": 25029471.0, + "step": 20850 + }, + { + "entropy": 1.835092043876648, + "epoch": 0.06466417485366475, + "grad_norm": 10.224356651306152, + "learning_rate": 9.94881189122029e-06, + "loss": 0.6312, + "mean_token_accuracy": 0.8145354777574539, + "num_tokens": 25041531.0, + "step": 20860 + }, + { + "entropy": 1.7869165703654288, + "epoch": 0.06469517397871445, + "grad_norm": 12.071456909179688, + "learning_rate": 9.946427971532263e-06, + "loss": 0.6134, + "mean_token_accuracy": 0.8095423832535744, + "num_tokens": 25054431.0, + "step": 20870 + }, + { + "entropy": 1.8208570063114167, + "epoch": 0.06472617310376415, + "grad_norm": 10.90512466430664, + "learning_rate": 9.944045764717379e-06, + "loss": 0.6059, + "mean_token_accuracy": 0.813726843893528, + "num_tokens": 25066679.0, + "step": 20880 + }, + { + "entropy": 1.7698472633957862, + "epoch": 0.06475717222881384, + "grad_norm": 4.871018409729004, + "learning_rate": 9.941665268725422e-06, + "loss": 0.5526, + "mean_token_accuracy": 0.8185109004378319, + "num_tokens": 25078924.0, + "step": 20890 + }, + { + "entropy": 1.7922344714403153, + "epoch": 0.06478817135386354, + "grad_norm": Infinity, + "learning_rate": 9.939286481509611e-06, + "loss": 0.5971, + "mean_token_accuracy": 0.8192875310778618, + "num_tokens": 25091264.0, + "step": 20900 + }, + { + "entropy": 1.873502266407013, + "epoch": 0.06481917047891324, + "grad_norm": 10.781414985656738, + "learning_rate": 9.93690940102659e-06, + "loss": 0.6287, + "mean_token_accuracy": 0.8078038066625595, + "num_tokens": 25102391.0, + "step": 20910 + }, + { + "entropy": 1.8319077044725418, + "epoch": 0.06485016960396293, + "grad_norm": 5.344398021697998, + "learning_rate": 9.934534025236426e-06, + "loss": 0.586, + "mean_token_accuracy": 0.8086917147040367, + "num_tokens": 25114388.0, + "step": 20920 + }, + { + "entropy": 1.8269810006022453, + "epoch": 0.06488116872901262, + "grad_norm": 10.4851655960083, + "learning_rate": 9.93216035210259e-06, + "loss": 0.6322, + "mean_token_accuracy": 0.8202467620372772, + "num_tokens": 25126354.0, + "step": 20930 + }, + { + "entropy": 1.842079259455204, + "epoch": 0.06491216785406231, + "grad_norm": 9.595065116882324, + "learning_rate": 9.929788379591967e-06, + "loss": 0.6143, + "mean_token_accuracy": 0.81163260191679, + "num_tokens": 25138345.0, + "step": 20940 + }, + { + "entropy": 1.8531109601259232, + "epoch": 0.06494316697911201, + "grad_norm": 10.80106258392334, + "learning_rate": 9.92741810567483e-06, + "loss": 0.5955, + "mean_token_accuracy": 0.8235780015587807, + "num_tokens": 25149452.0, + "step": 20950 + }, + { + "entropy": 1.7707980051636696, + "epoch": 0.0649741661041617, + "grad_norm": 9.250375747680664, + "learning_rate": 9.925049528324852e-06, + "loss": 0.5531, + "mean_token_accuracy": 0.825781124830246, + "num_tokens": 25162412.0, + "step": 20960 + }, + { + "entropy": 1.7839957982301713, + "epoch": 0.0650051652292114, + "grad_norm": 11.089311599731445, + "learning_rate": 9.922682645519076e-06, + "loss": 0.535, + "mean_token_accuracy": 0.8361950904130936, + "num_tokens": 25174698.0, + "step": 20970 + }, + { + "entropy": 1.8392813488841058, + "epoch": 0.0650361643542611, + "grad_norm": 9.709254264831543, + "learning_rate": 9.920317455237932e-06, + "loss": 0.6204, + "mean_token_accuracy": 0.8103036060929298, + "num_tokens": 25187089.0, + "step": 20980 + }, + { + "entropy": 1.89919556081295, + "epoch": 0.0650671634793108, + "grad_norm": 10.977981567382812, + "learning_rate": 9.917953955465215e-06, + "loss": 0.69, + "mean_token_accuracy": 0.7884634464979172, + "num_tokens": 25198974.0, + "step": 20990 + }, + { + "entropy": 1.8912572488188744, + "epoch": 0.0650981626043605, + "grad_norm": 11.527606964111328, + "learning_rate": 9.915592144188078e-06, + "loss": 0.6459, + "mean_token_accuracy": 0.8056445106863975, + "num_tokens": 25209971.0, + "step": 21000 + }, + { + "entropy": 1.8872391551733017, + "epoch": 0.06512916172941019, + "grad_norm": 10.813753128051758, + "learning_rate": 9.913232019397025e-06, + "loss": 0.7104, + "mean_token_accuracy": 0.7989979892969131, + "num_tokens": 25220791.0, + "step": 21010 + }, + { + "entropy": 1.8582836613059044, + "epoch": 0.06516016085445989, + "grad_norm": 9.749269485473633, + "learning_rate": 9.910873579085914e-06, + "loss": 0.6316, + "mean_token_accuracy": 0.8038420900702477, + "num_tokens": 25233097.0, + "step": 21020 + }, + { + "entropy": 1.9419476687908173, + "epoch": 0.06519115997950958, + "grad_norm": 10.637131690979004, + "learning_rate": 9.908516821251943e-06, + "loss": 0.6322, + "mean_token_accuracy": 0.811190040409565, + "num_tokens": 25244036.0, + "step": 21030 + }, + { + "entropy": 1.8555924728512765, + "epoch": 0.06522215910455928, + "grad_norm": 11.770044326782227, + "learning_rate": 9.906161743895632e-06, + "loss": 0.6444, + "mean_token_accuracy": 0.8096898928284645, + "num_tokens": 25255598.0, + "step": 21040 + }, + { + "entropy": 1.8110544815659524, + "epoch": 0.06525315822960896, + "grad_norm": 9.333264350891113, + "learning_rate": 9.903808345020833e-06, + "loss": 0.5622, + "mean_token_accuracy": 0.8238018527626991, + "num_tokens": 25268260.0, + "step": 21050 + }, + { + "entropy": 1.8922196328639984, + "epoch": 0.06528415735465866, + "grad_norm": 10.881926536560059, + "learning_rate": 9.901456622634717e-06, + "loss": 0.6427, + "mean_token_accuracy": 0.8115344852209091, + "num_tokens": 25279383.0, + "step": 21060 + }, + { + "entropy": 1.8002661630511283, + "epoch": 0.06531515647970836, + "grad_norm": 11.18062686920166, + "learning_rate": 9.899106574747767e-06, + "loss": 0.5344, + "mean_token_accuracy": 0.8290015637874604, + "num_tokens": 25292228.0, + "step": 21070 + }, + { + "entropy": 1.8496287435293197, + "epoch": 0.06534615560475805, + "grad_norm": 8.944875717163086, + "learning_rate": 9.896758199373761e-06, + "loss": 0.6023, + "mean_token_accuracy": 0.8110730588436127, + "num_tokens": 25304999.0, + "step": 21080 + }, + { + "entropy": 1.887920793890953, + "epoch": 0.06537715472980775, + "grad_norm": 5.810867786407471, + "learning_rate": 9.894411494529786e-06, + "loss": 0.5847, + "mean_token_accuracy": 0.8183253973722457, + "num_tokens": 25316919.0, + "step": 21090 + }, + { + "entropy": 1.7966985195875167, + "epoch": 0.06540815385485745, + "grad_norm": 10.770297050476074, + "learning_rate": 9.892066458236207e-06, + "loss": 0.5133, + "mean_token_accuracy": 0.8241160228848458, + "num_tokens": 25329663.0, + "step": 21100 + }, + { + "entropy": 1.936796745657921, + "epoch": 0.06543915297990714, + "grad_norm": 10.986133575439453, + "learning_rate": 9.88972308851668e-06, + "loss": 0.6237, + "mean_token_accuracy": 0.7997463896870614, + "num_tokens": 25341721.0, + "step": 21110 + }, + { + "entropy": 1.917391985654831, + "epoch": 0.06547015210495684, + "grad_norm": 11.787429809570312, + "learning_rate": 9.887381383398138e-06, + "loss": 0.5681, + "mean_token_accuracy": 0.8246586143970489, + "num_tokens": 25353957.0, + "step": 21120 + }, + { + "entropy": 1.8626626536250115, + "epoch": 0.06550115123000654, + "grad_norm": 10.935876846313477, + "learning_rate": 9.885041340910771e-06, + "loss": 0.5545, + "mean_token_accuracy": 0.8301349982619286, + "num_tokens": 25366548.0, + "step": 21130 + }, + { + "entropy": 1.9012475624680518, + "epoch": 0.06553215035505623, + "grad_norm": 11.089349746704102, + "learning_rate": 9.882702959088042e-06, + "loss": 0.6256, + "mean_token_accuracy": 0.8115785628557205, + "num_tokens": 25378165.0, + "step": 21140 + }, + { + "entropy": 1.9432383313775063, + "epoch": 0.06556314948010593, + "grad_norm": 6.057243824005127, + "learning_rate": 9.880366235966667e-06, + "loss": 0.6253, + "mean_token_accuracy": 0.8162486657500267, + "num_tokens": 25389601.0, + "step": 21150 + }, + { + "entropy": 1.8184719279408454, + "epoch": 0.06559414860515562, + "grad_norm": 9.748690605163574, + "learning_rate": 9.878031169586607e-06, + "loss": 0.5692, + "mean_token_accuracy": 0.8212730750441551, + "num_tokens": 25402464.0, + "step": 21160 + }, + { + "entropy": 1.8200145453214644, + "epoch": 0.06562514773020532, + "grad_norm": 10.591629981994629, + "learning_rate": 9.875697757991068e-06, + "loss": 0.5807, + "mean_token_accuracy": 0.8190927669405937, + "num_tokens": 25415261.0, + "step": 21170 + }, + { + "entropy": 1.911136743426323, + "epoch": 0.065656146855255, + "grad_norm": 9.886494636535645, + "learning_rate": 9.87336599922648e-06, + "loss": 0.6362, + "mean_token_accuracy": 0.8133341312408447, + "num_tokens": 25426211.0, + "step": 21180 + }, + { + "entropy": 1.859015080332756, + "epoch": 0.0656871459803047, + "grad_norm": 11.79246711730957, + "learning_rate": 9.871035891342516e-06, + "loss": 0.5876, + "mean_token_accuracy": 0.8148143604397774, + "num_tokens": 25438384.0, + "step": 21190 + }, + { + "entropy": 1.845600600540638, + "epoch": 0.0657181451053544, + "grad_norm": 11.987141609191895, + "learning_rate": 9.868707432392061e-06, + "loss": 0.5756, + "mean_token_accuracy": 0.8216294556856155, + "num_tokens": 25451164.0, + "step": 21200 + }, + { + "entropy": 1.970012903213501, + "epoch": 0.0657491442304041, + "grad_norm": 10.808934211730957, + "learning_rate": 9.866380620431211e-06, + "loss": 0.674, + "mean_token_accuracy": 0.8027332067489624, + "num_tokens": 25461708.0, + "step": 21210 + }, + { + "entropy": 1.8186683028936386, + "epoch": 0.06578014335545379, + "grad_norm": 5.0801897048950195, + "learning_rate": 9.86405545351927e-06, + "loss": 0.5119, + "mean_token_accuracy": 0.8230162873864174, + "num_tokens": 25475658.0, + "step": 21220 + }, + { + "entropy": 1.9227222502231598, + "epoch": 0.06581114248050349, + "grad_norm": 10.78120231628418, + "learning_rate": 9.861731929718746e-06, + "loss": 0.6277, + "mean_token_accuracy": 0.816400445997715, + "num_tokens": 25487133.0, + "step": 21230 + }, + { + "entropy": 1.921273510158062, + "epoch": 0.06584214160555318, + "grad_norm": 11.297560691833496, + "learning_rate": 9.859410047095337e-06, + "loss": 0.5938, + "mean_token_accuracy": 0.8111256033182144, + "num_tokens": 25498732.0, + "step": 21240 + }, + { + "entropy": 1.8860565900802613, + "epoch": 0.06587314073060288, + "grad_norm": 12.347557067871094, + "learning_rate": 9.857089803717928e-06, + "loss": 0.5763, + "mean_token_accuracy": 0.814181549847126, + "num_tokens": 25510149.0, + "step": 21250 + }, + { + "entropy": 1.8615178689360619, + "epoch": 0.06590413985565258, + "grad_norm": 13.131210327148438, + "learning_rate": 9.854771197658584e-06, + "loss": 0.6138, + "mean_token_accuracy": 0.8099151000380516, + "num_tokens": 25522335.0, + "step": 21260 + }, + { + "entropy": 1.9241080656647682, + "epoch": 0.06593513898070227, + "grad_norm": 4.803411960601807, + "learning_rate": 9.852454226992548e-06, + "loss": 0.5768, + "mean_token_accuracy": 0.8204170763492584, + "num_tokens": 25533747.0, + "step": 21270 + }, + { + "entropy": 1.889383627474308, + "epoch": 0.06596613810575197, + "grad_norm": 10.4091796875, + "learning_rate": 9.850138889798216e-06, + "loss": 0.6602, + "mean_token_accuracy": 0.8120569944381714, + "num_tokens": 25545456.0, + "step": 21280 + }, + { + "entropy": 1.9471757873892783, + "epoch": 0.06599713723080167, + "grad_norm": 9.847548484802246, + "learning_rate": 9.847825184157157e-06, + "loss": 0.6242, + "mean_token_accuracy": 0.808134414255619, + "num_tokens": 25556403.0, + "step": 21290 + }, + { + "entropy": 1.8040209040045738, + "epoch": 0.06602813635585135, + "grad_norm": 9.907560348510742, + "learning_rate": 9.845513108154088e-06, + "loss": 0.52, + "mean_token_accuracy": 0.8381869062781334, + "num_tokens": 25569237.0, + "step": 21300 + }, + { + "entropy": 1.8439020454883575, + "epoch": 0.06605913548090105, + "grad_norm": 10.755191802978516, + "learning_rate": 9.843202659876867e-06, + "loss": 0.5805, + "mean_token_accuracy": 0.8121224045753479, + "num_tokens": 25581127.0, + "step": 21310 + }, + { + "entropy": 1.9237766891717911, + "epoch": 0.06609013460595074, + "grad_norm": 9.386770248413086, + "learning_rate": 9.8408938374165e-06, + "loss": 0.6702, + "mean_token_accuracy": 0.8007935658097267, + "num_tokens": 25592005.0, + "step": 21320 + }, + { + "entropy": 1.9236200451850891, + "epoch": 0.06612113373100044, + "grad_norm": 11.110610961914062, + "learning_rate": 9.838586638867122e-06, + "loss": 0.6142, + "mean_token_accuracy": 0.8141868889331818, + "num_tokens": 25603901.0, + "step": 21330 + }, + { + "entropy": 1.8876248642802238, + "epoch": 0.06615213285605014, + "grad_norm": 11.1986665725708, + "learning_rate": 9.836281062325994e-06, + "loss": 0.5891, + "mean_token_accuracy": 0.8205958276987075, + "num_tokens": 25615461.0, + "step": 21340 + }, + { + "entropy": 1.8958319693803787, + "epoch": 0.06618313198109983, + "grad_norm": 10.968427658081055, + "learning_rate": 9.833977105893499e-06, + "loss": 0.6026, + "mean_token_accuracy": 0.8049457132816314, + "num_tokens": 25627052.0, + "step": 21350 + }, + { + "entropy": 1.7655820041894912, + "epoch": 0.06621413110614953, + "grad_norm": 11.009693145751953, + "learning_rate": 9.831674767673128e-06, + "loss": 0.4757, + "mean_token_accuracy": 0.8243117600679397, + "num_tokens": 25640796.0, + "step": 21360 + }, + { + "entropy": 1.9570647537708283, + "epoch": 0.06624513023119923, + "grad_norm": 4.816678524017334, + "learning_rate": 9.829374045771485e-06, + "loss": 0.6455, + "mean_token_accuracy": 0.8167396351695061, + "num_tokens": 25652130.0, + "step": 21370 + }, + { + "entropy": 1.8768548294901848, + "epoch": 0.06627612935624892, + "grad_norm": 9.173911094665527, + "learning_rate": 9.827074938298272e-06, + "loss": 0.563, + "mean_token_accuracy": 0.8292611509561538, + "num_tokens": 25664530.0, + "step": 21380 + }, + { + "entropy": 1.836962741613388, + "epoch": 0.06630712848129862, + "grad_norm": 11.056595802307129, + "learning_rate": 9.824777443366282e-06, + "loss": 0.5817, + "mean_token_accuracy": 0.8175490334630012, + "num_tokens": 25676479.0, + "step": 21390 + }, + { + "entropy": 1.862608587741852, + "epoch": 0.06633812760634832, + "grad_norm": 10.60038948059082, + "learning_rate": 9.822481559091401e-06, + "loss": 0.6626, + "mean_token_accuracy": 0.8113431319594383, + "num_tokens": 25687935.0, + "step": 21400 + }, + { + "entropy": 1.8528019905090332, + "epoch": 0.06636912673139801, + "grad_norm": 13.311474800109863, + "learning_rate": 9.820187283592584e-06, + "loss": 0.5789, + "mean_token_accuracy": 0.8175876498222351, + "num_tokens": 25700014.0, + "step": 21410 + }, + { + "entropy": 1.903902080655098, + "epoch": 0.0664001258564477, + "grad_norm": 9.098358154296875, + "learning_rate": 9.817894614991875e-06, + "loss": 0.6484, + "mean_token_accuracy": 0.8050385147333146, + "num_tokens": 25711005.0, + "step": 21420 + }, + { + "entropy": 1.8798560991883277, + "epoch": 0.06643112498149739, + "grad_norm": 10.936999320983887, + "learning_rate": 9.815603551414374e-06, + "loss": 0.5749, + "mean_token_accuracy": 0.822943688929081, + "num_tokens": 25722335.0, + "step": 21430 + }, + { + "entropy": 1.7517326176166534, + "epoch": 0.06646212410654709, + "grad_norm": 10.7169189453125, + "learning_rate": 9.813314090988247e-06, + "loss": 0.5203, + "mean_token_accuracy": 0.8164523139595985, + "num_tokens": 25736104.0, + "step": 21440 + }, + { + "entropy": 1.9384674742817878, + "epoch": 0.06649312323159678, + "grad_norm": 10.496194839477539, + "learning_rate": 9.811026231844714e-06, + "loss": 0.6222, + "mean_token_accuracy": 0.8058070495724678, + "num_tokens": 25747103.0, + "step": 21450 + }, + { + "entropy": 1.9147820815443992, + "epoch": 0.06652412235664648, + "grad_norm": 11.332496643066406, + "learning_rate": 9.808739972118045e-06, + "loss": 0.6591, + "mean_token_accuracy": 0.7969366267323494, + "num_tokens": 25759932.0, + "step": 21460 + }, + { + "entropy": 1.9171302869915963, + "epoch": 0.06655512148169618, + "grad_norm": 10.07258129119873, + "learning_rate": 9.806455309945553e-06, + "loss": 0.596, + "mean_token_accuracy": 0.8198638454079628, + "num_tokens": 25771193.0, + "step": 21470 + }, + { + "entropy": 1.663824899494648, + "epoch": 0.06658612060674587, + "grad_norm": 4.567380428314209, + "learning_rate": 9.804172243467576e-06, + "loss": 0.4525, + "mean_token_accuracy": 0.8362728267908096, + "num_tokens": 25785955.0, + "step": 21480 + }, + { + "entropy": 1.819168321788311, + "epoch": 0.06661711973179557, + "grad_norm": 6.046937465667725, + "learning_rate": 9.8018907708275e-06, + "loss": 0.6468, + "mean_token_accuracy": 0.8072507992386818, + "num_tokens": 25798694.0, + "step": 21490 + }, + { + "entropy": 1.8685800537467003, + "epoch": 0.06664811885684527, + "grad_norm": 9.6455659866333, + "learning_rate": 9.799610890171714e-06, + "loss": 0.5722, + "mean_token_accuracy": 0.8169274970889091, + "num_tokens": 25810540.0, + "step": 21500 + }, + { + "entropy": 1.894263032078743, + "epoch": 0.06667911798189496, + "grad_norm": 11.558295249938965, + "learning_rate": 9.797332599649637e-06, + "loss": 0.636, + "mean_token_accuracy": 0.8053765878081321, + "num_tokens": 25822467.0, + "step": 21510 + }, + { + "entropy": 1.9177772477269173, + "epoch": 0.06671011710694466, + "grad_norm": 10.580260276794434, + "learning_rate": 9.795055897413697e-06, + "loss": 0.684, + "mean_token_accuracy": 0.800214584171772, + "num_tokens": 25834010.0, + "step": 21520 + }, + { + "entropy": 1.831970725953579, + "epoch": 0.06674111623199436, + "grad_norm": 10.084224700927734, + "learning_rate": 9.792780781619318e-06, + "loss": 0.5683, + "mean_token_accuracy": 0.821027934551239, + "num_tokens": 25846769.0, + "step": 21530 + }, + { + "entropy": 1.9271451473236083, + "epoch": 0.06677211535704405, + "grad_norm": 11.508028030395508, + "learning_rate": 9.790507250424926e-06, + "loss": 0.643, + "mean_token_accuracy": 0.7898375526070595, + "num_tokens": 25857730.0, + "step": 21540 + }, + { + "entropy": 1.9292464286088944, + "epoch": 0.06680311448209374, + "grad_norm": 9.729171752929688, + "learning_rate": 9.788235301991947e-06, + "loss": 0.6296, + "mean_token_accuracy": 0.8045027181506157, + "num_tokens": 25869739.0, + "step": 21550 + }, + { + "entropy": 1.9347792953252791, + "epoch": 0.06683411360714343, + "grad_norm": 9.912041664123535, + "learning_rate": 9.785964934484776e-06, + "loss": 0.7101, + "mean_token_accuracy": 0.7909634590148926, + "num_tokens": 25880786.0, + "step": 21560 + }, + { + "entropy": 1.8601661220192909, + "epoch": 0.06686511273219313, + "grad_norm": 10.346519470214844, + "learning_rate": 9.783696146070801e-06, + "loss": 0.5925, + "mean_token_accuracy": 0.8147589430212975, + "num_tokens": 25892319.0, + "step": 21570 + }, + { + "entropy": 1.7954956993460656, + "epoch": 0.06689611185724283, + "grad_norm": 4.684342384338379, + "learning_rate": 9.781428934920377e-06, + "loss": 0.5335, + "mean_token_accuracy": 0.8224958389997482, + "num_tokens": 25905020.0, + "step": 21580 + }, + { + "entropy": 1.8782492965459823, + "epoch": 0.06692711098229252, + "grad_norm": 12.385150909423828, + "learning_rate": 9.77916329920682e-06, + "loss": 0.6179, + "mean_token_accuracy": 0.8114234715700149, + "num_tokens": 25916570.0, + "step": 21590 + }, + { + "entropy": 1.8336504146456718, + "epoch": 0.06695811010734222, + "grad_norm": 3.51712965965271, + "learning_rate": 9.776899237106418e-06, + "loss": 0.5324, + "mean_token_accuracy": 0.8270602032542229, + "num_tokens": 25929984.0, + "step": 21600 + }, + { + "entropy": 1.8505462616682054, + "epoch": 0.06698910923239192, + "grad_norm": 9.539434432983398, + "learning_rate": 9.774636746798405e-06, + "loss": 0.635, + "mean_token_accuracy": 0.8051048040390014, + "num_tokens": 25941877.0, + "step": 21610 + }, + { + "entropy": 1.7775013580918313, + "epoch": 0.06702010835744161, + "grad_norm": 11.277313232421875, + "learning_rate": 9.77237582646497e-06, + "loss": 0.5501, + "mean_token_accuracy": 0.823443454504013, + "num_tokens": 25955271.0, + "step": 21620 + }, + { + "entropy": 1.8188173368573188, + "epoch": 0.06705110748249131, + "grad_norm": 5.36696720123291, + "learning_rate": 9.770116474291232e-06, + "loss": 0.5572, + "mean_token_accuracy": 0.8193252950906753, + "num_tokens": 25967242.0, + "step": 21630 + }, + { + "entropy": 1.819683888554573, + "epoch": 0.067082106607541, + "grad_norm": 10.795523643493652, + "learning_rate": 9.767858688465254e-06, + "loss": 0.6162, + "mean_token_accuracy": 0.8111403912305832, + "num_tokens": 25979320.0, + "step": 21640 + }, + { + "entropy": 1.7906116724014283, + "epoch": 0.0671131057325907, + "grad_norm": 9.500042915344238, + "learning_rate": 9.765602467178033e-06, + "loss": 0.5362, + "mean_token_accuracy": 0.822031632065773, + "num_tokens": 25992253.0, + "step": 21650 + }, + { + "entropy": 1.8245372220873832, + "epoch": 0.0671441048576404, + "grad_norm": 8.064600944519043, + "learning_rate": 9.763347808623481e-06, + "loss": 0.5656, + "mean_token_accuracy": 0.817147271335125, + "num_tokens": 26003957.0, + "step": 21660 + }, + { + "entropy": 1.8516656443476678, + "epoch": 0.06717510398269008, + "grad_norm": 10.404606819152832, + "learning_rate": 9.761094710998432e-06, + "loss": 0.5836, + "mean_token_accuracy": 0.8104123935103417, + "num_tokens": 26015881.0, + "step": 21670 + }, + { + "entropy": 1.827203567326069, + "epoch": 0.06720610310773978, + "grad_norm": 13.826399803161621, + "learning_rate": 9.75884317250263e-06, + "loss": 0.5807, + "mean_token_accuracy": 0.8180100679397583, + "num_tokens": 26028572.0, + "step": 21680 + }, + { + "entropy": 1.7154892578721046, + "epoch": 0.06723710223278948, + "grad_norm": 4.944169044494629, + "learning_rate": 9.756593191338725e-06, + "loss": 0.3884, + "mean_token_accuracy": 0.848066033422947, + "num_tokens": 26043181.0, + "step": 21690 + }, + { + "entropy": 1.8305002465844153, + "epoch": 0.06726810135783917, + "grad_norm": 11.294840812683105, + "learning_rate": 9.754344765712266e-06, + "loss": 0.5811, + "mean_token_accuracy": 0.8094679519534111, + "num_tokens": 26055431.0, + "step": 21700 + }, + { + "entropy": 1.8822925835847855, + "epoch": 0.06729910048288887, + "grad_norm": 11.073433876037598, + "learning_rate": 9.752097893831698e-06, + "loss": 0.6668, + "mean_token_accuracy": 0.8047523692250251, + "num_tokens": 26067040.0, + "step": 21710 + }, + { + "entropy": 1.8371146380901338, + "epoch": 0.06733009960793856, + "grad_norm": 9.113365173339844, + "learning_rate": 9.749852573908346e-06, + "loss": 0.5507, + "mean_token_accuracy": 0.8240691289305687, + "num_tokens": 26080151.0, + "step": 21720 + }, + { + "entropy": 1.8499283462762832, + "epoch": 0.06736109873298826, + "grad_norm": 11.711502075195312, + "learning_rate": 9.747608804156427e-06, + "loss": 0.5783, + "mean_token_accuracy": 0.8179016202688217, + "num_tokens": 26092394.0, + "step": 21730 + }, + { + "entropy": 1.7742941290140153, + "epoch": 0.06739209785803796, + "grad_norm": 10.995613098144531, + "learning_rate": 9.745366582793027e-06, + "loss": 0.5917, + "mean_token_accuracy": 0.8143564939498902, + "num_tokens": 26105790.0, + "step": 21740 + }, + { + "entropy": 1.727923959493637, + "epoch": 0.06742309698308765, + "grad_norm": 3.2797532081604004, + "learning_rate": 9.7431259080381e-06, + "loss": 0.4503, + "mean_token_accuracy": 0.8418777331709861, + "num_tokens": 26119446.0, + "step": 21750 + }, + { + "entropy": 1.8147297531366349, + "epoch": 0.06745409610813735, + "grad_norm": 8.432580947875977, + "learning_rate": 9.740886778114467e-06, + "loss": 0.5166, + "mean_token_accuracy": 0.8339284613728524, + "num_tokens": 26131348.0, + "step": 21760 + }, + { + "entropy": 1.8820328325033189, + "epoch": 0.06748509523318705, + "grad_norm": 5.698887825012207, + "learning_rate": 9.738649191247806e-06, + "loss": 0.6172, + "mean_token_accuracy": 0.809808611869812, + "num_tokens": 26143041.0, + "step": 21770 + }, + { + "entropy": 1.8815367594361305, + "epoch": 0.06751609435823674, + "grad_norm": 8.804140090942383, + "learning_rate": 9.736413145666649e-06, + "loss": 0.594, + "mean_token_accuracy": 0.820245711505413, + "num_tokens": 26154978.0, + "step": 21780 + }, + { + "entropy": 1.9202604204416276, + "epoch": 0.06754709348328644, + "grad_norm": 9.845060348510742, + "learning_rate": 9.734178639602368e-06, + "loss": 0.6266, + "mean_token_accuracy": 0.8070004045963287, + "num_tokens": 26166500.0, + "step": 21790 + }, + { + "entropy": 1.8778617054224014, + "epoch": 0.06757809260833612, + "grad_norm": 9.954373359680176, + "learning_rate": 9.731945671289185e-06, + "loss": 0.6406, + "mean_token_accuracy": 0.8081085130572319, + "num_tokens": 26178330.0, + "step": 21800 + }, + { + "entropy": 1.9139564037322998, + "epoch": 0.06760909173338582, + "grad_norm": 10.78117847442627, + "learning_rate": 9.72971423896414e-06, + "loss": 0.6322, + "mean_token_accuracy": 0.8143807515501976, + "num_tokens": 26189448.0, + "step": 21810 + }, + { + "entropy": 1.8785874828696252, + "epoch": 0.06764009085843552, + "grad_norm": 5.0894389152526855, + "learning_rate": 9.727484340867116e-06, + "loss": 0.5966, + "mean_token_accuracy": 0.8152860775589943, + "num_tokens": 26201634.0, + "step": 21820 + }, + { + "entropy": 1.8347378268837928, + "epoch": 0.06767108998348521, + "grad_norm": 4.857226371765137, + "learning_rate": 9.725255975240813e-06, + "loss": 0.4972, + "mean_token_accuracy": 0.8315959751605988, + "num_tokens": 26213558.0, + "step": 21830 + }, + { + "entropy": 1.845048761367798, + "epoch": 0.06770208910853491, + "grad_norm": 11.329482078552246, + "learning_rate": 9.723029140330748e-06, + "loss": 0.5478, + "mean_token_accuracy": 0.8252388536930084, + "num_tokens": 26225961.0, + "step": 21840 + }, + { + "entropy": 1.8393208265304566, + "epoch": 0.0677330882335846, + "grad_norm": 10.937715530395508, + "learning_rate": 9.72080383438525e-06, + "loss": 0.6227, + "mean_token_accuracy": 0.8155218675732613, + "num_tokens": 26238570.0, + "step": 21850 + }, + { + "entropy": 1.835605874657631, + "epoch": 0.0677640873586343, + "grad_norm": 9.71265983581543, + "learning_rate": 9.718580055655447e-06, + "loss": 0.5487, + "mean_token_accuracy": 0.8284683138132095, + "num_tokens": 26249804.0, + "step": 21860 + }, + { + "entropy": 1.9305734172463418, + "epoch": 0.067795086483684, + "grad_norm": 9.470818519592285, + "learning_rate": 9.716357802395276e-06, + "loss": 0.6421, + "mean_token_accuracy": 0.8088799849152565, + "num_tokens": 26261058.0, + "step": 21870 + }, + { + "entropy": 1.8729921713471414, + "epoch": 0.0678260856087337, + "grad_norm": 11.346515655517578, + "learning_rate": 9.714137072861461e-06, + "loss": 0.6013, + "mean_token_accuracy": 0.8200590804219245, + "num_tokens": 26273172.0, + "step": 21880 + }, + { + "entropy": 1.8179200991988183, + "epoch": 0.0678570847337834, + "grad_norm": 9.96755313873291, + "learning_rate": 9.711917865313517e-06, + "loss": 0.5983, + "mean_token_accuracy": 0.813332186639309, + "num_tokens": 26286190.0, + "step": 21890 + }, + { + "entropy": 1.807481935620308, + "epoch": 0.06788808385883309, + "grad_norm": 9.412008285522461, + "learning_rate": 9.709700178013736e-06, + "loss": 0.5088, + "mean_token_accuracy": 0.823253333568573, + "num_tokens": 26299950.0, + "step": 21900 + }, + { + "entropy": 1.8429307714104652, + "epoch": 0.06791908298388279, + "grad_norm": 10.125129699707031, + "learning_rate": 9.707484009227192e-06, + "loss": 0.5312, + "mean_token_accuracy": 0.8161771893501282, + "num_tokens": 26312676.0, + "step": 21910 + }, + { + "entropy": 1.932323306798935, + "epoch": 0.06795008210893247, + "grad_norm": 10.303938865661621, + "learning_rate": 9.705269357221728e-06, + "loss": 0.6293, + "mean_token_accuracy": 0.8122246876358986, + "num_tokens": 26324476.0, + "step": 21920 + }, + { + "entropy": 1.8644244894385338, + "epoch": 0.06798108123398217, + "grad_norm": 9.893197059631348, + "learning_rate": 9.703056220267948e-06, + "loss": 0.5923, + "mean_token_accuracy": 0.8110254645347595, + "num_tokens": 26336986.0, + "step": 21930 + }, + { + "entropy": 1.9435497313737868, + "epoch": 0.06801208035903186, + "grad_norm": 10.371121406555176, + "learning_rate": 9.700844596639224e-06, + "loss": 0.6676, + "mean_token_accuracy": 0.8060028344392777, + "num_tokens": 26348032.0, + "step": 21940 + }, + { + "entropy": 1.818027876317501, + "epoch": 0.06804307948408156, + "grad_norm": 10.99317741394043, + "learning_rate": 9.698634484611671e-06, + "loss": 0.4973, + "mean_token_accuracy": 0.8351524174213409, + "num_tokens": 26360903.0, + "step": 21950 + }, + { + "entropy": 2.044380483031273, + "epoch": 0.06807407860913126, + "grad_norm": 11.019384384155273, + "learning_rate": 9.696425882464162e-06, + "loss": 0.7252, + "mean_token_accuracy": 0.7958701580762864, + "num_tokens": 26371825.0, + "step": 21960 + }, + { + "entropy": 1.9928947612643242, + "epoch": 0.06810507773418095, + "grad_norm": 9.34325122833252, + "learning_rate": 9.694218788478302e-06, + "loss": 0.6251, + "mean_token_accuracy": 0.8070891216397286, + "num_tokens": 26382919.0, + "step": 21970 + }, + { + "entropy": 1.8768664389848708, + "epoch": 0.06813607685923065, + "grad_norm": 9.70384407043457, + "learning_rate": 9.692013200938443e-06, + "loss": 0.5142, + "mean_token_accuracy": 0.8396727994084359, + "num_tokens": 26395459.0, + "step": 21980 + }, + { + "entropy": 1.8494936734437943, + "epoch": 0.06816707598428035, + "grad_norm": 11.369257926940918, + "learning_rate": 9.689809118131661e-06, + "loss": 0.5583, + "mean_token_accuracy": 0.8196944579482078, + "num_tokens": 26407374.0, + "step": 21990 + }, + { + "entropy": 1.8978412017226218, + "epoch": 0.06819807510933004, + "grad_norm": 11.591387748718262, + "learning_rate": 9.68760653834776e-06, + "loss": 0.6132, + "mean_token_accuracy": 0.8112616434693336, + "num_tokens": 26419627.0, + "step": 22000 + }, + { + "entropy": 1.8775578573346139, + "epoch": 0.06822907423437974, + "grad_norm": 5.471284866333008, + "learning_rate": 9.685405459879265e-06, + "loss": 0.5619, + "mean_token_accuracy": 0.8241677179932594, + "num_tokens": 26431836.0, + "step": 22010 + }, + { + "entropy": 1.9307914853096009, + "epoch": 0.06826007335942944, + "grad_norm": 9.451096534729004, + "learning_rate": 9.683205881021414e-06, + "loss": 0.6088, + "mean_token_accuracy": 0.8177047148346901, + "num_tokens": 26442888.0, + "step": 22020 + }, + { + "entropy": 1.915931698679924, + "epoch": 0.06829107248447913, + "grad_norm": 10.326228141784668, + "learning_rate": 9.681007800072153e-06, + "loss": 0.6463, + "mean_token_accuracy": 0.8002750858664512, + "num_tokens": 26454901.0, + "step": 22030 + }, + { + "entropy": 1.9367796674370765, + "epoch": 0.06832207160952881, + "grad_norm": 10.724263191223145, + "learning_rate": 9.678811215332136e-06, + "loss": 0.6072, + "mean_token_accuracy": 0.8125025644898415, + "num_tokens": 26466644.0, + "step": 22040 + }, + { + "entropy": 1.8923877745866775, + "epoch": 0.06835307073457851, + "grad_norm": 9.895198822021484, + "learning_rate": 9.676616125104707e-06, + "loss": 0.5792, + "mean_token_accuracy": 0.8232117027044297, + "num_tokens": 26478470.0, + "step": 22050 + }, + { + "entropy": 1.9004954114556312, + "epoch": 0.06838406985962821, + "grad_norm": 9.4879150390625, + "learning_rate": 9.674422527695905e-06, + "loss": 0.5998, + "mean_token_accuracy": 0.8110221117734909, + "num_tokens": 26490224.0, + "step": 22060 + }, + { + "entropy": 1.9001964166760446, + "epoch": 0.0684150689846779, + "grad_norm": 5.616433620452881, + "learning_rate": 9.672230421414466e-06, + "loss": 0.625, + "mean_token_accuracy": 0.8154096305370331, + "num_tokens": 26501366.0, + "step": 22070 + }, + { + "entropy": 1.9321230724453926, + "epoch": 0.0684460681097276, + "grad_norm": 9.737174987792969, + "learning_rate": 9.670039804571791e-06, + "loss": 0.6395, + "mean_token_accuracy": 0.8030791565775871, + "num_tokens": 26512519.0, + "step": 22080 + }, + { + "entropy": 1.9279805243015289, + "epoch": 0.0684770672347773, + "grad_norm": 9.636087417602539, + "learning_rate": 9.667850675481966e-06, + "loss": 0.5453, + "mean_token_accuracy": 0.8268870130181313, + "num_tokens": 26524253.0, + "step": 22090 + }, + { + "entropy": 1.8550891995429992, + "epoch": 0.068508066359827, + "grad_norm": 4.487429141998291, + "learning_rate": 9.665663032461747e-06, + "loss": 0.5258, + "mean_token_accuracy": 0.8270553275942802, + "num_tokens": 26536731.0, + "step": 22100 + }, + { + "entropy": 1.9424635365605354, + "epoch": 0.06853906548487669, + "grad_norm": 10.442728996276855, + "learning_rate": 9.663476873830555e-06, + "loss": 0.5829, + "mean_token_accuracy": 0.8206922665238381, + "num_tokens": 26547996.0, + "step": 22110 + }, + { + "entropy": 1.9067403554916382, + "epoch": 0.06857006460992639, + "grad_norm": 9.885056495666504, + "learning_rate": 9.661292197910468e-06, + "loss": 0.626, + "mean_token_accuracy": 0.8120665445923805, + "num_tokens": 26560444.0, + "step": 22120 + }, + { + "entropy": 1.9578855365514756, + "epoch": 0.06860106373497608, + "grad_norm": 8.84913158416748, + "learning_rate": 9.659109003026222e-06, + "loss": 0.6182, + "mean_token_accuracy": 0.8090391054749488, + "num_tokens": 26572313.0, + "step": 22130 + }, + { + "entropy": 2.025720348954201, + "epoch": 0.06863206286002578, + "grad_norm": 9.901925086975098, + "learning_rate": 9.656927287505196e-06, + "loss": 0.7, + "mean_token_accuracy": 0.8067919075489044, + "num_tokens": 26583097.0, + "step": 22140 + }, + { + "entropy": 2.0064867049455644, + "epoch": 0.06866306198507548, + "grad_norm": 10.104426383972168, + "learning_rate": 9.654747049677416e-06, + "loss": 0.6419, + "mean_token_accuracy": 0.8101355582475662, + "num_tokens": 26594726.0, + "step": 22150 + }, + { + "entropy": 1.9431262537837029, + "epoch": 0.06869406111012517, + "grad_norm": 10.839469909667969, + "learning_rate": 9.652568287875552e-06, + "loss": 0.5512, + "mean_token_accuracy": 0.8269416391849518, + "num_tokens": 26606536.0, + "step": 22160 + }, + { + "entropy": 2.012579309940338, + "epoch": 0.06872506023517486, + "grad_norm": 12.621667861938477, + "learning_rate": 9.650391000434892e-06, + "loss": 0.6522, + "mean_token_accuracy": 0.8193601354956627, + "num_tokens": 26617239.0, + "step": 22170 + }, + { + "entropy": 1.8841126412153244, + "epoch": 0.06875605936022455, + "grad_norm": 5.069724082946777, + "learning_rate": 9.648215185693367e-06, + "loss": 0.5399, + "mean_token_accuracy": 0.8241742476820946, + "num_tokens": 26629571.0, + "step": 22180 + }, + { + "entropy": 1.938437856733799, + "epoch": 0.06878705848527425, + "grad_norm": 10.184552192687988, + "learning_rate": 9.646040841991519e-06, + "loss": 0.6364, + "mean_token_accuracy": 0.812780536711216, + "num_tokens": 26640715.0, + "step": 22190 + }, + { + "entropy": 1.8110738307237626, + "epoch": 0.06881805761032395, + "grad_norm": 10.346161842346191, + "learning_rate": 9.64386796767251e-06, + "loss": 0.534, + "mean_token_accuracy": 0.8277497068047523, + "num_tokens": 26655037.0, + "step": 22200 + }, + { + "entropy": 1.9208206847310065, + "epoch": 0.06884905673537364, + "grad_norm": 5.2261762619018555, + "learning_rate": 9.641696561082117e-06, + "loss": 0.5982, + "mean_token_accuracy": 0.8199071541428566, + "num_tokens": 26667082.0, + "step": 22210 + }, + { + "entropy": 1.8494412750005722, + "epoch": 0.06888005586042334, + "grad_norm": 10.328657150268555, + "learning_rate": 9.639526620568718e-06, + "loss": 0.505, + "mean_token_accuracy": 0.8268266186118126, + "num_tokens": 26680702.0, + "step": 22220 + }, + { + "entropy": 2.001448130607605, + "epoch": 0.06891105498547304, + "grad_norm": 10.208023071289062, + "learning_rate": 9.637358144483292e-06, + "loss": 0.6457, + "mean_token_accuracy": 0.8080255687236786, + "num_tokens": 26692418.0, + "step": 22230 + }, + { + "entropy": 1.9539388507604598, + "epoch": 0.06894205411052273, + "grad_norm": 5.077010631561279, + "learning_rate": 9.635191131179423e-06, + "loss": 0.5923, + "mean_token_accuracy": 0.8029374584555626, + "num_tokens": 26705460.0, + "step": 22240 + }, + { + "entropy": 2.013536052405834, + "epoch": 0.06897305323557243, + "grad_norm": 8.475672721862793, + "learning_rate": 9.633025579013265e-06, + "loss": 0.6455, + "mean_token_accuracy": 0.8068231120705605, + "num_tokens": 26716216.0, + "step": 22250 + }, + { + "entropy": 1.9996852800250053, + "epoch": 0.06900405236062213, + "grad_norm": 11.207576751708984, + "learning_rate": 9.630861486343582e-06, + "loss": 0.6351, + "mean_token_accuracy": 0.8085829988121986, + "num_tokens": 26726768.0, + "step": 22260 + }, + { + "entropy": 1.8975500375032426, + "epoch": 0.06903505148567182, + "grad_norm": 9.219078063964844, + "learning_rate": 9.628698851531698e-06, + "loss": 0.5485, + "mean_token_accuracy": 0.8229989439249039, + "num_tokens": 26739895.0, + "step": 22270 + }, + { + "entropy": 1.9055659562349319, + "epoch": 0.06906605061072152, + "grad_norm": 9.515271186828613, + "learning_rate": 9.62653767294152e-06, + "loss": 0.5738, + "mean_token_accuracy": 0.8302743345499038, + "num_tokens": 26751846.0, + "step": 22280 + }, + { + "entropy": 1.928154693543911, + "epoch": 0.0690970497357712, + "grad_norm": 4.5626654624938965, + "learning_rate": 9.624377948939526e-06, + "loss": 0.578, + "mean_token_accuracy": 0.818497559428215, + "num_tokens": 26764244.0, + "step": 22290 + }, + { + "entropy": 1.9365676030516625, + "epoch": 0.0691280488608209, + "grad_norm": 10.669965744018555, + "learning_rate": 9.622219677894753e-06, + "loss": 0.5561, + "mean_token_accuracy": 0.8216531693935394, + "num_tokens": 26777119.0, + "step": 22300 + }, + { + "entropy": 1.9759273901581764, + "epoch": 0.0691590479858706, + "grad_norm": 9.490134239196777, + "learning_rate": 9.620062858178804e-06, + "loss": 0.5882, + "mean_token_accuracy": 0.8160349875688553, + "num_tokens": 26789166.0, + "step": 22310 + }, + { + "entropy": 1.930388794839382, + "epoch": 0.06919004711092029, + "grad_norm": 11.474687576293945, + "learning_rate": 9.617907488165825e-06, + "loss": 0.6161, + "mean_token_accuracy": 0.8070300281047821, + "num_tokens": 26800730.0, + "step": 22320 + }, + { + "entropy": 1.9698352724313737, + "epoch": 0.06922104623596999, + "grad_norm": 10.189593315124512, + "learning_rate": 9.615753566232525e-06, + "loss": 0.6158, + "mean_token_accuracy": 0.8105901405215263, + "num_tokens": 26812034.0, + "step": 22330 + }, + { + "entropy": 1.8571903064846993, + "epoch": 0.06925204536101968, + "grad_norm": 12.529641151428223, + "learning_rate": 9.613601090758144e-06, + "loss": 0.5555, + "mean_token_accuracy": 0.8161864310503006, + "num_tokens": 26825051.0, + "step": 22340 + }, + { + "entropy": 1.9772468566894532, + "epoch": 0.06928304448606938, + "grad_norm": 4.139663219451904, + "learning_rate": 9.611450060124465e-06, + "loss": 0.6634, + "mean_token_accuracy": 0.8000595793128014, + "num_tokens": 26836544.0, + "step": 22350 + }, + { + "entropy": 1.8732447177171707, + "epoch": 0.06931404361111908, + "grad_norm": 11.927010536193848, + "learning_rate": 9.609300472715811e-06, + "loss": 0.5409, + "mean_token_accuracy": 0.8216081500053406, + "num_tokens": 26849615.0, + "step": 22360 + }, + { + "entropy": 1.93919685035944, + "epoch": 0.06934504273616877, + "grad_norm": 10.041327476501465, + "learning_rate": 9.607152326919017e-06, + "loss": 0.6048, + "mean_token_accuracy": 0.8051743671298027, + "num_tokens": 26861482.0, + "step": 22370 + }, + { + "entropy": 1.970766744017601, + "epoch": 0.06937604186121847, + "grad_norm": 10.42013931274414, + "learning_rate": 9.605005621123464e-06, + "loss": 0.6534, + "mean_token_accuracy": 0.8068610474467277, + "num_tokens": 26873126.0, + "step": 22380 + }, + { + "entropy": 1.9374214485287666, + "epoch": 0.06940704098626817, + "grad_norm": 9.681081771850586, + "learning_rate": 9.602860353721028e-06, + "loss": 0.6223, + "mean_token_accuracy": 0.8023192837834359, + "num_tokens": 26885153.0, + "step": 22390 + }, + { + "entropy": 1.9398547366261483, + "epoch": 0.06943804011131786, + "grad_norm": 9.45241928100586, + "learning_rate": 9.600716523106113e-06, + "loss": 0.6011, + "mean_token_accuracy": 0.82165118008852, + "num_tokens": 26896187.0, + "step": 22400 + }, + { + "entropy": 1.9944514393806458, + "epoch": 0.06946903923636755, + "grad_norm": 9.26956844329834, + "learning_rate": 9.598574127675626e-06, + "loss": 0.6097, + "mean_token_accuracy": 0.8134081959724426, + "num_tokens": 26907111.0, + "step": 22410 + }, + { + "entropy": 1.9747971966862679, + "epoch": 0.06950003836141724, + "grad_norm": 10.627933502197266, + "learning_rate": 9.59643316582898e-06, + "loss": 0.6869, + "mean_token_accuracy": 0.7921556413173676, + "num_tokens": 26918762.0, + "step": 22420 + }, + { + "entropy": 1.975240734219551, + "epoch": 0.06953103748646694, + "grad_norm": 11.592079162597656, + "learning_rate": 9.594293635968081e-06, + "loss": 0.6539, + "mean_token_accuracy": 0.800179885327816, + "num_tokens": 26930609.0, + "step": 22430 + }, + { + "entropy": 1.9276101559400558, + "epoch": 0.06956203661151664, + "grad_norm": 10.43637466430664, + "learning_rate": 9.59215553649733e-06, + "loss": 0.6321, + "mean_token_accuracy": 0.8026261925697327, + "num_tokens": 26942087.0, + "step": 22440 + }, + { + "entropy": 1.9058784544467926, + "epoch": 0.06959303573656633, + "grad_norm": 10.7579984664917, + "learning_rate": 9.590018865823617e-06, + "loss": 0.5575, + "mean_token_accuracy": 0.8225520506501198, + "num_tokens": 26954256.0, + "step": 22450 + }, + { + "entropy": 1.928253923356533, + "epoch": 0.06962403486161603, + "grad_norm": 9.777754783630371, + "learning_rate": 9.587883622356315e-06, + "loss": 0.5975, + "mean_token_accuracy": 0.8160154640674591, + "num_tokens": 26966403.0, + "step": 22460 + }, + { + "entropy": 1.8594375014305116, + "epoch": 0.06965503398666573, + "grad_norm": 4.670237064361572, + "learning_rate": 9.585749804507275e-06, + "loss": 0.5684, + "mean_token_accuracy": 0.8113547876477242, + "num_tokens": 26979420.0, + "step": 22470 + }, + { + "entropy": 1.8346465826034546, + "epoch": 0.06968603311171542, + "grad_norm": 10.510963439941406, + "learning_rate": 9.58361741069082e-06, + "loss": 0.5274, + "mean_token_accuracy": 0.8267633840441704, + "num_tokens": 26991602.0, + "step": 22480 + }, + { + "entropy": 1.855713202059269, + "epoch": 0.06971703223676512, + "grad_norm": 10.206911087036133, + "learning_rate": 9.581486439323741e-06, + "loss": 0.6001, + "mean_token_accuracy": 0.8163058936595917, + "num_tokens": 27003931.0, + "step": 22490 + }, + { + "entropy": 1.9139684349298478, + "epoch": 0.06974803136181482, + "grad_norm": 10.377299308776855, + "learning_rate": 9.579356888825293e-06, + "loss": 0.622, + "mean_token_accuracy": 0.8105499014258385, + "num_tokens": 27016144.0, + "step": 22500 + }, + { + "entropy": 1.8956247389316558, + "epoch": 0.06977903048686451, + "grad_norm": 11.713562965393066, + "learning_rate": 9.57722875761719e-06, + "loss": 0.5887, + "mean_token_accuracy": 0.8171533569693565, + "num_tokens": 27028209.0, + "step": 22510 + }, + { + "entropy": 1.8996628910303115, + "epoch": 0.06981002961191421, + "grad_norm": 10.300981521606445, + "learning_rate": 9.5751020441236e-06, + "loss": 0.6273, + "mean_token_accuracy": 0.816479966044426, + "num_tokens": 27039827.0, + "step": 22520 + }, + { + "entropy": 1.8548725992441177, + "epoch": 0.0698410287369639, + "grad_norm": 9.6876220703125, + "learning_rate": 9.572976746771132e-06, + "loss": 0.529, + "mean_token_accuracy": 0.8360526219010354, + "num_tokens": 27051785.0, + "step": 22530 + }, + { + "entropy": 1.8720781534910202, + "epoch": 0.06987202786201359, + "grad_norm": 11.248559951782227, + "learning_rate": 9.570852863988847e-06, + "loss": 0.5231, + "mean_token_accuracy": 0.8292858317494393, + "num_tokens": 27063846.0, + "step": 22540 + }, + { + "entropy": 1.928021316230297, + "epoch": 0.06990302698706329, + "grad_norm": 12.513522148132324, + "learning_rate": 9.568730394208245e-06, + "loss": 0.6212, + "mean_token_accuracy": 0.8208240970969201, + "num_tokens": 27075131.0, + "step": 22550 + }, + { + "entropy": 1.9080385088920593, + "epoch": 0.06993402611211298, + "grad_norm": 13.877976417541504, + "learning_rate": 9.566609335863253e-06, + "loss": 0.5778, + "mean_token_accuracy": 0.8138008996844291, + "num_tokens": 27088488.0, + "step": 22560 + }, + { + "entropy": 1.8622148722410201, + "epoch": 0.06996502523716268, + "grad_norm": 9.615792274475098, + "learning_rate": 9.564489687390232e-06, + "loss": 0.5473, + "mean_token_accuracy": 0.8177649453282356, + "num_tokens": 27101651.0, + "step": 22570 + }, + { + "entropy": 1.9193992719054223, + "epoch": 0.06999602436221238, + "grad_norm": 10.330147743225098, + "learning_rate": 9.562371447227966e-06, + "loss": 0.6293, + "mean_token_accuracy": 0.8139034882187843, + "num_tokens": 27112881.0, + "step": 22580 + }, + { + "entropy": 1.8690322995185853, + "epoch": 0.07002702348726207, + "grad_norm": 9.160511016845703, + "learning_rate": 9.560254613817653e-06, + "loss": 0.5615, + "mean_token_accuracy": 0.8230489745736123, + "num_tokens": 27124763.0, + "step": 22590 + }, + { + "entropy": 1.8515427842736245, + "epoch": 0.07005802261231177, + "grad_norm": 9.302170753479004, + "learning_rate": 9.558139185602919e-06, + "loss": 0.6005, + "mean_token_accuracy": 0.8178195133805275, + "num_tokens": 27136756.0, + "step": 22600 + }, + { + "entropy": 1.9153072491288186, + "epoch": 0.07008902173736146, + "grad_norm": 4.208265781402588, + "learning_rate": 9.556025161029786e-06, + "loss": 0.5976, + "mean_token_accuracy": 0.816322073340416, + "num_tokens": 27148060.0, + "step": 22610 + }, + { + "entropy": 1.958570632338524, + "epoch": 0.07012002086241116, + "grad_norm": 11.623967170715332, + "learning_rate": 9.553912538546687e-06, + "loss": 0.663, + "mean_token_accuracy": 0.8184066906571388, + "num_tokens": 27158765.0, + "step": 22620 + }, + { + "entropy": 1.8749097064137459, + "epoch": 0.07015101998746086, + "grad_norm": 10.970122337341309, + "learning_rate": 9.55180131660445e-06, + "loss": 0.5946, + "mean_token_accuracy": 0.81072598695755, + "num_tokens": 27170806.0, + "step": 22630 + }, + { + "entropy": 1.831792925298214, + "epoch": 0.07018201911251055, + "grad_norm": 11.485851287841797, + "learning_rate": 9.54969149365631e-06, + "loss": 0.5266, + "mean_token_accuracy": 0.8216899335384369, + "num_tokens": 27184157.0, + "step": 22640 + }, + { + "entropy": 1.9119883999228477, + "epoch": 0.07021301823756025, + "grad_norm": 11.170741081237793, + "learning_rate": 9.547583068157877e-06, + "loss": 0.6163, + "mean_token_accuracy": 0.8011090368032455, + "num_tokens": 27196415.0, + "step": 22650 + }, + { + "entropy": 1.8825739532709123, + "epoch": 0.07024401736260993, + "grad_norm": 8.26751708984375, + "learning_rate": 9.54547603856716e-06, + "loss": 0.5673, + "mean_token_accuracy": 0.8156488195061684, + "num_tokens": 27209042.0, + "step": 22660 + }, + { + "entropy": 1.9585013464093208, + "epoch": 0.07027501648765963, + "grad_norm": 13.17714786529541, + "learning_rate": 9.54337040334454e-06, + "loss": 0.66, + "mean_token_accuracy": 0.8024971842765808, + "num_tokens": 27219980.0, + "step": 22670 + }, + { + "entropy": 1.9092792928218842, + "epoch": 0.07030601561270933, + "grad_norm": 8.935144424438477, + "learning_rate": 9.54126616095278e-06, + "loss": 0.6069, + "mean_token_accuracy": 0.8157609686255455, + "num_tokens": 27231294.0, + "step": 22680 + }, + { + "entropy": 1.8577600359916686, + "epoch": 0.07033701473775902, + "grad_norm": 11.60800838470459, + "learning_rate": 9.539163309857014e-06, + "loss": 0.559, + "mean_token_accuracy": 0.8194443017244339, + "num_tokens": 27243917.0, + "step": 22690 + }, + { + "entropy": 1.9067883223295212, + "epoch": 0.07036801386280872, + "grad_norm": 9.336089134216309, + "learning_rate": 9.537061848524734e-06, + "loss": 0.6081, + "mean_token_accuracy": 0.8156686320900917, + "num_tokens": 27255668.0, + "step": 22700 + }, + { + "entropy": 1.8172156438231468, + "epoch": 0.07039901298785842, + "grad_norm": 5.0443644523620605, + "learning_rate": 9.534961775425811e-06, + "loss": 0.504, + "mean_token_accuracy": 0.8269337728619576, + "num_tokens": 27269533.0, + "step": 22710 + }, + { + "entropy": 1.8444980800151825, + "epoch": 0.07043001211290811, + "grad_norm": 9.690264701843262, + "learning_rate": 9.532863089032457e-06, + "loss": 0.554, + "mean_token_accuracy": 0.8164748609066009, + "num_tokens": 27282775.0, + "step": 22720 + }, + { + "entropy": 2.0001364931464196, + "epoch": 0.07046101123795781, + "grad_norm": 11.46723747253418, + "learning_rate": 9.530765787819247e-06, + "loss": 0.6126, + "mean_token_accuracy": 0.8077256262302399, + "num_tokens": 27294314.0, + "step": 22730 + }, + { + "entropy": 2.0072741538286207, + "epoch": 0.0704920103630075, + "grad_norm": 9.978995323181152, + "learning_rate": 9.528669870263103e-06, + "loss": 0.6748, + "mean_token_accuracy": 0.7956588104367256, + "num_tokens": 27305909.0, + "step": 22740 + }, + { + "entropy": 2.031547136604786, + "epoch": 0.0705230094880572, + "grad_norm": 9.277776718139648, + "learning_rate": 9.526575334843284e-06, + "loss": 0.6595, + "mean_token_accuracy": 0.8040603443980217, + "num_tokens": 27317574.0, + "step": 22750 + }, + { + "entropy": 1.887120671570301, + "epoch": 0.0705540086131069, + "grad_norm": 10.967029571533203, + "learning_rate": 9.524482180041396e-06, + "loss": 0.5722, + "mean_token_accuracy": 0.809663151204586, + "num_tokens": 27330161.0, + "step": 22760 + }, + { + "entropy": 1.8155884474515915, + "epoch": 0.0705850077381566, + "grad_norm": 10.948394775390625, + "learning_rate": 9.522390404341375e-06, + "loss": 0.5795, + "mean_token_accuracy": 0.8259642213582993, + "num_tokens": 27343602.0, + "step": 22770 + }, + { + "entropy": 1.9294879660010338, + "epoch": 0.07061600686320628, + "grad_norm": 9.418436050415039, + "learning_rate": 9.52030000622949e-06, + "loss": 0.5636, + "mean_token_accuracy": 0.8182932585477829, + "num_tokens": 27355422.0, + "step": 22780 + }, + { + "entropy": 1.9679240599274634, + "epoch": 0.07064700598825598, + "grad_norm": 5.147764682769775, + "learning_rate": 9.51821098419433e-06, + "loss": 0.5615, + "mean_token_accuracy": 0.8287713721394538, + "num_tokens": 27367573.0, + "step": 22790 + }, + { + "entropy": 1.9291693687438964, + "epoch": 0.07067800511330567, + "grad_norm": 4.149558067321777, + "learning_rate": 9.516123336726806e-06, + "loss": 0.6414, + "mean_token_accuracy": 0.8070420354604722, + "num_tokens": 27379593.0, + "step": 22800 + }, + { + "entropy": 2.0455012649297712, + "epoch": 0.07070900423835537, + "grad_norm": 11.943114280700684, + "learning_rate": 9.514037062320148e-06, + "loss": 0.7181, + "mean_token_accuracy": 0.7937376946210861, + "num_tokens": 27390417.0, + "step": 22810 + }, + { + "entropy": 1.928147019445896, + "epoch": 0.07074000336340507, + "grad_norm": 10.236185073852539, + "learning_rate": 9.511952159469895e-06, + "loss": 0.5803, + "mean_token_accuracy": 0.8198776602745056, + "num_tokens": 27402550.0, + "step": 22820 + }, + { + "entropy": 2.0119119971990584, + "epoch": 0.07077100248845476, + "grad_norm": 8.378026962280273, + "learning_rate": 9.509868626673897e-06, + "loss": 0.6267, + "mean_token_accuracy": 0.821306724846363, + "num_tokens": 27413355.0, + "step": 22830 + }, + { + "entropy": 1.9993405610322952, + "epoch": 0.07080200161350446, + "grad_norm": 9.925045013427734, + "learning_rate": 9.507786462432295e-06, + "loss": 0.6015, + "mean_token_accuracy": 0.8195290580391884, + "num_tokens": 27424504.0, + "step": 22840 + }, + { + "entropy": 1.9374178424477577, + "epoch": 0.07083300073855416, + "grad_norm": 4.789980888366699, + "learning_rate": 9.505705665247544e-06, + "loss": 0.6441, + "mean_token_accuracy": 0.8104717791080475, + "num_tokens": 27436791.0, + "step": 22850 + }, + { + "entropy": 1.9383593767881393, + "epoch": 0.07086399986360385, + "grad_norm": 10.263440132141113, + "learning_rate": 9.503626233624376e-06, + "loss": 0.627, + "mean_token_accuracy": 0.8090912804007531, + "num_tokens": 27448858.0, + "step": 22860 + }, + { + "entropy": 2.0473968207836153, + "epoch": 0.07089499898865355, + "grad_norm": 12.426820755004883, + "learning_rate": 9.501548166069823e-06, + "loss": 0.7145, + "mean_token_accuracy": 0.7896327748894691, + "num_tokens": 27459568.0, + "step": 22870 + }, + { + "entropy": 1.9629662334918976, + "epoch": 0.07092599811370325, + "grad_norm": 9.21473503112793, + "learning_rate": 9.499471461093198e-06, + "loss": 0.6637, + "mean_token_accuracy": 0.8080029338598251, + "num_tokens": 27472413.0, + "step": 22880 + }, + { + "entropy": 1.9620086148381233, + "epoch": 0.07095699723875294, + "grad_norm": 11.438567161560059, + "learning_rate": 9.497396117206091e-06, + "loss": 0.6037, + "mean_token_accuracy": 0.8183909133076668, + "num_tokens": 27484336.0, + "step": 22890 + }, + { + "entropy": 1.9684356123209, + "epoch": 0.07098799636380264, + "grad_norm": 10.009490966796875, + "learning_rate": 9.49532213292237e-06, + "loss": 0.6147, + "mean_token_accuracy": 0.8186342790722847, + "num_tokens": 27495646.0, + "step": 22900 + }, + { + "entropy": 1.8952196419239045, + "epoch": 0.07101899548885232, + "grad_norm": 10.987327575683594, + "learning_rate": 9.493249506758174e-06, + "loss": 0.5656, + "mean_token_accuracy": 0.8256588101387023, + "num_tokens": 27507145.0, + "step": 22910 + }, + { + "entropy": 1.9499731242656708, + "epoch": 0.07104999461390202, + "grad_norm": 12.571517944335938, + "learning_rate": 9.491178237231904e-06, + "loss": 0.6393, + "mean_token_accuracy": 0.8107420906424523, + "num_tokens": 27518536.0, + "step": 22920 + }, + { + "entropy": 1.9733731657266618, + "epoch": 0.07108099373895171, + "grad_norm": 11.42594051361084, + "learning_rate": 9.48910832286423e-06, + "loss": 0.6315, + "mean_token_accuracy": 0.8104237765073776, + "num_tokens": 27529692.0, + "step": 22930 + }, + { + "entropy": 1.9351124539971352, + "epoch": 0.07111199286400141, + "grad_norm": 12.896414756774902, + "learning_rate": 9.48703976217807e-06, + "loss": 0.6321, + "mean_token_accuracy": 0.8083524718880654, + "num_tokens": 27541473.0, + "step": 22940 + }, + { + "entropy": 2.000078111886978, + "epoch": 0.07114299198905111, + "grad_norm": 10.5455904006958, + "learning_rate": 9.484972553698609e-06, + "loss": 0.7026, + "mean_token_accuracy": 0.7948870256543159, + "num_tokens": 27552542.0, + "step": 22950 + }, + { + "entropy": 1.87677208930254, + "epoch": 0.0711739911141008, + "grad_norm": 11.3195161819458, + "learning_rate": 9.482906695953262e-06, + "loss": 0.5929, + "mean_token_accuracy": 0.819133634865284, + "num_tokens": 27565263.0, + "step": 22960 + }, + { + "entropy": 1.8816159687936307, + "epoch": 0.0712049902391505, + "grad_norm": 5.181549072265625, + "learning_rate": 9.480842187471707e-06, + "loss": 0.5527, + "mean_token_accuracy": 0.8206304356455802, + "num_tokens": 27577618.0, + "step": 22970 + }, + { + "entropy": 1.8843361094594002, + "epoch": 0.0712359893642002, + "grad_norm": 9.86581039428711, + "learning_rate": 9.478779026785849e-06, + "loss": 0.5401, + "mean_token_accuracy": 0.8279527604579926, + "num_tokens": 27589842.0, + "step": 22980 + }, + { + "entropy": 1.9283671468496322, + "epoch": 0.0712669884892499, + "grad_norm": 8.319514274597168, + "learning_rate": 9.476717212429832e-06, + "loss": 0.6132, + "mean_token_accuracy": 0.8075576022267341, + "num_tokens": 27601806.0, + "step": 22990 + }, + { + "entropy": 1.9603848546743392, + "epoch": 0.07129798761429959, + "grad_norm": 10.907440185546875, + "learning_rate": 9.474656742940032e-06, + "loss": 0.6049, + "mean_token_accuracy": 0.817129735648632, + "num_tokens": 27613900.0, + "step": 23000 + }, + { + "entropy": 1.923836286365986, + "epoch": 0.07132898673934929, + "grad_norm": 10.593782424926758, + "learning_rate": 9.472597616855047e-06, + "loss": 0.6263, + "mean_token_accuracy": 0.8129079177975654, + "num_tokens": 27626469.0, + "step": 23010 + }, + { + "entropy": 1.9737991467118263, + "epoch": 0.07135998586439898, + "grad_norm": 9.002423286437988, + "learning_rate": 9.470539832715709e-06, + "loss": 0.6858, + "mean_token_accuracy": 0.8034425944089889, + "num_tokens": 27638941.0, + "step": 23020 + }, + { + "entropy": 2.0140700817108153, + "epoch": 0.07139098498944867, + "grad_norm": 9.269657135009766, + "learning_rate": 9.468483389065051e-06, + "loss": 0.7441, + "mean_token_accuracy": 0.7920542553067207, + "num_tokens": 27650333.0, + "step": 23030 + }, + { + "entropy": 1.9881491467356682, + "epoch": 0.07142198411449836, + "grad_norm": 9.53425407409668, + "learning_rate": 9.466428284448339e-06, + "loss": 0.605, + "mean_token_accuracy": 0.815938925743103, + "num_tokens": 27661706.0, + "step": 23040 + }, + { + "entropy": 1.9040154725313188, + "epoch": 0.07145298323954806, + "grad_norm": 11.15352725982666, + "learning_rate": 9.464374517413028e-06, + "loss": 0.5775, + "mean_token_accuracy": 0.8235550940036773, + "num_tokens": 27674006.0, + "step": 23050 + }, + { + "entropy": 1.8507097408175468, + "epoch": 0.07148398236459776, + "grad_norm": 4.413679599761963, + "learning_rate": 9.462322086508796e-06, + "loss": 0.613, + "mean_token_accuracy": 0.822106608748436, + "num_tokens": 27686803.0, + "step": 23060 + }, + { + "entropy": 1.8648405969142914, + "epoch": 0.07151498148964745, + "grad_norm": 8.988687515258789, + "learning_rate": 9.460270990287506e-06, + "loss": 0.623, + "mean_token_accuracy": 0.8131396234035492, + "num_tokens": 27699064.0, + "step": 23070 + }, + { + "entropy": 1.9689454659819603, + "epoch": 0.07154598061469715, + "grad_norm": 12.583850860595703, + "learning_rate": 9.458221227303229e-06, + "loss": 0.6461, + "mean_token_accuracy": 0.8088790848851204, + "num_tokens": 27710056.0, + "step": 23080 + }, + { + "entropy": 1.920731896162033, + "epoch": 0.07157697973974685, + "grad_norm": 7.8442254066467285, + "learning_rate": 9.456172796112224e-06, + "loss": 0.588, + "mean_token_accuracy": 0.8172019869089127, + "num_tokens": 27722078.0, + "step": 23090 + }, + { + "entropy": 1.8544889241456985, + "epoch": 0.07160797886479654, + "grad_norm": 4.247855186462402, + "learning_rate": 9.454125695272939e-06, + "loss": 0.5021, + "mean_token_accuracy": 0.8311522156000137, + "num_tokens": 27734016.0, + "step": 23100 + }, + { + "entropy": 1.905607244372368, + "epoch": 0.07163897798984624, + "grad_norm": 11.669919967651367, + "learning_rate": 9.452079923346001e-06, + "loss": 0.5869, + "mean_token_accuracy": 0.8104119807481766, + "num_tokens": 27745967.0, + "step": 23110 + }, + { + "entropy": 1.8860177457332612, + "epoch": 0.07166997711489594, + "grad_norm": 8.93431568145752, + "learning_rate": 9.45003547889422e-06, + "loss": 0.5465, + "mean_token_accuracy": 0.8315723642706871, + "num_tokens": 27758582.0, + "step": 23120 + }, + { + "entropy": 1.9184095725417136, + "epoch": 0.07170097623994563, + "grad_norm": 9.845928192138672, + "learning_rate": 9.447992360482587e-06, + "loss": 0.5926, + "mean_token_accuracy": 0.8231678128242492, + "num_tokens": 27770200.0, + "step": 23130 + }, + { + "entropy": 1.9659554988145829, + "epoch": 0.07173197536499533, + "grad_norm": 6.174232006072998, + "learning_rate": 9.445950566678251e-06, + "loss": 0.6704, + "mean_token_accuracy": 0.8078634425997734, + "num_tokens": 27781380.0, + "step": 23140 + }, + { + "entropy": 1.8729309558868408, + "epoch": 0.07176297449004501, + "grad_norm": 11.597721099853516, + "learning_rate": 9.443910096050535e-06, + "loss": 0.584, + "mean_token_accuracy": 0.8244254574179649, + "num_tokens": 27793522.0, + "step": 23150 + }, + { + "entropy": 1.9304191544651985, + "epoch": 0.07179397361509471, + "grad_norm": 4.9579596519470215, + "learning_rate": 9.441870947170925e-06, + "loss": 0.6446, + "mean_token_accuracy": 0.812780550122261, + "num_tokens": 27805347.0, + "step": 23160 + }, + { + "entropy": 1.9905481100082398, + "epoch": 0.0718249727401444, + "grad_norm": 8.615478515625, + "learning_rate": 9.439833118613064e-06, + "loss": 0.6513, + "mean_token_accuracy": 0.8182663396000862, + "num_tokens": 27816623.0, + "step": 23170 + }, + { + "entropy": 1.9626749917864799, + "epoch": 0.0718559718651941, + "grad_norm": 10.739019393920898, + "learning_rate": 9.437796608952747e-06, + "loss": 0.5874, + "mean_token_accuracy": 0.8244615375995636, + "num_tokens": 27828120.0, + "step": 23180 + }, + { + "entropy": 1.9498226195573807, + "epoch": 0.0718869709902438, + "grad_norm": 10.651679039001465, + "learning_rate": 9.435761416767925e-06, + "loss": 0.6655, + "mean_token_accuracy": 0.8038348644971848, + "num_tokens": 27839640.0, + "step": 23190 + }, + { + "entropy": 1.8966133877635003, + "epoch": 0.0719179701152935, + "grad_norm": 8.705821990966797, + "learning_rate": 9.433727540638685e-06, + "loss": 0.6362, + "mean_token_accuracy": 0.8108318716287612, + "num_tokens": 27851113.0, + "step": 23200 + }, + { + "entropy": 1.8680199176073073, + "epoch": 0.07194896924034319, + "grad_norm": 5.520404815673828, + "learning_rate": 9.431694979147263e-06, + "loss": 0.5776, + "mean_token_accuracy": 0.8188697442412376, + "num_tokens": 27862805.0, + "step": 23210 + }, + { + "entropy": 1.9000079199671744, + "epoch": 0.07197996836539289, + "grad_norm": 9.184073448181152, + "learning_rate": 9.429663730878031e-06, + "loss": 0.5792, + "mean_token_accuracy": 0.8288420543074608, + "num_tokens": 27875156.0, + "step": 23220 + }, + { + "entropy": 1.8455702617764473, + "epoch": 0.07201096749044258, + "grad_norm": 8.574012756347656, + "learning_rate": 9.427633794417493e-06, + "loss": 0.4801, + "mean_token_accuracy": 0.8355481848120689, + "num_tokens": 27888621.0, + "step": 23230 + }, + { + "entropy": 1.9175620287656785, + "epoch": 0.07204196661549228, + "grad_norm": 9.760550498962402, + "learning_rate": 9.425605168354284e-06, + "loss": 0.6302, + "mean_token_accuracy": 0.8198355168104172, + "num_tokens": 27899563.0, + "step": 23240 + }, + { + "entropy": 1.8596885740756988, + "epoch": 0.07207296574054198, + "grad_norm": 5.949073791503906, + "learning_rate": 9.423577851279158e-06, + "loss": 0.5272, + "mean_token_accuracy": 0.8252129018306732, + "num_tokens": 27913195.0, + "step": 23250 + }, + { + "entropy": 1.9171173721551895, + "epoch": 0.07210396486559167, + "grad_norm": 9.470051765441895, + "learning_rate": 9.421551841784998e-06, + "loss": 0.6161, + "mean_token_accuracy": 0.8131398126482964, + "num_tokens": 27925430.0, + "step": 23260 + }, + { + "entropy": 1.9633556708693505, + "epoch": 0.07213496399064137, + "grad_norm": 8.595085144042969, + "learning_rate": 9.4195271384668e-06, + "loss": 0.5934, + "mean_token_accuracy": 0.8224080309271813, + "num_tokens": 27936940.0, + "step": 23270 + }, + { + "entropy": 1.937751042842865, + "epoch": 0.07216596311569105, + "grad_norm": 5.916355609893799, + "learning_rate": 9.417503739921671e-06, + "loss": 0.6028, + "mean_token_accuracy": 0.8086767017841339, + "num_tokens": 27948799.0, + "step": 23280 + }, + { + "entropy": 1.9640158087015152, + "epoch": 0.07219696224074075, + "grad_norm": 10.401774406433105, + "learning_rate": 9.415481644748828e-06, + "loss": 0.6664, + "mean_token_accuracy": 0.8089988082647324, + "num_tokens": 27959298.0, + "step": 23290 + }, + { + "entropy": 1.8880821034312247, + "epoch": 0.07222796136579045, + "grad_norm": 10.773529052734375, + "learning_rate": 9.413460851549596e-06, + "loss": 0.6015, + "mean_token_accuracy": 0.808536796271801, + "num_tokens": 27971177.0, + "step": 23300 + }, + { + "entropy": 1.9368318811058998, + "epoch": 0.07225896049084014, + "grad_norm": 8.663378715515137, + "learning_rate": 9.411441358927392e-06, + "loss": 0.5692, + "mean_token_accuracy": 0.8213057667016983, + "num_tokens": 27983758.0, + "step": 23310 + }, + { + "entropy": 1.9660808324813843, + "epoch": 0.07228995961588984, + "grad_norm": 9.40843677520752, + "learning_rate": 9.40942316548774e-06, + "loss": 0.6292, + "mean_token_accuracy": 0.8099093735218048, + "num_tokens": 27994875.0, + "step": 23320 + }, + { + "entropy": 1.9370207205414771, + "epoch": 0.07232095874093954, + "grad_norm": 8.460911750793457, + "learning_rate": 9.407406269838248e-06, + "loss": 0.616, + "mean_token_accuracy": 0.8174383148550988, + "num_tokens": 28005892.0, + "step": 23330 + }, + { + "entropy": 1.9526633992791176, + "epoch": 0.07235195786598923, + "grad_norm": 9.294373512268066, + "learning_rate": 9.405390670588613e-06, + "loss": 0.6369, + "mean_token_accuracy": 0.7995718091726303, + "num_tokens": 28017932.0, + "step": 23340 + }, + { + "entropy": 1.9452801957726478, + "epoch": 0.07238295699103893, + "grad_norm": 9.387127876281738, + "learning_rate": 9.403376366350623e-06, + "loss": 0.6482, + "mean_token_accuracy": 0.8082485362887383, + "num_tokens": 28029626.0, + "step": 23350 + }, + { + "entropy": 1.8676959410309792, + "epoch": 0.07241395611608863, + "grad_norm": 9.851313591003418, + "learning_rate": 9.401363355738139e-06, + "loss": 0.5982, + "mean_token_accuracy": 0.8089207410812378, + "num_tokens": 28041602.0, + "step": 23360 + }, + { + "entropy": 1.8555365659296512, + "epoch": 0.07244495524113832, + "grad_norm": 9.005014419555664, + "learning_rate": 9.399351637367101e-06, + "loss": 0.5462, + "mean_token_accuracy": 0.8123884066939354, + "num_tokens": 28054923.0, + "step": 23370 + }, + { + "entropy": 1.8651632323861123, + "epoch": 0.07247595436618802, + "grad_norm": 4.899734020233154, + "learning_rate": 9.397341209855522e-06, + "loss": 0.5624, + "mean_token_accuracy": 0.8213669985532761, + "num_tokens": 28067132.0, + "step": 23380 + }, + { + "entropy": 1.9289941102266313, + "epoch": 0.07250695349123772, + "grad_norm": 5.1334228515625, + "learning_rate": 9.395332071823485e-06, + "loss": 0.6144, + "mean_token_accuracy": 0.8181710079312324, + "num_tokens": 28078505.0, + "step": 23390 + }, + { + "entropy": 1.8720559775829315, + "epoch": 0.0725379526162874, + "grad_norm": 10.077095031738281, + "learning_rate": 9.393324221893131e-06, + "loss": 0.5644, + "mean_token_accuracy": 0.818347430229187, + "num_tokens": 28090333.0, + "step": 23400 + }, + { + "entropy": 1.853566548228264, + "epoch": 0.0725689517413371, + "grad_norm": 4.725801467895508, + "learning_rate": 9.391317658688664e-06, + "loss": 0.5701, + "mean_token_accuracy": 0.8122278556227684, + "num_tokens": 28103819.0, + "step": 23410 + }, + { + "entropy": 1.9520633488893508, + "epoch": 0.07259995086638679, + "grad_norm": 9.427369117736816, + "learning_rate": 9.389312380836351e-06, + "loss": 0.6409, + "mean_token_accuracy": 0.8200425282120705, + "num_tokens": 28114738.0, + "step": 23420 + }, + { + "entropy": 1.8956316590309144, + "epoch": 0.07263094999143649, + "grad_norm": 7.797948360443115, + "learning_rate": 9.3873083869645e-06, + "loss": 0.5618, + "mean_token_accuracy": 0.8178161531686783, + "num_tokens": 28127067.0, + "step": 23430 + }, + { + "entropy": 1.9396205574274064, + "epoch": 0.07266194911648619, + "grad_norm": 10.093338012695312, + "learning_rate": 9.38530567570348e-06, + "loss": 0.6226, + "mean_token_accuracy": 0.8116014674305916, + "num_tokens": 28138551.0, + "step": 23440 + }, + { + "entropy": 2.0014042764902116, + "epoch": 0.07269294824153588, + "grad_norm": 8.427999496459961, + "learning_rate": 9.383304245685689e-06, + "loss": 0.6412, + "mean_token_accuracy": 0.8133781552314758, + "num_tokens": 28149878.0, + "step": 23450 + }, + { + "entropy": 1.9623918294906617, + "epoch": 0.07272394736658558, + "grad_norm": 8.854625701904297, + "learning_rate": 9.381304095545583e-06, + "loss": 0.6584, + "mean_token_accuracy": 0.8096672505140304, + "num_tokens": 28160988.0, + "step": 23460 + }, + { + "entropy": 1.9225846633315087, + "epoch": 0.07275494649163528, + "grad_norm": 12.70853042602539, + "learning_rate": 9.379305223919642e-06, + "loss": 0.6568, + "mean_token_accuracy": 0.8003104001283645, + "num_tokens": 28172035.0, + "step": 23470 + }, + { + "entropy": 1.8282870531082154, + "epoch": 0.07278594561668497, + "grad_norm": 8.768390655517578, + "learning_rate": 9.377307629446383e-06, + "loss": 0.5394, + "mean_token_accuracy": 0.8221458449959755, + "num_tokens": 28184093.0, + "step": 23480 + }, + { + "entropy": 1.8909123882651329, + "epoch": 0.07281694474173467, + "grad_norm": 20.700326919555664, + "learning_rate": 9.375311310766353e-06, + "loss": 0.5838, + "mean_token_accuracy": 0.8204225182533265, + "num_tokens": 28195961.0, + "step": 23490 + }, + { + "entropy": 1.8430864825844764, + "epoch": 0.07284794386678436, + "grad_norm": 5.11720609664917, + "learning_rate": 9.373316266522123e-06, + "loss": 0.4796, + "mean_token_accuracy": 0.8319470316171647, + "num_tokens": 28208526.0, + "step": 23500 + }, + { + "entropy": 1.8916525810956955, + "epoch": 0.07287894299183406, + "grad_norm": 11.949532508850098, + "learning_rate": 9.371322495358281e-06, + "loss": 0.6055, + "mean_token_accuracy": 0.8044298276305198, + "num_tokens": 28220641.0, + "step": 23510 + }, + { + "entropy": 1.979685339331627, + "epoch": 0.07290994211688374, + "grad_norm": 10.18620777130127, + "learning_rate": 9.369329995921444e-06, + "loss": 0.6482, + "mean_token_accuracy": 0.8135093718767166, + "num_tokens": 28231519.0, + "step": 23520 + }, + { + "entropy": 1.9537073016166686, + "epoch": 0.07294094124193344, + "grad_norm": 9.836045265197754, + "learning_rate": 9.36733876686023e-06, + "loss": 0.6507, + "mean_token_accuracy": 0.8180084466934204, + "num_tokens": 28242672.0, + "step": 23530 + }, + { + "entropy": 1.9465522095561028, + "epoch": 0.07297194036698314, + "grad_norm": 4.765397071838379, + "learning_rate": 9.365348806825274e-06, + "loss": 0.6548, + "mean_token_accuracy": 0.8062368303537368, + "num_tokens": 28254108.0, + "step": 23540 + }, + { + "entropy": 1.9817388027906417, + "epoch": 0.07300293949203283, + "grad_norm": 9.245866775512695, + "learning_rate": 9.36336011446921e-06, + "loss": 0.6389, + "mean_token_accuracy": 0.8076906755566597, + "num_tokens": 28265018.0, + "step": 23550 + }, + { + "entropy": 1.8746733576059342, + "epoch": 0.07303393861708253, + "grad_norm": 10.122679710388184, + "learning_rate": 9.36137268844668e-06, + "loss": 0.5419, + "mean_token_accuracy": 0.8259296610951423, + "num_tokens": 28277414.0, + "step": 23560 + }, + { + "entropy": 1.9215095147490502, + "epoch": 0.07306493774213223, + "grad_norm": 9.02968692779541, + "learning_rate": 9.359386527414325e-06, + "loss": 0.5584, + "mean_token_accuracy": 0.8263942644000053, + "num_tokens": 28289205.0, + "step": 23570 + }, + { + "entropy": 1.8805431053042412, + "epoch": 0.07309593686718192, + "grad_norm": 8.153913497924805, + "learning_rate": 9.35740163003077e-06, + "loss": 0.5737, + "mean_token_accuracy": 0.8194609686732293, + "num_tokens": 28300813.0, + "step": 23580 + }, + { + "entropy": 1.8591163650155067, + "epoch": 0.07312693599223162, + "grad_norm": 11.29405403137207, + "learning_rate": 9.355417994956647e-06, + "loss": 0.6421, + "mean_token_accuracy": 0.7990970879793167, + "num_tokens": 28313650.0, + "step": 23590 + }, + { + "entropy": 1.887549029290676, + "epoch": 0.07315793511728132, + "grad_norm": 4.2539472579956055, + "learning_rate": 9.353435620854559e-06, + "loss": 0.539, + "mean_token_accuracy": 0.8247372597455979, + "num_tokens": 28325816.0, + "step": 23600 + }, + { + "entropy": 1.966456551849842, + "epoch": 0.07318893424233101, + "grad_norm": 9.739566802978516, + "learning_rate": 9.3514545063891e-06, + "loss": 0.6504, + "mean_token_accuracy": 0.8081991970539093, + "num_tokens": 28337088.0, + "step": 23610 + }, + { + "entropy": 1.9840751081705092, + "epoch": 0.07321993336738071, + "grad_norm": 11.179567337036133, + "learning_rate": 9.349474650226844e-06, + "loss": 0.6291, + "mean_token_accuracy": 0.8135482758283615, + "num_tokens": 28347375.0, + "step": 23620 + }, + { + "entropy": 1.9408112317323685, + "epoch": 0.0732509324924304, + "grad_norm": 9.25151538848877, + "learning_rate": 9.347496051036333e-06, + "loss": 0.6223, + "mean_token_accuracy": 0.8124508634209633, + "num_tokens": 28358563.0, + "step": 23630 + }, + { + "entropy": 1.947693009674549, + "epoch": 0.0732819316174801, + "grad_norm": 9.259462356567383, + "learning_rate": 9.345518707488087e-06, + "loss": 0.6084, + "mean_token_accuracy": 0.8231876268982887, + "num_tokens": 28370399.0, + "step": 23640 + }, + { + "entropy": 1.846913254261017, + "epoch": 0.07331293074252979, + "grad_norm": 8.472787857055664, + "learning_rate": 9.343542618254596e-06, + "loss": 0.5402, + "mean_token_accuracy": 0.822886273264885, + "num_tokens": 28382814.0, + "step": 23650 + }, + { + "entropy": 1.848525333404541, + "epoch": 0.07334392986757948, + "grad_norm": 11.601837158203125, + "learning_rate": 9.341567782010304e-06, + "loss": 0.5389, + "mean_token_accuracy": 0.8181129440665245, + "num_tokens": 28395944.0, + "step": 23660 + }, + { + "entropy": 1.778299406170845, + "epoch": 0.07337492899262918, + "grad_norm": 10.347676277160645, + "learning_rate": 9.339594197431625e-06, + "loss": 0.548, + "mean_token_accuracy": 0.8195615544915199, + "num_tokens": 28409215.0, + "step": 23670 + }, + { + "entropy": 1.8473999500274658, + "epoch": 0.07340592811767888, + "grad_norm": 11.07874584197998, + "learning_rate": 9.337621863196925e-06, + "loss": 0.5769, + "mean_token_accuracy": 0.8226366356015206, + "num_tokens": 28421381.0, + "step": 23680 + }, + { + "entropy": 1.8470186293125153, + "epoch": 0.07343692724272857, + "grad_norm": 11.499205589294434, + "learning_rate": 9.335650777986526e-06, + "loss": 0.5876, + "mean_token_accuracy": 0.824992710351944, + "num_tokens": 28433681.0, + "step": 23690 + }, + { + "entropy": 1.9232969626784324, + "epoch": 0.07346792636777827, + "grad_norm": 11.156820297241211, + "learning_rate": 9.333680940482696e-06, + "loss": 0.5953, + "mean_token_accuracy": 0.8178783282637596, + "num_tokens": 28445840.0, + "step": 23700 + }, + { + "entropy": 1.9458463191986084, + "epoch": 0.07349892549282797, + "grad_norm": 5.818225860595703, + "learning_rate": 9.331712349369646e-06, + "loss": 0.6437, + "mean_token_accuracy": 0.8080273911356926, + "num_tokens": 28457345.0, + "step": 23710 + }, + { + "entropy": 1.9871521532535552, + "epoch": 0.07352992461787766, + "grad_norm": 9.459403991699219, + "learning_rate": 9.329745003333538e-06, + "loss": 0.6474, + "mean_token_accuracy": 0.8077614203095436, + "num_tokens": 28468578.0, + "step": 23720 + }, + { + "entropy": 1.9817648902535439, + "epoch": 0.07356092374292736, + "grad_norm": 8.348322868347168, + "learning_rate": 9.327778901062466e-06, + "loss": 0.6367, + "mean_token_accuracy": 0.8168795883655549, + "num_tokens": 28479788.0, + "step": 23730 + }, + { + "entropy": 1.9540872916579246, + "epoch": 0.07359192286797706, + "grad_norm": 8.266214370727539, + "learning_rate": 9.325814041246455e-06, + "loss": 0.6205, + "mean_token_accuracy": 0.8153672978281975, + "num_tokens": 28491967.0, + "step": 23740 + }, + { + "entropy": 1.960209146142006, + "epoch": 0.07362292199302675, + "grad_norm": 11.05349349975586, + "learning_rate": 9.323850422577469e-06, + "loss": 0.6019, + "mean_token_accuracy": 0.811015397310257, + "num_tokens": 28503930.0, + "step": 23750 + }, + { + "entropy": 1.9480343893170358, + "epoch": 0.07365392111807645, + "grad_norm": 10.689523696899414, + "learning_rate": 9.321888043749389e-06, + "loss": 0.5604, + "mean_token_accuracy": 0.8198656231164932, + "num_tokens": 28515451.0, + "step": 23760 + }, + { + "entropy": 1.994300290942192, + "epoch": 0.07368492024312613, + "grad_norm": 8.471319198608398, + "learning_rate": 9.319926903458033e-06, + "loss": 0.631, + "mean_token_accuracy": 0.8167888432741165, + "num_tokens": 28526353.0, + "step": 23770 + }, + { + "entropy": 1.9629247948527335, + "epoch": 0.07371591936817583, + "grad_norm": 10.409026145935059, + "learning_rate": 9.317967000401127e-06, + "loss": 0.6331, + "mean_token_accuracy": 0.8050680428743362, + "num_tokens": 28537484.0, + "step": 23780 + }, + { + "entropy": 1.8861139222979546, + "epoch": 0.07374691849322552, + "grad_norm": 8.942246437072754, + "learning_rate": 9.316008333278319e-06, + "loss": 0.5364, + "mean_token_accuracy": 0.8210712686181069, + "num_tokens": 28550244.0, + "step": 23790 + }, + { + "entropy": 1.867449736595154, + "epoch": 0.07377791761827522, + "grad_norm": 12.16384506225586, + "learning_rate": 9.314050900791163e-06, + "loss": 0.5982, + "mean_token_accuracy": 0.8131821945309639, + "num_tokens": 28562527.0, + "step": 23800 + }, + { + "entropy": 1.8669994980096818, + "epoch": 0.07380891674332492, + "grad_norm": 9.032928466796875, + "learning_rate": 9.312094701643134e-06, + "loss": 0.5414, + "mean_token_accuracy": 0.8275477036833763, + "num_tokens": 28575138.0, + "step": 23810 + }, + { + "entropy": 2.010842078924179, + "epoch": 0.07383991586837461, + "grad_norm": 9.27695369720459, + "learning_rate": 9.3101397345396e-06, + "loss": 0.635, + "mean_token_accuracy": 0.8104411855340004, + "num_tokens": 28586985.0, + "step": 23820 + }, + { + "entropy": 1.9500906214118003, + "epoch": 0.07387091499342431, + "grad_norm": 3.9518938064575195, + "learning_rate": 9.30818599818784e-06, + "loss": 0.5668, + "mean_token_accuracy": 0.8293194532394409, + "num_tokens": 28598926.0, + "step": 23830 + }, + { + "entropy": 1.9337919741868972, + "epoch": 0.07390191411847401, + "grad_norm": 10.783734321594238, + "learning_rate": 9.306233491297024e-06, + "loss": 0.5621, + "mean_token_accuracy": 0.8271838411688804, + "num_tokens": 28610265.0, + "step": 23840 + }, + { + "entropy": 1.8781434014439582, + "epoch": 0.0739329132435237, + "grad_norm": 4.472099781036377, + "learning_rate": 9.30428221257822e-06, + "loss": 0.5585, + "mean_token_accuracy": 0.8306781485676765, + "num_tokens": 28622824.0, + "step": 23850 + }, + { + "entropy": 1.9807551354169846, + "epoch": 0.0739639123685734, + "grad_norm": 11.769026756286621, + "learning_rate": 9.302332160744387e-06, + "loss": 0.6261, + "mean_token_accuracy": 0.8115956127643585, + "num_tokens": 28633746.0, + "step": 23860 + }, + { + "entropy": 1.9240091010928153, + "epoch": 0.0739949114936231, + "grad_norm": 11.441720962524414, + "learning_rate": 9.300383334510372e-06, + "loss": 0.6157, + "mean_token_accuracy": 0.8130579099059105, + "num_tokens": 28646025.0, + "step": 23870 + }, + { + "entropy": 1.9416207402944565, + "epoch": 0.0740259106186728, + "grad_norm": 9.635127067565918, + "learning_rate": 9.298435732592904e-06, + "loss": 0.5847, + "mean_token_accuracy": 0.8167257070541382, + "num_tokens": 28658461.0, + "step": 23880 + }, + { + "entropy": 1.9926012963056565, + "epoch": 0.07405690974372248, + "grad_norm": 9.58889102935791, + "learning_rate": 9.296489353710593e-06, + "loss": 0.6111, + "mean_token_accuracy": 0.8202344119548798, + "num_tokens": 28669851.0, + "step": 23890 + }, + { + "entropy": 1.917009449005127, + "epoch": 0.07408790886877217, + "grad_norm": 12.779633522033691, + "learning_rate": 9.294544196583929e-06, + "loss": 0.5674, + "mean_token_accuracy": 0.8271616026759148, + "num_tokens": 28682219.0, + "step": 23900 + }, + { + "entropy": 1.9119545385241508, + "epoch": 0.07411890799382187, + "grad_norm": 4.876058578491211, + "learning_rate": 9.29260025993527e-06, + "loss": 0.5712, + "mean_token_accuracy": 0.8157782420516014, + "num_tokens": 28694730.0, + "step": 23910 + }, + { + "entropy": 1.982969084382057, + "epoch": 0.07414990711887157, + "grad_norm": 4.22367525100708, + "learning_rate": 9.290657542488846e-06, + "loss": 0.5934, + "mean_token_accuracy": 0.8123695835471153, + "num_tokens": 28706446.0, + "step": 23920 + }, + { + "entropy": 1.965246671438217, + "epoch": 0.07418090624392126, + "grad_norm": 11.446246147155762, + "learning_rate": 9.28871604297075e-06, + "loss": 0.6086, + "mean_token_accuracy": 0.8144690200686455, + "num_tokens": 28718411.0, + "step": 23930 + }, + { + "entropy": 1.9847482353448869, + "epoch": 0.07421190536897096, + "grad_norm": 11.718853950500488, + "learning_rate": 9.28677576010895e-06, + "loss": 0.6281, + "mean_token_accuracy": 0.8073780536651611, + "num_tokens": 28730646.0, + "step": 23940 + }, + { + "entropy": 1.9997831106185913, + "epoch": 0.07424290449402066, + "grad_norm": 9.053925514221191, + "learning_rate": 9.284836692633257e-06, + "loss": 0.6105, + "mean_token_accuracy": 0.8186374083161354, + "num_tokens": 28741168.0, + "step": 23950 + }, + { + "entropy": 1.8356414943933488, + "epoch": 0.07427390361907035, + "grad_norm": 9.540495872497559, + "learning_rate": 9.282898839275347e-06, + "loss": 0.5233, + "mean_token_accuracy": 0.8257902413606644, + "num_tokens": 28755165.0, + "step": 23960 + }, + { + "entropy": 1.9523360833525658, + "epoch": 0.07430490274412005, + "grad_norm": 11.081901550292969, + "learning_rate": 9.280962198768745e-06, + "loss": 0.6224, + "mean_token_accuracy": 0.8144389301538467, + "num_tokens": 28766577.0, + "step": 23970 + }, + { + "entropy": 1.962296548485756, + "epoch": 0.07433590186916975, + "grad_norm": 9.727673530578613, + "learning_rate": 9.279026769848825e-06, + "loss": 0.6022, + "mean_token_accuracy": 0.8120512589812279, + "num_tokens": 28777780.0, + "step": 23980 + }, + { + "entropy": 1.9339822575449943, + "epoch": 0.07436690099421944, + "grad_norm": 12.94129467010498, + "learning_rate": 9.277092551252813e-06, + "loss": 0.6079, + "mean_token_accuracy": 0.8044057339429855, + "num_tokens": 28789644.0, + "step": 23990 + }, + { + "entropy": 1.9828555971384048, + "epoch": 0.07439790011926914, + "grad_norm": 11.137704849243164, + "learning_rate": 9.275159541719766e-06, + "loss": 0.6334, + "mean_token_accuracy": 0.8016148954629898, + "num_tokens": 28800391.0, + "step": 24000 + }, + { + "entropy": 1.8854010567069053, + "epoch": 0.07442889924431884, + "grad_norm": 9.98965835571289, + "learning_rate": 9.273227739990585e-06, + "loss": 0.5532, + "mean_token_accuracy": 0.8219901576638222, + "num_tokens": 28812899.0, + "step": 24010 + }, + { + "entropy": 1.9441875860095024, + "epoch": 0.07445989836936852, + "grad_norm": 9.923999786376953, + "learning_rate": 9.271297144808003e-06, + "loss": 0.637, + "mean_token_accuracy": 0.8106958836317062, + "num_tokens": 28824358.0, + "step": 24020 + }, + { + "entropy": 1.889090073108673, + "epoch": 0.07449089749441822, + "grad_norm": 8.807568550109863, + "learning_rate": 9.269367754916594e-06, + "loss": 0.5404, + "mean_token_accuracy": 0.83165952116251, + "num_tokens": 28835906.0, + "step": 24030 + }, + { + "entropy": 1.8731929019093514, + "epoch": 0.07452189661946791, + "grad_norm": 11.27502727508545, + "learning_rate": 9.267439569062747e-06, + "loss": 0.5855, + "mean_token_accuracy": 0.8130838423967361, + "num_tokens": 28848120.0, + "step": 24040 + }, + { + "entropy": 1.9439046129584312, + "epoch": 0.07455289574451761, + "grad_norm": 10.14338207244873, + "learning_rate": 9.26551258599468e-06, + "loss": 0.5926, + "mean_token_accuracy": 0.817124992609024, + "num_tokens": 28860425.0, + "step": 24050 + }, + { + "entropy": 1.8763557612895965, + "epoch": 0.0745838948695673, + "grad_norm": 9.435752868652344, + "learning_rate": 9.26358680446244e-06, + "loss": 0.5757, + "mean_token_accuracy": 0.82074084430933, + "num_tokens": 28873156.0, + "step": 24060 + }, + { + "entropy": 1.8747549593448638, + "epoch": 0.074614893994617, + "grad_norm": 12.689223289489746, + "learning_rate": 9.26166222321788e-06, + "loss": 0.5661, + "mean_token_accuracy": 0.817321227490902, + "num_tokens": 28885693.0, + "step": 24070 + }, + { + "entropy": 1.7927760377526283, + "epoch": 0.0746458931196667, + "grad_norm": 8.860281944274902, + "learning_rate": 9.25973884101468e-06, + "loss": 0.5196, + "mean_token_accuracy": 0.8320321753621102, + "num_tokens": 28898315.0, + "step": 24080 + }, + { + "entropy": 1.8106766402721406, + "epoch": 0.0746768922447164, + "grad_norm": 11.25741958618164, + "learning_rate": 9.257816656608314e-06, + "loss": 0.554, + "mean_token_accuracy": 0.8215843066573143, + "num_tokens": 28912120.0, + "step": 24090 + }, + { + "entropy": 1.951988846063614, + "epoch": 0.07470789136976609, + "grad_norm": 9.571369171142578, + "learning_rate": 9.25589566875608e-06, + "loss": 0.6411, + "mean_token_accuracy": 0.8124495342373848, + "num_tokens": 28924042.0, + "step": 24100 + }, + { + "entropy": 1.850806337594986, + "epoch": 0.07473889049481579, + "grad_norm": 9.427949905395508, + "learning_rate": 9.253975876217073e-06, + "loss": 0.5342, + "mean_token_accuracy": 0.8303135067224503, + "num_tokens": 28936552.0, + "step": 24110 + }, + { + "entropy": 1.9311017155647279, + "epoch": 0.07476988961986548, + "grad_norm": 5.728914260864258, + "learning_rate": 9.25205727775219e-06, + "loss": 0.6528, + "mean_token_accuracy": 0.8051609516143798, + "num_tokens": 28948713.0, + "step": 24120 + }, + { + "entropy": 1.9310711935162543, + "epoch": 0.07480088874491518, + "grad_norm": 11.202091217041016, + "learning_rate": 9.250139872124125e-06, + "loss": 0.6041, + "mean_token_accuracy": 0.8168716624379158, + "num_tokens": 28959606.0, + "step": 24130 + }, + { + "entropy": 1.8889696165919303, + "epoch": 0.07483188786996486, + "grad_norm": 3.5714454650878906, + "learning_rate": 9.248223658097366e-06, + "loss": 0.547, + "mean_token_accuracy": 0.8295445218682289, + "num_tokens": 28971784.0, + "step": 24140 + }, + { + "entropy": 1.9066083252429962, + "epoch": 0.07486288699501456, + "grad_norm": 15.336698532104492, + "learning_rate": 9.246308634438193e-06, + "loss": 0.6225, + "mean_token_accuracy": 0.814862783253193, + "num_tokens": 28983340.0, + "step": 24150 + }, + { + "entropy": 1.9325315952301025, + "epoch": 0.07489388612006426, + "grad_norm": 10.859710693359375, + "learning_rate": 9.244394799914674e-06, + "loss": 0.6306, + "mean_token_accuracy": 0.8129459515213966, + "num_tokens": 28995466.0, + "step": 24160 + }, + { + "entropy": 1.9299397438764572, + "epoch": 0.07492488524511395, + "grad_norm": 11.286776542663574, + "learning_rate": 9.242482153296657e-06, + "loss": 0.6152, + "mean_token_accuracy": 0.8135070979595185, + "num_tokens": 29007025.0, + "step": 24170 + }, + { + "entropy": 1.9263689830899238, + "epoch": 0.07495588437016365, + "grad_norm": 8.885149002075195, + "learning_rate": 9.240570693355777e-06, + "loss": 0.6203, + "mean_token_accuracy": 0.8149414286017418, + "num_tokens": 29019269.0, + "step": 24180 + }, + { + "entropy": 1.9538878142833709, + "epoch": 0.07498688349521335, + "grad_norm": 12.976593971252441, + "learning_rate": 9.238660418865444e-06, + "loss": 0.6581, + "mean_token_accuracy": 0.8068120896816253, + "num_tokens": 29030437.0, + "step": 24190 + }, + { + "entropy": 1.9112654134631157, + "epoch": 0.07501788262026304, + "grad_norm": 9.866148948669434, + "learning_rate": 9.236751328600838e-06, + "loss": 0.5783, + "mean_token_accuracy": 0.8209733456373215, + "num_tokens": 29042237.0, + "step": 24200 + }, + { + "entropy": 2.0098152339458464, + "epoch": 0.07504888174531274, + "grad_norm": 9.860359191894531, + "learning_rate": 9.234843421338919e-06, + "loss": 0.691, + "mean_token_accuracy": 0.7986131757497787, + "num_tokens": 29053227.0, + "step": 24210 + }, + { + "entropy": 1.914099135249853, + "epoch": 0.07507988087036244, + "grad_norm": 3.684596300125122, + "learning_rate": 9.232936695858406e-06, + "loss": 0.5496, + "mean_token_accuracy": 0.8275950834155082, + "num_tokens": 29066785.0, + "step": 24220 + }, + { + "entropy": 1.8749552696943284, + "epoch": 0.07511087999541213, + "grad_norm": 9.360295295715332, + "learning_rate": 9.231031150939787e-06, + "loss": 0.5658, + "mean_token_accuracy": 0.8239286541938782, + "num_tokens": 29080125.0, + "step": 24230 + }, + { + "entropy": 1.9456191077828406, + "epoch": 0.07514187912046183, + "grad_norm": 9.981027603149414, + "learning_rate": 9.229126785365307e-06, + "loss": 0.6293, + "mean_token_accuracy": 0.8097566932439804, + "num_tokens": 29091466.0, + "step": 24240 + }, + { + "entropy": 1.9858911633491516, + "epoch": 0.07517287824551153, + "grad_norm": 10.95433521270752, + "learning_rate": 9.227223597918977e-06, + "loss": 0.6356, + "mean_token_accuracy": 0.8043408066034317, + "num_tokens": 29102658.0, + "step": 24250 + }, + { + "entropy": 1.9025256112217903, + "epoch": 0.07520387737056121, + "grad_norm": 4.23534631729126, + "learning_rate": 9.225321587386555e-06, + "loss": 0.5672, + "mean_token_accuracy": 0.8235425055027008, + "num_tokens": 29115098.0, + "step": 24260 + }, + { + "entropy": 1.9474715083837508, + "epoch": 0.0752348764956109, + "grad_norm": 8.94451904296875, + "learning_rate": 9.22342075255555e-06, + "loss": 0.5524, + "mean_token_accuracy": 0.82459936439991, + "num_tokens": 29127073.0, + "step": 24270 + }, + { + "entropy": 2.0239467322826385, + "epoch": 0.0752658756206606, + "grad_norm": 9.617466926574707, + "learning_rate": 9.221521092215226e-06, + "loss": 0.6738, + "mean_token_accuracy": 0.8019678488373756, + "num_tokens": 29137792.0, + "step": 24280 + }, + { + "entropy": 1.9810176610946655, + "epoch": 0.0752968747457103, + "grad_norm": 9.926680564880371, + "learning_rate": 9.219622605156583e-06, + "loss": 0.6936, + "mean_token_accuracy": 0.8082947731018066, + "num_tokens": 29148654.0, + "step": 24290 + }, + { + "entropy": 1.9755667805671693, + "epoch": 0.07532787387076, + "grad_norm": 9.290923118591309, + "learning_rate": 9.217725290172373e-06, + "loss": 0.6299, + "mean_token_accuracy": 0.8143254667520523, + "num_tokens": 29159751.0, + "step": 24300 + }, + { + "entropy": 1.9506172388792038, + "epoch": 0.07535887299580969, + "grad_norm": 4.7398362159729, + "learning_rate": 9.215829146057074e-06, + "loss": 0.6397, + "mean_token_accuracy": 0.8142923697829246, + "num_tokens": 29171860.0, + "step": 24310 + }, + { + "entropy": 1.964981135725975, + "epoch": 0.07538987212085939, + "grad_norm": 10.409062385559082, + "learning_rate": 9.213934171606911e-06, + "loss": 0.585, + "mean_token_accuracy": 0.8187711656093597, + "num_tokens": 29183756.0, + "step": 24320 + }, + { + "entropy": 1.8626610353589057, + "epoch": 0.07542087124590909, + "grad_norm": 16.499589920043945, + "learning_rate": 9.212040365619834e-06, + "loss": 0.5001, + "mean_token_accuracy": 0.8303863450884819, + "num_tokens": 29196691.0, + "step": 24330 + }, + { + "entropy": 1.9400402843952178, + "epoch": 0.07545187037095878, + "grad_norm": 10.061908721923828, + "learning_rate": 9.210147726895522e-06, + "loss": 0.6201, + "mean_token_accuracy": 0.8127806216478348, + "num_tokens": 29208279.0, + "step": 24340 + }, + { + "entropy": 1.9271167308092116, + "epoch": 0.07548286949600848, + "grad_norm": 4.295543193817139, + "learning_rate": 9.208256254235383e-06, + "loss": 0.5521, + "mean_token_accuracy": 0.8224424123764038, + "num_tokens": 29219803.0, + "step": 24350 + }, + { + "entropy": 1.9891034051775933, + "epoch": 0.07551386862105817, + "grad_norm": 12.840479850769043, + "learning_rate": 9.206365946442545e-06, + "loss": 0.6032, + "mean_token_accuracy": 0.8187379658222198, + "num_tokens": 29230787.0, + "step": 24360 + }, + { + "entropy": 1.9009444043040276, + "epoch": 0.07554486774610787, + "grad_norm": 10.091814041137695, + "learning_rate": 9.204476802321853e-06, + "loss": 0.5682, + "mean_token_accuracy": 0.8159004956483841, + "num_tokens": 29243031.0, + "step": 24370 + }, + { + "entropy": 1.9343391075730323, + "epoch": 0.07557586687115757, + "grad_norm": 11.57731819152832, + "learning_rate": 9.202588820679873e-06, + "loss": 0.5871, + "mean_token_accuracy": 0.8243474781513214, + "num_tokens": 29254685.0, + "step": 24380 + }, + { + "entropy": 1.795560409873724, + "epoch": 0.07560686599620725, + "grad_norm": 2.2424561977386475, + "learning_rate": 9.20070200032488e-06, + "loss": 0.5271, + "mean_token_accuracy": 0.8258658021688461, + "num_tokens": 29268202.0, + "step": 24390 + }, + { + "entropy": 1.896498514711857, + "epoch": 0.07563786512125695, + "grad_norm": 11.616437911987305, + "learning_rate": 9.198816340066862e-06, + "loss": 0.6379, + "mean_token_accuracy": 0.8165041267871856, + "num_tokens": 29280463.0, + "step": 24400 + }, + { + "entropy": 1.9246143698692322, + "epoch": 0.07566886424630664, + "grad_norm": 10.278308868408203, + "learning_rate": 9.19693183871751e-06, + "loss": 0.5943, + "mean_token_accuracy": 0.8151498556137085, + "num_tokens": 29292665.0, + "step": 24410 + }, + { + "entropy": 2.0228449046611785, + "epoch": 0.07569986337135634, + "grad_norm": 9.317980766296387, + "learning_rate": 9.19504849509022e-06, + "loss": 0.6758, + "mean_token_accuracy": 0.7997980415821075, + "num_tokens": 29304324.0, + "step": 24420 + }, + { + "entropy": 1.9563984453678132, + "epoch": 0.07573086249640604, + "grad_norm": 4.3756422996521, + "learning_rate": 9.19316630800009e-06, + "loss": 0.6041, + "mean_token_accuracy": 0.8073103711009025, + "num_tokens": 29316485.0, + "step": 24430 + }, + { + "entropy": 1.9286874875426292, + "epoch": 0.07576186162145573, + "grad_norm": 9.686562538146973, + "learning_rate": 9.191285276263909e-06, + "loss": 0.5309, + "mean_token_accuracy": 0.8244198396801948, + "num_tokens": 29329320.0, + "step": 24440 + }, + { + "entropy": 1.9644469603896142, + "epoch": 0.07579286074650543, + "grad_norm": 8.78291130065918, + "learning_rate": 9.18940539870017e-06, + "loss": 0.5696, + "mean_token_accuracy": 0.8178132340312004, + "num_tokens": 29340967.0, + "step": 24450 + }, + { + "entropy": 1.981376151740551, + "epoch": 0.07582385987155513, + "grad_norm": 10.671409606933594, + "learning_rate": 9.187526674129046e-06, + "loss": 0.6293, + "mean_token_accuracy": 0.807844452559948, + "num_tokens": 29353256.0, + "step": 24460 + }, + { + "entropy": 1.9730531215667724, + "epoch": 0.07585485899660482, + "grad_norm": 9.958746910095215, + "learning_rate": 9.185649101372406e-06, + "loss": 0.5865, + "mean_token_accuracy": 0.8276485189795494, + "num_tokens": 29363717.0, + "step": 24470 + }, + { + "entropy": 1.9136406406760216, + "epoch": 0.07588585812165452, + "grad_norm": 5.672607898712158, + "learning_rate": 9.1837726792538e-06, + "loss": 0.5799, + "mean_token_accuracy": 0.8139689520001412, + "num_tokens": 29375747.0, + "step": 24480 + }, + { + "entropy": 1.8405794501304626, + "epoch": 0.07591685724670422, + "grad_norm": 9.598546028137207, + "learning_rate": 9.18189740659846e-06, + "loss": 0.5324, + "mean_token_accuracy": 0.825704300403595, + "num_tokens": 29388350.0, + "step": 24490 + }, + { + "entropy": 1.9111091524362565, + "epoch": 0.07594785637175391, + "grad_norm": 11.881855964660645, + "learning_rate": 9.180023282233297e-06, + "loss": 0.6201, + "mean_token_accuracy": 0.8082716703414917, + "num_tokens": 29400465.0, + "step": 24500 + }, + { + "entropy": 1.9688442632555962, + "epoch": 0.0759788554968036, + "grad_norm": 9.742838859558105, + "learning_rate": 9.178150304986897e-06, + "loss": 0.5825, + "mean_token_accuracy": 0.8214067354798317, + "num_tokens": 29411568.0, + "step": 24510 + }, + { + "entropy": 1.8937147334218025, + "epoch": 0.07600985462185329, + "grad_norm": 10.891999244689941, + "learning_rate": 9.17627847368952e-06, + "loss": 0.5447, + "mean_token_accuracy": 0.8176762789487839, + "num_tokens": 29424053.0, + "step": 24520 + }, + { + "entropy": 1.831982211768627, + "epoch": 0.07604085374690299, + "grad_norm": 9.699090003967285, + "learning_rate": 9.174407787173092e-06, + "loss": 0.4613, + "mean_token_accuracy": 0.8441127195954323, + "num_tokens": 29436498.0, + "step": 24530 + }, + { + "entropy": 1.829953595995903, + "epoch": 0.07607185287195269, + "grad_norm": 10.976648330688477, + "learning_rate": 9.172538244271205e-06, + "loss": 0.5677, + "mean_token_accuracy": 0.8100987046957016, + "num_tokens": 29449302.0, + "step": 24540 + }, + { + "entropy": 1.8781290456652642, + "epoch": 0.07610285199700238, + "grad_norm": 10.623594284057617, + "learning_rate": 9.170669843819118e-06, + "loss": 0.6207, + "mean_token_accuracy": 0.8065455496311188, + "num_tokens": 29461309.0, + "step": 24550 + }, + { + "entropy": 1.9357499971985817, + "epoch": 0.07613385112205208, + "grad_norm": 5.401109218597412, + "learning_rate": 9.16880258465375e-06, + "loss": 0.6238, + "mean_token_accuracy": 0.809730452299118, + "num_tokens": 29473381.0, + "step": 24560 + }, + { + "entropy": 1.9581341043114662, + "epoch": 0.07616485024710178, + "grad_norm": 9.8287935256958, + "learning_rate": 9.166936465613671e-06, + "loss": 0.606, + "mean_token_accuracy": 0.8168662905693054, + "num_tokens": 29484514.0, + "step": 24570 + }, + { + "entropy": 1.8992894351482392, + "epoch": 0.07619584937215147, + "grad_norm": 10.469278335571289, + "learning_rate": 9.165071485539113e-06, + "loss": 0.5448, + "mean_token_accuracy": 0.8262490287423134, + "num_tokens": 29496684.0, + "step": 24580 + }, + { + "entropy": 1.856746034324169, + "epoch": 0.07622684849720117, + "grad_norm": 7.977566719055176, + "learning_rate": 9.163207643271953e-06, + "loss": 0.5472, + "mean_token_accuracy": 0.8371643677353859, + "num_tokens": 29508911.0, + "step": 24590 + }, + { + "entropy": 1.8318717867136, + "epoch": 0.07625784762225087, + "grad_norm": 7.776499271392822, + "learning_rate": 9.161344937655717e-06, + "loss": 0.5138, + "mean_token_accuracy": 0.8247979655861855, + "num_tokens": 29522385.0, + "step": 24600 + }, + { + "entropy": 1.9623229920864105, + "epoch": 0.07628884674730056, + "grad_norm": 11.194042205810547, + "learning_rate": 9.159483367535581e-06, + "loss": 0.5956, + "mean_token_accuracy": 0.8133369132876396, + "num_tokens": 29533515.0, + "step": 24610 + }, + { + "entropy": 1.8496048122644424, + "epoch": 0.07631984587235026, + "grad_norm": 9.94607162475586, + "learning_rate": 9.157622931758355e-06, + "loss": 0.4642, + "mean_token_accuracy": 0.8247012510895729, + "num_tokens": 29546981.0, + "step": 24620 + }, + { + "entropy": 1.8561659142374993, + "epoch": 0.07635084499739994, + "grad_norm": 4.343288421630859, + "learning_rate": 9.155763629172494e-06, + "loss": 0.5875, + "mean_token_accuracy": 0.8228087961673737, + "num_tokens": 29559906.0, + "step": 24630 + }, + { + "entropy": 1.8643113687634467, + "epoch": 0.07638184412244964, + "grad_norm": 3.739152193069458, + "learning_rate": 9.153905458628086e-06, + "loss": 0.5515, + "mean_token_accuracy": 0.8250072717666626, + "num_tokens": 29572877.0, + "step": 24640 + }, + { + "entropy": 1.9850792646408082, + "epoch": 0.07641284324749933, + "grad_norm": 11.358891487121582, + "learning_rate": 9.152048418976852e-06, + "loss": 0.656, + "mean_token_accuracy": 0.8045062452554703, + "num_tokens": 29584254.0, + "step": 24650 + }, + { + "entropy": 1.930943602323532, + "epoch": 0.07644384237254903, + "grad_norm": 9.818647384643555, + "learning_rate": 9.150192509072147e-06, + "loss": 0.5624, + "mean_token_accuracy": 0.811825430393219, + "num_tokens": 29596694.0, + "step": 24660 + }, + { + "entropy": 1.963192854821682, + "epoch": 0.07647484149759873, + "grad_norm": 10.888368606567383, + "learning_rate": 9.148337727768948e-06, + "loss": 0.6148, + "mean_token_accuracy": 0.8172325447201729, + "num_tokens": 29607626.0, + "step": 24670 + }, + { + "entropy": 1.8728375658392906, + "epoch": 0.07650584062264842, + "grad_norm": 9.656998634338379, + "learning_rate": 9.146484073923858e-06, + "loss": 0.6045, + "mean_token_accuracy": 0.8192928373813629, + "num_tokens": 29620518.0, + "step": 24680 + }, + { + "entropy": 1.9853510811924935, + "epoch": 0.07653683974769812, + "grad_norm": 9.287243843078613, + "learning_rate": 9.144631546395098e-06, + "loss": 0.6036, + "mean_token_accuracy": 0.814111416041851, + "num_tokens": 29632155.0, + "step": 24690 + }, + { + "entropy": 1.974936306476593, + "epoch": 0.07656783887274782, + "grad_norm": 9.676168441772461, + "learning_rate": 9.142780144042515e-06, + "loss": 0.6892, + "mean_token_accuracy": 0.8018039375543594, + "num_tokens": 29643855.0, + "step": 24700 + }, + { + "entropy": 1.9820726066827774, + "epoch": 0.07659883799779751, + "grad_norm": 10.02213191986084, + "learning_rate": 9.140929865727566e-06, + "loss": 0.5892, + "mean_token_accuracy": 0.8112702906131745, + "num_tokens": 29655657.0, + "step": 24710 + }, + { + "entropy": 1.909799675643444, + "epoch": 0.07662983712284721, + "grad_norm": 9.104336738586426, + "learning_rate": 9.139080710313316e-06, + "loss": 0.5391, + "mean_token_accuracy": 0.8222946509718895, + "num_tokens": 29667862.0, + "step": 24720 + }, + { + "entropy": 1.992139181494713, + "epoch": 0.07666083624789691, + "grad_norm": 4.088809013366699, + "learning_rate": 9.137232676664449e-06, + "loss": 0.66, + "mean_token_accuracy": 0.7951893359422684, + "num_tokens": 29679790.0, + "step": 24730 + }, + { + "entropy": 1.9576111137866974, + "epoch": 0.0766918353729466, + "grad_norm": 11.400434494018555, + "learning_rate": 9.135385763647246e-06, + "loss": 0.6328, + "mean_token_accuracy": 0.8115091070532798, + "num_tokens": 29691260.0, + "step": 24740 + }, + { + "entropy": 1.8499963372945785, + "epoch": 0.0767228344979963, + "grad_norm": 9.526920318603516, + "learning_rate": 9.1335399701296e-06, + "loss": 0.5439, + "mean_token_accuracy": 0.8311129599809647, + "num_tokens": 29704733.0, + "step": 24750 + }, + { + "entropy": 1.848706914484501, + "epoch": 0.07675383362304598, + "grad_norm": 9.934120178222656, + "learning_rate": 9.131695294980995e-06, + "loss": 0.5474, + "mean_token_accuracy": 0.831969042122364, + "num_tokens": 29717478.0, + "step": 24760 + }, + { + "entropy": 1.912044520676136, + "epoch": 0.07678483274809568, + "grad_norm": 10.17774772644043, + "learning_rate": 9.129851737072522e-06, + "loss": 0.63, + "mean_token_accuracy": 0.8104033455252647, + "num_tokens": 29728251.0, + "step": 24770 + }, + { + "entropy": 2.015045040845871, + "epoch": 0.07681583187314538, + "grad_norm": 7.955008506774902, + "learning_rate": 9.128009295276862e-06, + "loss": 0.6655, + "mean_token_accuracy": 0.8075503215193749, + "num_tokens": 29739613.0, + "step": 24780 + }, + { + "entropy": 1.8380137085914612, + "epoch": 0.07684683099819507, + "grad_norm": 6.014031887054443, + "learning_rate": 9.126167968468289e-06, + "loss": 0.5133, + "mean_token_accuracy": 0.8174324378371238, + "num_tokens": 29752547.0, + "step": 24790 + }, + { + "entropy": 2.001574045419693, + "epoch": 0.07687783012324477, + "grad_norm": 10.860368728637695, + "learning_rate": 9.124327755522661e-06, + "loss": 0.6676, + "mean_token_accuracy": 0.808248932659626, + "num_tokens": 29762867.0, + "step": 24800 + }, + { + "entropy": 2.0134127497673036, + "epoch": 0.07690882924829447, + "grad_norm": 9.547008514404297, + "learning_rate": 9.122488655317434e-06, + "loss": 0.7027, + "mean_token_accuracy": 0.804673321545124, + "num_tokens": 29773812.0, + "step": 24810 + }, + { + "entropy": 1.8855068862438202, + "epoch": 0.07693982837334416, + "grad_norm": 11.452570915222168, + "learning_rate": 9.120650666731632e-06, + "loss": 0.56, + "mean_token_accuracy": 0.8221283480525017, + "num_tokens": 29785964.0, + "step": 24820 + }, + { + "entropy": 1.8937954485416413, + "epoch": 0.07697082749839386, + "grad_norm": 9.164377212524414, + "learning_rate": 9.118813788645872e-06, + "loss": 0.6028, + "mean_token_accuracy": 0.8206618830561638, + "num_tokens": 29797873.0, + "step": 24830 + }, + { + "entropy": 1.8926649376749993, + "epoch": 0.07700182662344356, + "grad_norm": 10.76089096069336, + "learning_rate": 9.116978019942341e-06, + "loss": 0.5414, + "mean_token_accuracy": 0.8190899342298508, + "num_tokens": 29809898.0, + "step": 24840 + }, + { + "entropy": 1.739300973713398, + "epoch": 0.07703282574849325, + "grad_norm": 4.010213851928711, + "learning_rate": 9.115143359504806e-06, + "loss": 0.4485, + "mean_token_accuracy": 0.8370007425546646, + "num_tokens": 29824298.0, + "step": 24850 + }, + { + "entropy": 1.8673224583268166, + "epoch": 0.07706382487354295, + "grad_norm": 7.60786247253418, + "learning_rate": 9.113309806218598e-06, + "loss": 0.5623, + "mean_token_accuracy": 0.8228920385241508, + "num_tokens": 29836384.0, + "step": 24860 + }, + { + "entropy": 1.8416576564311982, + "epoch": 0.07709482399859265, + "grad_norm": 10.18614387512207, + "learning_rate": 9.111477358970625e-06, + "loss": 0.5068, + "mean_token_accuracy": 0.8315700188279151, + "num_tokens": 29848465.0, + "step": 24870 + }, + { + "entropy": 1.833722086250782, + "epoch": 0.07712582312364233, + "grad_norm": 9.657119750976562, + "learning_rate": 9.109646016649358e-06, + "loss": 0.5351, + "mean_token_accuracy": 0.822961401939392, + "num_tokens": 29860315.0, + "step": 24880 + }, + { + "entropy": 1.9155391678214073, + "epoch": 0.07715682224869203, + "grad_norm": 9.408291816711426, + "learning_rate": 9.107815778144829e-06, + "loss": 0.6015, + "mean_token_accuracy": 0.8233883902430534, + "num_tokens": 29871950.0, + "step": 24890 + }, + { + "entropy": 1.8746279999613762, + "epoch": 0.07718782137374172, + "grad_norm": 9.93213176727295, + "learning_rate": 9.105986642348637e-06, + "loss": 0.6101, + "mean_token_accuracy": 0.8215365827083587, + "num_tokens": 29883850.0, + "step": 24900 + }, + { + "entropy": 1.997341087460518, + "epoch": 0.07721882049879142, + "grad_norm": 10.22244930267334, + "learning_rate": 9.104158608153925e-06, + "loss": 0.6552, + "mean_token_accuracy": 0.8083247482776642, + "num_tokens": 29894752.0, + "step": 24910 + }, + { + "entropy": 1.9021727859973907, + "epoch": 0.07724981962384112, + "grad_norm": 10.58688735961914, + "learning_rate": 9.10233167445541e-06, + "loss": 0.5976, + "mean_token_accuracy": 0.8210710018873215, + "num_tokens": 29906184.0, + "step": 24920 + }, + { + "entropy": 1.9085007548332213, + "epoch": 0.07728081874889081, + "grad_norm": 10.130495071411133, + "learning_rate": 9.100505840149343e-06, + "loss": 0.5777, + "mean_token_accuracy": 0.8177463442087174, + "num_tokens": 29918911.0, + "step": 24930 + }, + { + "entropy": 1.949122653901577, + "epoch": 0.07731181787394051, + "grad_norm": 10.247212409973145, + "learning_rate": 9.09868110413354e-06, + "loss": 0.5698, + "mean_token_accuracy": 0.8257071673870087, + "num_tokens": 29930418.0, + "step": 24940 + }, + { + "entropy": 1.8943613111972808, + "epoch": 0.0773428169989902, + "grad_norm": 8.908804893493652, + "learning_rate": 9.096857465307348e-06, + "loss": 0.5782, + "mean_token_accuracy": 0.8173254355788231, + "num_tokens": 29942796.0, + "step": 24950 + }, + { + "entropy": 1.8852560982108115, + "epoch": 0.0773738161240399, + "grad_norm": 9.335540771484375, + "learning_rate": 9.095034922571667e-06, + "loss": 0.5485, + "mean_token_accuracy": 0.8281083196401596, + "num_tokens": 29954762.0, + "step": 24960 + }, + { + "entropy": 1.9339411437511445, + "epoch": 0.0774048152490896, + "grad_norm": 8.73486042022705, + "learning_rate": 9.093213474828937e-06, + "loss": 0.6839, + "mean_token_accuracy": 0.8056422159075737, + "num_tokens": 29966209.0, + "step": 24970 + }, + { + "entropy": 1.8483040496706962, + "epoch": 0.0774358143741393, + "grad_norm": 16.429473876953125, + "learning_rate": 9.091393120983134e-06, + "loss": 0.5784, + "mean_token_accuracy": 0.8213301405310631, + "num_tokens": 29979277.0, + "step": 24980 + }, + { + "entropy": 1.9246578827500342, + "epoch": 0.07746681349918899, + "grad_norm": 11.819490432739258, + "learning_rate": 9.089573859939769e-06, + "loss": 0.6234, + "mean_token_accuracy": 0.8150523856282235, + "num_tokens": 29991214.0, + "step": 24990 + }, + { + "entropy": 1.8754414036870002, + "epoch": 0.07749781262423869, + "grad_norm": 3.053318500518799, + "learning_rate": 9.087755690605889e-06, + "loss": 0.5299, + "mean_token_accuracy": 0.8337288603186608, + "num_tokens": 30004088.0, + "step": 25000 + }, + { + "entropy": 1.958650803565979, + "epoch": 0.07752881174928837, + "grad_norm": 9.075653076171875, + "learning_rate": 9.085938611890065e-06, + "loss": 0.6506, + "mean_token_accuracy": 0.8138946458697319, + "num_tokens": 30015387.0, + "step": 25010 + }, + { + "entropy": 1.831257238984108, + "epoch": 0.07755981087433807, + "grad_norm": 7.205061912536621, + "learning_rate": 9.084122622702402e-06, + "loss": 0.5113, + "mean_token_accuracy": 0.8284579709172248, + "num_tokens": 30028069.0, + "step": 25020 + }, + { + "entropy": 1.921176479756832, + "epoch": 0.07759080999938776, + "grad_norm": 9.383119583129883, + "learning_rate": 9.082307721954523e-06, + "loss": 0.5628, + "mean_token_accuracy": 0.8308942928910256, + "num_tokens": 30039720.0, + "step": 25030 + }, + { + "entropy": 1.881229367852211, + "epoch": 0.07762180912443746, + "grad_norm": 9.154582023620605, + "learning_rate": 9.080493908559574e-06, + "loss": 0.5715, + "mean_token_accuracy": 0.8158265233039856, + "num_tokens": 30052310.0, + "step": 25040 + }, + { + "entropy": 1.9333179131150247, + "epoch": 0.07765280824948716, + "grad_norm": 10.319096565246582, + "learning_rate": 9.078681181432226e-06, + "loss": 0.656, + "mean_token_accuracy": 0.8071909308433532, + "num_tokens": 30063377.0, + "step": 25050 + }, + { + "entropy": 1.9245691820979118, + "epoch": 0.07768380737453685, + "grad_norm": 9.286566734313965, + "learning_rate": 9.076869539488652e-06, + "loss": 0.5793, + "mean_token_accuracy": 0.8230229288339614, + "num_tokens": 30074509.0, + "step": 25060 + }, + { + "entropy": 1.9185208335518837, + "epoch": 0.07771480649958655, + "grad_norm": 10.67673110961914, + "learning_rate": 9.075058981646555e-06, + "loss": 0.6388, + "mean_token_accuracy": 0.810713978111744, + "num_tokens": 30086905.0, + "step": 25070 + }, + { + "entropy": 1.9412970080971719, + "epoch": 0.07774580562463625, + "grad_norm": 8.668137550354004, + "learning_rate": 9.073249506825138e-06, + "loss": 0.5973, + "mean_token_accuracy": 0.8281246423721313, + "num_tokens": 30098285.0, + "step": 25080 + }, + { + "entropy": 1.9020920038223266, + "epoch": 0.07777680474968594, + "grad_norm": 4.553186416625977, + "learning_rate": 9.071441113945115e-06, + "loss": 0.5749, + "mean_token_accuracy": 0.8125508442521095, + "num_tokens": 30109763.0, + "step": 25090 + }, + { + "entropy": 1.9090612187981606, + "epoch": 0.07780780387473564, + "grad_norm": 9.096549987792969, + "learning_rate": 9.069633801928704e-06, + "loss": 0.5595, + "mean_token_accuracy": 0.8202937185764313, + "num_tokens": 30121903.0, + "step": 25100 + }, + { + "entropy": 1.9104878604412079, + "epoch": 0.07783880299978534, + "grad_norm": 10.058632850646973, + "learning_rate": 9.067827569699626e-06, + "loss": 0.6086, + "mean_token_accuracy": 0.8223781570792198, + "num_tokens": 30134136.0, + "step": 25110 + }, + { + "entropy": 1.9066944912075996, + "epoch": 0.07786980212483503, + "grad_norm": 7.3937835693359375, + "learning_rate": 9.066022416183104e-06, + "loss": 0.6247, + "mean_token_accuracy": 0.8184296682476997, + "num_tokens": 30145673.0, + "step": 25120 + }, + { + "entropy": 1.977784439921379, + "epoch": 0.07790080124988472, + "grad_norm": 8.76796817779541, + "learning_rate": 9.064218340305854e-06, + "loss": 0.6401, + "mean_token_accuracy": 0.8132604837417603, + "num_tokens": 30156471.0, + "step": 25130 + }, + { + "entropy": 1.886358195543289, + "epoch": 0.07793180037493441, + "grad_norm": 8.512279510498047, + "learning_rate": 9.06241534099609e-06, + "loss": 0.5172, + "mean_token_accuracy": 0.8291906744241715, + "num_tokens": 30168512.0, + "step": 25140 + }, + { + "entropy": 1.9338295564055443, + "epoch": 0.07796279949998411, + "grad_norm": 8.981732368469238, + "learning_rate": 9.060613417183516e-06, + "loss": 0.6373, + "mean_token_accuracy": 0.8155599415302277, + "num_tokens": 30180559.0, + "step": 25150 + }, + { + "entropy": 1.851930246502161, + "epoch": 0.0779937986250338, + "grad_norm": 11.297672271728516, + "learning_rate": 9.058812567799327e-06, + "loss": 0.4845, + "mean_token_accuracy": 0.8301180645823478, + "num_tokens": 30194350.0, + "step": 25160 + }, + { + "entropy": 1.932929702103138, + "epoch": 0.0780247977500835, + "grad_norm": 4.530979156494141, + "learning_rate": 9.0570127917762e-06, + "loss": 0.5372, + "mean_token_accuracy": 0.823866055905819, + "num_tokens": 30206582.0, + "step": 25170 + }, + { + "entropy": 1.9074819549918174, + "epoch": 0.0780557968751332, + "grad_norm": 4.458383560180664, + "learning_rate": 9.055214088048302e-06, + "loss": 0.5628, + "mean_token_accuracy": 0.8161739498376847, + "num_tokens": 30218782.0, + "step": 25180 + }, + { + "entropy": 1.9132570594549179, + "epoch": 0.0780867960001829, + "grad_norm": Infinity, + "learning_rate": 9.053416455551274e-06, + "loss": 0.5894, + "mean_token_accuracy": 0.8196236163377761, + "num_tokens": 30230586.0, + "step": 25190 + }, + { + "entropy": 1.8898057445883751, + "epoch": 0.07811779512523259, + "grad_norm": 10.808612823486328, + "learning_rate": 9.051619893222242e-06, + "loss": 0.5555, + "mean_token_accuracy": 0.8312712505459785, + "num_tokens": 30242839.0, + "step": 25200 + }, + { + "entropy": 1.8855190083384514, + "epoch": 0.07814879425028229, + "grad_norm": 10.371384620666504, + "learning_rate": 9.049824399999807e-06, + "loss": 0.5713, + "mean_token_accuracy": 0.8088703706860543, + "num_tokens": 30255525.0, + "step": 25210 + }, + { + "entropy": 1.8693858250975608, + "epoch": 0.07817979337533199, + "grad_norm": 11.21320629119873, + "learning_rate": 9.048029974824037e-06, + "loss": 0.5543, + "mean_token_accuracy": 0.826404581964016, + "num_tokens": 30268078.0, + "step": 25220 + }, + { + "entropy": 1.9001759216189384, + "epoch": 0.07821079250038168, + "grad_norm": 8.04379940032959, + "learning_rate": 9.046236616636477e-06, + "loss": 0.6188, + "mean_token_accuracy": 0.8241765573620796, + "num_tokens": 30280040.0, + "step": 25230 + }, + { + "entropy": 1.9273234009742737, + "epoch": 0.07824179162543138, + "grad_norm": 11.019646644592285, + "learning_rate": 9.044444324380139e-06, + "loss": 0.5832, + "mean_token_accuracy": 0.8227365925908089, + "num_tokens": 30290832.0, + "step": 25240 + }, + { + "entropy": 1.9361531496047975, + "epoch": 0.07827279075048106, + "grad_norm": 3.8891756534576416, + "learning_rate": 9.042653096999496e-06, + "loss": 0.6237, + "mean_token_accuracy": 0.8081121280789375, + "num_tokens": 30302299.0, + "step": 25250 + }, + { + "entropy": 1.96910317838192, + "epoch": 0.07830378987553076, + "grad_norm": 8.430257797241211, + "learning_rate": 9.04086293344049e-06, + "loss": 0.5955, + "mean_token_accuracy": 0.8174209460616112, + "num_tokens": 30313745.0, + "step": 25260 + }, + { + "entropy": 1.8507313832640648, + "epoch": 0.07833478900058045, + "grad_norm": 10.676153182983398, + "learning_rate": 9.039073832650518e-06, + "loss": 0.5944, + "mean_token_accuracy": 0.8205841019749641, + "num_tokens": 30326883.0, + "step": 25270 + }, + { + "entropy": 1.963607743382454, + "epoch": 0.07836578812563015, + "grad_norm": 10.588327407836914, + "learning_rate": 9.037285793578439e-06, + "loss": 0.6089, + "mean_token_accuracy": 0.8108539909124375, + "num_tokens": 30338102.0, + "step": 25280 + }, + { + "entropy": 1.9715974554419518, + "epoch": 0.07839678725067985, + "grad_norm": 9.336774826049805, + "learning_rate": 9.035498815174564e-06, + "loss": 0.6256, + "mean_token_accuracy": 0.8076756104826928, + "num_tokens": 30349489.0, + "step": 25290 + }, + { + "entropy": 1.9550383672118188, + "epoch": 0.07842778637572954, + "grad_norm": 9.92357063293457, + "learning_rate": 9.033712896390654e-06, + "loss": 0.6597, + "mean_token_accuracy": 0.803634025156498, + "num_tokens": 30361182.0, + "step": 25300 + }, + { + "entropy": 1.9395698383450508, + "epoch": 0.07845878550077924, + "grad_norm": 8.71617603302002, + "learning_rate": 9.031928036179925e-06, + "loss": 0.631, + "mean_token_accuracy": 0.8065466418862343, + "num_tokens": 30372996.0, + "step": 25310 + }, + { + "entropy": 1.847324576973915, + "epoch": 0.07848978462582894, + "grad_norm": 5.6480584144592285, + "learning_rate": 9.030144233497038e-06, + "loss": 0.5279, + "mean_token_accuracy": 0.8144370660185813, + "num_tokens": 30386024.0, + "step": 25320 + }, + { + "entropy": 1.8323638439178467, + "epoch": 0.07852078375087863, + "grad_norm": 10.176698684692383, + "learning_rate": 9.028361487298097e-06, + "loss": 0.5583, + "mean_token_accuracy": 0.8260697081685067, + "num_tokens": 30399072.0, + "step": 25330 + }, + { + "entropy": 1.9654901623725891, + "epoch": 0.07855178287592833, + "grad_norm": 10.919507026672363, + "learning_rate": 9.026579796540651e-06, + "loss": 0.6562, + "mean_token_accuracy": 0.8032170712947846, + "num_tokens": 30410445.0, + "step": 25340 + }, + { + "entropy": 1.8728495821356774, + "epoch": 0.07858278200097803, + "grad_norm": 9.558503150939941, + "learning_rate": 9.024799160183686e-06, + "loss": 0.5595, + "mean_token_accuracy": 0.82089733928442, + "num_tokens": 30422438.0, + "step": 25350 + }, + { + "entropy": 1.8767936065793038, + "epoch": 0.07861378112602772, + "grad_norm": 6.74028205871582, + "learning_rate": 9.023019577187625e-06, + "loss": 0.5351, + "mean_token_accuracy": 0.8307792738080024, + "num_tokens": 30435068.0, + "step": 25360 + }, + { + "entropy": 1.9264275386929512, + "epoch": 0.07864478025107742, + "grad_norm": 9.8104829788208, + "learning_rate": 9.021241046514326e-06, + "loss": 0.5865, + "mean_token_accuracy": 0.8222647801041603, + "num_tokens": 30446563.0, + "step": 25370 + }, + { + "entropy": 1.93300940990448, + "epoch": 0.0786757793761271, + "grad_norm": 9.438268661499023, + "learning_rate": 9.019463567127084e-06, + "loss": 0.6245, + "mean_token_accuracy": 0.813123632967472, + "num_tokens": 30457702.0, + "step": 25380 + }, + { + "entropy": 1.9117926597595214, + "epoch": 0.0787067785011768, + "grad_norm": 10.080045700073242, + "learning_rate": 9.017687137990611e-06, + "loss": 0.6594, + "mean_token_accuracy": 0.8094756618142128, + "num_tokens": 30469833.0, + "step": 25390 + }, + { + "entropy": 1.9238570496439933, + "epoch": 0.0787377776262265, + "grad_norm": 9.324179649353027, + "learning_rate": 9.01591175807106e-06, + "loss": 0.6385, + "mean_token_accuracy": 0.8032417684793473, + "num_tokens": 30481722.0, + "step": 25400 + }, + { + "entropy": 1.9335221163928509, + "epoch": 0.07876877675127619, + "grad_norm": 10.291169166564941, + "learning_rate": 9.014137426335997e-06, + "loss": 0.6627, + "mean_token_accuracy": 0.8041873052716255, + "num_tokens": 30494711.0, + "step": 25410 + }, + { + "entropy": 1.8992304280400276, + "epoch": 0.07879977587632589, + "grad_norm": 9.40573787689209, + "learning_rate": 9.012364141754415e-06, + "loss": 0.5454, + "mean_token_accuracy": 0.8209843635559082, + "num_tokens": 30506481.0, + "step": 25420 + }, + { + "entropy": 1.9901362270116807, + "epoch": 0.07883077500137559, + "grad_norm": 11.049663543701172, + "learning_rate": 9.010591903296726e-06, + "loss": 0.652, + "mean_token_accuracy": 0.8064600333571434, + "num_tokens": 30517738.0, + "step": 25430 + }, + { + "entropy": 1.9595788344740868, + "epoch": 0.07886177412642528, + "grad_norm": 9.368852615356445, + "learning_rate": 9.008820709934756e-06, + "loss": 0.6615, + "mean_token_accuracy": 0.7978012830018997, + "num_tokens": 30529230.0, + "step": 25440 + }, + { + "entropy": 1.8642796009778977, + "epoch": 0.07889277325147498, + "grad_norm": 5.745087146759033, + "learning_rate": 9.00705056064175e-06, + "loss": 0.4942, + "mean_token_accuracy": 0.8303910657763481, + "num_tokens": 30542462.0, + "step": 25450 + }, + { + "entropy": 1.890346498787403, + "epoch": 0.07892377237652468, + "grad_norm": 12.036408424377441, + "learning_rate": 9.00528145439236e-06, + "loss": 0.5508, + "mean_token_accuracy": 0.8230859890580178, + "num_tokens": 30554735.0, + "step": 25460 + }, + { + "entropy": 1.9690846800804138, + "epoch": 0.07895477150157437, + "grad_norm": 10.138855934143066, + "learning_rate": 9.00351339016265e-06, + "loss": 0.6098, + "mean_token_accuracy": 0.8199924737215042, + "num_tokens": 30566201.0, + "step": 25470 + }, + { + "entropy": 2.0111054688692094, + "epoch": 0.07898577062662407, + "grad_norm": 8.961263656616211, + "learning_rate": 9.001746366930088e-06, + "loss": 0.6215, + "mean_token_accuracy": 0.8159183651208878, + "num_tokens": 30577578.0, + "step": 25480 + }, + { + "entropy": 1.9971545487642288, + "epoch": 0.07901676975167377, + "grad_norm": 8.601164817810059, + "learning_rate": 8.99998038367355e-06, + "loss": 0.6461, + "mean_token_accuracy": 0.8124226480722427, + "num_tokens": 30589416.0, + "step": 25490 + }, + { + "entropy": 2.0272182136774064, + "epoch": 0.07904776887672345, + "grad_norm": 9.910305976867676, + "learning_rate": 8.99821543937331e-06, + "loss": 0.7675, + "mean_token_accuracy": 0.8009614288806916, + "num_tokens": 30601049.0, + "step": 25500 + }, + { + "entropy": 1.9942003041505814, + "epoch": 0.07907876800177314, + "grad_norm": 9.830111503601074, + "learning_rate": 8.996451533011044e-06, + "loss": 0.62, + "mean_token_accuracy": 0.8120215192437172, + "num_tokens": 30611769.0, + "step": 25510 + }, + { + "entropy": 1.929137869179249, + "epoch": 0.07910976712682284, + "grad_norm": 10.363052368164062, + "learning_rate": 8.994688663569825e-06, + "loss": 0.6134, + "mean_token_accuracy": 0.8164379015564919, + "num_tokens": 30623572.0, + "step": 25520 + }, + { + "entropy": 1.9560409665107727, + "epoch": 0.07914076625187254, + "grad_norm": 10.759134292602539, + "learning_rate": 8.992926830034117e-06, + "loss": 0.609, + "mean_token_accuracy": 0.8174306198954582, + "num_tokens": 30634807.0, + "step": 25530 + }, + { + "entropy": 2.0076246559619904, + "epoch": 0.07917176537692223, + "grad_norm": 10.531962394714355, + "learning_rate": 8.991166031389779e-06, + "loss": 0.7254, + "mean_token_accuracy": 0.7958659037947655, + "num_tokens": 30645607.0, + "step": 25540 + }, + { + "entropy": 1.8035003036260604, + "epoch": 0.07920276450197193, + "grad_norm": 8.862020492553711, + "learning_rate": 8.989406266624054e-06, + "loss": 0.4378, + "mean_token_accuracy": 0.8288759797811508, + "num_tokens": 30659160.0, + "step": 25550 + }, + { + "entropy": 1.9678311094641685, + "epoch": 0.07923376362702163, + "grad_norm": 9.012432098388672, + "learning_rate": 8.987647534725585e-06, + "loss": 0.6402, + "mean_token_accuracy": 0.8036282330751419, + "num_tokens": 30670411.0, + "step": 25560 + }, + { + "entropy": 1.9711192145943641, + "epoch": 0.07926476275207132, + "grad_norm": 10.310850143432617, + "learning_rate": 8.985889834684384e-06, + "loss": 0.6003, + "mean_token_accuracy": 0.8189939826726913, + "num_tokens": 30682267.0, + "step": 25570 + }, + { + "entropy": 1.792180709540844, + "epoch": 0.07929576187712102, + "grad_norm": 12.734272956848145, + "learning_rate": 8.984133165491855e-06, + "loss": 0.4795, + "mean_token_accuracy": 0.8344605669379235, + "num_tokens": 30695155.0, + "step": 25580 + }, + { + "entropy": 1.9018129274249076, + "epoch": 0.07932676100217072, + "grad_norm": 9.23828125, + "learning_rate": 8.982377526140776e-06, + "loss": 0.6423, + "mean_token_accuracy": 0.8115654811263084, + "num_tokens": 30708151.0, + "step": 25590 + }, + { + "entropy": 1.9214498043060302, + "epoch": 0.07935776012722041, + "grad_norm": 4.616427421569824, + "learning_rate": 8.98062291562531e-06, + "loss": 0.5874, + "mean_token_accuracy": 0.817217455804348, + "num_tokens": 30719730.0, + "step": 25600 + }, + { + "entropy": 1.8673983976244926, + "epoch": 0.07938875925227011, + "grad_norm": 9.74424934387207, + "learning_rate": 8.978869332940982e-06, + "loss": 0.5426, + "mean_token_accuracy": 0.8221541255712509, + "num_tokens": 30732691.0, + "step": 25610 + }, + { + "entropy": 1.8662855371832847, + "epoch": 0.0794197583773198, + "grad_norm": 10.79581069946289, + "learning_rate": 8.977116777084705e-06, + "loss": 0.5544, + "mean_token_accuracy": 0.8190583303570748, + "num_tokens": 30745114.0, + "step": 25620 + }, + { + "entropy": 2.0325272887945176, + "epoch": 0.07945075750236949, + "grad_norm": 10.473299026489258, + "learning_rate": 8.97536524705475e-06, + "loss": 0.677, + "mean_token_accuracy": 0.8067957848310471, + "num_tokens": 30756265.0, + "step": 25630 + }, + { + "entropy": 1.974963715672493, + "epoch": 0.07948175662741919, + "grad_norm": 8.578821182250977, + "learning_rate": 8.97361474185076e-06, + "loss": 0.5835, + "mean_token_accuracy": 0.8203337907791137, + "num_tokens": 30767729.0, + "step": 25640 + }, + { + "entropy": 1.9518730476498605, + "epoch": 0.07951275575246888, + "grad_norm": 9.881929397583008, + "learning_rate": 8.971865260473745e-06, + "loss": 0.6164, + "mean_token_accuracy": 0.8127556294202805, + "num_tokens": 30779093.0, + "step": 25650 + }, + { + "entropy": 1.9101403415203095, + "epoch": 0.07954375487751858, + "grad_norm": 9.09304428100586, + "learning_rate": 8.970116801926072e-06, + "loss": 0.5419, + "mean_token_accuracy": 0.833724494278431, + "num_tokens": 30790837.0, + "step": 25660 + }, + { + "entropy": 1.8745130628347397, + "epoch": 0.07957475400256828, + "grad_norm": 9.2150239944458, + "learning_rate": 8.968369365211478e-06, + "loss": 0.5464, + "mean_token_accuracy": 0.8339875742793084, + "num_tokens": 30802641.0, + "step": 25670 + }, + { + "entropy": 1.94149309694767, + "epoch": 0.07960575312761797, + "grad_norm": 9.093975067138672, + "learning_rate": 8.966622949335044e-06, + "loss": 0.6163, + "mean_token_accuracy": 0.8239960089325905, + "num_tokens": 30813720.0, + "step": 25680 + }, + { + "entropy": 1.8855563715100288, + "epoch": 0.07963675225266767, + "grad_norm": 4.550987243652344, + "learning_rate": 8.964877553303222e-06, + "loss": 0.5868, + "mean_token_accuracy": 0.8233138158917427, + "num_tokens": 30825831.0, + "step": 25690 + }, + { + "entropy": 1.9300659999251366, + "epoch": 0.07966775137771737, + "grad_norm": 10.610151290893555, + "learning_rate": 8.963133176123809e-06, + "loss": 0.5674, + "mean_token_accuracy": 0.8244445592164993, + "num_tokens": 30837291.0, + "step": 25700 + }, + { + "entropy": 1.9518328681588173, + "epoch": 0.07969875050276706, + "grad_norm": 10.140637397766113, + "learning_rate": 8.96138981680595e-06, + "loss": 0.6133, + "mean_token_accuracy": 0.8174855709075928, + "num_tokens": 30848725.0, + "step": 25710 + }, + { + "entropy": 1.9481216147542, + "epoch": 0.07972974962781676, + "grad_norm": 5.342643737792969, + "learning_rate": 8.959647474360146e-06, + "loss": 0.6286, + "mean_token_accuracy": 0.8104046359658241, + "num_tokens": 30860275.0, + "step": 25720 + }, + { + "entropy": 1.9668898478150367, + "epoch": 0.07976074875286646, + "grad_norm": 9.690641403198242, + "learning_rate": 8.95790614779824e-06, + "loss": 0.644, + "mean_token_accuracy": 0.8048096477985383, + "num_tokens": 30872020.0, + "step": 25730 + }, + { + "entropy": 1.895005388557911, + "epoch": 0.07979174787791615, + "grad_norm": 13.030941009521484, + "learning_rate": 8.956165836133419e-06, + "loss": 0.6495, + "mean_token_accuracy": 0.8069834470748901, + "num_tokens": 30884853.0, + "step": 25740 + }, + { + "entropy": 1.9691629260778427, + "epoch": 0.07982274700296584, + "grad_norm": 11.10064697265625, + "learning_rate": 8.954426538380212e-06, + "loss": 0.6367, + "mean_token_accuracy": 0.8148859232664108, + "num_tokens": 30895623.0, + "step": 25750 + }, + { + "entropy": 1.9379816070199012, + "epoch": 0.07985374612801553, + "grad_norm": 7.397519588470459, + "learning_rate": 8.952688253554488e-06, + "loss": 0.6139, + "mean_token_accuracy": 0.820967635512352, + "num_tokens": 30907044.0, + "step": 25760 + }, + { + "entropy": 1.8707928493618966, + "epoch": 0.07988474525306523, + "grad_norm": 8.89204216003418, + "learning_rate": 8.950950980673451e-06, + "loss": 0.5938, + "mean_token_accuracy": 0.8097473427653312, + "num_tokens": 30919270.0, + "step": 25770 + }, + { + "entropy": 1.8771313697099685, + "epoch": 0.07991574437811493, + "grad_norm": 9.017126083374023, + "learning_rate": 8.949214718755644e-06, + "loss": 0.6379, + "mean_token_accuracy": 0.8157517641782761, + "num_tokens": 30931930.0, + "step": 25780 + }, + { + "entropy": 1.9514517337083817, + "epoch": 0.07994674350316462, + "grad_norm": 9.412887573242188, + "learning_rate": 8.947479466820933e-06, + "loss": 0.6404, + "mean_token_accuracy": 0.807883444428444, + "num_tokens": 30943436.0, + "step": 25790 + }, + { + "entropy": 1.9551150798797607, + "epoch": 0.07997774262821432, + "grad_norm": 10.580997467041016, + "learning_rate": 8.945745223890525e-06, + "loss": 0.635, + "mean_token_accuracy": 0.8118805930018425, + "num_tokens": 30955093.0, + "step": 25800 + }, + { + "entropy": 1.9215492516756059, + "epoch": 0.08000874175326401, + "grad_norm": 4.973093509674072, + "learning_rate": 8.944011988986943e-06, + "loss": 0.6226, + "mean_token_accuracy": 0.8189863070845604, + "num_tokens": 30966468.0, + "step": 25810 + }, + { + "entropy": 1.9604627519845963, + "epoch": 0.08003974087831371, + "grad_norm": 11.397422790527344, + "learning_rate": 8.942279761134045e-06, + "loss": 0.6099, + "mean_token_accuracy": 0.8068236276507378, + "num_tokens": 30978392.0, + "step": 25820 + }, + { + "entropy": 1.8480182617902756, + "epoch": 0.08007074000336341, + "grad_norm": 4.0751261711120605, + "learning_rate": 8.940548539357008e-06, + "loss": 0.5905, + "mean_token_accuracy": 0.81248699426651, + "num_tokens": 30992369.0, + "step": 25830 + }, + { + "entropy": 1.8566480681300164, + "epoch": 0.0801017391284131, + "grad_norm": 6.9857916831970215, + "learning_rate": 8.938818322682328e-06, + "loss": 0.5375, + "mean_token_accuracy": 0.8251223266124725, + "num_tokens": 31005133.0, + "step": 25840 + }, + { + "entropy": 1.951548993587494, + "epoch": 0.0801327382534628, + "grad_norm": 9.003771781921387, + "learning_rate": 8.937089110137822e-06, + "loss": 0.6049, + "mean_token_accuracy": 0.812137958407402, + "num_tokens": 31017000.0, + "step": 25850 + }, + { + "entropy": 1.9318700328469276, + "epoch": 0.0801637373785125, + "grad_norm": 10.689835548400879, + "learning_rate": 8.935360900752618e-06, + "loss": 0.5753, + "mean_token_accuracy": 0.8234897822141647, + "num_tokens": 31028427.0, + "step": 25860 + }, + { + "entropy": 1.9848022490739823, + "epoch": 0.08019473650356218, + "grad_norm": 9.83168888092041, + "learning_rate": 8.933633693557168e-06, + "loss": 0.6557, + "mean_token_accuracy": 0.8102818682789803, + "num_tokens": 31039440.0, + "step": 25870 + }, + { + "entropy": 1.8783807411789895, + "epoch": 0.08022573562861188, + "grad_norm": 10.153984069824219, + "learning_rate": 8.931907487583224e-06, + "loss": 0.5498, + "mean_token_accuracy": 0.8286775290966034, + "num_tokens": 31051086.0, + "step": 25880 + }, + { + "entropy": 1.9235938981175422, + "epoch": 0.08025673475366157, + "grad_norm": 10.353632926940918, + "learning_rate": 8.930182281863854e-06, + "loss": 0.6119, + "mean_token_accuracy": 0.8167414903640747, + "num_tokens": 31062372.0, + "step": 25890 + }, + { + "entropy": 1.9412924006581307, + "epoch": 0.08028773387871127, + "grad_norm": 9.35558032989502, + "learning_rate": 8.928458075433428e-06, + "loss": 0.6887, + "mean_token_accuracy": 0.7980827406048775, + "num_tokens": 31074103.0, + "step": 25900 + }, + { + "entropy": 1.9117850810289383, + "epoch": 0.08031873300376097, + "grad_norm": 8.50671100616455, + "learning_rate": 8.926734867327626e-06, + "loss": 0.5881, + "mean_token_accuracy": 0.80945935100317, + "num_tokens": 31086274.0, + "step": 25910 + }, + { + "entropy": 1.9611097499728203, + "epoch": 0.08034973212881066, + "grad_norm": 8.947006225585938, + "learning_rate": 8.925012656583428e-06, + "loss": 0.6327, + "mean_token_accuracy": 0.8106518238782883, + "num_tokens": 31097609.0, + "step": 25920 + }, + { + "entropy": 1.941481387615204, + "epoch": 0.08038073125386036, + "grad_norm": 4.701364040374756, + "learning_rate": 8.923291442239114e-06, + "loss": 0.5451, + "mean_token_accuracy": 0.8272410377860069, + "num_tokens": 31109738.0, + "step": 25930 + }, + { + "entropy": 1.9850254267454148, + "epoch": 0.08041173037891006, + "grad_norm": 10.166258811950684, + "learning_rate": 8.921571223334262e-06, + "loss": 0.6038, + "mean_token_accuracy": 0.8136166602373123, + "num_tokens": 31120962.0, + "step": 25940 + }, + { + "entropy": 1.8603035241365433, + "epoch": 0.08044272950395975, + "grad_norm": 9.798973083496094, + "learning_rate": 8.919851998909738e-06, + "loss": 0.5282, + "mean_token_accuracy": 0.833336153626442, + "num_tokens": 31133382.0, + "step": 25950 + }, + { + "entropy": 1.9528677210211753, + "epoch": 0.08047372862900945, + "grad_norm": 10.009119033813477, + "learning_rate": 8.918133768007718e-06, + "loss": 0.6604, + "mean_token_accuracy": 0.806072898209095, + "num_tokens": 31144362.0, + "step": 25960 + }, + { + "entropy": 1.8188411056995393, + "epoch": 0.08050472775405915, + "grad_norm": 8.478353500366211, + "learning_rate": 8.916416529671652e-06, + "loss": 0.5355, + "mean_token_accuracy": 0.8259258419275284, + "num_tokens": 31157130.0, + "step": 25970 + }, + { + "entropy": 1.855710941553116, + "epoch": 0.08053572687910884, + "grad_norm": 4.531567096710205, + "learning_rate": 8.914700282946292e-06, + "loss": 0.5377, + "mean_token_accuracy": 0.826113897562027, + "num_tokens": 31169756.0, + "step": 25980 + }, + { + "entropy": 1.8293490082025528, + "epoch": 0.08056672600415853, + "grad_norm": 4.700442790985107, + "learning_rate": 8.912985026877668e-06, + "loss": 0.5051, + "mean_token_accuracy": 0.8252740427851677, + "num_tokens": 31182565.0, + "step": 25990 + }, + { + "entropy": 1.8819095849990846, + "epoch": 0.08059772512920822, + "grad_norm": 10.227608680725098, + "learning_rate": 8.911270760513097e-06, + "loss": 0.5533, + "mean_token_accuracy": 0.8177149787545204, + "num_tokens": 31194457.0, + "step": 26000 + }, + { + "entropy": 1.9495192840695381, + "epoch": 0.08062872425425792, + "grad_norm": 10.328841209411621, + "learning_rate": 8.909557482901179e-06, + "loss": 0.5876, + "mean_token_accuracy": 0.8159580215811729, + "num_tokens": 31205886.0, + "step": 26010 + }, + { + "entropy": 1.9724682122468948, + "epoch": 0.08065972337930762, + "grad_norm": 10.351836204528809, + "learning_rate": 8.907845193091793e-06, + "loss": 0.6892, + "mean_token_accuracy": 0.7933256924152374, + "num_tokens": 31216798.0, + "step": 26020 + }, + { + "entropy": 1.8941216632723807, + "epoch": 0.08069072250435731, + "grad_norm": 11.160825729370117, + "learning_rate": 8.906133890136095e-06, + "loss": 0.5391, + "mean_token_accuracy": 0.8280467942357064, + "num_tokens": 31229398.0, + "step": 26030 + }, + { + "entropy": 1.9749949797987938, + "epoch": 0.08072172162940701, + "grad_norm": 7.923883438110352, + "learning_rate": 8.90442357308652e-06, + "loss": 0.6518, + "mean_token_accuracy": 0.8020041942596435, + "num_tokens": 31240647.0, + "step": 26040 + }, + { + "entropy": 1.9157247439026832, + "epoch": 0.0807527207544567, + "grad_norm": 8.247703552246094, + "learning_rate": 8.902714240996773e-06, + "loss": 0.5723, + "mean_token_accuracy": 0.8275446742773056, + "num_tokens": 31253058.0, + "step": 26050 + }, + { + "entropy": 1.9141668871045112, + "epoch": 0.0807837198795064, + "grad_norm": 4.465513706207275, + "learning_rate": 8.901005892921827e-06, + "loss": 0.5543, + "mean_token_accuracy": 0.81632010191679, + "num_tokens": 31265597.0, + "step": 26060 + }, + { + "entropy": 1.998997524380684, + "epoch": 0.0808147190045561, + "grad_norm": 9.465398788452148, + "learning_rate": 8.899298527917932e-06, + "loss": 0.625, + "mean_token_accuracy": 0.8183861076831818, + "num_tokens": 31276394.0, + "step": 26070 + }, + { + "entropy": 1.9701352685689926, + "epoch": 0.0808457181296058, + "grad_norm": 9.879446029663086, + "learning_rate": 8.897592145042599e-06, + "loss": 0.6342, + "mean_token_accuracy": 0.8082750916481019, + "num_tokens": 31287446.0, + "step": 26080 + }, + { + "entropy": 1.8816271275281906, + "epoch": 0.08087671725465549, + "grad_norm": 10.568394660949707, + "learning_rate": 8.895886743354607e-06, + "loss": 0.538, + "mean_token_accuracy": 0.8256219759583473, + "num_tokens": 31299916.0, + "step": 26090 + }, + { + "entropy": 1.9000601902604104, + "epoch": 0.08090771637970519, + "grad_norm": 9.578839302062988, + "learning_rate": 8.89418232191399e-06, + "loss": 0.57, + "mean_token_accuracy": 0.8156951382756233, + "num_tokens": 31313478.0, + "step": 26100 + }, + { + "entropy": 1.9269771069288253, + "epoch": 0.08093871550475489, + "grad_norm": 5.241694927215576, + "learning_rate": 8.892478879782055e-06, + "loss": 0.5895, + "mean_token_accuracy": 0.822375500202179, + "num_tokens": 31325006.0, + "step": 26110 + }, + { + "entropy": 1.915390558540821, + "epoch": 0.08096971462980457, + "grad_norm": 10.392666816711426, + "learning_rate": 8.89077641602135e-06, + "loss": 0.5677, + "mean_token_accuracy": 0.8208850100636482, + "num_tokens": 31336358.0, + "step": 26120 + }, + { + "entropy": 1.8701285541057586, + "epoch": 0.08100071375485426, + "grad_norm": 8.39531135559082, + "learning_rate": 8.889074929695697e-06, + "loss": 0.5223, + "mean_token_accuracy": 0.8237580001354218, + "num_tokens": 31349262.0, + "step": 26130 + }, + { + "entropy": 1.9644879460334779, + "epoch": 0.08103171287990396, + "grad_norm": 8.174306869506836, + "learning_rate": 8.887374419870162e-06, + "loss": 0.6391, + "mean_token_accuracy": 0.8160557880997658, + "num_tokens": 31360978.0, + "step": 26140 + }, + { + "entropy": 1.870239832997322, + "epoch": 0.08106271200495366, + "grad_norm": 4.718651294708252, + "learning_rate": 8.885674885611059e-06, + "loss": 0.5204, + "mean_token_accuracy": 0.8182685613632202, + "num_tokens": 31374020.0, + "step": 26150 + }, + { + "entropy": 1.8591389670968055, + "epoch": 0.08109371113000335, + "grad_norm": 8.56635570526123, + "learning_rate": 8.883976325985959e-06, + "loss": 0.5134, + "mean_token_accuracy": 0.8392164379358291, + "num_tokens": 31386295.0, + "step": 26160 + }, + { + "entropy": 2.010419914126396, + "epoch": 0.08112471025505305, + "grad_norm": 10.94079303741455, + "learning_rate": 8.882278740063671e-06, + "loss": 0.7017, + "mean_token_accuracy": 0.804059025645256, + "num_tokens": 31397492.0, + "step": 26170 + }, + { + "entropy": 1.9494641929864884, + "epoch": 0.08115570938010275, + "grad_norm": 3.204777717590332, + "learning_rate": 8.880582126914265e-06, + "loss": 0.6059, + "mean_token_accuracy": 0.8109059870243073, + "num_tokens": 31408976.0, + "step": 26180 + }, + { + "entropy": 1.9177653357386588, + "epoch": 0.08118670850515244, + "grad_norm": 9.458409309387207, + "learning_rate": 8.878886485609038e-06, + "loss": 0.5732, + "mean_token_accuracy": 0.8205662295222282, + "num_tokens": 31421691.0, + "step": 26190 + }, + { + "entropy": 1.941348561644554, + "epoch": 0.08121770763020214, + "grad_norm": 10.314962387084961, + "learning_rate": 8.877191815220537e-06, + "loss": 0.609, + "mean_token_accuracy": 0.8177227556705475, + "num_tokens": 31433019.0, + "step": 26200 + }, + { + "entropy": 1.8470569260418415, + "epoch": 0.08124870675525184, + "grad_norm": 10.691038131713867, + "learning_rate": 8.87549811482254e-06, + "loss": 0.4794, + "mean_token_accuracy": 0.8288433074951171, + "num_tokens": 31446595.0, + "step": 26210 + }, + { + "entropy": 1.9533916339278221, + "epoch": 0.08127970588030153, + "grad_norm": 4.70416784286499, + "learning_rate": 8.873805383490072e-06, + "loss": 0.5795, + "mean_token_accuracy": 0.8235921949148178, + "num_tokens": 31458289.0, + "step": 26220 + }, + { + "entropy": 1.9918930500745773, + "epoch": 0.08131070500535123, + "grad_norm": 10.034157752990723, + "learning_rate": 8.872113620299381e-06, + "loss": 0.6885, + "mean_token_accuracy": 0.8030604913830757, + "num_tokens": 31469163.0, + "step": 26230 + }, + { + "entropy": 1.8951246917247773, + "epoch": 0.08134170413040091, + "grad_norm": 8.459050178527832, + "learning_rate": 8.870422824327956e-06, + "loss": 0.4934, + "mean_token_accuracy": 0.8330509856343269, + "num_tokens": 31482597.0, + "step": 26240 + }, + { + "entropy": 1.901189935207367, + "epoch": 0.08137270325545061, + "grad_norm": 4.526102542877197, + "learning_rate": 8.868732994654511e-06, + "loss": 0.5578, + "mean_token_accuracy": 0.8234093725681305, + "num_tokens": 31495707.0, + "step": 26250 + }, + { + "entropy": 1.8694349378347397, + "epoch": 0.0814037023805003, + "grad_norm": 8.740363121032715, + "learning_rate": 8.86704413035899e-06, + "loss": 0.5515, + "mean_token_accuracy": 0.8234608098864555, + "num_tokens": 31507761.0, + "step": 26260 + }, + { + "entropy": 1.90776207447052, + "epoch": 0.08143470150555, + "grad_norm": 8.299092292785645, + "learning_rate": 8.865356230522566e-06, + "loss": 0.5869, + "mean_token_accuracy": 0.8166111215949059, + "num_tokens": 31519733.0, + "step": 26270 + }, + { + "entropy": 1.988849925994873, + "epoch": 0.0814657006305997, + "grad_norm": 10.71021842956543, + "learning_rate": 8.86366929422763e-06, + "loss": 0.6454, + "mean_token_accuracy": 0.8144510120153428, + "num_tokens": 31530442.0, + "step": 26280 + }, + { + "entropy": 1.9403239041566849, + "epoch": 0.0814966997556494, + "grad_norm": 10.783156394958496, + "learning_rate": 8.861983320557797e-06, + "loss": 0.619, + "mean_token_accuracy": 0.8142760202288628, + "num_tokens": 31541855.0, + "step": 26290 + }, + { + "entropy": 1.9194020926952362, + "epoch": 0.08152769888069909, + "grad_norm": 5.271920204162598, + "learning_rate": 8.860298308597903e-06, + "loss": 0.6006, + "mean_token_accuracy": 0.8099767789244652, + "num_tokens": 31553683.0, + "step": 26300 + }, + { + "entropy": 1.9054380610585213, + "epoch": 0.08155869800574879, + "grad_norm": 12.495100975036621, + "learning_rate": 8.858614257434004e-06, + "loss": 0.5515, + "mean_token_accuracy": 0.8234122693538666, + "num_tokens": 31566209.0, + "step": 26310 + }, + { + "entropy": 1.9283027410507203, + "epoch": 0.08158969713079849, + "grad_norm": 10.337760925292969, + "learning_rate": 8.856931166153366e-06, + "loss": 0.557, + "mean_token_accuracy": 0.827317263185978, + "num_tokens": 31578676.0, + "step": 26320 + }, + { + "entropy": 1.9200524538755417, + "epoch": 0.08162069625584818, + "grad_norm": 9.985169410705566, + "learning_rate": 8.855249033844469e-06, + "loss": 0.5736, + "mean_token_accuracy": 0.8291199162602425, + "num_tokens": 31591513.0, + "step": 26330 + }, + { + "entropy": 1.8813577458262443, + "epoch": 0.08165169538089788, + "grad_norm": 10.70927906036377, + "learning_rate": 8.85356785959701e-06, + "loss": 0.5232, + "mean_token_accuracy": 0.8120563924312592, + "num_tokens": 31604610.0, + "step": 26340 + }, + { + "entropy": 1.9429839253425598, + "epoch": 0.08168269450594758, + "grad_norm": 7.395925998687744, + "learning_rate": 8.851887642501889e-06, + "loss": 0.5473, + "mean_token_accuracy": 0.8196773245930672, + "num_tokens": 31616846.0, + "step": 26350 + }, + { + "entropy": 1.8979680463671684, + "epoch": 0.08171369363099726, + "grad_norm": 10.052593231201172, + "learning_rate": 8.850208381651215e-06, + "loss": 0.5895, + "mean_token_accuracy": 0.8154531285166741, + "num_tokens": 31629253.0, + "step": 26360 + }, + { + "entropy": 1.894950045645237, + "epoch": 0.08174469275604695, + "grad_norm": 10.194618225097656, + "learning_rate": 8.848530076138306e-06, + "loss": 0.5183, + "mean_token_accuracy": 0.827742937207222, + "num_tokens": 31642045.0, + "step": 26370 + }, + { + "entropy": 1.9756985664367677, + "epoch": 0.08177569188109665, + "grad_norm": 9.307488441467285, + "learning_rate": 8.846852725057679e-06, + "loss": 0.6164, + "mean_token_accuracy": 0.8118034496903419, + "num_tokens": 31653833.0, + "step": 26380 + }, + { + "entropy": 1.977025419473648, + "epoch": 0.08180669100614635, + "grad_norm": 8.45008659362793, + "learning_rate": 8.845176327505053e-06, + "loss": 0.5422, + "mean_token_accuracy": 0.8323731452226639, + "num_tokens": 31664531.0, + "step": 26390 + }, + { + "entropy": 1.8568702943623066, + "epoch": 0.08183769013119604, + "grad_norm": 9.432966232299805, + "learning_rate": 8.843500882577342e-06, + "loss": 0.4926, + "mean_token_accuracy": 0.83240677267313, + "num_tokens": 31677924.0, + "step": 26400 + }, + { + "entropy": 1.9465472564101218, + "epoch": 0.08186868925624574, + "grad_norm": 2.6528499126434326, + "learning_rate": 8.841826389372667e-06, + "loss": 0.57, + "mean_token_accuracy": 0.819884067773819, + "num_tokens": 31689433.0, + "step": 26410 + }, + { + "entropy": 1.9157054662704467, + "epoch": 0.08189968838129544, + "grad_norm": 4.077538013458252, + "learning_rate": 8.840152846990336e-06, + "loss": 0.5505, + "mean_token_accuracy": 0.8310910239815712, + "num_tokens": 31701899.0, + "step": 26420 + }, + { + "entropy": 1.9014649242162704, + "epoch": 0.08193068750634513, + "grad_norm": 9.67369556427002, + "learning_rate": 8.838480254530852e-06, + "loss": 0.5669, + "mean_token_accuracy": 0.8307635292410851, + "num_tokens": 31713964.0, + "step": 26430 + }, + { + "entropy": 1.944985429942608, + "epoch": 0.08196168663139483, + "grad_norm": 10.318603515625, + "learning_rate": 8.836808611095908e-06, + "loss": 0.589, + "mean_token_accuracy": 0.8197949156165123, + "num_tokens": 31725749.0, + "step": 26440 + }, + { + "entropy": 1.9587625324726106, + "epoch": 0.08199268575644453, + "grad_norm": 9.986980438232422, + "learning_rate": 8.835137915788388e-06, + "loss": 0.6011, + "mean_token_accuracy": 0.816116102039814, + "num_tokens": 31738281.0, + "step": 26450 + }, + { + "entropy": 1.9987529665231705, + "epoch": 0.08202368488149422, + "grad_norm": 9.245450973510742, + "learning_rate": 8.83346816771236e-06, + "loss": 0.637, + "mean_token_accuracy": 0.8139676377177238, + "num_tokens": 31749103.0, + "step": 26460 + }, + { + "entropy": 1.9783283829689027, + "epoch": 0.08205468400654392, + "grad_norm": 13.349959373474121, + "learning_rate": 8.831799365973078e-06, + "loss": 0.6254, + "mean_token_accuracy": 0.8167249724268913, + "num_tokens": 31760327.0, + "step": 26470 + }, + { + "entropy": 1.9021371573209762, + "epoch": 0.08208568313159362, + "grad_norm": 4.939640998840332, + "learning_rate": 8.83013150967698e-06, + "loss": 0.5441, + "mean_token_accuracy": 0.8197105556726456, + "num_tokens": 31772545.0, + "step": 26480 + }, + { + "entropy": 1.8698833830654622, + "epoch": 0.0821166822566433, + "grad_norm": 9.22522258758545, + "learning_rate": 8.828464597931686e-06, + "loss": 0.5702, + "mean_token_accuracy": 0.8225023493170738, + "num_tokens": 31785337.0, + "step": 26490 + }, + { + "entropy": 1.8712293177843093, + "epoch": 0.082147681381693, + "grad_norm": 5.361429214477539, + "learning_rate": 8.82679862984599e-06, + "loss": 0.5483, + "mean_token_accuracy": 0.8150455951690674, + "num_tokens": 31797319.0, + "step": 26500 + }, + { + "entropy": 1.9659972831606864, + "epoch": 0.0821786805067427, + "grad_norm": 9.854397773742676, + "learning_rate": 8.825133604529864e-06, + "loss": 0.5934, + "mean_token_accuracy": 0.8131707355380058, + "num_tokens": 31808978.0, + "step": 26510 + }, + { + "entropy": 1.9501794785261155, + "epoch": 0.08220967963179239, + "grad_norm": 8.959293365478516, + "learning_rate": 8.823469521094459e-06, + "loss": 0.6272, + "mean_token_accuracy": 0.8212861150503159, + "num_tokens": 31821515.0, + "step": 26520 + }, + { + "entropy": 1.9314236760139465, + "epoch": 0.08224067875684209, + "grad_norm": 10.00307846069336, + "learning_rate": 8.821806378652095e-06, + "loss": 0.5826, + "mean_token_accuracy": 0.8202818527817726, + "num_tokens": 31832945.0, + "step": 26530 + }, + { + "entropy": 1.9181413248181343, + "epoch": 0.08227167788189178, + "grad_norm": 12.737542152404785, + "learning_rate": 8.820144176316263e-06, + "loss": 0.6233, + "mean_token_accuracy": 0.808757683634758, + "num_tokens": 31844982.0, + "step": 26540 + }, + { + "entropy": 1.9316234186291694, + "epoch": 0.08230267700694148, + "grad_norm": 4.668799877166748, + "learning_rate": 8.818482913201624e-06, + "loss": 0.5973, + "mean_token_accuracy": 0.81618000715971, + "num_tokens": 31857158.0, + "step": 26550 + }, + { + "entropy": 1.9605771273374557, + "epoch": 0.08233367613199118, + "grad_norm": 10.899618148803711, + "learning_rate": 8.816822588424007e-06, + "loss": 0.5998, + "mean_token_accuracy": 0.8206842705607414, + "num_tokens": 31868956.0, + "step": 26560 + }, + { + "entropy": 1.9107506170868873, + "epoch": 0.08236467525704087, + "grad_norm": 10.741750717163086, + "learning_rate": 8.815163201100404e-06, + "loss": 0.564, + "mean_token_accuracy": 0.8113860443234444, + "num_tokens": 31881384.0, + "step": 26570 + }, + { + "entropy": 1.940708489716053, + "epoch": 0.08239567438209057, + "grad_norm": 10.441813468933105, + "learning_rate": 8.813504750348967e-06, + "loss": 0.584, + "mean_token_accuracy": 0.8169033840298653, + "num_tokens": 31892982.0, + "step": 26580 + }, + { + "entropy": 1.9362832650542259, + "epoch": 0.08242667350714027, + "grad_norm": 9.666556358337402, + "learning_rate": 8.811847235289013e-06, + "loss": 0.6264, + "mean_token_accuracy": 0.8026597559452057, + "num_tokens": 31905412.0, + "step": 26590 + }, + { + "entropy": 1.9204092472791672, + "epoch": 0.08245767263218996, + "grad_norm": 8.493167877197266, + "learning_rate": 8.81019065504102e-06, + "loss": 0.6364, + "mean_token_accuracy": 0.8150411576032639, + "num_tokens": 31917332.0, + "step": 26600 + }, + { + "entropy": 1.9749287247657776, + "epoch": 0.08248867175723965, + "grad_norm": 9.949286460876465, + "learning_rate": 8.808535008726616e-06, + "loss": 0.6233, + "mean_token_accuracy": 0.8155123621225357, + "num_tokens": 31928762.0, + "step": 26610 + }, + { + "entropy": 2.0281356394290926, + "epoch": 0.08251967088228934, + "grad_norm": 8.458736419677734, + "learning_rate": 8.806880295468594e-06, + "loss": 0.6129, + "mean_token_accuracy": 0.817957803606987, + "num_tokens": 31939691.0, + "step": 26620 + }, + { + "entropy": 1.9947254791855813, + "epoch": 0.08255067000733904, + "grad_norm": 10.101909637451172, + "learning_rate": 8.805226514390884e-06, + "loss": 0.5895, + "mean_token_accuracy": 0.8237784549593925, + "num_tokens": 31950789.0, + "step": 26630 + }, + { + "entropy": 1.9266445934772491, + "epoch": 0.08258166913238874, + "grad_norm": 9.334656715393066, + "learning_rate": 8.803573664618587e-06, + "loss": 0.5351, + "mean_token_accuracy": 0.8326213672757149, + "num_tokens": 31962662.0, + "step": 26640 + }, + { + "entropy": 1.9811443358659744, + "epoch": 0.08261266825743843, + "grad_norm": 9.728004455566406, + "learning_rate": 8.801921745277938e-06, + "loss": 0.6228, + "mean_token_accuracy": 0.8104926988482475, + "num_tokens": 31973878.0, + "step": 26650 + }, + { + "entropy": 1.9071193888783455, + "epoch": 0.08264366738248813, + "grad_norm": 9.647929191589355, + "learning_rate": 8.800270755496327e-06, + "loss": 0.5784, + "mean_token_accuracy": 0.8240789666771888, + "num_tokens": 31985601.0, + "step": 26660 + }, + { + "entropy": 1.8988082259893417, + "epoch": 0.08267466650753783, + "grad_norm": 5.0942888259887695, + "learning_rate": 8.798620694402286e-06, + "loss": 0.5738, + "mean_token_accuracy": 0.819773106276989, + "num_tokens": 31999208.0, + "step": 26670 + }, + { + "entropy": 1.9712405875325203, + "epoch": 0.08270566563258752, + "grad_norm": 8.369913101196289, + "learning_rate": 8.796971561125492e-06, + "loss": 0.6086, + "mean_token_accuracy": 0.8169108390808105, + "num_tokens": 32010782.0, + "step": 26680 + }, + { + "entropy": 1.868271279335022, + "epoch": 0.08273666475763722, + "grad_norm": 8.779671669006348, + "learning_rate": 8.795323354796762e-06, + "loss": 0.5195, + "mean_token_accuracy": 0.8314287766814232, + "num_tokens": 32023787.0, + "step": 26690 + }, + { + "entropy": 1.9078305020928383, + "epoch": 0.08276766388268691, + "grad_norm": 9.038331985473633, + "learning_rate": 8.79367607454805e-06, + "loss": 0.5642, + "mean_token_accuracy": 0.8266749441623688, + "num_tokens": 32036579.0, + "step": 26700 + }, + { + "entropy": 1.9659512534737587, + "epoch": 0.08279866300773661, + "grad_norm": 10.663753509521484, + "learning_rate": 8.792029719512458e-06, + "loss": 0.5953, + "mean_token_accuracy": 0.8178058177232742, + "num_tokens": 32048734.0, + "step": 26710 + }, + { + "entropy": 1.8505152672529221, + "epoch": 0.08282966213278631, + "grad_norm": 3.9875552654266357, + "learning_rate": 8.79038428882421e-06, + "loss": 0.5194, + "mean_token_accuracy": 0.8262492626905441, + "num_tokens": 32061882.0, + "step": 26720 + }, + { + "entropy": 1.913353630900383, + "epoch": 0.08286066125783599, + "grad_norm": 10.27352523803711, + "learning_rate": 8.788739781618678e-06, + "loss": 0.5767, + "mean_token_accuracy": 0.8218828424811363, + "num_tokens": 32074037.0, + "step": 26730 + }, + { + "entropy": 1.9119721353054047, + "epoch": 0.08289166038288569, + "grad_norm": 9.0624361038208, + "learning_rate": 8.78709619703235e-06, + "loss": 0.5735, + "mean_token_accuracy": 0.8198665246367455, + "num_tokens": 32086164.0, + "step": 26740 + }, + { + "entropy": 1.8503568902611733, + "epoch": 0.08292265950793538, + "grad_norm": 5.034000396728516, + "learning_rate": 8.785453534202857e-06, + "loss": 0.5473, + "mean_token_accuracy": 0.8217618718743325, + "num_tokens": 32099236.0, + "step": 26750 + }, + { + "entropy": 1.9170492082834243, + "epoch": 0.08295365863298508, + "grad_norm": 11.386276245117188, + "learning_rate": 8.78381179226895e-06, + "loss": 0.5962, + "mean_token_accuracy": 0.8131098374724388, + "num_tokens": 32111036.0, + "step": 26760 + }, + { + "entropy": 2.001404981315136, + "epoch": 0.08298465775803478, + "grad_norm": 9.401607513427734, + "learning_rate": 8.782170970370514e-06, + "loss": 0.6513, + "mean_token_accuracy": 0.8121735200285911, + "num_tokens": 32122745.0, + "step": 26770 + }, + { + "entropy": 1.9728690326213836, + "epoch": 0.08301565688308447, + "grad_norm": 9.115731239318848, + "learning_rate": 8.78053106764855e-06, + "loss": 0.6129, + "mean_token_accuracy": 0.8155710816383361, + "num_tokens": 32133822.0, + "step": 26780 + }, + { + "entropy": 1.952053852379322, + "epoch": 0.08304665600813417, + "grad_norm": 9.419805526733398, + "learning_rate": 8.778892083245187e-06, + "loss": 0.6176, + "mean_token_accuracy": 0.814921198785305, + "num_tokens": 32144770.0, + "step": 26790 + }, + { + "entropy": 1.9340526655316352, + "epoch": 0.08307765513318387, + "grad_norm": 4.676142692565918, + "learning_rate": 8.77725401630367e-06, + "loss": 0.6304, + "mean_token_accuracy": 0.7994276747107506, + "num_tokens": 32156515.0, + "step": 26800 + }, + { + "entropy": 2.002475252747536, + "epoch": 0.08310865425823356, + "grad_norm": 5.354742527008057, + "learning_rate": 8.775616865968369e-06, + "loss": 0.7259, + "mean_token_accuracy": 0.7925671577453614, + "num_tokens": 32167752.0, + "step": 26810 + }, + { + "entropy": 1.8802199259400367, + "epoch": 0.08313965338328326, + "grad_norm": 8.681201934814453, + "learning_rate": 8.773980631384764e-06, + "loss": 0.5398, + "mean_token_accuracy": 0.8213910296559334, + "num_tokens": 32181313.0, + "step": 26820 + }, + { + "entropy": 1.873046538233757, + "epoch": 0.08317065250833296, + "grad_norm": 8.961077690124512, + "learning_rate": 8.772345311699455e-06, + "loss": 0.558, + "mean_token_accuracy": 0.8188267648220062, + "num_tokens": 32194510.0, + "step": 26830 + }, + { + "entropy": 1.8043782696127892, + "epoch": 0.08320165163338265, + "grad_norm": 10.20888614654541, + "learning_rate": 8.770710906060152e-06, + "loss": 0.5254, + "mean_token_accuracy": 0.8352167114615441, + "num_tokens": 32207477.0, + "step": 26840 + }, + { + "entropy": 1.962397077679634, + "epoch": 0.08323265075843235, + "grad_norm": 8.981500625610352, + "learning_rate": 8.769077413615676e-06, + "loss": 0.6166, + "mean_token_accuracy": 0.8194408491253853, + "num_tokens": 32218668.0, + "step": 26850 + }, + { + "entropy": 1.9099099516868592, + "epoch": 0.08326364988348203, + "grad_norm": 9.33209228515625, + "learning_rate": 8.76744483351596e-06, + "loss": 0.5514, + "mean_token_accuracy": 0.8329472854733467, + "num_tokens": 32230093.0, + "step": 26860 + }, + { + "entropy": 1.8509590789675712, + "epoch": 0.08329464900853173, + "grad_norm": 10.720943450927734, + "learning_rate": 8.76581316491204e-06, + "loss": 0.5368, + "mean_token_accuracy": 0.8222440704703331, + "num_tokens": 32242430.0, + "step": 26870 + }, + { + "entropy": 1.7675331860780716, + "epoch": 0.08332564813358143, + "grad_norm": 5.470025539398193, + "learning_rate": 8.764182406956064e-06, + "loss": 0.4998, + "mean_token_accuracy": 0.8318305298686027, + "num_tokens": 32255794.0, + "step": 26880 + }, + { + "entropy": 1.8293626666069032, + "epoch": 0.08335664725863112, + "grad_norm": 6.280379772186279, + "learning_rate": 8.762552558801276e-06, + "loss": 0.5219, + "mean_token_accuracy": 0.8191826656460762, + "num_tokens": 32269026.0, + "step": 26890 + }, + { + "entropy": 1.8529397904872895, + "epoch": 0.08338764638368082, + "grad_norm": 9.650927543640137, + "learning_rate": 8.760923619602028e-06, + "loss": 0.5539, + "mean_token_accuracy": 0.8250053748488426, + "num_tokens": 32282150.0, + "step": 26900 + }, + { + "entropy": 1.8897050678730012, + "epoch": 0.08341864550873052, + "grad_norm": 4.701704978942871, + "learning_rate": 8.75929558851377e-06, + "loss": 0.564, + "mean_token_accuracy": 0.8231826663017273, + "num_tokens": 32293837.0, + "step": 26910 + }, + { + "entropy": 1.8501434102654457, + "epoch": 0.08344964463378021, + "grad_norm": 10.15555191040039, + "learning_rate": 8.757668464693049e-06, + "loss": 0.5392, + "mean_token_accuracy": 0.8285914018750191, + "num_tokens": 32306371.0, + "step": 26920 + }, + { + "entropy": 1.9136781513690948, + "epoch": 0.08348064375882991, + "grad_norm": 9.14686107635498, + "learning_rate": 8.756042247297512e-06, + "loss": 0.5924, + "mean_token_accuracy": 0.8250029042363167, + "num_tokens": 32318441.0, + "step": 26930 + }, + { + "entropy": 1.9553735256195068, + "epoch": 0.0835116428838796, + "grad_norm": 4.363174915313721, + "learning_rate": 8.754416935485893e-06, + "loss": 0.6237, + "mean_token_accuracy": 0.8168588444590569, + "num_tokens": 32329443.0, + "step": 26940 + }, + { + "entropy": 1.839969304203987, + "epoch": 0.0835426420089293, + "grad_norm": 9.89714241027832, + "learning_rate": 8.75279252841803e-06, + "loss": 0.4824, + "mean_token_accuracy": 0.8351193264126777, + "num_tokens": 32341869.0, + "step": 26950 + }, + { + "entropy": 1.9383182168006896, + "epoch": 0.083573641133979, + "grad_norm": 10.119979858398438, + "learning_rate": 8.751169025254838e-06, + "loss": 0.593, + "mean_token_accuracy": 0.8233784183859825, + "num_tokens": 32352867.0, + "step": 26960 + }, + { + "entropy": 1.8974655985832214, + "epoch": 0.0836046402590287, + "grad_norm": 9.094542503356934, + "learning_rate": 8.749546425158334e-06, + "loss": 0.5511, + "mean_token_accuracy": 0.8213450491428376, + "num_tokens": 32365021.0, + "step": 26970 + }, + { + "entropy": 1.8861443296074867, + "epoch": 0.08363563938407838, + "grad_norm": 9.384647369384766, + "learning_rate": 8.747924727291615e-06, + "loss": 0.5612, + "mean_token_accuracy": 0.8222412168979645, + "num_tokens": 32376467.0, + "step": 26980 + }, + { + "entropy": 1.9021775022149086, + "epoch": 0.08366663850912807, + "grad_norm": 8.179243087768555, + "learning_rate": 8.746303930818864e-06, + "loss": 0.5683, + "mean_token_accuracy": 0.8159692242741585, + "num_tokens": 32388816.0, + "step": 26990 + }, + { + "entropy": 1.9401640132069589, + "epoch": 0.08369763763417777, + "grad_norm": 4.109061241149902, + "learning_rate": 8.744684034905353e-06, + "loss": 0.5723, + "mean_token_accuracy": 0.8248335152864457, + "num_tokens": 32400534.0, + "step": 27000 + }, + { + "entropy": 1.8398920968174934, + "epoch": 0.08372863675922747, + "grad_norm": 10.484107971191406, + "learning_rate": 8.743065038717426e-06, + "loss": 0.5011, + "mean_token_accuracy": 0.8244616940617562, + "num_tokens": 32413551.0, + "step": 27010 + }, + { + "entropy": 1.9067912593483924, + "epoch": 0.08375963588427716, + "grad_norm": 4.866901397705078, + "learning_rate": 8.741446941422514e-06, + "loss": 0.5775, + "mean_token_accuracy": 0.8243859261274338, + "num_tokens": 32425381.0, + "step": 27020 + }, + { + "entropy": 1.839262606203556, + "epoch": 0.08379063500932686, + "grad_norm": 11.031305313110352, + "learning_rate": 8.739829742189128e-06, + "loss": 0.5304, + "mean_token_accuracy": 0.8227956235408783, + "num_tokens": 32438069.0, + "step": 27030 + }, + { + "entropy": 1.8895979836583137, + "epoch": 0.08382163413437656, + "grad_norm": 8.922094345092773, + "learning_rate": 8.738213440186849e-06, + "loss": 0.5385, + "mean_token_accuracy": 0.8271268501877784, + "num_tokens": 32449380.0, + "step": 27040 + }, + { + "entropy": 1.945242629945278, + "epoch": 0.08385263325942625, + "grad_norm": 4.774547576904297, + "learning_rate": 8.736598034586335e-06, + "loss": 0.609, + "mean_token_accuracy": 0.8129522934556007, + "num_tokens": 32460665.0, + "step": 27050 + }, + { + "entropy": 1.9358567342162132, + "epoch": 0.08388363238447595, + "grad_norm": 5.142744064331055, + "learning_rate": 8.734983524559322e-06, + "loss": 0.6, + "mean_token_accuracy": 0.8112214103341102, + "num_tokens": 32472553.0, + "step": 27060 + }, + { + "entropy": 1.909812480211258, + "epoch": 0.08391463150952565, + "grad_norm": 8.57243824005127, + "learning_rate": 8.733369909278609e-06, + "loss": 0.5495, + "mean_token_accuracy": 0.8224716767668724, + "num_tokens": 32485563.0, + "step": 27070 + }, + { + "entropy": 2.0142262816429137, + "epoch": 0.08394563063457534, + "grad_norm": 10.417340278625488, + "learning_rate": 8.731757187918067e-06, + "loss": 0.6399, + "mean_token_accuracy": 0.8098721027374267, + "num_tokens": 32496322.0, + "step": 27080 + }, + { + "entropy": 1.8523387983441353, + "epoch": 0.08397662975962504, + "grad_norm": 4.585434436798096, + "learning_rate": 8.730145359652638e-06, + "loss": 0.5058, + "mean_token_accuracy": 0.8351857841014863, + "num_tokens": 32509323.0, + "step": 27090 + }, + { + "entropy": 1.8732295900583267, + "epoch": 0.08400762888467472, + "grad_norm": 4.835305690765381, + "learning_rate": 8.728534423658325e-06, + "loss": 0.5411, + "mean_token_accuracy": 0.8177412390708924, + "num_tokens": 32521503.0, + "step": 27100 + }, + { + "entropy": 2.0055402636528017, + "epoch": 0.08403862800972442, + "grad_norm": 13.625639915466309, + "learning_rate": 8.726924379112201e-06, + "loss": 0.6815, + "mean_token_accuracy": 0.8050232946872711, + "num_tokens": 32532199.0, + "step": 27110 + }, + { + "entropy": 1.8963544055819512, + "epoch": 0.08406962713477412, + "grad_norm": 9.827751159667969, + "learning_rate": 8.725315225192391e-06, + "loss": 0.5774, + "mean_token_accuracy": 0.8158974751830101, + "num_tokens": 32544370.0, + "step": 27120 + }, + { + "entropy": 1.957877866923809, + "epoch": 0.08410062625982381, + "grad_norm": 4.588069915771484, + "learning_rate": 8.723706961078094e-06, + "loss": 0.5492, + "mean_token_accuracy": 0.8149573966860771, + "num_tokens": 32555644.0, + "step": 27130 + }, + { + "entropy": 1.9230869844555856, + "epoch": 0.08413162538487351, + "grad_norm": 8.736885070800781, + "learning_rate": 8.722099585949552e-06, + "loss": 0.5671, + "mean_token_accuracy": 0.8175855800509453, + "num_tokens": 32568234.0, + "step": 27140 + }, + { + "entropy": 1.9423416420817374, + "epoch": 0.0841626245099232, + "grad_norm": 10.53701400756836, + "learning_rate": 8.720493098988078e-06, + "loss": 0.6455, + "mean_token_accuracy": 0.8082656994462013, + "num_tokens": 32580063.0, + "step": 27150 + }, + { + "entropy": 2.0061039671301844, + "epoch": 0.0841936236349729, + "grad_norm": 9.278496742248535, + "learning_rate": 8.718887499376033e-06, + "loss": 0.6392, + "mean_token_accuracy": 0.8064600452780724, + "num_tokens": 32591372.0, + "step": 27160 + }, + { + "entropy": 1.9022212833166123, + "epoch": 0.0842246227600226, + "grad_norm": 4.704682350158691, + "learning_rate": 8.717282786296834e-06, + "loss": 0.5109, + "mean_token_accuracy": 0.8284071788191796, + "num_tokens": 32604362.0, + "step": 27170 + }, + { + "entropy": 1.858892096579075, + "epoch": 0.0842556218850723, + "grad_norm": 10.126176834106445, + "learning_rate": 8.715678958934944e-06, + "loss": 0.5473, + "mean_token_accuracy": 0.8314953356981277, + "num_tokens": 32616980.0, + "step": 27180 + }, + { + "entropy": 1.8593590274453162, + "epoch": 0.08428662101012199, + "grad_norm": 10.302115440368652, + "learning_rate": 8.714076016475885e-06, + "loss": 0.5496, + "mean_token_accuracy": 0.8241918250918389, + "num_tokens": 32629349.0, + "step": 27190 + }, + { + "entropy": 1.8986007198691368, + "epoch": 0.08431762013517169, + "grad_norm": 3.36916446685791, + "learning_rate": 8.712473958106222e-06, + "loss": 0.5957, + "mean_token_accuracy": 0.8168155118823052, + "num_tokens": 32641755.0, + "step": 27200 + }, + { + "entropy": 1.8978888988494873, + "epoch": 0.08434861926022139, + "grad_norm": 9.608877182006836, + "learning_rate": 8.710872783013563e-06, + "loss": 0.5693, + "mean_token_accuracy": 0.8153728187084198, + "num_tokens": 32653282.0, + "step": 27210 + }, + { + "entropy": 1.9542479366064072, + "epoch": 0.08437961838527108, + "grad_norm": 4.4257307052612305, + "learning_rate": 8.709272490386569e-06, + "loss": 0.6091, + "mean_token_accuracy": 0.806995865702629, + "num_tokens": 32664764.0, + "step": 27220 + }, + { + "entropy": 1.9367131859064102, + "epoch": 0.08441061751032077, + "grad_norm": 8.415755271911621, + "learning_rate": 8.707673079414937e-06, + "loss": 0.5859, + "mean_token_accuracy": 0.8194612860679626, + "num_tokens": 32675941.0, + "step": 27230 + }, + { + "entropy": 1.9739065006375314, + "epoch": 0.08444161663537046, + "grad_norm": 9.644390106201172, + "learning_rate": 8.706074549289411e-06, + "loss": 0.6278, + "mean_token_accuracy": 0.8121867910027504, + "num_tokens": 32687272.0, + "step": 27240 + }, + { + "entropy": 1.9072382494807243, + "epoch": 0.08447261576042016, + "grad_norm": 8.06834888458252, + "learning_rate": 8.704476899201766e-06, + "loss": 0.5399, + "mean_token_accuracy": 0.8260136678814888, + "num_tokens": 32698817.0, + "step": 27250 + }, + { + "entropy": 1.9281651645898819, + "epoch": 0.08450361488546985, + "grad_norm": 10.1871976852417, + "learning_rate": 8.702880128344827e-06, + "loss": 0.5762, + "mean_token_accuracy": 0.82305498868227, + "num_tokens": 32710416.0, + "step": 27260 + }, + { + "entropy": 1.9203084349632262, + "epoch": 0.08453461401051955, + "grad_norm": 11.28444766998291, + "learning_rate": 8.701284235912444e-06, + "loss": 0.5659, + "mean_token_accuracy": 0.8235008075833321, + "num_tokens": 32722557.0, + "step": 27270 + }, + { + "entropy": 1.8537453413009644, + "epoch": 0.08456561313556925, + "grad_norm": 10.914976119995117, + "learning_rate": 8.699689221099508e-06, + "loss": 0.6454, + "mean_token_accuracy": 0.8067585110664368, + "num_tokens": 32735577.0, + "step": 27280 + }, + { + "entropy": 1.9035761684179306, + "epoch": 0.08459661226061894, + "grad_norm": 11.605010986328125, + "learning_rate": 8.698095083101939e-06, + "loss": 0.5898, + "mean_token_accuracy": 0.8235433489084244, + "num_tokens": 32747673.0, + "step": 27290 + }, + { + "entropy": 1.8419725641608238, + "epoch": 0.08462761138566864, + "grad_norm": 7.603212833404541, + "learning_rate": 8.69650182111669e-06, + "loss": 0.4914, + "mean_token_accuracy": 0.8381339445710182, + "num_tokens": 32760397.0, + "step": 27300 + }, + { + "entropy": 1.9057568103075027, + "epoch": 0.08465861051071834, + "grad_norm": 7.779855251312256, + "learning_rate": 8.694909434341743e-06, + "loss": 0.5506, + "mean_token_accuracy": 0.8320439457893372, + "num_tokens": 32772961.0, + "step": 27310 + }, + { + "entropy": 1.9533362567424775, + "epoch": 0.08468960963576803, + "grad_norm": 9.11958122253418, + "learning_rate": 8.693317921976107e-06, + "loss": 0.6301, + "mean_token_accuracy": 0.8056516513228417, + "num_tokens": 32785213.0, + "step": 27320 + }, + { + "entropy": 1.9149364829063416, + "epoch": 0.08472060876081773, + "grad_norm": 8.80299186706543, + "learning_rate": 8.69172728321982e-06, + "loss": 0.5535, + "mean_token_accuracy": 0.8212979912757874, + "num_tokens": 32798167.0, + "step": 27330 + }, + { + "entropy": 1.8609929516911508, + "epoch": 0.08475160788586743, + "grad_norm": 4.60377311706543, + "learning_rate": 8.690137517273937e-06, + "loss": 0.5088, + "mean_token_accuracy": 0.8336883813142777, + "num_tokens": 32810453.0, + "step": 27340 + }, + { + "entropy": 1.9059410750865937, + "epoch": 0.08478260701091711, + "grad_norm": 10.732145309448242, + "learning_rate": 8.688548623340543e-06, + "loss": 0.6174, + "mean_token_accuracy": 0.8180224969983101, + "num_tokens": 32823220.0, + "step": 27350 + }, + { + "entropy": 1.9536007195711136, + "epoch": 0.08481360613596681, + "grad_norm": 8.611103057861328, + "learning_rate": 8.68696060062274e-06, + "loss": 0.6304, + "mean_token_accuracy": 0.8148555681109428, + "num_tokens": 32834092.0, + "step": 27360 + }, + { + "entropy": 1.8823547706007957, + "epoch": 0.0848446052610165, + "grad_norm": 8.003419876098633, + "learning_rate": 8.685373448324655e-06, + "loss": 0.5494, + "mean_token_accuracy": 0.8131691709160804, + "num_tokens": 32846184.0, + "step": 27370 + }, + { + "entropy": 1.8589939393103123, + "epoch": 0.0848756043860662, + "grad_norm": 9.513988494873047, + "learning_rate": 8.683787165651419e-06, + "loss": 0.5296, + "mean_token_accuracy": 0.8292522057890892, + "num_tokens": 32859312.0, + "step": 27380 + }, + { + "entropy": 1.9123133316636085, + "epoch": 0.0849066035111159, + "grad_norm": 9.80207633972168, + "learning_rate": 8.682201751809196e-06, + "loss": 0.6113, + "mean_token_accuracy": 0.8205790624022484, + "num_tokens": 32870081.0, + "step": 27390 + }, + { + "entropy": 1.8975965559482575, + "epoch": 0.0849376026361656, + "grad_norm": 9.128314971923828, + "learning_rate": 8.680617206005148e-06, + "loss": 0.5771, + "mean_token_accuracy": 0.8289038851857186, + "num_tokens": 32881096.0, + "step": 27400 + }, + { + "entropy": 1.9172597080469131, + "epoch": 0.08496860176121529, + "grad_norm": 10.668315887451172, + "learning_rate": 8.679033527447462e-06, + "loss": 0.5846, + "mean_token_accuracy": 0.8238224640488625, + "num_tokens": 32893038.0, + "step": 27410 + }, + { + "entropy": 1.8307318970561028, + "epoch": 0.08499960088626499, + "grad_norm": 4.935861587524414, + "learning_rate": 8.67745071534533e-06, + "loss": 0.531, + "mean_token_accuracy": 0.8222571074962616, + "num_tokens": 32906388.0, + "step": 27420 + }, + { + "entropy": 1.9008280768990518, + "epoch": 0.08503060001131468, + "grad_norm": 4.503681182861328, + "learning_rate": 8.675868768908956e-06, + "loss": 0.5765, + "mean_token_accuracy": 0.8253033936023713, + "num_tokens": 32918054.0, + "step": 27430 + }, + { + "entropy": 1.8841553077101707, + "epoch": 0.08506159913636438, + "grad_norm": 8.340595245361328, + "learning_rate": 8.674287687349546e-06, + "loss": 0.5451, + "mean_token_accuracy": 0.8274075031280518, + "num_tokens": 32929483.0, + "step": 27440 + }, + { + "entropy": 1.8089099466800689, + "epoch": 0.08509259826141408, + "grad_norm": 4.920588493347168, + "learning_rate": 8.672707469879315e-06, + "loss": 0.4781, + "mean_token_accuracy": 0.8309017539024353, + "num_tokens": 32942288.0, + "step": 27450 + }, + { + "entropy": 1.8523821212351321, + "epoch": 0.08512359738646377, + "grad_norm": 10.191000938415527, + "learning_rate": 8.67112811571149e-06, + "loss": 0.5767, + "mean_token_accuracy": 0.8139420121908187, + "num_tokens": 32954595.0, + "step": 27460 + }, + { + "entropy": 1.878308503329754, + "epoch": 0.08515459651151346, + "grad_norm": 5.282871246337891, + "learning_rate": 8.669549624060287e-06, + "loss": 0.5801, + "mean_token_accuracy": 0.8135267734527588, + "num_tokens": 32966387.0, + "step": 27470 + }, + { + "entropy": 1.8660302713513375, + "epoch": 0.08518559563656315, + "grad_norm": 7.17582893371582, + "learning_rate": 8.66797199414093e-06, + "loss": 0.6105, + "mean_token_accuracy": 0.8228353872895241, + "num_tokens": 32978739.0, + "step": 27480 + }, + { + "entropy": 1.855283497273922, + "epoch": 0.08521659476161285, + "grad_norm": 9.164436340332031, + "learning_rate": 8.666395225169643e-06, + "loss": 0.5588, + "mean_token_accuracy": 0.8205430552363395, + "num_tokens": 32991000.0, + "step": 27490 + }, + { + "entropy": 1.916937005519867, + "epoch": 0.08524759388666255, + "grad_norm": 5.297219276428223, + "learning_rate": 8.664819316363645e-06, + "loss": 0.6027, + "mean_token_accuracy": 0.8212128937244415, + "num_tokens": 33002576.0, + "step": 27500 + }, + { + "entropy": 1.9633919060230256, + "epoch": 0.08527859301171224, + "grad_norm": 10.939813613891602, + "learning_rate": 8.663244266941157e-06, + "loss": 0.6718, + "mean_token_accuracy": 0.8004844337701797, + "num_tokens": 33013283.0, + "step": 27510 + }, + { + "entropy": 1.8730249777436256, + "epoch": 0.08530959213676194, + "grad_norm": 5.108644962310791, + "learning_rate": 8.661670076121382e-06, + "loss": 0.6221, + "mean_token_accuracy": 0.810503962635994, + "num_tokens": 33025788.0, + "step": 27520 + }, + { + "entropy": 1.9476985320448876, + "epoch": 0.08534059126181164, + "grad_norm": 9.54196548461914, + "learning_rate": 8.66009674312453e-06, + "loss": 0.6355, + "mean_token_accuracy": 0.800178411602974, + "num_tokens": 33036893.0, + "step": 27530 + }, + { + "entropy": 1.9610710114240646, + "epoch": 0.08537159038686133, + "grad_norm": 8.600343704223633, + "learning_rate": 8.658524267171792e-06, + "loss": 0.6988, + "mean_token_accuracy": 0.8083438903093338, + "num_tokens": 33048666.0, + "step": 27540 + }, + { + "entropy": 1.8692133530974389, + "epoch": 0.08540258951191103, + "grad_norm": 10.251091957092285, + "learning_rate": 8.656952647485353e-06, + "loss": 0.561, + "mean_token_accuracy": 0.8302003368735313, + "num_tokens": 33061005.0, + "step": 27550 + }, + { + "entropy": 1.827210921049118, + "epoch": 0.08543358863696073, + "grad_norm": 9.993083000183105, + "learning_rate": 8.655381883288387e-06, + "loss": 0.6069, + "mean_token_accuracy": 0.8159284323453904, + "num_tokens": 33074475.0, + "step": 27560 + }, + { + "entropy": 1.8685495540499688, + "epoch": 0.08546458776201042, + "grad_norm": 10.750574111938477, + "learning_rate": 8.653811973805048e-06, + "loss": 0.5993, + "mean_token_accuracy": 0.8224136650562286, + "num_tokens": 33086089.0, + "step": 27570 + }, + { + "entropy": 1.7942854017019272, + "epoch": 0.08549558688706012, + "grad_norm": 4.533649921417236, + "learning_rate": 8.652242918260485e-06, + "loss": 0.5276, + "mean_token_accuracy": 0.8248818814754486, + "num_tokens": 33100203.0, + "step": 27580 + }, + { + "entropy": 1.8375580973923207, + "epoch": 0.08552658601210981, + "grad_norm": 4.471881866455078, + "learning_rate": 8.650674715880821e-06, + "loss": 0.5732, + "mean_token_accuracy": 0.8313728928565979, + "num_tokens": 33113514.0, + "step": 27590 + }, + { + "entropy": 1.9055291563272476, + "epoch": 0.0855575851371595, + "grad_norm": 8.167454719543457, + "learning_rate": 8.649107365893162e-06, + "loss": 0.5533, + "mean_token_accuracy": 0.8212564826011658, + "num_tokens": 33125377.0, + "step": 27600 + }, + { + "entropy": 1.882322046160698, + "epoch": 0.0855885842622092, + "grad_norm": 9.761778831481934, + "learning_rate": 8.647540867525599e-06, + "loss": 0.5904, + "mean_token_accuracy": 0.8162475794553756, + "num_tokens": 33137354.0, + "step": 27610 + }, + { + "entropy": 1.8892990604043007, + "epoch": 0.08561958338725889, + "grad_norm": 9.397979736328125, + "learning_rate": 8.645975220007197e-06, + "loss": 0.6297, + "mean_token_accuracy": 0.8177220180630684, + "num_tokens": 33149211.0, + "step": 27620 + }, + { + "entropy": 1.9387479245662689, + "epoch": 0.08565058251230859, + "grad_norm": 10.300222396850586, + "learning_rate": 8.644410422567995e-06, + "loss": 0.6018, + "mean_token_accuracy": 0.8117999002337456, + "num_tokens": 33160821.0, + "step": 27630 + }, + { + "entropy": 1.942291297018528, + "epoch": 0.08568158163735828, + "grad_norm": 11.509066581726074, + "learning_rate": 8.642846474439016e-06, + "loss": 0.5863, + "mean_token_accuracy": 0.8200451254844665, + "num_tokens": 33172491.0, + "step": 27640 + }, + { + "entropy": 1.954495507478714, + "epoch": 0.08571258076240798, + "grad_norm": 8.897340774536133, + "learning_rate": 8.641283374852245e-06, + "loss": 0.5856, + "mean_token_accuracy": 0.8219591468572617, + "num_tokens": 33184078.0, + "step": 27650 + }, + { + "entropy": 1.8060628071427345, + "epoch": 0.08574357988745768, + "grad_norm": 8.12181282043457, + "learning_rate": 8.639721123040653e-06, + "loss": 0.49, + "mean_token_accuracy": 0.8268592938780784, + "num_tokens": 33198036.0, + "step": 27660 + }, + { + "entropy": 1.7999069422483445, + "epoch": 0.08577457901250737, + "grad_norm": 4.471950054168701, + "learning_rate": 8.638159718238167e-06, + "loss": 0.4485, + "mean_token_accuracy": 0.8397438317537308, + "num_tokens": 33212045.0, + "step": 27670 + }, + { + "entropy": 1.9364047437906264, + "epoch": 0.08580557813755707, + "grad_norm": 13.346952438354492, + "learning_rate": 8.636599159679694e-06, + "loss": 0.6085, + "mean_token_accuracy": 0.8085330918431282, + "num_tokens": 33223752.0, + "step": 27680 + }, + { + "entropy": 1.8672024756669998, + "epoch": 0.08583657726260677, + "grad_norm": 5.810976028442383, + "learning_rate": 8.635039446601096e-06, + "loss": 0.5647, + "mean_token_accuracy": 0.8230971753597259, + "num_tokens": 33236114.0, + "step": 27690 + }, + { + "entropy": 1.8852705493569375, + "epoch": 0.08586757638765646, + "grad_norm": 5.571743011474609, + "learning_rate": 8.633480578239217e-06, + "loss": 0.5606, + "mean_token_accuracy": 0.8273131296038627, + "num_tokens": 33248369.0, + "step": 27700 + }, + { + "entropy": 1.9336923122406007, + "epoch": 0.08589857551270616, + "grad_norm": 11.164706230163574, + "learning_rate": 8.63192255383185e-06, + "loss": 0.5876, + "mean_token_accuracy": 0.8112567231059075, + "num_tokens": 33260442.0, + "step": 27710 + }, + { + "entropy": 1.9060131937265397, + "epoch": 0.08592957463775584, + "grad_norm": 10.689841270446777, + "learning_rate": 8.630365372617761e-06, + "loss": 0.5697, + "mean_token_accuracy": 0.8298930734395981, + "num_tokens": 33272270.0, + "step": 27720 + }, + { + "entropy": 1.9362266033887863, + "epoch": 0.08596057376280554, + "grad_norm": 9.383460998535156, + "learning_rate": 8.62880903383667e-06, + "loss": 0.6129, + "mean_token_accuracy": 0.8200540199875832, + "num_tokens": 33283981.0, + "step": 27730 + }, + { + "entropy": 1.9314474761486053, + "epoch": 0.08599157288785524, + "grad_norm": 10.130805969238281, + "learning_rate": 8.627253536729257e-06, + "loss": 0.5401, + "mean_token_accuracy": 0.8344287112355232, + "num_tokens": 33296450.0, + "step": 27740 + }, + { + "entropy": 1.944339656829834, + "epoch": 0.08602257201290493, + "grad_norm": 8.83008861541748, + "learning_rate": 8.625698880537165e-06, + "loss": 0.6563, + "mean_token_accuracy": 0.8038821011781693, + "num_tokens": 33307571.0, + "step": 27750 + }, + { + "entropy": 1.9081848710775375, + "epoch": 0.08605357113795463, + "grad_norm": 10.480369567871094, + "learning_rate": 8.62414506450299e-06, + "loss": 0.5913, + "mean_token_accuracy": 0.817870119214058, + "num_tokens": 33319700.0, + "step": 27760 + }, + { + "entropy": 1.84724163711071, + "epoch": 0.08608457026300433, + "grad_norm": 3.8803551197052, + "learning_rate": 8.622592087870282e-06, + "loss": 0.5173, + "mean_token_accuracy": 0.8268263578414917, + "num_tokens": 33331971.0, + "step": 27770 + }, + { + "entropy": 1.934218955039978, + "epoch": 0.08611556938805402, + "grad_norm": 9.977314949035645, + "learning_rate": 8.621039949883543e-06, + "loss": 0.6326, + "mean_token_accuracy": 0.8152559250593185, + "num_tokens": 33342967.0, + "step": 27780 + }, + { + "entropy": 1.8323631912469864, + "epoch": 0.08614656851310372, + "grad_norm": 12.091215133666992, + "learning_rate": 8.619488649788232e-06, + "loss": 0.5562, + "mean_token_accuracy": 0.8163023605942726, + "num_tokens": 33356235.0, + "step": 27790 + }, + { + "entropy": 1.8991770133376122, + "epoch": 0.08617756763815342, + "grad_norm": 7.5659074783325195, + "learning_rate": 8.617938186830752e-06, + "loss": 0.5869, + "mean_token_accuracy": 0.828110148012638, + "num_tokens": 33368988.0, + "step": 27800 + }, + { + "entropy": 1.8398756310343742, + "epoch": 0.08620856676320311, + "grad_norm": 8.638795852661133, + "learning_rate": 8.616388560258459e-06, + "loss": 0.5014, + "mean_token_accuracy": 0.8359197080135345, + "num_tokens": 33381666.0, + "step": 27810 + }, + { + "entropy": 1.9264942169189454, + "epoch": 0.08623956588825281, + "grad_norm": 9.415760040283203, + "learning_rate": 8.61483976931965e-06, + "loss": 0.5762, + "mean_token_accuracy": 0.8242600187659264, + "num_tokens": 33393125.0, + "step": 27820 + }, + { + "entropy": 1.841173852980137, + "epoch": 0.0862705650133025, + "grad_norm": 6.242795944213867, + "learning_rate": 8.613291813263578e-06, + "loss": 0.4799, + "mean_token_accuracy": 0.8402754053473472, + "num_tokens": 33405715.0, + "step": 27830 + }, + { + "entropy": 1.9486567616462707, + "epoch": 0.08630156413835219, + "grad_norm": 8.530654907226562, + "learning_rate": 8.61174469134043e-06, + "loss": 0.621, + "mean_token_accuracy": 0.8173292249441146, + "num_tokens": 33417314.0, + "step": 27840 + }, + { + "entropy": 1.8448371395468712, + "epoch": 0.08633256326340188, + "grad_norm": 5.01255464553833, + "learning_rate": 8.610198402801338e-06, + "loss": 0.4891, + "mean_token_accuracy": 0.8294866099953652, + "num_tokens": 33431319.0, + "step": 27850 + }, + { + "entropy": 1.8705769084393977, + "epoch": 0.08636356238845158, + "grad_norm": 12.166780471801758, + "learning_rate": 8.60865294689838e-06, + "loss": 0.6069, + "mean_token_accuracy": 0.8117830008268356, + "num_tokens": 33444973.0, + "step": 27860 + }, + { + "entropy": 1.9853614136576652, + "epoch": 0.08639456151350128, + "grad_norm": 9.717790603637695, + "learning_rate": 8.607108322884566e-06, + "loss": 0.61, + "mean_token_accuracy": 0.8108811169862747, + "num_tokens": 33456479.0, + "step": 27870 + }, + { + "entropy": 1.918572799861431, + "epoch": 0.08642556063855097, + "grad_norm": 4.174164295196533, + "learning_rate": 8.605564530013847e-06, + "loss": 0.5333, + "mean_token_accuracy": 0.829804676771164, + "num_tokens": 33468303.0, + "step": 27880 + }, + { + "entropy": 1.9406263917684554, + "epoch": 0.08645655976360067, + "grad_norm": 8.830784797668457, + "learning_rate": 8.604021567541113e-06, + "loss": 0.6364, + "mean_token_accuracy": 0.8110644370317459, + "num_tokens": 33480915.0, + "step": 27890 + }, + { + "entropy": 1.9681098356842994, + "epoch": 0.08648755888865037, + "grad_norm": 9.740206718444824, + "learning_rate": 8.602479434722184e-06, + "loss": 0.623, + "mean_token_accuracy": 0.8138311117887497, + "num_tokens": 33491964.0, + "step": 27900 + }, + { + "entropy": 1.9589456513524055, + "epoch": 0.08651855801370006, + "grad_norm": 9.058309555053711, + "learning_rate": 8.600938130813817e-06, + "loss": 0.6245, + "mean_token_accuracy": 0.814877088367939, + "num_tokens": 33503440.0, + "step": 27910 + }, + { + "entropy": 1.787692840397358, + "epoch": 0.08654955713874976, + "grad_norm": 9.325605392456055, + "learning_rate": 8.5993976550737e-06, + "loss": 0.4343, + "mean_token_accuracy": 0.8466595977544784, + "num_tokens": 33517709.0, + "step": 27920 + }, + { + "entropy": 1.9136761993169784, + "epoch": 0.08658055626379946, + "grad_norm": 11.386701583862305, + "learning_rate": 8.59785800676045e-06, + "loss": 0.5628, + "mean_token_accuracy": 0.8248209357261658, + "num_tokens": 33529539.0, + "step": 27930 + }, + { + "entropy": 1.8507066249847413, + "epoch": 0.08661155538884915, + "grad_norm": 9.556991577148438, + "learning_rate": 8.596319185133614e-06, + "loss": 0.521, + "mean_token_accuracy": 0.8304374098777771, + "num_tokens": 33542002.0, + "step": 27940 + }, + { + "entropy": 1.9005508705973626, + "epoch": 0.08664255451389885, + "grad_norm": 4.965198516845703, + "learning_rate": 8.594781189453666e-06, + "loss": 0.5964, + "mean_token_accuracy": 0.8208310827612877, + "num_tokens": 33553517.0, + "step": 27950 + }, + { + "entropy": 1.9146973922848702, + "epoch": 0.08667355363894855, + "grad_norm": 4.9398932456970215, + "learning_rate": 8.593244018982006e-06, + "loss": 0.5812, + "mean_token_accuracy": 0.8191220477223397, + "num_tokens": 33565833.0, + "step": 27960 + }, + { + "entropy": 1.8798900321125984, + "epoch": 0.08670455276399823, + "grad_norm": 8.898138046264648, + "learning_rate": 8.59170767298096e-06, + "loss": 0.6295, + "mean_token_accuracy": 0.8155390247702599, + "num_tokens": 33578565.0, + "step": 27970 + }, + { + "entropy": 1.8256605304777622, + "epoch": 0.08673555188904793, + "grad_norm": 9.652932167053223, + "learning_rate": 8.590172150713773e-06, + "loss": 0.4694, + "mean_token_accuracy": 0.8326058983802795, + "num_tokens": 33591102.0, + "step": 27980 + }, + { + "entropy": 1.8813355445861817, + "epoch": 0.08676655101409762, + "grad_norm": 9.704791069030762, + "learning_rate": 8.588637451444612e-06, + "loss": 0.5626, + "mean_token_accuracy": 0.8161522090435028, + "num_tokens": 33603187.0, + "step": 27990 + }, + { + "entropy": 1.926676908135414, + "epoch": 0.08679755013914732, + "grad_norm": 10.068709373474121, + "learning_rate": 8.587103574438569e-06, + "loss": 0.644, + "mean_token_accuracy": 0.8118028625845909, + "num_tokens": 33614612.0, + "step": 28000 + }, + { + "entropy": 1.880669642984867, + "epoch": 0.08682854926419702, + "grad_norm": 8.847538948059082, + "learning_rate": 8.585570518961651e-06, + "loss": 0.5402, + "mean_token_accuracy": 0.827701772749424, + "num_tokens": 33626936.0, + "step": 28010 + }, + { + "entropy": 1.9508138015866279, + "epoch": 0.08685954838924671, + "grad_norm": 11.043745994567871, + "learning_rate": 8.58403828428078e-06, + "loss": 0.5966, + "mean_token_accuracy": 0.8156369537115097, + "num_tokens": 33638555.0, + "step": 28020 + }, + { + "entropy": 1.9032750859856606, + "epoch": 0.08689054751429641, + "grad_norm": 10.193146705627441, + "learning_rate": 8.582506869663793e-06, + "loss": 0.5462, + "mean_token_accuracy": 0.8273995086550713, + "num_tokens": 33650029.0, + "step": 28030 + }, + { + "entropy": 1.8772203177213669, + "epoch": 0.0869215466393461, + "grad_norm": 4.957283020019531, + "learning_rate": 8.580976274379448e-06, + "loss": 0.5746, + "mean_token_accuracy": 0.8220570385456085, + "num_tokens": 33662687.0, + "step": 28040 + }, + { + "entropy": 2.005272647738457, + "epoch": 0.0869525457643958, + "grad_norm": 8.876986503601074, + "learning_rate": 8.579446497697407e-06, + "loss": 0.6347, + "mean_token_accuracy": 0.809257960319519, + "num_tokens": 33673133.0, + "step": 28050 + }, + { + "entropy": 1.897057008743286, + "epoch": 0.0869835448894455, + "grad_norm": 4.687375068664551, + "learning_rate": 8.57791753888825e-06, + "loss": 0.5549, + "mean_token_accuracy": 0.8260800316929817, + "num_tokens": 33685382.0, + "step": 28060 + }, + { + "entropy": 1.890799196064472, + "epoch": 0.0870145440144952, + "grad_norm": 11.013267517089844, + "learning_rate": 8.576389397223463e-06, + "loss": 0.5619, + "mean_token_accuracy": 0.8162667453289032, + "num_tokens": 33698473.0, + "step": 28070 + }, + { + "entropy": 1.8947896853089332, + "epoch": 0.08704554313954489, + "grad_norm": 9.438980102539062, + "learning_rate": 8.574862071975438e-06, + "loss": 0.495, + "mean_token_accuracy": 0.8390587836503982, + "num_tokens": 33710540.0, + "step": 28080 + }, + { + "entropy": 1.833718155324459, + "epoch": 0.08707654226459458, + "grad_norm": 9.037769317626953, + "learning_rate": 8.57333556241748e-06, + "loss": 0.5269, + "mean_token_accuracy": 0.8308623015880585, + "num_tokens": 33723318.0, + "step": 28090 + }, + { + "entropy": 1.9327234029769897, + "epoch": 0.08710754138964427, + "grad_norm": 8.257326126098633, + "learning_rate": 8.571809867823794e-06, + "loss": 0.6263, + "mean_token_accuracy": 0.8134735986590386, + "num_tokens": 33734540.0, + "step": 28100 + }, + { + "entropy": 1.9399518385529517, + "epoch": 0.08713854051469397, + "grad_norm": 8.866934776306152, + "learning_rate": 8.57028498746949e-06, + "loss": 0.5604, + "mean_token_accuracy": 0.8291549518704414, + "num_tokens": 33745641.0, + "step": 28110 + }, + { + "entropy": 1.8807589188218117, + "epoch": 0.08716953963974367, + "grad_norm": 9.743132591247559, + "learning_rate": 8.568760920630582e-06, + "loss": 0.5934, + "mean_token_accuracy": 0.8171397164463997, + "num_tokens": 33757675.0, + "step": 28120 + }, + { + "entropy": 1.8112970426678658, + "epoch": 0.08720053876479336, + "grad_norm": 12.117599487304688, + "learning_rate": 8.567237666583983e-06, + "loss": 0.5077, + "mean_token_accuracy": 0.8401974871754646, + "num_tokens": 33769971.0, + "step": 28130 + }, + { + "entropy": 1.8710318520665168, + "epoch": 0.08723153788984306, + "grad_norm": 10.758975982666016, + "learning_rate": 8.565715224607507e-06, + "loss": 0.5972, + "mean_token_accuracy": 0.8087448701262474, + "num_tokens": 33782332.0, + "step": 28140 + }, + { + "entropy": 1.9059083193540574, + "epoch": 0.08726253701489275, + "grad_norm": 9.423166275024414, + "learning_rate": 8.564193593979863e-06, + "loss": 0.566, + "mean_token_accuracy": 0.8211286991834641, + "num_tokens": 33794157.0, + "step": 28150 + }, + { + "entropy": 1.8057139664888382, + "epoch": 0.08729353613994245, + "grad_norm": 4.398457050323486, + "learning_rate": 8.562672773980662e-06, + "loss": 0.5061, + "mean_token_accuracy": 0.8441807180643082, + "num_tokens": 33806432.0, + "step": 28160 + }, + { + "entropy": 1.8262229442596436, + "epoch": 0.08732453526499215, + "grad_norm": 8.604774475097656, + "learning_rate": 8.561152763890406e-06, + "loss": 0.519, + "mean_token_accuracy": 0.8298747554421425, + "num_tokens": 33818953.0, + "step": 28170 + }, + { + "entropy": 1.856579264998436, + "epoch": 0.08735553439004184, + "grad_norm": 9.146932601928711, + "learning_rate": 8.559633562990491e-06, + "loss": 0.5448, + "mean_token_accuracy": 0.8260318920016289, + "num_tokens": 33831302.0, + "step": 28180 + }, + { + "entropy": 1.9090891823172569, + "epoch": 0.08738653351509154, + "grad_norm": 10.917224884033203, + "learning_rate": 8.558115170563206e-06, + "loss": 0.6262, + "mean_token_accuracy": 0.7991869211196899, + "num_tokens": 33843087.0, + "step": 28190 + }, + { + "entropy": 1.8630158439278603, + "epoch": 0.08741753264014124, + "grad_norm": 8.141376495361328, + "learning_rate": 8.556597585891731e-06, + "loss": 0.5709, + "mean_token_accuracy": 0.8259263783693314, + "num_tokens": 33855443.0, + "step": 28200 + }, + { + "entropy": 1.9804678469896317, + "epoch": 0.08744853176519093, + "grad_norm": 9.60744857788086, + "learning_rate": 8.555080808260135e-06, + "loss": 0.6753, + "mean_token_accuracy": 0.8023278743028641, + "num_tokens": 33866719.0, + "step": 28210 + }, + { + "entropy": 1.933399637043476, + "epoch": 0.08747953089024062, + "grad_norm": 7.662554740905762, + "learning_rate": 8.55356483695338e-06, + "loss": 0.5787, + "mean_token_accuracy": 0.8204610332846641, + "num_tokens": 33877680.0, + "step": 28220 + }, + { + "entropy": 1.936586219072342, + "epoch": 0.08751053001529031, + "grad_norm": 9.148261070251465, + "learning_rate": 8.552049671257301e-06, + "loss": 0.6002, + "mean_token_accuracy": 0.810947285592556, + "num_tokens": 33890103.0, + "step": 28230 + }, + { + "entropy": 1.9918580889701842, + "epoch": 0.08754152914034001, + "grad_norm": 9.620087623596191, + "learning_rate": 8.550535310458639e-06, + "loss": 0.6424, + "mean_token_accuracy": 0.8145117297768593, + "num_tokens": 33901085.0, + "step": 28240 + }, + { + "entropy": 1.9204299062490464, + "epoch": 0.0875725282653897, + "grad_norm": 9.934686660766602, + "learning_rate": 8.549021753844996e-06, + "loss": 0.5643, + "mean_token_accuracy": 0.82352195084095, + "num_tokens": 33913338.0, + "step": 28250 + }, + { + "entropy": 1.8327893808484077, + "epoch": 0.0876035273904394, + "grad_norm": 5.132232666015625, + "learning_rate": 8.547509000704874e-06, + "loss": 0.4515, + "mean_token_accuracy": 0.8444669589400291, + "num_tokens": 33926686.0, + "step": 28260 + }, + { + "entropy": 1.8552611097693443, + "epoch": 0.0876345265154891, + "grad_norm": 10.373604774475098, + "learning_rate": 8.545997050327648e-06, + "loss": 0.5785, + "mean_token_accuracy": 0.8225721046328545, + "num_tokens": 33939038.0, + "step": 28270 + }, + { + "entropy": 1.9178115367889403, + "epoch": 0.0876655256405388, + "grad_norm": 11.540125846862793, + "learning_rate": 8.544485902003573e-06, + "loss": 0.6322, + "mean_token_accuracy": 0.8020992532372475, + "num_tokens": 33950992.0, + "step": 28280 + }, + { + "entropy": 1.852571301162243, + "epoch": 0.0876965247655885, + "grad_norm": 10.074957847595215, + "learning_rate": 8.542975555023786e-06, + "loss": 0.5433, + "mean_token_accuracy": 0.8297949224710465, + "num_tokens": 33963043.0, + "step": 28290 + }, + { + "entropy": 1.815762387216091, + "epoch": 0.08772752389063819, + "grad_norm": 5.080463886260986, + "learning_rate": 8.541466008680297e-06, + "loss": 0.5416, + "mean_token_accuracy": 0.8305084735155106, + "num_tokens": 33976042.0, + "step": 28300 + }, + { + "entropy": 1.7797249928116798, + "epoch": 0.08775852301568789, + "grad_norm": 11.101983070373535, + "learning_rate": 8.539957262265988e-06, + "loss": 0.5854, + "mean_token_accuracy": 0.8253014713525773, + "num_tokens": 33989173.0, + "step": 28310 + }, + { + "entropy": 1.9091511026024819, + "epoch": 0.08778952214073758, + "grad_norm": 6.8226141929626465, + "learning_rate": 8.538449315074628e-06, + "loss": 0.6012, + "mean_token_accuracy": 0.8207783192396164, + "num_tokens": 34000588.0, + "step": 28320 + }, + { + "entropy": 1.8104485064744948, + "epoch": 0.08782052126578728, + "grad_norm": 8.398052215576172, + "learning_rate": 8.536942166400845e-06, + "loss": 0.456, + "mean_token_accuracy": 0.8447774976491929, + "num_tokens": 34013462.0, + "step": 28330 + }, + { + "entropy": 1.8076317757368088, + "epoch": 0.08785152039083696, + "grad_norm": 8.194868087768555, + "learning_rate": 8.535435815540142e-06, + "loss": 0.5373, + "mean_token_accuracy": 0.8241015315055847, + "num_tokens": 34025542.0, + "step": 28340 + }, + { + "entropy": 1.8683214783668518, + "epoch": 0.08788251951588666, + "grad_norm": 9.149380683898926, + "learning_rate": 8.533930261788897e-06, + "loss": 0.5971, + "mean_token_accuracy": 0.8132390677928925, + "num_tokens": 34037377.0, + "step": 28350 + }, + { + "entropy": 1.8220268458127975, + "epoch": 0.08791351864093636, + "grad_norm": 4.427048683166504, + "learning_rate": 8.532425504444351e-06, + "loss": 0.5305, + "mean_token_accuracy": 0.82462307959795, + "num_tokens": 34050269.0, + "step": 28360 + }, + { + "entropy": 1.804908536374569, + "epoch": 0.08794451776598605, + "grad_norm": 4.890988349914551, + "learning_rate": 8.530921542804612e-06, + "loss": 0.5341, + "mean_token_accuracy": 0.8353535622358322, + "num_tokens": 34062946.0, + "step": 28370 + }, + { + "entropy": 1.832678309082985, + "epoch": 0.08797551689103575, + "grad_norm": 9.909698486328125, + "learning_rate": 8.52941837616866e-06, + "loss": 0.5177, + "mean_token_accuracy": 0.8312997654080391, + "num_tokens": 34075013.0, + "step": 28380 + }, + { + "entropy": 1.7965776398777962, + "epoch": 0.08800651601608545, + "grad_norm": 2.5267937183380127, + "learning_rate": 8.527916003836331e-06, + "loss": 0.5494, + "mean_token_accuracy": 0.8231059461832047, + "num_tokens": 34087851.0, + "step": 28390 + }, + { + "entropy": 1.8382470428943634, + "epoch": 0.08803751514113514, + "grad_norm": 9.315332412719727, + "learning_rate": 8.52641442510833e-06, + "loss": 0.5739, + "mean_token_accuracy": 0.8272110924124718, + "num_tokens": 34099777.0, + "step": 28400 + }, + { + "entropy": 1.7765842095017432, + "epoch": 0.08806851426618484, + "grad_norm": 9.057583808898926, + "learning_rate": 8.524913639286219e-06, + "loss": 0.4539, + "mean_token_accuracy": 0.8396439641714096, + "num_tokens": 34111989.0, + "step": 28410 + }, + { + "entropy": 1.9059796720743178, + "epoch": 0.08809951339123454, + "grad_norm": 8.058761596679688, + "learning_rate": 8.523413645672424e-06, + "loss": 0.6247, + "mean_token_accuracy": 0.811768627166748, + "num_tokens": 34123482.0, + "step": 28420 + }, + { + "entropy": 1.8845535546541214, + "epoch": 0.08813051251628423, + "grad_norm": 7.467669486999512, + "learning_rate": 8.52191444357023e-06, + "loss": 0.6146, + "mean_token_accuracy": 0.8204743906855583, + "num_tokens": 34134946.0, + "step": 28430 + }, + { + "entropy": 1.789374603331089, + "epoch": 0.08816151164133393, + "grad_norm": 8.87814712524414, + "learning_rate": 8.520416032283778e-06, + "loss": 0.5555, + "mean_token_accuracy": 0.8291935101151466, + "num_tokens": 34147310.0, + "step": 28440 + }, + { + "entropy": 1.9232223629951477, + "epoch": 0.08819251076638362, + "grad_norm": 10.156554222106934, + "learning_rate": 8.518918411118063e-06, + "loss": 0.6747, + "mean_token_accuracy": 0.8002053126692772, + "num_tokens": 34157822.0, + "step": 28450 + }, + { + "entropy": 1.8766271471977234, + "epoch": 0.08822350989143331, + "grad_norm": 9.71645450592041, + "learning_rate": 8.51742157937894e-06, + "loss": 0.6254, + "mean_token_accuracy": 0.8169643610715867, + "num_tokens": 34168930.0, + "step": 28460 + }, + { + "entropy": 1.9275714561343193, + "epoch": 0.088254509016483, + "grad_norm": 10.583243370056152, + "learning_rate": 8.515925536373112e-06, + "loss": 0.6441, + "mean_token_accuracy": 0.8056027069687843, + "num_tokens": 34181366.0, + "step": 28470 + }, + { + "entropy": 1.8285674542188644, + "epoch": 0.0882855081415327, + "grad_norm": 12.124537467956543, + "learning_rate": 8.51443028140814e-06, + "loss": 0.5686, + "mean_token_accuracy": 0.81962631046772, + "num_tokens": 34193201.0, + "step": 28480 + }, + { + "entropy": 1.8924355298280715, + "epoch": 0.0883165072665824, + "grad_norm": 8.544037818908691, + "learning_rate": 8.512935813792427e-06, + "loss": 0.5901, + "mean_token_accuracy": 0.8254889070987701, + "num_tokens": 34203925.0, + "step": 28490 + }, + { + "entropy": 1.8146496564149857, + "epoch": 0.0883475063916321, + "grad_norm": 9.721524238586426, + "learning_rate": 8.511442132835237e-06, + "loss": 0.533, + "mean_token_accuracy": 0.8225750029087067, + "num_tokens": 34216079.0, + "step": 28500 + }, + { + "entropy": 1.8287780940532685, + "epoch": 0.08837850551668179, + "grad_norm": 10.420062065124512, + "learning_rate": 8.509949237846672e-06, + "loss": 0.6115, + "mean_token_accuracy": 0.8089086845517158, + "num_tokens": 34228708.0, + "step": 28510 + }, + { + "entropy": 1.8081783071160316, + "epoch": 0.08840950464173149, + "grad_norm": 5.157535076141357, + "learning_rate": 8.508457128137686e-06, + "loss": 0.5269, + "mean_token_accuracy": 0.8270225182175637, + "num_tokens": 34241771.0, + "step": 28520 + }, + { + "entropy": 1.8582287296652793, + "epoch": 0.08844050376678118, + "grad_norm": 4.54521369934082, + "learning_rate": 8.506965803020078e-06, + "loss": 0.5714, + "mean_token_accuracy": 0.8173837080597878, + "num_tokens": 34253510.0, + "step": 28530 + }, + { + "entropy": 1.769499270617962, + "epoch": 0.08847150289183088, + "grad_norm": 11.29238224029541, + "learning_rate": 8.50547526180649e-06, + "loss": 0.5211, + "mean_token_accuracy": 0.8185042932629585, + "num_tokens": 34266457.0, + "step": 28540 + }, + { + "entropy": 1.8402513667941094, + "epoch": 0.08850250201688058, + "grad_norm": 10.66517162322998, + "learning_rate": 8.503985503810404e-06, + "loss": 0.5411, + "mean_token_accuracy": 0.8161203816533089, + "num_tokens": 34278814.0, + "step": 28550 + }, + { + "entropy": 1.9259721651673316, + "epoch": 0.08853350114193027, + "grad_norm": 11.027581214904785, + "learning_rate": 8.502496528346151e-06, + "loss": 0.6318, + "mean_token_accuracy": 0.8031986460089684, + "num_tokens": 34290769.0, + "step": 28560 + }, + { + "entropy": 1.8921418815851212, + "epoch": 0.08856450026697997, + "grad_norm": 8.231386184692383, + "learning_rate": 8.501008334728893e-06, + "loss": 0.573, + "mean_token_accuracy": 0.8195232212543487, + "num_tokens": 34301680.0, + "step": 28570 + }, + { + "entropy": 1.9299334168434144, + "epoch": 0.08859549939202967, + "grad_norm": 12.117446899414062, + "learning_rate": 8.49952092227464e-06, + "loss": 0.6296, + "mean_token_accuracy": 0.8005195692181587, + "num_tokens": 34313378.0, + "step": 28580 + }, + { + "entropy": 1.8172454446554185, + "epoch": 0.08862649851707935, + "grad_norm": 8.979788780212402, + "learning_rate": 8.498034290300233e-06, + "loss": 0.5726, + "mean_token_accuracy": 0.8196427300572395, + "num_tokens": 34326259.0, + "step": 28590 + }, + { + "entropy": 1.8738806433975697, + "epoch": 0.08865749764212905, + "grad_norm": 9.108292579650879, + "learning_rate": 8.496548438123347e-06, + "loss": 0.5925, + "mean_token_accuracy": 0.8214808851480484, + "num_tokens": 34337849.0, + "step": 28600 + }, + { + "entropy": 1.8880690574645995, + "epoch": 0.08868849676717874, + "grad_norm": 10.18707275390625, + "learning_rate": 8.495063365062501e-06, + "loss": 0.5741, + "mean_token_accuracy": 0.8180713891983032, + "num_tokens": 34349609.0, + "step": 28610 + }, + { + "entropy": 1.7710354149341583, + "epoch": 0.08871949589222844, + "grad_norm": 2.5999815464019775, + "learning_rate": 8.493579070437038e-06, + "loss": 0.4607, + "mean_token_accuracy": 0.8418709754943847, + "num_tokens": 34363340.0, + "step": 28620 + }, + { + "entropy": 1.7987074330449104, + "epoch": 0.08875049501727814, + "grad_norm": 4.221140384674072, + "learning_rate": 8.492095553567142e-06, + "loss": 0.5227, + "mean_token_accuracy": 0.8264068886637688, + "num_tokens": 34376348.0, + "step": 28630 + }, + { + "entropy": 1.8191845625638963, + "epoch": 0.08878149414232783, + "grad_norm": 8.881943702697754, + "learning_rate": 8.49061281377382e-06, + "loss": 0.615, + "mean_token_accuracy": 0.8144074931740761, + "num_tokens": 34389472.0, + "step": 28640 + }, + { + "entropy": 1.910823555290699, + "epoch": 0.08881249326737753, + "grad_norm": 10.219226837158203, + "learning_rate": 8.489130850378912e-06, + "loss": 0.5835, + "mean_token_accuracy": 0.8214432463049889, + "num_tokens": 34400353.0, + "step": 28650 + }, + { + "entropy": 1.817768232524395, + "epoch": 0.08884349239242723, + "grad_norm": 9.067606925964355, + "learning_rate": 8.487649662705087e-06, + "loss": 0.5141, + "mean_token_accuracy": 0.8307969868183136, + "num_tokens": 34412601.0, + "step": 28660 + }, + { + "entropy": 1.7433785900473595, + "epoch": 0.08887449151747692, + "grad_norm": 9.326837539672852, + "learning_rate": 8.48616925007584e-06, + "loss": 0.4671, + "mean_token_accuracy": 0.8333866909146309, + "num_tokens": 34425813.0, + "step": 28670 + }, + { + "entropy": 1.8864496439695357, + "epoch": 0.08890549064252662, + "grad_norm": 8.599132537841797, + "learning_rate": 8.484689611815491e-06, + "loss": 0.6327, + "mean_token_accuracy": 0.8141407266259193, + "num_tokens": 34437369.0, + "step": 28680 + }, + { + "entropy": 1.8374801024794578, + "epoch": 0.08893648976757632, + "grad_norm": 9.512642860412598, + "learning_rate": 8.483210747249186e-06, + "loss": 0.5685, + "mean_token_accuracy": 0.8316513374447823, + "num_tokens": 34449662.0, + "step": 28690 + }, + { + "entropy": 1.7934413641691207, + "epoch": 0.08896748889262601, + "grad_norm": 5.296740531921387, + "learning_rate": 8.481732655702892e-06, + "loss": 0.5281, + "mean_token_accuracy": 0.8272898524999619, + "num_tokens": 34462445.0, + "step": 28700 + }, + { + "entropy": 1.8716158524155617, + "epoch": 0.0889984880176757, + "grad_norm": 9.011677742004395, + "learning_rate": 8.4802553365034e-06, + "loss": 0.5302, + "mean_token_accuracy": 0.8340117856860161, + "num_tokens": 34474062.0, + "step": 28710 + }, + { + "entropy": 1.801230075955391, + "epoch": 0.08902948714272539, + "grad_norm": 8.55329418182373, + "learning_rate": 8.478778788978323e-06, + "loss": 0.5594, + "mean_token_accuracy": 0.8217922821640968, + "num_tokens": 34487372.0, + "step": 28720 + }, + { + "entropy": 1.8363505557179451, + "epoch": 0.08906048626777509, + "grad_norm": 9.565589904785156, + "learning_rate": 8.477303012456088e-06, + "loss": 0.5462, + "mean_token_accuracy": 0.825613497197628, + "num_tokens": 34499424.0, + "step": 28730 + }, + { + "entropy": 1.8904439568519593, + "epoch": 0.08909148539282478, + "grad_norm": 9.289710998535156, + "learning_rate": 8.47582800626594e-06, + "loss": 0.5842, + "mean_token_accuracy": 0.8117494031786918, + "num_tokens": 34510549.0, + "step": 28740 + }, + { + "entropy": 1.876707810163498, + "epoch": 0.08912248451787448, + "grad_norm": 10.488570213317871, + "learning_rate": 8.474353769737951e-06, + "loss": 0.584, + "mean_token_accuracy": 0.8244671374559402, + "num_tokens": 34522326.0, + "step": 28750 + }, + { + "entropy": 1.8384212747216224, + "epoch": 0.08915348364292418, + "grad_norm": 4.727441787719727, + "learning_rate": 8.472880302202995e-06, + "loss": 0.5507, + "mean_token_accuracy": 0.8281288787722587, + "num_tokens": 34533723.0, + "step": 28760 + }, + { + "entropy": 1.8969373285770417, + "epoch": 0.08918448276797387, + "grad_norm": 10.660612106323242, + "learning_rate": 8.471407602992768e-06, + "loss": 0.6065, + "mean_token_accuracy": 0.8140498995780945, + "num_tokens": 34545247.0, + "step": 28770 + }, + { + "entropy": 1.8668588057160378, + "epoch": 0.08921548189302357, + "grad_norm": 5.187861919403076, + "learning_rate": 8.469935671439776e-06, + "loss": 0.5975, + "mean_token_accuracy": 0.8087451472878456, + "num_tokens": 34557397.0, + "step": 28780 + }, + { + "entropy": 1.9032288029789926, + "epoch": 0.08924648101807327, + "grad_norm": 5.01014518737793, + "learning_rate": 8.468464506877338e-06, + "loss": 0.5857, + "mean_token_accuracy": 0.8231427073478699, + "num_tokens": 34568813.0, + "step": 28790 + }, + { + "entropy": 1.83003838211298, + "epoch": 0.08927748014312296, + "grad_norm": 7.490168571472168, + "learning_rate": 8.46699410863958e-06, + "loss": 0.5445, + "mean_token_accuracy": 0.8283394366502762, + "num_tokens": 34581102.0, + "step": 28800 + }, + { + "entropy": 1.8458102241158485, + "epoch": 0.08930847926817266, + "grad_norm": 11.137908935546875, + "learning_rate": 8.465524476061445e-06, + "loss": 0.5456, + "mean_token_accuracy": 0.8272756159305572, + "num_tokens": 34593458.0, + "step": 28810 + }, + { + "entropy": 1.7978875115513802, + "epoch": 0.08933947839322236, + "grad_norm": 13.252009391784668, + "learning_rate": 8.464055608478673e-06, + "loss": 0.5037, + "mean_token_accuracy": 0.8363189071416854, + "num_tokens": 34607214.0, + "step": 28820 + }, + { + "entropy": 1.8788623362779617, + "epoch": 0.08937047751827204, + "grad_norm": 8.798412322998047, + "learning_rate": 8.46258750522782e-06, + "loss": 0.5202, + "mean_token_accuracy": 0.8359053075313568, + "num_tokens": 34618731.0, + "step": 28830 + }, + { + "entropy": 1.8706136047840118, + "epoch": 0.08940147664332174, + "grad_norm": 10.804203033447266, + "learning_rate": 8.46112016564624e-06, + "loss": 0.6323, + "mean_token_accuracy": 0.8170384958386421, + "num_tokens": 34629767.0, + "step": 28840 + }, + { + "entropy": 1.9193976387381553, + "epoch": 0.08943247576837143, + "grad_norm": 9.608490943908691, + "learning_rate": 8.459653589072098e-06, + "loss": 0.5879, + "mean_token_accuracy": 0.823236158490181, + "num_tokens": 34641783.0, + "step": 28850 + }, + { + "entropy": 1.953177237510681, + "epoch": 0.08946347489342113, + "grad_norm": 9.958290100097656, + "learning_rate": 8.458187774844355e-06, + "loss": 0.6343, + "mean_token_accuracy": 0.8059092834591866, + "num_tokens": 34652879.0, + "step": 28860 + }, + { + "entropy": 1.9278223618865014, + "epoch": 0.08949447401847083, + "grad_norm": 9.370678901672363, + "learning_rate": 8.456722722302779e-06, + "loss": 0.5779, + "mean_token_accuracy": 0.8173175677657127, + "num_tokens": 34664517.0, + "step": 28870 + }, + { + "entropy": 1.743288952112198, + "epoch": 0.08952547314352052, + "grad_norm": 4.402005672454834, + "learning_rate": 8.45525843078793e-06, + "loss": 0.4175, + "mean_token_accuracy": 0.8388036683201789, + "num_tokens": 34678318.0, + "step": 28880 + }, + { + "entropy": 1.82712120115757, + "epoch": 0.08955647226857022, + "grad_norm": 5.668634414672852, + "learning_rate": 8.453794899641178e-06, + "loss": 0.5317, + "mean_token_accuracy": 0.8243148773908615, + "num_tokens": 34690574.0, + "step": 28890 + }, + { + "entropy": 1.794793924689293, + "epoch": 0.08958747139361992, + "grad_norm": 4.694520950317383, + "learning_rate": 8.452332128204687e-06, + "loss": 0.5341, + "mean_token_accuracy": 0.8249142169952393, + "num_tokens": 34702705.0, + "step": 28900 + }, + { + "entropy": 1.9069462090730667, + "epoch": 0.08961847051866961, + "grad_norm": 9.739960670471191, + "learning_rate": 8.450870115821412e-06, + "loss": 0.6399, + "mean_token_accuracy": 0.8125949770212173, + "num_tokens": 34713359.0, + "step": 28910 + }, + { + "entropy": 1.843550091981888, + "epoch": 0.08964946964371931, + "grad_norm": 9.711644172668457, + "learning_rate": 8.449408861835107e-06, + "loss": 0.5467, + "mean_token_accuracy": 0.8326569676399231, + "num_tokens": 34724846.0, + "step": 28920 + }, + { + "entropy": 1.8499679207801818, + "epoch": 0.089680468768769, + "grad_norm": 9.517023086547852, + "learning_rate": 8.447948365590324e-06, + "loss": 0.6048, + "mean_token_accuracy": 0.8209212452173233, + "num_tokens": 34736090.0, + "step": 28930 + }, + { + "entropy": 1.8778686568140983, + "epoch": 0.0897114678938187, + "grad_norm": 7.4227423667907715, + "learning_rate": 8.446488626432398e-06, + "loss": 0.5703, + "mean_token_accuracy": 0.8157725527882576, + "num_tokens": 34748597.0, + "step": 28940 + }, + { + "entropy": 1.901959116756916, + "epoch": 0.0897424670188684, + "grad_norm": 8.503968238830566, + "learning_rate": 8.445029643707466e-06, + "loss": 0.6077, + "mean_token_accuracy": 0.8245082676410675, + "num_tokens": 34760085.0, + "step": 28950 + }, + { + "entropy": 1.9186773270368576, + "epoch": 0.08977346614391808, + "grad_norm": 9.638235092163086, + "learning_rate": 8.443571416762454e-06, + "loss": 0.5811, + "mean_token_accuracy": 0.8147833585739136, + "num_tokens": 34771714.0, + "step": 28960 + }, + { + "entropy": 1.9039877220988273, + "epoch": 0.08980446526896778, + "grad_norm": 10.257258415222168, + "learning_rate": 8.442113944945066e-06, + "loss": 0.5929, + "mean_token_accuracy": 0.8272214874625206, + "num_tokens": 34783065.0, + "step": 28970 + }, + { + "entropy": 1.8442776799201965, + "epoch": 0.08983546439401748, + "grad_norm": 8.371413230895996, + "learning_rate": 8.440657227603809e-06, + "loss": 0.5034, + "mean_token_accuracy": 0.8354337051510811, + "num_tokens": 34796313.0, + "step": 28980 + }, + { + "entropy": 1.8650359570980073, + "epoch": 0.08986646351906717, + "grad_norm": 8.93613052368164, + "learning_rate": 8.439201264087966e-06, + "loss": 0.4865, + "mean_token_accuracy": 0.8383944362401963, + "num_tokens": 34809266.0, + "step": 28990 + }, + { + "entropy": 1.898645070195198, + "epoch": 0.08989746264411687, + "grad_norm": 4.809848785400391, + "learning_rate": 8.437746053747611e-06, + "loss": 0.5454, + "mean_token_accuracy": 0.8344621181488037, + "num_tokens": 34821442.0, + "step": 29000 + }, + { + "entropy": 1.8691908940672874, + "epoch": 0.08992846176916656, + "grad_norm": 4.506728649139404, + "learning_rate": 8.436291595933597e-06, + "loss": 0.5207, + "mean_token_accuracy": 0.8261950254440308, + "num_tokens": 34834092.0, + "step": 29010 + }, + { + "entropy": 1.989156812429428, + "epoch": 0.08995946089421626, + "grad_norm": 10.102865219116211, + "learning_rate": 8.434837889997567e-06, + "loss": 0.6544, + "mean_token_accuracy": 0.8023478895425796, + "num_tokens": 34844589.0, + "step": 29020 + }, + { + "entropy": 1.8639222919940948, + "epoch": 0.08999046001926596, + "grad_norm": 8.607808113098145, + "learning_rate": 8.433384935291941e-06, + "loss": 0.5202, + "mean_token_accuracy": 0.8342802464962006, + "num_tokens": 34856930.0, + "step": 29030 + }, + { + "entropy": 1.888324649631977, + "epoch": 0.09002145914431565, + "grad_norm": 15.61713695526123, + "learning_rate": 8.43193273116992e-06, + "loss": 0.5623, + "mean_token_accuracy": 0.8299177408218383, + "num_tokens": 34868535.0, + "step": 29040 + }, + { + "entropy": 1.9326095387339592, + "epoch": 0.09005245826936535, + "grad_norm": 13.876715660095215, + "learning_rate": 8.430481276985486e-06, + "loss": 0.606, + "mean_token_accuracy": 0.8128352046012879, + "num_tokens": 34879434.0, + "step": 29050 + }, + { + "entropy": 1.8903497770428657, + "epoch": 0.09008345739441505, + "grad_norm": 10.24538803100586, + "learning_rate": 8.429030572093397e-06, + "loss": 0.556, + "mean_token_accuracy": 0.8233076304197311, + "num_tokens": 34891052.0, + "step": 29060 + }, + { + "entropy": 1.8975743114948274, + "epoch": 0.09011445651946474, + "grad_norm": 11.500408172607422, + "learning_rate": 8.427580615849188e-06, + "loss": 0.5778, + "mean_token_accuracy": 0.8229522377252578, + "num_tokens": 34903040.0, + "step": 29070 + }, + { + "entropy": 1.8994533449411393, + "epoch": 0.09014545564451443, + "grad_norm": 11.263349533081055, + "learning_rate": 8.426131407609173e-06, + "loss": 0.5816, + "mean_token_accuracy": 0.820560471713543, + "num_tokens": 34914402.0, + "step": 29080 + }, + { + "entropy": 1.8676754549145698, + "epoch": 0.09017645476956412, + "grad_norm": 9.663436889648438, + "learning_rate": 8.42468294673044e-06, + "loss": 0.5948, + "mean_token_accuracy": 0.8182600021362305, + "num_tokens": 34926419.0, + "step": 29090 + }, + { + "entropy": 1.8573566570878028, + "epoch": 0.09020745389461382, + "grad_norm": 11.30569076538086, + "learning_rate": 8.423235232570846e-06, + "loss": 0.5602, + "mean_token_accuracy": 0.8217246904969215, + "num_tokens": 34938924.0, + "step": 29100 + }, + { + "entropy": 1.9108622312545775, + "epoch": 0.09023845301966352, + "grad_norm": 5.698962211608887, + "learning_rate": 8.421788264489021e-06, + "loss": 0.5796, + "mean_token_accuracy": 0.8229934841394424, + "num_tokens": 34950788.0, + "step": 29110 + }, + { + "entropy": 1.777287058532238, + "epoch": 0.09026945214471321, + "grad_norm": 9.155726432800293, + "learning_rate": 8.420342041844372e-06, + "loss": 0.518, + "mean_token_accuracy": 0.834735095500946, + "num_tokens": 34963772.0, + "step": 29120 + }, + { + "entropy": 1.8479400515556335, + "epoch": 0.09030045126976291, + "grad_norm": 4.175968170166016, + "learning_rate": 8.418896563997072e-06, + "loss": 0.5232, + "mean_token_accuracy": 0.8213765308260917, + "num_tokens": 34976155.0, + "step": 29130 + }, + { + "entropy": 1.8892390161752701, + "epoch": 0.0903314503948126, + "grad_norm": 10.586323738098145, + "learning_rate": 8.41745183030806e-06, + "loss": 0.5572, + "mean_token_accuracy": 0.8211356267333031, + "num_tokens": 34988223.0, + "step": 29140 + }, + { + "entropy": 1.8357836604118347, + "epoch": 0.0903624495198623, + "grad_norm": 10.024797439575195, + "learning_rate": 8.416007840139042e-06, + "loss": 0.5248, + "mean_token_accuracy": 0.8338177174329757, + "num_tokens": 35001045.0, + "step": 29150 + }, + { + "entropy": 1.887669287621975, + "epoch": 0.090393448644912, + "grad_norm": 9.929082870483398, + "learning_rate": 8.4145645928525e-06, + "loss": 0.6165, + "mean_token_accuracy": 0.8130573540925979, + "num_tokens": 35012268.0, + "step": 29160 + }, + { + "entropy": 1.8210702255368232, + "epoch": 0.0904244477699617, + "grad_norm": 4.600307464599609, + "learning_rate": 8.413122087811668e-06, + "loss": 0.5054, + "mean_token_accuracy": 0.8326424300670624, + "num_tokens": 35025165.0, + "step": 29170 + }, + { + "entropy": 1.9049372583627702, + "epoch": 0.0904554468950114, + "grad_norm": 9.302005767822266, + "learning_rate": 8.411680324380554e-06, + "loss": 0.5807, + "mean_token_accuracy": 0.8166706204414368, + "num_tokens": 35036999.0, + "step": 29180 + }, + { + "entropy": 1.8811736673116684, + "epoch": 0.09048644602006109, + "grad_norm": 11.642059326171875, + "learning_rate": 8.410239301923921e-06, + "loss": 0.5831, + "mean_token_accuracy": 0.8235431507229805, + "num_tokens": 35048718.0, + "step": 29190 + }, + { + "entropy": 1.7818408221006394, + "epoch": 0.09051744514511077, + "grad_norm": 3.9471521377563477, + "learning_rate": 8.408799019807298e-06, + "loss": 0.4817, + "mean_token_accuracy": 0.8274667426943779, + "num_tokens": 35062426.0, + "step": 29200 + }, + { + "entropy": 1.9234388038516044, + "epoch": 0.09054844427016047, + "grad_norm": 9.527081489562988, + "learning_rate": 8.407359477396974e-06, + "loss": 0.5952, + "mean_token_accuracy": 0.824344064295292, + "num_tokens": 35073342.0, + "step": 29210 + }, + { + "entropy": 1.9060495615005493, + "epoch": 0.09057944339521017, + "grad_norm": 4.219698905944824, + "learning_rate": 8.405920674059997e-06, + "loss": 0.5991, + "mean_token_accuracy": 0.8162388294935227, + "num_tokens": 35085769.0, + "step": 29220 + }, + { + "entropy": 1.9729074537754059, + "epoch": 0.09061044252025986, + "grad_norm": 12.57748794555664, + "learning_rate": 8.404482609164172e-06, + "loss": 0.678, + "mean_token_accuracy": 0.8029696837067604, + "num_tokens": 35096711.0, + "step": 29230 + }, + { + "entropy": 1.9721817374229431, + "epoch": 0.09064144164530956, + "grad_norm": 9.833843231201172, + "learning_rate": 8.40304528207806e-06, + "loss": 0.5768, + "mean_token_accuracy": 0.8235157504677773, + "num_tokens": 35107468.0, + "step": 29240 + }, + { + "entropy": 1.8956667900085449, + "epoch": 0.09067244077035926, + "grad_norm": 8.549565315246582, + "learning_rate": 8.40160869217098e-06, + "loss": 0.5324, + "mean_token_accuracy": 0.8300123199820518, + "num_tokens": 35119143.0, + "step": 29250 + }, + { + "entropy": 1.856950616836548, + "epoch": 0.09070343989540895, + "grad_norm": 8.162071228027344, + "learning_rate": 8.400172838813004e-06, + "loss": 0.4855, + "mean_token_accuracy": 0.8311971753835679, + "num_tokens": 35131680.0, + "step": 29260 + }, + { + "entropy": 1.92002714574337, + "epoch": 0.09073443902045865, + "grad_norm": 9.52449893951416, + "learning_rate": 8.398737721374958e-06, + "loss": 0.5886, + "mean_token_accuracy": 0.8099859327077865, + "num_tokens": 35143630.0, + "step": 29270 + }, + { + "entropy": 1.8672298848628999, + "epoch": 0.09076543814550835, + "grad_norm": 10.40442180633545, + "learning_rate": 8.39730333922842e-06, + "loss": 0.5327, + "mean_token_accuracy": 0.8219729751348496, + "num_tokens": 35155669.0, + "step": 29280 + }, + { + "entropy": 1.8741114243865014, + "epoch": 0.09079643727055804, + "grad_norm": 9.638392448425293, + "learning_rate": 8.39586969174572e-06, + "loss": 0.6016, + "mean_token_accuracy": 0.8221320033073425, + "num_tokens": 35168182.0, + "step": 29290 + }, + { + "entropy": 1.8563538879156112, + "epoch": 0.09082743639560774, + "grad_norm": 11.128931999206543, + "learning_rate": 8.394436778299934e-06, + "loss": 0.5658, + "mean_token_accuracy": 0.8322840392589569, + "num_tokens": 35180920.0, + "step": 29300 + }, + { + "entropy": 1.940355482697487, + "epoch": 0.09085843552065744, + "grad_norm": 12.628369331359863, + "learning_rate": 8.393004598264892e-06, + "loss": 0.6027, + "mean_token_accuracy": 0.8121664762496948, + "num_tokens": 35191979.0, + "step": 29310 + }, + { + "entropy": 1.881583635509014, + "epoch": 0.09088943464570713, + "grad_norm": 10.32827091217041, + "learning_rate": 8.391573151015169e-06, + "loss": 0.5926, + "mean_token_accuracy": 0.8216746702790261, + "num_tokens": 35204154.0, + "step": 29320 + }, + { + "entropy": 1.9963495016098023, + "epoch": 0.09092043377075681, + "grad_norm": 10.914651870727539, + "learning_rate": 8.390142435926085e-06, + "loss": 0.6708, + "mean_token_accuracy": 0.802430622279644, + "num_tokens": 35215370.0, + "step": 29330 + }, + { + "entropy": 1.807043470442295, + "epoch": 0.09095143289580651, + "grad_norm": 6.392316818237305, + "learning_rate": 8.38871245237371e-06, + "loss": 0.4936, + "mean_token_accuracy": 0.83642118871212, + "num_tokens": 35228636.0, + "step": 29340 + }, + { + "entropy": 1.8242978394031524, + "epoch": 0.09098243202085621, + "grad_norm": 8.627922058105469, + "learning_rate": 8.387283199734848e-06, + "loss": 0.4963, + "mean_token_accuracy": 0.833967824280262, + "num_tokens": 35241156.0, + "step": 29350 + }, + { + "entropy": 1.937102773785591, + "epoch": 0.0910134311459059, + "grad_norm": 7.6476640701293945, + "learning_rate": 8.38585467738706e-06, + "loss": 0.6429, + "mean_token_accuracy": 0.810026279091835, + "num_tokens": 35252420.0, + "step": 29360 + }, + { + "entropy": 1.8407186821103096, + "epoch": 0.0910444302709556, + "grad_norm": 9.836170196533203, + "learning_rate": 8.38442688470864e-06, + "loss": 0.5336, + "mean_token_accuracy": 0.8297666862607003, + "num_tokens": 35264790.0, + "step": 29370 + }, + { + "entropy": 1.8077852353453636, + "epoch": 0.0910754293960053, + "grad_norm": 9.396247863769531, + "learning_rate": 8.382999821078624e-06, + "loss": 0.5106, + "mean_token_accuracy": 0.8240919351577759, + "num_tokens": 35277676.0, + "step": 29380 + }, + { + "entropy": 1.8391135558485985, + "epoch": 0.091106428521055, + "grad_norm": 4.910935401916504, + "learning_rate": 8.381573485876786e-06, + "loss": 0.4858, + "mean_token_accuracy": 0.8325628146529198, + "num_tokens": 35290859.0, + "step": 29390 + }, + { + "entropy": 1.8129886105656623, + "epoch": 0.09113742764610469, + "grad_norm": 9.58067798614502, + "learning_rate": 8.380147878483645e-06, + "loss": 0.5201, + "mean_token_accuracy": 0.826962910592556, + "num_tokens": 35303398.0, + "step": 29400 + }, + { + "entropy": 1.85167246311903, + "epoch": 0.09116842677115439, + "grad_norm": 8.074714660644531, + "learning_rate": 8.378722998280448e-06, + "loss": 0.5415, + "mean_token_accuracy": 0.8280769392848015, + "num_tokens": 35315344.0, + "step": 29410 + }, + { + "entropy": 1.992096221446991, + "epoch": 0.09119942589620408, + "grad_norm": 9.145973205566406, + "learning_rate": 8.377298844649186e-06, + "loss": 0.7051, + "mean_token_accuracy": 0.7987665429711341, + "num_tokens": 35326107.0, + "step": 29420 + }, + { + "entropy": 1.8843266785144805, + "epoch": 0.09123042502125378, + "grad_norm": 9.272721290588379, + "learning_rate": 8.375875416972584e-06, + "loss": 0.5242, + "mean_token_accuracy": 0.83641587048769, + "num_tokens": 35338238.0, + "step": 29430 + }, + { + "entropy": 1.8167478024959565, + "epoch": 0.09126142414630348, + "grad_norm": 3.9986329078674316, + "learning_rate": 8.374452714634094e-06, + "loss": 0.5826, + "mean_token_accuracy": 0.8207257181406021, + "num_tokens": 35351268.0, + "step": 29440 + }, + { + "entropy": 1.9153858974575997, + "epoch": 0.09129242327135316, + "grad_norm": 10.753203392028809, + "learning_rate": 8.373030737017907e-06, + "loss": 0.5577, + "mean_token_accuracy": 0.8246575996279717, + "num_tokens": 35362594.0, + "step": 29450 + }, + { + "entropy": 1.9501359462738037, + "epoch": 0.09132342239640286, + "grad_norm": 9.741477966308594, + "learning_rate": 8.371609483508947e-06, + "loss": 0.6487, + "mean_token_accuracy": 0.8103729501366616, + "num_tokens": 35373596.0, + "step": 29460 + }, + { + "entropy": 1.932808554172516, + "epoch": 0.09135442152145255, + "grad_norm": 11.1822509765625, + "learning_rate": 8.370188953492866e-06, + "loss": 0.6056, + "mean_token_accuracy": 0.821463891863823, + "num_tokens": 35385127.0, + "step": 29470 + }, + { + "entropy": 1.8480829164385795, + "epoch": 0.09138542064650225, + "grad_norm": 8.782395362854004, + "learning_rate": 8.368769146356043e-06, + "loss": 0.5412, + "mean_token_accuracy": 0.8359311327338219, + "num_tokens": 35397793.0, + "step": 29480 + }, + { + "entropy": 1.9216597527265549, + "epoch": 0.09141641977155195, + "grad_norm": 10.21785831451416, + "learning_rate": 8.36735006148559e-06, + "loss": 0.6182, + "mean_token_accuracy": 0.8218403205275535, + "num_tokens": 35408887.0, + "step": 29490 + }, + { + "entropy": 1.854940627515316, + "epoch": 0.09144741889660164, + "grad_norm": 9.738880157470703, + "learning_rate": 8.365931698269346e-06, + "loss": 0.6001, + "mean_token_accuracy": 0.8242891728878021, + "num_tokens": 35421648.0, + "step": 29500 + }, + { + "entropy": 1.8743022330105306, + "epoch": 0.09147841802165134, + "grad_norm": 10.232392311096191, + "learning_rate": 8.36451405609587e-06, + "loss": 0.5753, + "mean_token_accuracy": 0.8169502720236779, + "num_tokens": 35434473.0, + "step": 29510 + }, + { + "entropy": 1.8739581793546676, + "epoch": 0.09150941714670104, + "grad_norm": 9.921628952026367, + "learning_rate": 8.363097134354453e-06, + "loss": 0.5563, + "mean_token_accuracy": 0.8249258384108543, + "num_tokens": 35446211.0, + "step": 29520 + }, + { + "entropy": 1.9153602778911591, + "epoch": 0.09154041627175073, + "grad_norm": 9.796987533569336, + "learning_rate": 8.361680932435107e-06, + "loss": 0.5789, + "mean_token_accuracy": 0.8256885960698128, + "num_tokens": 35457049.0, + "step": 29530 + }, + { + "entropy": 1.8282124564051627, + "epoch": 0.09157141539680043, + "grad_norm": 8.886029243469238, + "learning_rate": 8.360265449728567e-06, + "loss": 0.5452, + "mean_token_accuracy": 0.8246500983834266, + "num_tokens": 35470118.0, + "step": 29540 + }, + { + "entropy": 1.8801322914659977, + "epoch": 0.09160241452185013, + "grad_norm": 6.403395652770996, + "learning_rate": 8.358850685626288e-06, + "loss": 0.5331, + "mean_token_accuracy": 0.8278462216258049, + "num_tokens": 35483031.0, + "step": 29550 + }, + { + "entropy": 1.8667038604617119, + "epoch": 0.09163341364689982, + "grad_norm": 8.779369354248047, + "learning_rate": 8.357436639520454e-06, + "loss": 0.6143, + "mean_token_accuracy": 0.8238617271184921, + "num_tokens": 35494985.0, + "step": 29560 + }, + { + "entropy": 1.8358441174030304, + "epoch": 0.0916644127719495, + "grad_norm": 9.281476020812988, + "learning_rate": 8.356023310803953e-06, + "loss": 0.5642, + "mean_token_accuracy": 0.8221923336386681, + "num_tokens": 35506647.0, + "step": 29570 + }, + { + "entropy": 1.8791137009859085, + "epoch": 0.0916954118969992, + "grad_norm": 11.18396282196045, + "learning_rate": 8.354610698870407e-06, + "loss": 0.6371, + "mean_token_accuracy": 0.8211662247776985, + "num_tokens": 35518289.0, + "step": 29580 + }, + { + "entropy": 1.8068061396479607, + "epoch": 0.0917264110220489, + "grad_norm": 9.689407348632812, + "learning_rate": 8.353198803114144e-06, + "loss": 0.5216, + "mean_token_accuracy": 0.819252048432827, + "num_tokens": 35531769.0, + "step": 29590 + }, + { + "entropy": 1.8276824593544005, + "epoch": 0.0917574101470986, + "grad_norm": 8.703977584838867, + "learning_rate": 8.351787622930218e-06, + "loss": 0.4981, + "mean_token_accuracy": 0.8255561321973801, + "num_tokens": 35544219.0, + "step": 29600 + }, + { + "entropy": 1.9323107048869133, + "epoch": 0.09178840927214829, + "grad_norm": 9.766465187072754, + "learning_rate": 8.350377157714388e-06, + "loss": 0.6171, + "mean_token_accuracy": 0.8179709568619729, + "num_tokens": 35555186.0, + "step": 29610 + }, + { + "entropy": 1.896198531985283, + "epoch": 0.09181940839719799, + "grad_norm": 9.552749633789062, + "learning_rate": 8.348967406863137e-06, + "loss": 0.6246, + "mean_token_accuracy": 0.8194371804594993, + "num_tokens": 35567088.0, + "step": 29620 + }, + { + "entropy": 1.9005744218826295, + "epoch": 0.09185040752224768, + "grad_norm": 8.323497772216797, + "learning_rate": 8.347558369773652e-06, + "loss": 0.5541, + "mean_token_accuracy": 0.8280677005648613, + "num_tokens": 35578712.0, + "step": 29630 + }, + { + "entropy": 1.8314684435725213, + "epoch": 0.09188140664729738, + "grad_norm": 3.150182008743286, + "learning_rate": 8.346150045843839e-06, + "loss": 0.5522, + "mean_token_accuracy": 0.8305447429418564, + "num_tokens": 35591457.0, + "step": 29640 + }, + { + "entropy": 1.8687354385852815, + "epoch": 0.09191240577234708, + "grad_norm": 9.428099632263184, + "learning_rate": 8.344742434472313e-06, + "loss": 0.5739, + "mean_token_accuracy": 0.8308294847607612, + "num_tokens": 35603419.0, + "step": 29650 + }, + { + "entropy": 1.8598203748464583, + "epoch": 0.09194340489739677, + "grad_norm": 7.902471542358398, + "learning_rate": 8.343335535058393e-06, + "loss": 0.508, + "mean_token_accuracy": 0.8371548101305961, + "num_tokens": 35615606.0, + "step": 29660 + }, + { + "entropy": 1.955969789624214, + "epoch": 0.09197440402244647, + "grad_norm": 9.46319580078125, + "learning_rate": 8.341929347002115e-06, + "loss": 0.6819, + "mean_token_accuracy": 0.8057120770215989, + "num_tokens": 35626376.0, + "step": 29670 + }, + { + "entropy": 1.9067956805229187, + "epoch": 0.09200540314749617, + "grad_norm": 4.286515235900879, + "learning_rate": 8.340523869704218e-06, + "loss": 0.5918, + "mean_token_accuracy": 0.8169622346758842, + "num_tokens": 35637722.0, + "step": 29680 + }, + { + "entropy": 1.824674019217491, + "epoch": 0.09203640227254586, + "grad_norm": 9.380338668823242, + "learning_rate": 8.33911910256615e-06, + "loss": 0.5286, + "mean_token_accuracy": 0.836339183151722, + "num_tokens": 35650536.0, + "step": 29690 + }, + { + "entropy": 1.8453435346484184, + "epoch": 0.09206740139759555, + "grad_norm": 4.688096523284912, + "learning_rate": 8.33771504499006e-06, + "loss": 0.4755, + "mean_token_accuracy": 0.8418554350733757, + "num_tokens": 35663421.0, + "step": 29700 + }, + { + "entropy": 1.9342348664999007, + "epoch": 0.09209840052264524, + "grad_norm": 8.24661922454834, + "learning_rate": 8.336311696378805e-06, + "loss": 0.579, + "mean_token_accuracy": 0.8277911230921745, + "num_tokens": 35674421.0, + "step": 29710 + }, + { + "entropy": 1.9771510928869247, + "epoch": 0.09212939964769494, + "grad_norm": 11.507468223571777, + "learning_rate": 8.334909056135947e-06, + "loss": 0.6472, + "mean_token_accuracy": 0.8096749007701873, + "num_tokens": 35685226.0, + "step": 29720 + }, + { + "entropy": 1.8233107894659042, + "epoch": 0.09216039877274464, + "grad_norm": 11.91437816619873, + "learning_rate": 8.333507123665745e-06, + "loss": 0.5913, + "mean_token_accuracy": 0.8291966646909714, + "num_tokens": 35697511.0, + "step": 29730 + }, + { + "entropy": 1.9061884060502052, + "epoch": 0.09219139789779433, + "grad_norm": 9.895060539245605, + "learning_rate": 8.332105898373162e-06, + "loss": 0.5925, + "mean_token_accuracy": 0.8209904283285141, + "num_tokens": 35709184.0, + "step": 29740 + }, + { + "entropy": 1.8389362052083016, + "epoch": 0.09222239702284403, + "grad_norm": 9.202600479125977, + "learning_rate": 8.330705379663864e-06, + "loss": 0.538, + "mean_token_accuracy": 0.8290344223380088, + "num_tokens": 35722082.0, + "step": 29750 + }, + { + "entropy": 1.836703784763813, + "epoch": 0.09225339614789373, + "grad_norm": 9.766961097717285, + "learning_rate": 8.32930556694421e-06, + "loss": 0.6326, + "mean_token_accuracy": 0.8234434753656388, + "num_tokens": 35735502.0, + "step": 29760 + }, + { + "entropy": 1.823054054379463, + "epoch": 0.09228439527294342, + "grad_norm": 10.460724830627441, + "learning_rate": 8.327906459621262e-06, + "loss": 0.5132, + "mean_token_accuracy": 0.8367855966091156, + "num_tokens": 35747655.0, + "step": 29770 + }, + { + "entropy": 1.8504664957523347, + "epoch": 0.09231539439799312, + "grad_norm": 10.958568572998047, + "learning_rate": 8.32650805710278e-06, + "loss": 0.5747, + "mean_token_accuracy": 0.8216020077466964, + "num_tokens": 35759453.0, + "step": 29780 + }, + { + "entropy": 1.8954664573073388, + "epoch": 0.09234639352304282, + "grad_norm": 9.923591613769531, + "learning_rate": 8.32511035879721e-06, + "loss": 0.549, + "mean_token_accuracy": 0.8215544745326042, + "num_tokens": 35770958.0, + "step": 29790 + }, + { + "entropy": 1.8316803082823754, + "epoch": 0.09237739264809251, + "grad_norm": 9.00976848602295, + "learning_rate": 8.323713364113706e-06, + "loss": 0.5079, + "mean_token_accuracy": 0.8291282281279564, + "num_tokens": 35784494.0, + "step": 29800 + }, + { + "entropy": 1.8252049744129182, + "epoch": 0.09240839177314221, + "grad_norm": 4.551512241363525, + "learning_rate": 8.322317072462106e-06, + "loss": 0.4894, + "mean_token_accuracy": 0.8328192785382271, + "num_tokens": 35797555.0, + "step": 29810 + }, + { + "entropy": 1.86438340395689, + "epoch": 0.09243939089819189, + "grad_norm": 12.345170021057129, + "learning_rate": 8.320921483252948e-06, + "loss": 0.5755, + "mean_token_accuracy": 0.8219951093196869, + "num_tokens": 35810502.0, + "step": 29820 + }, + { + "entropy": 1.8994136035442353, + "epoch": 0.09247039002324159, + "grad_norm": 11.431099891662598, + "learning_rate": 8.319526595897457e-06, + "loss": 0.591, + "mean_token_accuracy": 0.8122029930353165, + "num_tokens": 35822303.0, + "step": 29830 + }, + { + "entropy": 1.8895697325468064, + "epoch": 0.09250138914829129, + "grad_norm": 9.15605354309082, + "learning_rate": 8.318132409807547e-06, + "loss": 0.5452, + "mean_token_accuracy": 0.8253117471933364, + "num_tokens": 35834691.0, + "step": 29840 + }, + { + "entropy": 1.8310220405459403, + "epoch": 0.09253238827334098, + "grad_norm": 9.539107322692871, + "learning_rate": 8.31673892439583e-06, + "loss": 0.5282, + "mean_token_accuracy": 0.8205739751458168, + "num_tokens": 35847122.0, + "step": 29850 + }, + { + "entropy": 1.8387802302837373, + "epoch": 0.09256338739839068, + "grad_norm": 9.641124725341797, + "learning_rate": 8.315346139075596e-06, + "loss": 0.5816, + "mean_token_accuracy": 0.8287546694278717, + "num_tokens": 35860015.0, + "step": 29860 + }, + { + "entropy": 1.8979364216327668, + "epoch": 0.09259438652344038, + "grad_norm": 9.997504234313965, + "learning_rate": 8.31395405326083e-06, + "loss": 0.5852, + "mean_token_accuracy": 0.8195781350135803, + "num_tokens": 35872240.0, + "step": 29870 + }, + { + "entropy": 1.9377481788396835, + "epoch": 0.09262538564849007, + "grad_norm": 9.89439868927002, + "learning_rate": 8.3125626663662e-06, + "loss": 0.6192, + "mean_token_accuracy": 0.8212395742535591, + "num_tokens": 35883875.0, + "step": 29880 + }, + { + "entropy": 1.9368197679519654, + "epoch": 0.09265638477353977, + "grad_norm": 9.990235328674316, + "learning_rate": 8.311171977807062e-06, + "loss": 0.5592, + "mean_token_accuracy": 0.8287534207105637, + "num_tokens": 35896237.0, + "step": 29890 + }, + { + "entropy": 1.9469063565135003, + "epoch": 0.09268738389858946, + "grad_norm": 8.493316650390625, + "learning_rate": 8.309781986999454e-06, + "loss": 0.5916, + "mean_token_accuracy": 0.813089169561863, + "num_tokens": 35907924.0, + "step": 29900 + }, + { + "entropy": 1.9415881425142287, + "epoch": 0.09271838302363916, + "grad_norm": 12.5552978515625, + "learning_rate": 8.3083926933601e-06, + "loss": 0.5415, + "mean_token_accuracy": 0.8337080240249634, + "num_tokens": 35919711.0, + "step": 29910 + }, + { + "entropy": 1.9274977430701257, + "epoch": 0.09274938214868886, + "grad_norm": 3.963390588760376, + "learning_rate": 8.307004096306404e-06, + "loss": 0.5286, + "mean_token_accuracy": 0.8345837160944939, + "num_tokens": 35931724.0, + "step": 29920 + }, + { + "entropy": 1.9158756092190743, + "epoch": 0.09278038127373855, + "grad_norm": 10.906941413879395, + "learning_rate": 8.30561619525645e-06, + "loss": 0.5781, + "mean_token_accuracy": 0.8259297773241997, + "num_tokens": 35943022.0, + "step": 29930 + }, + { + "entropy": 1.8155071794986726, + "epoch": 0.09281138039878824, + "grad_norm": 8.987025260925293, + "learning_rate": 8.304228989629007e-06, + "loss": 0.4983, + "mean_token_accuracy": 0.8335763946175575, + "num_tokens": 35955373.0, + "step": 29940 + }, + { + "entropy": 1.8580231979489326, + "epoch": 0.09284237952383793, + "grad_norm": 7.32357120513916, + "learning_rate": 8.302842478843522e-06, + "loss": 0.5705, + "mean_token_accuracy": 0.8255671873688698, + "num_tokens": 35968204.0, + "step": 29950 + }, + { + "entropy": 1.9050882533192635, + "epoch": 0.09287337864888763, + "grad_norm": 8.925579071044922, + "learning_rate": 8.301456662320118e-06, + "loss": 0.6012, + "mean_token_accuracy": 0.810367950797081, + "num_tokens": 35979573.0, + "step": 29960 + }, + { + "entropy": 1.836830762028694, + "epoch": 0.09290437777393733, + "grad_norm": 3.3085451126098633, + "learning_rate": 8.300071539479595e-06, + "loss": 0.5401, + "mean_token_accuracy": 0.8347239702939987, + "num_tokens": 35992865.0, + "step": 29970 + }, + { + "entropy": 1.9335076332092285, + "epoch": 0.09293537689898702, + "grad_norm": 4.406698226928711, + "learning_rate": 8.298687109743434e-06, + "loss": 0.5598, + "mean_token_accuracy": 0.8291699022054673, + "num_tokens": 36004840.0, + "step": 29980 + }, + { + "entropy": 1.9243319526314735, + "epoch": 0.09296637602403672, + "grad_norm": 9.231583595275879, + "learning_rate": 8.297303372533783e-06, + "loss": 0.5428, + "mean_token_accuracy": 0.8422666415572166, + "num_tokens": 36016239.0, + "step": 29990 + }, + { + "entropy": 1.9396521881222726, + "epoch": 0.09299737514908642, + "grad_norm": 10.043663024902344, + "learning_rate": 8.295920327273474e-06, + "loss": 0.6083, + "mean_token_accuracy": 0.8137664362788201, + "num_tokens": 36027621.0, + "step": 30000 + }, + { + "entropy": 1.8998918011784554, + "epoch": 0.09302837427413611, + "grad_norm": 3.658256769180298, + "learning_rate": 8.294537973386005e-06, + "loss": 0.5306, + "mean_token_accuracy": 0.8297856241464615, + "num_tokens": 36039304.0, + "step": 30010 + }, + { + "entropy": 1.905530793964863, + "epoch": 0.09305937339918581, + "grad_norm": 10.163437843322754, + "learning_rate": 8.29315631029555e-06, + "loss": 0.5719, + "mean_token_accuracy": 0.824944506585598, + "num_tokens": 36050882.0, + "step": 30020 + }, + { + "entropy": 1.7664498060941696, + "epoch": 0.0930903725242355, + "grad_norm": 11.35702896118164, + "learning_rate": 8.291775337426954e-06, + "loss": 0.4992, + "mean_token_accuracy": 0.8312182664871216, + "num_tokens": 36065249.0, + "step": 30030 + }, + { + "entropy": 1.8767417997121811, + "epoch": 0.0931213716492852, + "grad_norm": 8.986226081848145, + "learning_rate": 8.290395054205727e-06, + "loss": 0.5498, + "mean_token_accuracy": 0.8315106689929962, + "num_tokens": 36078427.0, + "step": 30040 + }, + { + "entropy": 1.9429257303476333, + "epoch": 0.0931523707743349, + "grad_norm": 5.139298915863037, + "learning_rate": 8.289015460058055e-06, + "loss": 0.6161, + "mean_token_accuracy": 0.8064095541834831, + "num_tokens": 36090554.0, + "step": 30050 + }, + { + "entropy": 1.8949994623661042, + "epoch": 0.0931833698993846, + "grad_norm": 3.8736157417297363, + "learning_rate": 8.28763655441079e-06, + "loss": 0.5681, + "mean_token_accuracy": 0.8159836381673813, + "num_tokens": 36103427.0, + "step": 30060 + }, + { + "entropy": 1.9144406855106353, + "epoch": 0.09321436902443428, + "grad_norm": 9.992815971374512, + "learning_rate": 8.286258336691447e-06, + "loss": 0.5257, + "mean_token_accuracy": 0.8263021633028984, + "num_tokens": 36116559.0, + "step": 30070 + }, + { + "entropy": 1.8954633638262748, + "epoch": 0.09324536814948398, + "grad_norm": 10.11132526397705, + "learning_rate": 8.284880806328216e-06, + "loss": 0.4951, + "mean_token_accuracy": 0.8312999933958054, + "num_tokens": 36129181.0, + "step": 30080 + }, + { + "entropy": 1.9233060747385025, + "epoch": 0.09327636727453367, + "grad_norm": 5.417168617248535, + "learning_rate": 8.283503962749944e-06, + "loss": 0.5479, + "mean_token_accuracy": 0.8156366124749184, + "num_tokens": 36141429.0, + "step": 30090 + }, + { + "entropy": 1.9522355869412422, + "epoch": 0.09330736639958337, + "grad_norm": 10.192994117736816, + "learning_rate": 8.282127805386145e-06, + "loss": 0.5551, + "mean_token_accuracy": 0.8214729785919189, + "num_tokens": 36153159.0, + "step": 30100 + }, + { + "entropy": 1.9816941738128662, + "epoch": 0.09333836552463307, + "grad_norm": 10.674017906188965, + "learning_rate": 8.280752333666999e-06, + "loss": 0.6031, + "mean_token_accuracy": 0.8102478966116905, + "num_tokens": 36164765.0, + "step": 30110 + }, + { + "entropy": 1.9644594475626946, + "epoch": 0.09336936464968276, + "grad_norm": 8.998485565185547, + "learning_rate": 8.279377547023342e-06, + "loss": 0.5509, + "mean_token_accuracy": 0.8299065142869949, + "num_tokens": 36176082.0, + "step": 30120 + }, + { + "entropy": 1.888843522965908, + "epoch": 0.09340036377473246, + "grad_norm": 4.065151691436768, + "learning_rate": 8.278003444886679e-06, + "loss": 0.5253, + "mean_token_accuracy": 0.8304722428321838, + "num_tokens": 36189223.0, + "step": 30130 + }, + { + "entropy": 1.8697800204157828, + "epoch": 0.09343136289978216, + "grad_norm": 9.428620338439941, + "learning_rate": 8.276630026689168e-06, + "loss": 0.5502, + "mean_token_accuracy": 0.8309311479330063, + "num_tokens": 36202150.0, + "step": 30140 + }, + { + "entropy": 1.933559250831604, + "epoch": 0.09346236202483185, + "grad_norm": 9.53989028930664, + "learning_rate": 8.275257291863631e-06, + "loss": 0.5439, + "mean_token_accuracy": 0.8235199645161628, + "num_tokens": 36213528.0, + "step": 30150 + }, + { + "entropy": 1.9191157951951028, + "epoch": 0.09349336114988155, + "grad_norm": 11.843420028686523, + "learning_rate": 8.273885239843545e-06, + "loss": 0.5874, + "mean_token_accuracy": 0.8172587484121323, + "num_tokens": 36225594.0, + "step": 30160 + }, + { + "entropy": 1.9522407323122024, + "epoch": 0.09352436027493125, + "grad_norm": 10.857767105102539, + "learning_rate": 8.272513870063048e-06, + "loss": 0.6238, + "mean_token_accuracy": 0.8080313906073571, + "num_tokens": 36236730.0, + "step": 30170 + }, + { + "entropy": 1.879507339000702, + "epoch": 0.09355535939998094, + "grad_norm": 4.612288951873779, + "learning_rate": 8.271143181956931e-06, + "loss": 0.561, + "mean_token_accuracy": 0.8234292283654213, + "num_tokens": 36249271.0, + "step": 30180 + }, + { + "entropy": 1.904217940568924, + "epoch": 0.09358635852503062, + "grad_norm": 9.96085262298584, + "learning_rate": 8.269773174960643e-06, + "loss": 0.6201, + "mean_token_accuracy": 0.816309979557991, + "num_tokens": 36261216.0, + "step": 30190 + }, + { + "entropy": 1.9194680109620095, + "epoch": 0.09361735765008032, + "grad_norm": 4.954514503479004, + "learning_rate": 8.268403848510283e-06, + "loss": 0.5507, + "mean_token_accuracy": 0.8259898975491524, + "num_tokens": 36272893.0, + "step": 30200 + }, + { + "entropy": 1.9340100079774856, + "epoch": 0.09364835677513002, + "grad_norm": 11.47993278503418, + "learning_rate": 8.267035202042611e-06, + "loss": 0.6081, + "mean_token_accuracy": 0.813405393064022, + "num_tokens": 36284689.0, + "step": 30210 + }, + { + "entropy": 1.904521045088768, + "epoch": 0.09367935590017971, + "grad_norm": 9.563363075256348, + "learning_rate": 8.265667234995031e-06, + "loss": 0.595, + "mean_token_accuracy": 0.8256731614470482, + "num_tokens": 36295838.0, + "step": 30220 + }, + { + "entropy": 1.8696922466158867, + "epoch": 0.09371035502522941, + "grad_norm": 11.36976432800293, + "learning_rate": 8.264299946805606e-06, + "loss": 0.5404, + "mean_token_accuracy": 0.8290983706712722, + "num_tokens": 36307929.0, + "step": 30230 + }, + { + "entropy": 1.9623483926057816, + "epoch": 0.09374135415027911, + "grad_norm": 9.148504257202148, + "learning_rate": 8.26293333691304e-06, + "loss": 0.6233, + "mean_token_accuracy": 0.8086713761091232, + "num_tokens": 36319244.0, + "step": 30240 + }, + { + "entropy": 1.9141168981790542, + "epoch": 0.0937723532753288, + "grad_norm": 8.628009796142578, + "learning_rate": 8.261567404756697e-06, + "loss": 0.5855, + "mean_token_accuracy": 0.8221844986081124, + "num_tokens": 36330692.0, + "step": 30250 + }, + { + "entropy": 1.898746033012867, + "epoch": 0.0938033524003785, + "grad_norm": 10.499122619628906, + "learning_rate": 8.260202149776582e-06, + "loss": 0.5483, + "mean_token_accuracy": 0.8232525825500489, + "num_tokens": 36342656.0, + "step": 30260 + }, + { + "entropy": 1.8459609940648078, + "epoch": 0.0938343515254282, + "grad_norm": 4.076352596282959, + "learning_rate": 8.258837571413353e-06, + "loss": 0.5287, + "mean_token_accuracy": 0.8242526456713677, + "num_tokens": 36355602.0, + "step": 30270 + }, + { + "entropy": 1.870631681382656, + "epoch": 0.0938653506504779, + "grad_norm": 9.26065731048584, + "learning_rate": 8.25747366910831e-06, + "loss": 0.5709, + "mean_token_accuracy": 0.8275643572211265, + "num_tokens": 36368347.0, + "step": 30280 + }, + { + "entropy": 1.9731937110424043, + "epoch": 0.09389634977552759, + "grad_norm": 10.137398719787598, + "learning_rate": 8.256110442303401e-06, + "loss": 0.6493, + "mean_token_accuracy": 0.8186086341738701, + "num_tokens": 36378870.0, + "step": 30290 + }, + { + "entropy": 1.9244423538446427, + "epoch": 0.09392734890057729, + "grad_norm": 8.93706226348877, + "learning_rate": 8.254747890441217e-06, + "loss": 0.5874, + "mean_token_accuracy": 0.8153250128030777, + "num_tokens": 36389925.0, + "step": 30300 + }, + { + "entropy": 1.956382469832897, + "epoch": 0.09395834802562697, + "grad_norm": 8.536420822143555, + "learning_rate": 8.253386012964996e-06, + "loss": 0.6129, + "mean_token_accuracy": 0.8204043418169021, + "num_tokens": 36401021.0, + "step": 30310 + }, + { + "entropy": 1.947486911714077, + "epoch": 0.09398934715067667, + "grad_norm": 9.407525062561035, + "learning_rate": 8.252024809318618e-06, + "loss": 0.6199, + "mean_token_accuracy": 0.8143399626016616, + "num_tokens": 36412021.0, + "step": 30320 + }, + { + "entropy": 1.9774963051080703, + "epoch": 0.09402034627572636, + "grad_norm": 9.66044807434082, + "learning_rate": 8.250664278946598e-06, + "loss": 0.6574, + "mean_token_accuracy": 0.8052627012133599, + "num_tokens": 36422783.0, + "step": 30330 + }, + { + "entropy": 1.9370567843317985, + "epoch": 0.09405134540077606, + "grad_norm": 10.304049491882324, + "learning_rate": 8.249304421294103e-06, + "loss": 0.617, + "mean_token_accuracy": 0.8105893135070801, + "num_tokens": 36434720.0, + "step": 30340 + }, + { + "entropy": 1.8703087165951728, + "epoch": 0.09408234452582576, + "grad_norm": 9.763945579528809, + "learning_rate": 8.247945235806933e-06, + "loss": 0.5499, + "mean_token_accuracy": 0.8189043670892715, + "num_tokens": 36447678.0, + "step": 30350 + }, + { + "entropy": 1.9350792229175569, + "epoch": 0.09411334365087545, + "grad_norm": 8.846423149108887, + "learning_rate": 8.246586721931527e-06, + "loss": 0.6098, + "mean_token_accuracy": 0.8202366888523102, + "num_tokens": 36459700.0, + "step": 30360 + }, + { + "entropy": 1.9018134266138076, + "epoch": 0.09414434277592515, + "grad_norm": 8.516942977905273, + "learning_rate": 8.245228879114964e-06, + "loss": 0.5191, + "mean_token_accuracy": 0.8454120337963105, + "num_tokens": 36471172.0, + "step": 30370 + }, + { + "entropy": 1.877271145582199, + "epoch": 0.09417534190097485, + "grad_norm": 9.023475646972656, + "learning_rate": 8.24387170680496e-06, + "loss": 0.5145, + "mean_token_accuracy": 0.8345570683479309, + "num_tokens": 36483107.0, + "step": 30380 + }, + { + "entropy": 1.8964007556438447, + "epoch": 0.09420634102602454, + "grad_norm": 8.33362102508545, + "learning_rate": 8.242515204449868e-06, + "loss": 0.5618, + "mean_token_accuracy": 0.8253947287797928, + "num_tokens": 36494970.0, + "step": 30390 + }, + { + "entropy": 2.000899037718773, + "epoch": 0.09423734015107424, + "grad_norm": 8.705028533935547, + "learning_rate": 8.241159371498669e-06, + "loss": 0.669, + "mean_token_accuracy": 0.8023026213049889, + "num_tokens": 36505728.0, + "step": 30400 + }, + { + "entropy": 1.9049580052495003, + "epoch": 0.09426833927612394, + "grad_norm": 9.36442756652832, + "learning_rate": 8.23980420740099e-06, + "loss": 0.5302, + "mean_token_accuracy": 0.8285997763276101, + "num_tokens": 36517575.0, + "step": 30410 + }, + { + "entropy": 1.9479586511850357, + "epoch": 0.09429933840117363, + "grad_norm": 4.493463039398193, + "learning_rate": 8.238449711607085e-06, + "loss": 0.6312, + "mean_token_accuracy": 0.8059987321496009, + "num_tokens": 36529609.0, + "step": 30420 + }, + { + "entropy": 1.8796984627842903, + "epoch": 0.09433033752622333, + "grad_norm": 5.25566291809082, + "learning_rate": 8.237095883567837e-06, + "loss": 0.5061, + "mean_token_accuracy": 0.8322527810931206, + "num_tokens": 36541818.0, + "step": 30430 + }, + { + "entropy": 1.8748697608709335, + "epoch": 0.09436133665127301, + "grad_norm": 7.110718250274658, + "learning_rate": 8.235742722734768e-06, + "loss": 0.6065, + "mean_token_accuracy": 0.820853716135025, + "num_tokens": 36554237.0, + "step": 30440 + }, + { + "entropy": 1.991670235991478, + "epoch": 0.09439233577632271, + "grad_norm": 9.52397632598877, + "learning_rate": 8.234390228560024e-06, + "loss": 0.6539, + "mean_token_accuracy": 0.8077353879809379, + "num_tokens": 36565127.0, + "step": 30450 + }, + { + "entropy": 1.8927300944924355, + "epoch": 0.0944233349013724, + "grad_norm": 10.027509689331055, + "learning_rate": 8.233038400496384e-06, + "loss": 0.5793, + "mean_token_accuracy": 0.8261799916625023, + "num_tokens": 36576878.0, + "step": 30460 + }, + { + "entropy": 1.9108740240335464, + "epoch": 0.0944543340264221, + "grad_norm": 4.891687870025635, + "learning_rate": 8.231687237997258e-06, + "loss": 0.5431, + "mean_token_accuracy": 0.8279662787914276, + "num_tokens": 36588895.0, + "step": 30470 + }, + { + "entropy": 1.8933662503957749, + "epoch": 0.0944853331514718, + "grad_norm": 9.058094024658203, + "learning_rate": 8.230336740516675e-06, + "loss": 0.5779, + "mean_token_accuracy": 0.8237976714968681, + "num_tokens": 36600847.0, + "step": 30480 + }, + { + "entropy": 1.9511252835392952, + "epoch": 0.0945163322765215, + "grad_norm": 3.954777479171753, + "learning_rate": 8.228986907509301e-06, + "loss": 0.598, + "mean_token_accuracy": 0.8181298822164536, + "num_tokens": 36612439.0, + "step": 30490 + }, + { + "entropy": 1.9695148766040802, + "epoch": 0.09454733140157119, + "grad_norm": 9.158353805541992, + "learning_rate": 8.227637738430418e-06, + "loss": 0.6427, + "mean_token_accuracy": 0.812971468269825, + "num_tokens": 36623202.0, + "step": 30500 + }, + { + "entropy": 1.9591504886746407, + "epoch": 0.09457833052662089, + "grad_norm": 9.691153526306152, + "learning_rate": 8.226289232735947e-06, + "loss": 0.5988, + "mean_token_accuracy": 0.8101043596863746, + "num_tokens": 36635035.0, + "step": 30510 + }, + { + "entropy": 1.8547694399952888, + "epoch": 0.09460932965167058, + "grad_norm": 7.980339527130127, + "learning_rate": 8.224941389882417e-06, + "loss": 0.503, + "mean_token_accuracy": 0.8369179427623749, + "num_tokens": 36647914.0, + "step": 30520 + }, + { + "entropy": 1.7833900511264802, + "epoch": 0.09464032877672028, + "grad_norm": 9.225784301757812, + "learning_rate": 8.223594209326989e-06, + "loss": 0.5084, + "mean_token_accuracy": 0.8371155127882958, + "num_tokens": 36660801.0, + "step": 30530 + }, + { + "entropy": 1.966453444957733, + "epoch": 0.09467132790176998, + "grad_norm": 9.450788497924805, + "learning_rate": 8.222247690527445e-06, + "loss": 0.6112, + "mean_token_accuracy": 0.8205273166298866, + "num_tokens": 36671949.0, + "step": 30540 + }, + { + "entropy": 1.9008652031421662, + "epoch": 0.09470232702681967, + "grad_norm": 5.126152992248535, + "learning_rate": 8.220901832942189e-06, + "loss": 0.5533, + "mean_token_accuracy": 0.8248556137084961, + "num_tokens": 36683882.0, + "step": 30550 + }, + { + "entropy": 1.8853500545024873, + "epoch": 0.09473332615186936, + "grad_norm": 9.585368156433105, + "learning_rate": 8.219556636030243e-06, + "loss": 0.5535, + "mean_token_accuracy": 0.8217202216386795, + "num_tokens": 36695769.0, + "step": 30560 + }, + { + "entropy": 1.9309042051434517, + "epoch": 0.09476432527691905, + "grad_norm": 14.714326858520508, + "learning_rate": 8.21821209925125e-06, + "loss": 0.603, + "mean_token_accuracy": 0.8186698570847512, + "num_tokens": 36707233.0, + "step": 30570 + }, + { + "entropy": 1.7473974063992501, + "epoch": 0.09479532440196875, + "grad_norm": 10.051163673400879, + "learning_rate": 8.21686822206547e-06, + "loss": 0.4686, + "mean_token_accuracy": 0.8436778649687767, + "num_tokens": 36721694.0, + "step": 30580 + }, + { + "entropy": 1.8862526759505271, + "epoch": 0.09482632352701845, + "grad_norm": 11.326528549194336, + "learning_rate": 8.215525003933785e-06, + "loss": 0.5516, + "mean_token_accuracy": 0.8218036189675331, + "num_tokens": 36734217.0, + "step": 30590 + }, + { + "entropy": 1.9055613458156586, + "epoch": 0.09485732265206814, + "grad_norm": 9.264266014099121, + "learning_rate": 8.214182444317686e-06, + "loss": 0.5895, + "mean_token_accuracy": 0.821314187347889, + "num_tokens": 36746342.0, + "step": 30600 + }, + { + "entropy": 1.8917794667184353, + "epoch": 0.09488832177711784, + "grad_norm": 3.4011175632476807, + "learning_rate": 8.21284054267929e-06, + "loss": 0.5038, + "mean_token_accuracy": 0.8364427119493485, + "num_tokens": 36758768.0, + "step": 30610 + }, + { + "entropy": 1.9201584607362747, + "epoch": 0.09491932090216754, + "grad_norm": 9.317156791687012, + "learning_rate": 8.211499298481317e-06, + "loss": 0.5849, + "mean_token_accuracy": 0.8274592280387878, + "num_tokens": 36770047.0, + "step": 30620 + }, + { + "entropy": 1.872726395726204, + "epoch": 0.09495032002721723, + "grad_norm": 4.236084938049316, + "learning_rate": 8.210158711187111e-06, + "loss": 0.4964, + "mean_token_accuracy": 0.8388981744647026, + "num_tokens": 36782594.0, + "step": 30630 + }, + { + "entropy": 1.9075644299387933, + "epoch": 0.09498131915226693, + "grad_norm": 8.899432182312012, + "learning_rate": 8.208818780260624e-06, + "loss": 0.5833, + "mean_token_accuracy": 0.8260029882192612, + "num_tokens": 36793757.0, + "step": 30640 + }, + { + "entropy": 1.878479714691639, + "epoch": 0.09501231827731663, + "grad_norm": 10.720561027526855, + "learning_rate": 8.207479505166421e-06, + "loss": 0.5665, + "mean_token_accuracy": 0.822782176733017, + "num_tokens": 36806341.0, + "step": 30650 + }, + { + "entropy": 1.7969744451344014, + "epoch": 0.09504331740236632, + "grad_norm": 1.9851399660110474, + "learning_rate": 8.206140885369683e-06, + "loss": 0.4671, + "mean_token_accuracy": 0.8433524951338768, + "num_tokens": 36819861.0, + "step": 30660 + }, + { + "entropy": 1.973240813612938, + "epoch": 0.09507431652741602, + "grad_norm": 10.891983985900879, + "learning_rate": 8.20480292033619e-06, + "loss": 0.6501, + "mean_token_accuracy": 0.8093028724193573, + "num_tokens": 36830803.0, + "step": 30670 + }, + { + "entropy": 1.9500004380941391, + "epoch": 0.0951053156524657, + "grad_norm": 8.56579303741455, + "learning_rate": 8.203465609532345e-06, + "loss": 0.641, + "mean_token_accuracy": 0.817388865351677, + "num_tokens": 36842015.0, + "step": 30680 + }, + { + "entropy": 1.945586933195591, + "epoch": 0.0951363147775154, + "grad_norm": 10.45765495300293, + "learning_rate": 8.20212895242515e-06, + "loss": 0.6136, + "mean_token_accuracy": 0.804017736017704, + "num_tokens": 36853267.0, + "step": 30690 + }, + { + "entropy": 1.921870057284832, + "epoch": 0.0951673139025651, + "grad_norm": 9.248764038085938, + "learning_rate": 8.20079294848222e-06, + "loss": 0.5519, + "mean_token_accuracy": 0.8254543885588645, + "num_tokens": 36864795.0, + "step": 30700 + }, + { + "entropy": 1.945303277671337, + "epoch": 0.09519831302761479, + "grad_norm": 8.114592552185059, + "learning_rate": 8.199457597171774e-06, + "loss": 0.5779, + "mean_token_accuracy": 0.825435708463192, + "num_tokens": 36875864.0, + "step": 30710 + }, + { + "entropy": 1.9745984852313996, + "epoch": 0.09522931215266449, + "grad_norm": 8.283961296081543, + "learning_rate": 8.198122897962637e-06, + "loss": 0.6276, + "mean_token_accuracy": 0.813007952272892, + "num_tokens": 36887565.0, + "step": 30720 + }, + { + "entropy": 1.971392062306404, + "epoch": 0.09526031127771419, + "grad_norm": 9.020630836486816, + "learning_rate": 8.19678885032424e-06, + "loss": 0.6255, + "mean_token_accuracy": 0.8155330777168274, + "num_tokens": 36899194.0, + "step": 30730 + }, + { + "entropy": 1.9447349101305007, + "epoch": 0.09529131040276388, + "grad_norm": 9.019769668579102, + "learning_rate": 8.195455453726619e-06, + "loss": 0.605, + "mean_token_accuracy": 0.8189174398779869, + "num_tokens": 36910579.0, + "step": 30740 + }, + { + "entropy": 1.9661620736122132, + "epoch": 0.09532230952781358, + "grad_norm": 5.993375778198242, + "learning_rate": 8.194122707640413e-06, + "loss": 0.6069, + "mean_token_accuracy": 0.8191442266106606, + "num_tokens": 36921589.0, + "step": 30750 + }, + { + "entropy": 1.8841305732727052, + "epoch": 0.09535330865286328, + "grad_norm": 3.033937692642212, + "learning_rate": 8.19279061153686e-06, + "loss": 0.5971, + "mean_token_accuracy": 0.8168802231550216, + "num_tokens": 36934205.0, + "step": 30760 + }, + { + "entropy": 1.8415063828229905, + "epoch": 0.09538430777791297, + "grad_norm": 4.099487781524658, + "learning_rate": 8.191459164887803e-06, + "loss": 0.5092, + "mean_token_accuracy": 0.830090343952179, + "num_tokens": 36946800.0, + "step": 30770 + }, + { + "entropy": 1.9041469410061835, + "epoch": 0.09541530690296267, + "grad_norm": 9.430980682373047, + "learning_rate": 8.190128367165687e-06, + "loss": 0.6402, + "mean_token_accuracy": 0.8095077216625214, + "num_tokens": 36958615.0, + "step": 30780 + }, + { + "entropy": 1.8373140662908554, + "epoch": 0.09544630602801236, + "grad_norm": 9.46648120880127, + "learning_rate": 8.188798217843552e-06, + "loss": 0.5476, + "mean_token_accuracy": 0.8348923206329346, + "num_tokens": 36971891.0, + "step": 30790 + }, + { + "entropy": 1.8742023393511773, + "epoch": 0.09547730515306206, + "grad_norm": 3.767117738723755, + "learning_rate": 8.187468716395042e-06, + "loss": 0.522, + "mean_token_accuracy": 0.8293885499238968, + "num_tokens": 36984478.0, + "step": 30800 + }, + { + "entropy": 1.8809196785092355, + "epoch": 0.09550830427811174, + "grad_norm": 4.8539276123046875, + "learning_rate": 8.186139862294395e-06, + "loss": 0.576, + "mean_token_accuracy": 0.818726259469986, + "num_tokens": 36996951.0, + "step": 30810 + }, + { + "entropy": 1.835391464829445, + "epoch": 0.09553930340316144, + "grad_norm": 9.504169464111328, + "learning_rate": 8.184811655016448e-06, + "loss": 0.52, + "mean_token_accuracy": 0.8270650163292885, + "num_tokens": 37010328.0, + "step": 30820 + }, + { + "entropy": 1.9224170967936516, + "epoch": 0.09557030252821114, + "grad_norm": 8.346292495727539, + "learning_rate": 8.183484094036632e-06, + "loss": 0.5438, + "mean_token_accuracy": 0.8288520753383637, + "num_tokens": 37021973.0, + "step": 30830 + }, + { + "entropy": 1.9015969276428222, + "epoch": 0.09560130165326083, + "grad_norm": 11.159916877746582, + "learning_rate": 8.182157178830978e-06, + "loss": 0.5541, + "mean_token_accuracy": 0.8173329204320907, + "num_tokens": 37034374.0, + "step": 30840 + }, + { + "entropy": 1.914992319047451, + "epoch": 0.09563230077831053, + "grad_norm": 11.362371444702148, + "learning_rate": 8.180830908876107e-06, + "loss": 0.598, + "mean_token_accuracy": 0.8112172558903694, + "num_tokens": 37046586.0, + "step": 30850 + }, + { + "entropy": 1.869801890850067, + "epoch": 0.09566329990336023, + "grad_norm": 9.398642539978027, + "learning_rate": 8.179505283649239e-06, + "loss": 0.4976, + "mean_token_accuracy": 0.8326647847890853, + "num_tokens": 37059363.0, + "step": 30860 + }, + { + "entropy": 1.8593883782625198, + "epoch": 0.09569429902840992, + "grad_norm": 8.429418563842773, + "learning_rate": 8.178180302628178e-06, + "loss": 0.5187, + "mean_token_accuracy": 0.833593225479126, + "num_tokens": 37071727.0, + "step": 30870 + }, + { + "entropy": 1.8928765952587128, + "epoch": 0.09572529815345962, + "grad_norm": 8.581494331359863, + "learning_rate": 8.176855965291328e-06, + "loss": 0.499, + "mean_token_accuracy": 0.836057162284851, + "num_tokens": 37083853.0, + "step": 30880 + }, + { + "entropy": 1.854266819357872, + "epoch": 0.09575629727850932, + "grad_norm": 4.337390422821045, + "learning_rate": 8.175532271117681e-06, + "loss": 0.5466, + "mean_token_accuracy": 0.8306478515267373, + "num_tokens": 37096495.0, + "step": 30890 + }, + { + "entropy": 1.7374467805027962, + "epoch": 0.09578729640355901, + "grad_norm": 8.350436210632324, + "learning_rate": 8.17420921958682e-06, + "loss": 0.4234, + "mean_token_accuracy": 0.843256051838398, + "num_tokens": 37110413.0, + "step": 30900 + }, + { + "entropy": 1.956564149260521, + "epoch": 0.09581829552860871, + "grad_norm": 9.846826553344727, + "learning_rate": 8.172886810178917e-06, + "loss": 0.6155, + "mean_token_accuracy": 0.8104242667555809, + "num_tokens": 37121734.0, + "step": 30910 + }, + { + "entropy": 1.8714107498526573, + "epoch": 0.0958492946536584, + "grad_norm": 9.786186218261719, + "learning_rate": 8.171565042374731e-06, + "loss": 0.5514, + "mean_token_accuracy": 0.8231972604990005, + "num_tokens": 37133847.0, + "step": 30920 + }, + { + "entropy": 1.92191222012043, + "epoch": 0.09588029377870809, + "grad_norm": 9.510786056518555, + "learning_rate": 8.17024391565561e-06, + "loss": 0.5702, + "mean_token_accuracy": 0.825206808745861, + "num_tokens": 37145862.0, + "step": 30930 + }, + { + "entropy": 1.9616800487041473, + "epoch": 0.09591129290375779, + "grad_norm": 10.04150676727295, + "learning_rate": 8.168923429503489e-06, + "loss": 0.5974, + "mean_token_accuracy": 0.8210392817854881, + "num_tokens": 37156951.0, + "step": 30940 + }, + { + "entropy": 1.920756246894598, + "epoch": 0.09594229202880748, + "grad_norm": 3.579078435897827, + "learning_rate": 8.167603583400891e-06, + "loss": 0.5547, + "mean_token_accuracy": 0.8260960400104522, + "num_tokens": 37169346.0, + "step": 30950 + }, + { + "entropy": 1.9267018765211106, + "epoch": 0.09597329115385718, + "grad_norm": 9.2460355758667, + "learning_rate": 8.166284376830917e-06, + "loss": 0.5963, + "mean_token_accuracy": 0.8267333999276161, + "num_tokens": 37181057.0, + "step": 30960 + }, + { + "entropy": 1.9822622686624527, + "epoch": 0.09600429027890688, + "grad_norm": 12.592954635620117, + "learning_rate": 8.164965809277262e-06, + "loss": 0.6087, + "mean_token_accuracy": 0.8141214087605476, + "num_tokens": 37192122.0, + "step": 30970 + }, + { + "entropy": 1.9343804091215133, + "epoch": 0.09603528940395657, + "grad_norm": 3.9561359882354736, + "learning_rate": 8.163647880224195e-06, + "loss": 0.5621, + "mean_token_accuracy": 0.8185720920562745, + "num_tokens": 37203777.0, + "step": 30980 + }, + { + "entropy": 1.9694799184799194, + "epoch": 0.09606628852900627, + "grad_norm": 9.182045936584473, + "learning_rate": 8.162330589156574e-06, + "loss": 0.5635, + "mean_token_accuracy": 0.8282784789800643, + "num_tokens": 37215743.0, + "step": 30990 + }, + { + "entropy": 1.8368911847472191, + "epoch": 0.09609728765405597, + "grad_norm": 4.183026313781738, + "learning_rate": 8.161013935559836e-06, + "loss": 0.4993, + "mean_token_accuracy": 0.8347128361463547, + "num_tokens": 37229304.0, + "step": 31000 + }, + { + "entropy": 1.8615291342139244, + "epoch": 0.09612828677910566, + "grad_norm": 10.485788345336914, + "learning_rate": 8.15969791892e-06, + "loss": 0.4681, + "mean_token_accuracy": 0.8415818125009537, + "num_tokens": 37241842.0, + "step": 31010 + }, + { + "entropy": 1.8837064534425736, + "epoch": 0.09615928590415536, + "grad_norm": 9.813204765319824, + "learning_rate": 8.158382538723663e-06, + "loss": 0.5599, + "mean_token_accuracy": 0.8298415571451188, + "num_tokens": 37253894.0, + "step": 31020 + }, + { + "entropy": 1.9101071432232857, + "epoch": 0.09619028502920506, + "grad_norm": 9.026832580566406, + "learning_rate": 8.157067794458002e-06, + "loss": 0.5352, + "mean_token_accuracy": 0.8250282123684883, + "num_tokens": 37266281.0, + "step": 31030 + }, + { + "entropy": 1.9916753947734833, + "epoch": 0.09622128415425475, + "grad_norm": 8.812485694885254, + "learning_rate": 8.155753685610777e-06, + "loss": 0.6106, + "mean_token_accuracy": 0.8189787149429322, + "num_tokens": 37277277.0, + "step": 31040 + }, + { + "entropy": 1.8934721648693085, + "epoch": 0.09625228327930443, + "grad_norm": 4.702004909515381, + "learning_rate": 8.154440211670315e-06, + "loss": 0.5081, + "mean_token_accuracy": 0.8383319437503814, + "num_tokens": 37288790.0, + "step": 31050 + }, + { + "entropy": 1.8943874284625053, + "epoch": 0.09628328240435413, + "grad_norm": 5.008129119873047, + "learning_rate": 8.153127372125532e-06, + "loss": 0.5132, + "mean_token_accuracy": 0.8305976986885071, + "num_tokens": 37301541.0, + "step": 31060 + }, + { + "entropy": 1.9754459619522096, + "epoch": 0.09631428152940383, + "grad_norm": 8.563139915466309, + "learning_rate": 8.151815166465911e-06, + "loss": 0.6077, + "mean_token_accuracy": 0.82650166451931, + "num_tokens": 37313145.0, + "step": 31070 + }, + { + "entropy": 1.9236046463251113, + "epoch": 0.09634528065445352, + "grad_norm": 9.933487892150879, + "learning_rate": 8.150503594181513e-06, + "loss": 0.5484, + "mean_token_accuracy": 0.8255937352776528, + "num_tokens": 37325171.0, + "step": 31080 + }, + { + "entropy": 1.8460835695266724, + "epoch": 0.09637627977950322, + "grad_norm": 4.579317569732666, + "learning_rate": 8.149192654762971e-06, + "loss": 0.5314, + "mean_token_accuracy": 0.828292778134346, + "num_tokens": 37338677.0, + "step": 31090 + }, + { + "entropy": 1.9498460546135903, + "epoch": 0.09640727890455292, + "grad_norm": 10.442451477050781, + "learning_rate": 8.147882347701493e-06, + "loss": 0.6204, + "mean_token_accuracy": 0.818555298447609, + "num_tokens": 37350287.0, + "step": 31100 + }, + { + "entropy": 1.890355022251606, + "epoch": 0.09643827802960261, + "grad_norm": 9.938260078430176, + "learning_rate": 8.146572672488863e-06, + "loss": 0.535, + "mean_token_accuracy": 0.828145657479763, + "num_tokens": 37361789.0, + "step": 31110 + }, + { + "entropy": 1.9762079477310182, + "epoch": 0.09646927715465231, + "grad_norm": 10.554993629455566, + "learning_rate": 8.145263628617433e-06, + "loss": 0.6357, + "mean_token_accuracy": 0.8193916127085685, + "num_tokens": 37372191.0, + "step": 31120 + }, + { + "entropy": 1.93240787088871, + "epoch": 0.09650027627970201, + "grad_norm": 8.852706909179688, + "learning_rate": 8.143955215580123e-06, + "loss": 0.6071, + "mean_token_accuracy": 0.8182915225625038, + "num_tokens": 37384812.0, + "step": 31130 + }, + { + "entropy": 2.0304384171962737, + "epoch": 0.0965312754047517, + "grad_norm": 10.083441734313965, + "learning_rate": 8.142647432870427e-06, + "loss": 0.6445, + "mean_token_accuracy": 0.8097123354673386, + "num_tokens": 37395995.0, + "step": 31140 + }, + { + "entropy": 1.9195516496896743, + "epoch": 0.0965622745298014, + "grad_norm": 8.50387954711914, + "learning_rate": 8.141340279982408e-06, + "loss": 0.5442, + "mean_token_accuracy": 0.8346219673752785, + "num_tokens": 37407286.0, + "step": 31150 + }, + { + "entropy": 1.9149638175964356, + "epoch": 0.0965932736548511, + "grad_norm": 8.920450210571289, + "learning_rate": 8.140033756410697e-06, + "loss": 0.5605, + "mean_token_accuracy": 0.8223188519477844, + "num_tokens": 37419478.0, + "step": 31160 + }, + { + "entropy": 1.8281333968043327, + "epoch": 0.0966242727799008, + "grad_norm": 9.24412727355957, + "learning_rate": 8.138727861650492e-06, + "loss": 0.4968, + "mean_token_accuracy": 0.8335327923297882, + "num_tokens": 37433099.0, + "step": 31170 + }, + { + "entropy": 1.8802730202674867, + "epoch": 0.09665527190495048, + "grad_norm": 6.880424976348877, + "learning_rate": 8.137422595197554e-06, + "loss": 0.588, + "mean_token_accuracy": 0.8161317735910416, + "num_tokens": 37445489.0, + "step": 31180 + }, + { + "entropy": 1.8878979504108429, + "epoch": 0.09668627103000017, + "grad_norm": 5.010068893432617, + "learning_rate": 8.136117956548222e-06, + "loss": 0.5278, + "mean_token_accuracy": 0.8283527597784996, + "num_tokens": 37458005.0, + "step": 31190 + }, + { + "entropy": 1.88038859218359, + "epoch": 0.09671727015504987, + "grad_norm": 3.999337673187256, + "learning_rate": 8.134813945199384e-06, + "loss": 0.5158, + "mean_token_accuracy": 0.8222674325108528, + "num_tokens": 37470320.0, + "step": 31200 + }, + { + "entropy": 1.926040168106556, + "epoch": 0.09674826928009957, + "grad_norm": 9.107236862182617, + "learning_rate": 8.133510560648504e-06, + "loss": 0.592, + "mean_token_accuracy": 0.8095137163996696, + "num_tokens": 37482370.0, + "step": 31210 + }, + { + "entropy": 1.8264732763171196, + "epoch": 0.09677926840514926, + "grad_norm": 8.244362831115723, + "learning_rate": 8.132207802393603e-06, + "loss": 0.5172, + "mean_token_accuracy": 0.8364429995417595, + "num_tokens": 37495518.0, + "step": 31220 + }, + { + "entropy": 1.9302683904767037, + "epoch": 0.09681026753019896, + "grad_norm": 7.822775363922119, + "learning_rate": 8.13090566993327e-06, + "loss": 0.565, + "mean_token_accuracy": 0.8201239988207817, + "num_tokens": 37507302.0, + "step": 31230 + }, + { + "entropy": 1.8702651888132096, + "epoch": 0.09684126665524866, + "grad_norm": 4.487738132476807, + "learning_rate": 8.12960416276665e-06, + "loss": 0.5, + "mean_token_accuracy": 0.8344218656420708, + "num_tokens": 37520315.0, + "step": 31240 + }, + { + "entropy": 1.9409600526094437, + "epoch": 0.09687226578029835, + "grad_norm": 9.113760948181152, + "learning_rate": 8.128303280393453e-06, + "loss": 0.5982, + "mean_token_accuracy": 0.815177421271801, + "num_tokens": 37532697.0, + "step": 31250 + }, + { + "entropy": 1.8489416658878326, + "epoch": 0.09690326490534805, + "grad_norm": 10.157899856567383, + "learning_rate": 8.12700302231395e-06, + "loss": 0.4969, + "mean_token_accuracy": 0.8288843706250191, + "num_tokens": 37545422.0, + "step": 31260 + }, + { + "entropy": 1.9021289080381394, + "epoch": 0.09693426403039775, + "grad_norm": 9.482295036315918, + "learning_rate": 8.125703388028969e-06, + "loss": 0.6059, + "mean_token_accuracy": 0.8187751770019531, + "num_tokens": 37557355.0, + "step": 31270 + }, + { + "entropy": 1.8654734954237937, + "epoch": 0.09696526315544744, + "grad_norm": 10.133862495422363, + "learning_rate": 8.124404377039897e-06, + "loss": 0.5901, + "mean_token_accuracy": 0.8247566968202591, + "num_tokens": 37570124.0, + "step": 31280 + }, + { + "entropy": 1.9067189067602157, + "epoch": 0.09699626228049714, + "grad_norm": 8.202488899230957, + "learning_rate": 8.123105988848677e-06, + "loss": 0.5681, + "mean_token_accuracy": 0.8185949176549911, + "num_tokens": 37582141.0, + "step": 31290 + }, + { + "entropy": 1.9307459384202956, + "epoch": 0.09702726140554682, + "grad_norm": 11.123476028442383, + "learning_rate": 8.121808222957812e-06, + "loss": 0.6175, + "mean_token_accuracy": 0.8119331106543541, + "num_tokens": 37592965.0, + "step": 31300 + }, + { + "entropy": 1.8705940045416356, + "epoch": 0.09705826053059652, + "grad_norm": 2.6256701946258545, + "learning_rate": 8.120511078870361e-06, + "loss": 0.5387, + "mean_token_accuracy": 0.8242152616381645, + "num_tokens": 37605456.0, + "step": 31310 + }, + { + "entropy": 1.9379503265023232, + "epoch": 0.09708925965564622, + "grad_norm": 3.4866576194763184, + "learning_rate": 8.119214556089939e-06, + "loss": 0.5883, + "mean_token_accuracy": 0.819600661098957, + "num_tokens": 37616988.0, + "step": 31320 + }, + { + "entropy": 1.8673909679055214, + "epoch": 0.09712025878069591, + "grad_norm": 8.800174713134766, + "learning_rate": 8.11791865412071e-06, + "loss": 0.489, + "mean_token_accuracy": 0.8349416226148605, + "num_tokens": 37630013.0, + "step": 31330 + }, + { + "entropy": 1.9335881680250169, + "epoch": 0.09715125790574561, + "grad_norm": 9.899873733520508, + "learning_rate": 8.1166233724674e-06, + "loss": 0.6003, + "mean_token_accuracy": 0.8110765367746353, + "num_tokens": 37641637.0, + "step": 31340 + }, + { + "entropy": 1.8817772254347802, + "epoch": 0.0971822570307953, + "grad_norm": 5.787439346313477, + "learning_rate": 8.115328710635283e-06, + "loss": 0.5509, + "mean_token_accuracy": 0.8280811190605164, + "num_tokens": 37653917.0, + "step": 31350 + }, + { + "entropy": 1.8765314996242524, + "epoch": 0.097213256155845, + "grad_norm": 8.655739784240723, + "learning_rate": 8.114034668130184e-06, + "loss": 0.5687, + "mean_token_accuracy": 0.8303320676088333, + "num_tokens": 37665711.0, + "step": 31360 + }, + { + "entropy": 1.9583671048283577, + "epoch": 0.0972442552808947, + "grad_norm": 8.754752159118652, + "learning_rate": 8.112741244458482e-06, + "loss": 0.614, + "mean_token_accuracy": 0.8211699604988099, + "num_tokens": 37677018.0, + "step": 31370 + }, + { + "entropy": 1.8967312201857567, + "epoch": 0.0972752544059444, + "grad_norm": 4.7562575340271, + "learning_rate": 8.11144843912711e-06, + "loss": 0.569, + "mean_token_accuracy": 0.8239043042063713, + "num_tokens": 37689120.0, + "step": 31380 + }, + { + "entropy": 1.952524009346962, + "epoch": 0.09730625353099409, + "grad_norm": 9.356344223022461, + "learning_rate": 8.110156251643543e-06, + "loss": 0.6207, + "mean_token_accuracy": 0.8118321433663368, + "num_tokens": 37700583.0, + "step": 31390 + }, + { + "entropy": 1.912900096178055, + "epoch": 0.09733725265604379, + "grad_norm": 9.547168731689453, + "learning_rate": 8.10886468151581e-06, + "loss": 0.5595, + "mean_token_accuracy": 0.8243383869528771, + "num_tokens": 37712109.0, + "step": 31400 + }, + { + "entropy": 1.9011776894330978, + "epoch": 0.09736825178109348, + "grad_norm": 9.910781860351562, + "learning_rate": 8.107573728252488e-06, + "loss": 0.5361, + "mean_token_accuracy": 0.8336651280522347, + "num_tokens": 37724208.0, + "step": 31410 + }, + { + "entropy": 1.8022936284542084, + "epoch": 0.09739925090614318, + "grad_norm": 10.076736450195312, + "learning_rate": 8.106283391362702e-06, + "loss": 0.4303, + "mean_token_accuracy": 0.8385085701942444, + "num_tokens": 37738243.0, + "step": 31420 + }, + { + "entropy": 1.8668938994407653, + "epoch": 0.09743025003119286, + "grad_norm": 3.9053683280944824, + "learning_rate": 8.10499367035612e-06, + "loss": 0.5387, + "mean_token_accuracy": 0.8268329188227653, + "num_tokens": 37750652.0, + "step": 31430 + }, + { + "entropy": 1.989102879166603, + "epoch": 0.09746124915624256, + "grad_norm": 8.179647445678711, + "learning_rate": 8.10370456474296e-06, + "loss": 0.6414, + "mean_token_accuracy": 0.8093044266104699, + "num_tokens": 37761475.0, + "step": 31440 + }, + { + "entropy": 1.9298017874360085, + "epoch": 0.09749224828129226, + "grad_norm": 9.899726867675781, + "learning_rate": 8.102416074033986e-06, + "loss": 0.5832, + "mean_token_accuracy": 0.82451683729887, + "num_tokens": 37773158.0, + "step": 31450 + }, + { + "entropy": 1.926207932829857, + "epoch": 0.09752324740634195, + "grad_norm": 10.14471435546875, + "learning_rate": 8.101128197740498e-06, + "loss": 0.6074, + "mean_token_accuracy": 0.8211355611681939, + "num_tokens": 37784675.0, + "step": 31460 + }, + { + "entropy": 1.9524447560310363, + "epoch": 0.09755424653139165, + "grad_norm": 10.060158729553223, + "learning_rate": 8.09984093537435e-06, + "loss": 0.547, + "mean_token_accuracy": 0.8268745079636574, + "num_tokens": 37795692.0, + "step": 31470 + }, + { + "entropy": 1.900469544529915, + "epoch": 0.09758524565644135, + "grad_norm": 8.439093589782715, + "learning_rate": 8.098554286447932e-06, + "loss": 0.5246, + "mean_token_accuracy": 0.8321187406778335, + "num_tokens": 37808059.0, + "step": 31480 + }, + { + "entropy": 1.931046536564827, + "epoch": 0.09761624478149104, + "grad_norm": 9.36986255645752, + "learning_rate": 8.09726825047418e-06, + "loss": 0.5602, + "mean_token_accuracy": 0.8171745136380195, + "num_tokens": 37819692.0, + "step": 31490 + }, + { + "entropy": 1.9444507226347922, + "epoch": 0.09764724390654074, + "grad_norm": 10.288396835327148, + "learning_rate": 8.095982826966572e-06, + "loss": 0.5723, + "mean_token_accuracy": 0.835323378443718, + "num_tokens": 37831392.0, + "step": 31500 + }, + { + "entropy": 1.892457826435566, + "epoch": 0.09767824303159044, + "grad_norm": 9.328455924987793, + "learning_rate": 8.094698015439117e-06, + "loss": 0.5247, + "mean_token_accuracy": 0.8328243359923363, + "num_tokens": 37842993.0, + "step": 31510 + }, + { + "entropy": 1.9004325211048125, + "epoch": 0.09770924215664013, + "grad_norm": 10.092707633972168, + "learning_rate": 8.093413815406375e-06, + "loss": 0.5346, + "mean_token_accuracy": 0.8365309268236161, + "num_tokens": 37854723.0, + "step": 31520 + }, + { + "entropy": 1.8958807915449143, + "epoch": 0.09774024128168983, + "grad_norm": 10.082904815673828, + "learning_rate": 8.092130226383442e-06, + "loss": 0.5567, + "mean_token_accuracy": 0.8289808630943298, + "num_tokens": 37866020.0, + "step": 31530 + }, + { + "entropy": 1.8704974979162217, + "epoch": 0.09777124040673953, + "grad_norm": 9.165472984313965, + "learning_rate": 8.090847247885948e-06, + "loss": 0.5996, + "mean_token_accuracy": 0.8228276550769806, + "num_tokens": 37877097.0, + "step": 31540 + }, + { + "entropy": 1.9054939955472947, + "epoch": 0.09780223953178921, + "grad_norm": 7.8041205406188965, + "learning_rate": 8.089564879430065e-06, + "loss": 0.5535, + "mean_token_accuracy": 0.8295403525233269, + "num_tokens": 37887835.0, + "step": 31550 + }, + { + "entropy": 1.886386200785637, + "epoch": 0.0978332386568389, + "grad_norm": 8.552273750305176, + "learning_rate": 8.088283120532499e-06, + "loss": 0.5835, + "mean_token_accuracy": 0.830622710287571, + "num_tokens": 37898855.0, + "step": 31560 + }, + { + "entropy": 1.867349809408188, + "epoch": 0.0978642377818886, + "grad_norm": 4.293386936187744, + "learning_rate": 8.087001970710495e-06, + "loss": 0.6032, + "mean_token_accuracy": 0.8121476486325264, + "num_tokens": 37910879.0, + "step": 31570 + }, + { + "entropy": 1.9287435024976731, + "epoch": 0.0978952369069383, + "grad_norm": 10.854731559753418, + "learning_rate": 8.085721429481825e-06, + "loss": 0.6146, + "mean_token_accuracy": 0.8168462857604026, + "num_tokens": 37921745.0, + "step": 31580 + }, + { + "entropy": 1.9377621293067933, + "epoch": 0.097926236031988, + "grad_norm": 11.060909271240234, + "learning_rate": 8.084441496364808e-06, + "loss": 0.6164, + "mean_token_accuracy": 0.8070833861827851, + "num_tokens": 37933377.0, + "step": 31590 + }, + { + "entropy": 1.854214581847191, + "epoch": 0.09795723515703769, + "grad_norm": 8.883403778076172, + "learning_rate": 8.083162170878286e-06, + "loss": 0.5058, + "mean_token_accuracy": 0.8322727754712105, + "num_tokens": 37945413.0, + "step": 31600 + }, + { + "entropy": 1.8605719447135924, + "epoch": 0.09798823428208739, + "grad_norm": 8.319462776184082, + "learning_rate": 8.081883452541636e-06, + "loss": 0.5203, + "mean_token_accuracy": 0.8308003038167954, + "num_tokens": 37956712.0, + "step": 31610 + }, + { + "entropy": 1.8450894683599472, + "epoch": 0.09801923340713709, + "grad_norm": 4.5528388023376465, + "learning_rate": 8.080605340874773e-06, + "loss": 0.6098, + "mean_token_accuracy": 0.8137845560908318, + "num_tokens": 37968907.0, + "step": 31620 + }, + { + "entropy": 1.864371307194233, + "epoch": 0.09805023253218678, + "grad_norm": 10.319393157958984, + "learning_rate": 8.079327835398136e-06, + "loss": 0.6086, + "mean_token_accuracy": 0.8172882586717606, + "num_tokens": 37980451.0, + "step": 31630 + }, + { + "entropy": 1.8554281413555145, + "epoch": 0.09808123165723648, + "grad_norm": 8.902338981628418, + "learning_rate": 8.078050935632698e-06, + "loss": 0.5693, + "mean_token_accuracy": 0.831707838177681, + "num_tokens": 37992222.0, + "step": 31640 + }, + { + "entropy": 1.8842545390129088, + "epoch": 0.09811223078228618, + "grad_norm": 8.492579460144043, + "learning_rate": 8.076774641099962e-06, + "loss": 0.5778, + "mean_token_accuracy": 0.8209319055080414, + "num_tokens": 38003647.0, + "step": 31650 + }, + { + "entropy": 1.8265250965952873, + "epoch": 0.09814322990733587, + "grad_norm": 10.900943756103516, + "learning_rate": 8.075498951321958e-06, + "loss": 0.5486, + "mean_token_accuracy": 0.8293390095233917, + "num_tokens": 38016248.0, + "step": 31660 + }, + { + "entropy": 1.9135718867182732, + "epoch": 0.09817422903238555, + "grad_norm": 9.381340026855469, + "learning_rate": 8.074223865821245e-06, + "loss": 0.5587, + "mean_token_accuracy": 0.8256983861327172, + "num_tokens": 38027814.0, + "step": 31670 + }, + { + "entropy": 1.851310819387436, + "epoch": 0.09820522815743525, + "grad_norm": 11.365713119506836, + "learning_rate": 8.072949384120915e-06, + "loss": 0.5346, + "mean_token_accuracy": 0.8150163918733597, + "num_tokens": 38040446.0, + "step": 31680 + }, + { + "entropy": 1.9479857064783572, + "epoch": 0.09823622728248495, + "grad_norm": 10.72806167602539, + "learning_rate": 8.071675505744575e-06, + "loss": 0.6235, + "mean_token_accuracy": 0.8115170300006866, + "num_tokens": 38051965.0, + "step": 31690 + }, + { + "entropy": 1.9195650681853293, + "epoch": 0.09826722640753464, + "grad_norm": 8.423272132873535, + "learning_rate": 8.070402230216367e-06, + "loss": 0.5614, + "mean_token_accuracy": 0.8288802221417427, + "num_tokens": 38063124.0, + "step": 31700 + }, + { + "entropy": 1.8054373532533645, + "epoch": 0.09829822553258434, + "grad_norm": 9.05594253540039, + "learning_rate": 8.06912955706096e-06, + "loss": 0.4789, + "mean_token_accuracy": 0.8311830461025238, + "num_tokens": 38076115.0, + "step": 31710 + }, + { + "entropy": 1.80671064555645, + "epoch": 0.09832922465763404, + "grad_norm": 7.974825382232666, + "learning_rate": 8.067857485803538e-06, + "loss": 0.4884, + "mean_token_accuracy": 0.8289128035306931, + "num_tokens": 38089278.0, + "step": 31720 + }, + { + "entropy": 1.782078829407692, + "epoch": 0.09836022378268373, + "grad_norm": 11.095165252685547, + "learning_rate": 8.066586015969819e-06, + "loss": 0.4897, + "mean_token_accuracy": 0.8267631024122238, + "num_tokens": 38102671.0, + "step": 31730 + }, + { + "entropy": 1.9678689301013947, + "epoch": 0.09839122290773343, + "grad_norm": 11.549463272094727, + "learning_rate": 8.065315147086036e-06, + "loss": 0.666, + "mean_token_accuracy": 0.8048723593354226, + "num_tokens": 38113699.0, + "step": 31740 + }, + { + "entropy": 1.836279061436653, + "epoch": 0.09842222203278313, + "grad_norm": 9.380285263061523, + "learning_rate": 8.06404487867895e-06, + "loss": 0.5539, + "mean_token_accuracy": 0.8251174956560134, + "num_tokens": 38127036.0, + "step": 31750 + }, + { + "entropy": 1.887456201016903, + "epoch": 0.09845322115783282, + "grad_norm": 9.466583251953125, + "learning_rate": 8.062775210275841e-06, + "loss": 0.6013, + "mean_token_accuracy": 0.8264592796564102, + "num_tokens": 38137940.0, + "step": 31760 + }, + { + "entropy": 1.870304611325264, + "epoch": 0.09848422028288252, + "grad_norm": 9.614102363586426, + "learning_rate": 8.061506141404512e-06, + "loss": 0.5776, + "mean_token_accuracy": 0.813577763736248, + "num_tokens": 38150705.0, + "step": 31770 + }, + { + "entropy": 1.8906529873609543, + "epoch": 0.09851521940793222, + "grad_norm": 4.766526222229004, + "learning_rate": 8.060237671593283e-06, + "loss": 0.5561, + "mean_token_accuracy": 0.8281260386109353, + "num_tokens": 38162785.0, + "step": 31780 + }, + { + "entropy": 1.817509751021862, + "epoch": 0.09854621853298191, + "grad_norm": 8.861818313598633, + "learning_rate": 8.058969800370995e-06, + "loss": 0.4718, + "mean_token_accuracy": 0.8299407482147216, + "num_tokens": 38176287.0, + "step": 31790 + }, + { + "entropy": 1.9326122373342514, + "epoch": 0.0985772176580316, + "grad_norm": 8.393477439880371, + "learning_rate": 8.057702527267008e-06, + "loss": 0.586, + "mean_token_accuracy": 0.8213211745023727, + "num_tokens": 38187326.0, + "step": 31800 + }, + { + "entropy": 1.9347236022353171, + "epoch": 0.09860821678308129, + "grad_norm": 9.152666091918945, + "learning_rate": 8.056435851811194e-06, + "loss": 0.5519, + "mean_token_accuracy": 0.8280371204018593, + "num_tokens": 38198420.0, + "step": 31810 + }, + { + "entropy": 1.9405176371335984, + "epoch": 0.09863921590813099, + "grad_norm": 10.188054084777832, + "learning_rate": 8.055169773533956e-06, + "loss": 0.6435, + "mean_token_accuracy": 0.8134154111146927, + "num_tokens": 38209643.0, + "step": 31820 + }, + { + "entropy": 1.8945807382464408, + "epoch": 0.09867021503318069, + "grad_norm": 8.68061637878418, + "learning_rate": 8.053904291966199e-06, + "loss": 0.5581, + "mean_token_accuracy": 0.826158694922924, + "num_tokens": 38221808.0, + "step": 31830 + }, + { + "entropy": 1.8253241881728173, + "epoch": 0.09870121415823038, + "grad_norm": 10.65565299987793, + "learning_rate": 8.052639406639352e-06, + "loss": 0.5494, + "mean_token_accuracy": 0.8176590099930763, + "num_tokens": 38235228.0, + "step": 31840 + }, + { + "entropy": 1.9344543784856796, + "epoch": 0.09873221328328008, + "grad_norm": 8.646658897399902, + "learning_rate": 8.051375117085356e-06, + "loss": 0.6081, + "mean_token_accuracy": 0.8185088708996773, + "num_tokens": 38246052.0, + "step": 31850 + }, + { + "entropy": 1.8455456778407098, + "epoch": 0.09876321240832978, + "grad_norm": 8.64867115020752, + "learning_rate": 8.050111422836666e-06, + "loss": 0.5153, + "mean_token_accuracy": 0.8226561039686203, + "num_tokens": 38258783.0, + "step": 31860 + }, + { + "entropy": 1.942963680624962, + "epoch": 0.09879421153337947, + "grad_norm": 9.261329650878906, + "learning_rate": 8.048848323426254e-06, + "loss": 0.5971, + "mean_token_accuracy": 0.8249636858701705, + "num_tokens": 38269528.0, + "step": 31870 + }, + { + "entropy": 1.9168901443481445, + "epoch": 0.09882521065842917, + "grad_norm": 9.246033668518066, + "learning_rate": 8.047585818387599e-06, + "loss": 0.5692, + "mean_token_accuracy": 0.8281188175082207, + "num_tokens": 38280554.0, + "step": 31880 + }, + { + "entropy": 1.8628426641225815, + "epoch": 0.09885620978347887, + "grad_norm": 8.739521026611328, + "learning_rate": 8.046323907254695e-06, + "loss": 0.524, + "mean_token_accuracy": 0.8306362301111221, + "num_tokens": 38292518.0, + "step": 31890 + }, + { + "entropy": 1.8454672649502755, + "epoch": 0.09888720890852856, + "grad_norm": 12.10193920135498, + "learning_rate": 8.045062589562051e-06, + "loss": 0.5709, + "mean_token_accuracy": 0.827955187857151, + "num_tokens": 38304577.0, + "step": 31900 + }, + { + "entropy": 1.9152088075876237, + "epoch": 0.09891820803357826, + "grad_norm": 9.613424301147461, + "learning_rate": 8.04380186484468e-06, + "loss": 0.6403, + "mean_token_accuracy": 0.8144320785999298, + "num_tokens": 38315941.0, + "step": 31910 + }, + { + "entropy": 1.9276456892490388, + "epoch": 0.09894920715862794, + "grad_norm": 9.09563159942627, + "learning_rate": 8.04254173263811e-06, + "loss": 0.557, + "mean_token_accuracy": 0.8385657519102097, + "num_tokens": 38326358.0, + "step": 31920 + }, + { + "entropy": 1.8148802295327187, + "epoch": 0.09898020628367764, + "grad_norm": 8.845330238342285, + "learning_rate": 8.041282192478376e-06, + "loss": 0.5189, + "mean_token_accuracy": 0.8373483180999756, + "num_tokens": 38339092.0, + "step": 31930 + }, + { + "entropy": 1.8389425054192543, + "epoch": 0.09901120540872733, + "grad_norm": 8.750908851623535, + "learning_rate": 8.040023243902018e-06, + "loss": 0.5495, + "mean_token_accuracy": 0.8151543363928795, + "num_tokens": 38351190.0, + "step": 31940 + }, + { + "entropy": 1.8402899622917175, + "epoch": 0.09904220453377703, + "grad_norm": 4.322279930114746, + "learning_rate": 8.038764886446095e-06, + "loss": 0.5531, + "mean_token_accuracy": 0.8239884585142135, + "num_tokens": 38363817.0, + "step": 31950 + }, + { + "entropy": 1.8671101585030556, + "epoch": 0.09907320365882673, + "grad_norm": 4.702272415161133, + "learning_rate": 8.037507119648157e-06, + "loss": 0.5213, + "mean_token_accuracy": 0.8226954385638237, + "num_tokens": 38376802.0, + "step": 31960 + }, + { + "entropy": 1.9011655792593956, + "epoch": 0.09910420278387642, + "grad_norm": 9.228616714477539, + "learning_rate": 8.036249943046277e-06, + "loss": 0.5436, + "mean_token_accuracy": 0.8301041334867477, + "num_tokens": 38388556.0, + "step": 31970 + }, + { + "entropy": 1.854082614183426, + "epoch": 0.09913520190892612, + "grad_norm": 4.278061389923096, + "learning_rate": 8.034993356179019e-06, + "loss": 0.5468, + "mean_token_accuracy": 0.8298392608761788, + "num_tokens": 38400576.0, + "step": 31980 + }, + { + "entropy": 1.8935617864131928, + "epoch": 0.09916620103397582, + "grad_norm": 8.677916526794434, + "learning_rate": 8.03373735858546e-06, + "loss": 0.5457, + "mean_token_accuracy": 0.8281985089182854, + "num_tokens": 38412502.0, + "step": 31990 + }, + { + "entropy": 1.878693199157715, + "epoch": 0.09919720015902551, + "grad_norm": 9.149133682250977, + "learning_rate": 8.032481949805182e-06, + "loss": 0.5288, + "mean_token_accuracy": 0.8354146972298622, + "num_tokens": 38423360.0, + "step": 32000 + }, + { + "entropy": 1.9500204533338548, + "epoch": 0.09922819928407521, + "grad_norm": 10.257466316223145, + "learning_rate": 8.031227129378268e-06, + "loss": 0.6755, + "mean_token_accuracy": 0.8080421656370163, + "num_tokens": 38433951.0, + "step": 32010 + }, + { + "entropy": 1.9539231777191162, + "epoch": 0.09925919840912491, + "grad_norm": 9.122477531433105, + "learning_rate": 8.029972896845298e-06, + "loss": 0.5941, + "mean_token_accuracy": 0.8169040486216546, + "num_tokens": 38445236.0, + "step": 32020 + }, + { + "entropy": 1.846830153465271, + "epoch": 0.0992901975341746, + "grad_norm": 9.854817390441895, + "learning_rate": 8.028719251747369e-06, + "loss": 0.5172, + "mean_token_accuracy": 0.8273584708571434, + "num_tokens": 38457779.0, + "step": 32030 + }, + { + "entropy": 1.9177356496453286, + "epoch": 0.09932119665922429, + "grad_norm": 8.14139461517334, + "learning_rate": 8.027466193626063e-06, + "loss": 0.563, + "mean_token_accuracy": 0.8339667573571206, + "num_tokens": 38469104.0, + "step": 32040 + }, + { + "entropy": 1.9175456002354623, + "epoch": 0.09935219578427398, + "grad_norm": 4.319037914276123, + "learning_rate": 8.026213722023473e-06, + "loss": 0.5978, + "mean_token_accuracy": 0.8246249735355378, + "num_tokens": 38480992.0, + "step": 32050 + }, + { + "entropy": 1.9631080061197281, + "epoch": 0.09938319490932368, + "grad_norm": 9.7236328125, + "learning_rate": 8.024961836482187e-06, + "loss": 0.6166, + "mean_token_accuracy": 0.8274016365408897, + "num_tokens": 38492002.0, + "step": 32060 + }, + { + "entropy": 1.943944238126278, + "epoch": 0.09941419403437338, + "grad_norm": 8.304064750671387, + "learning_rate": 8.023710536545295e-06, + "loss": 0.6108, + "mean_token_accuracy": 0.8115967348217964, + "num_tokens": 38502940.0, + "step": 32070 + }, + { + "entropy": 1.9384367182850837, + "epoch": 0.09944519315942307, + "grad_norm": 11.586662292480469, + "learning_rate": 8.022459821756386e-06, + "loss": 0.6286, + "mean_token_accuracy": 0.820226113498211, + "num_tokens": 38514238.0, + "step": 32080 + }, + { + "entropy": 1.9123076066374778, + "epoch": 0.09947619228447277, + "grad_norm": 9.793305397033691, + "learning_rate": 8.021209691659546e-06, + "loss": 0.5969, + "mean_token_accuracy": 0.8114640265703201, + "num_tokens": 38525572.0, + "step": 32090 + }, + { + "entropy": 1.8171996608376504, + "epoch": 0.09950719140952247, + "grad_norm": 4.4752020835876465, + "learning_rate": 8.019960145799353e-06, + "loss": 0.5131, + "mean_token_accuracy": 0.8394015192985534, + "num_tokens": 38538612.0, + "step": 32100 + }, + { + "entropy": 1.8743107169866562, + "epoch": 0.09953819053457216, + "grad_norm": 9.059576034545898, + "learning_rate": 8.01871118372089e-06, + "loss": 0.5375, + "mean_token_accuracy": 0.8266194269061089, + "num_tokens": 38550013.0, + "step": 32110 + }, + { + "entropy": 1.8382051154971122, + "epoch": 0.09956918965962186, + "grad_norm": 2.9260752201080322, + "learning_rate": 8.017462804969733e-06, + "loss": 0.566, + "mean_token_accuracy": 0.837664969265461, + "num_tokens": 38562520.0, + "step": 32120 + }, + { + "entropy": 1.946621085703373, + "epoch": 0.09960018878467156, + "grad_norm": 8.969979286193848, + "learning_rate": 8.01621500909195e-06, + "loss": 0.5906, + "mean_token_accuracy": 0.8193635001778603, + "num_tokens": 38573702.0, + "step": 32130 + }, + { + "entropy": 1.9753321021795274, + "epoch": 0.09963118790972125, + "grad_norm": 8.497583389282227, + "learning_rate": 8.014967795634104e-06, + "loss": 0.6373, + "mean_token_accuracy": 0.8141985774040222, + "num_tokens": 38584725.0, + "step": 32140 + }, + { + "entropy": 1.9402645155787468, + "epoch": 0.09966218703477095, + "grad_norm": 10.377409934997559, + "learning_rate": 8.013721164143257e-06, + "loss": 0.5876, + "mean_token_accuracy": 0.8230377018451691, + "num_tokens": 38596403.0, + "step": 32150 + }, + { + "entropy": 1.7853017404675484, + "epoch": 0.09969318615982065, + "grad_norm": 8.521496772766113, + "learning_rate": 8.012475114166955e-06, + "loss": 0.4442, + "mean_token_accuracy": 0.8455995440483093, + "num_tokens": 38610309.0, + "step": 32160 + }, + { + "entropy": 1.846723584830761, + "epoch": 0.09972418528487033, + "grad_norm": 5.919735908508301, + "learning_rate": 8.011229645253245e-06, + "loss": 0.5358, + "mean_token_accuracy": 0.8269700452685356, + "num_tokens": 38623255.0, + "step": 32170 + }, + { + "entropy": 1.8817561469972133, + "epoch": 0.09975518440992003, + "grad_norm": 10.0753755569458, + "learning_rate": 8.009984756950662e-06, + "loss": 0.5353, + "mean_token_accuracy": 0.8250563368201256, + "num_tokens": 38636156.0, + "step": 32180 + }, + { + "entropy": 1.8628202617168426, + "epoch": 0.09978618353496972, + "grad_norm": 9.534749031066895, + "learning_rate": 8.008740448808228e-06, + "loss": 0.5378, + "mean_token_accuracy": 0.8297192022204399, + "num_tokens": 38648825.0, + "step": 32190 + }, + { + "entropy": 1.8859781965613365, + "epoch": 0.09981718266001942, + "grad_norm": 9.754186630249023, + "learning_rate": 8.007496720375465e-06, + "loss": 0.554, + "mean_token_accuracy": 0.8234538331627845, + "num_tokens": 38660622.0, + "step": 32200 + }, + { + "entropy": 1.8697243973612785, + "epoch": 0.09984818178506912, + "grad_norm": 5.172030448913574, + "learning_rate": 8.006253571202375e-06, + "loss": 0.6183, + "mean_token_accuracy": 0.8093932062387467, + "num_tokens": 38672976.0, + "step": 32210 + }, + { + "entropy": 1.7175804510712624, + "epoch": 0.09987918091011881, + "grad_norm": 8.345678329467773, + "learning_rate": 8.005011000839453e-06, + "loss": 0.4797, + "mean_token_accuracy": 0.8368314996361732, + "num_tokens": 38687008.0, + "step": 32220 + }, + { + "entropy": 1.8780830636620522, + "epoch": 0.09991018003516851, + "grad_norm": 4.0767951011657715, + "learning_rate": 8.003769008837679e-06, + "loss": 0.5691, + "mean_token_accuracy": 0.8209434077143669, + "num_tokens": 38698550.0, + "step": 32230 + }, + { + "entropy": 1.9147436633706092, + "epoch": 0.0999411791602182, + "grad_norm": 10.640585899353027, + "learning_rate": 8.00252759474853e-06, + "loss": 0.6051, + "mean_token_accuracy": 0.8163613364100456, + "num_tokens": 38709869.0, + "step": 32240 + }, + { + "entropy": 1.8193964034318924, + "epoch": 0.0999721782852679, + "grad_norm": 10.641173362731934, + "learning_rate": 8.001286758123959e-06, + "loss": 0.5251, + "mean_token_accuracy": 0.8270770683884621, + "num_tokens": 38721887.0, + "step": 32250 + }, + { + "entropy": 1.7959134474396705, + "epoch": 0.1000031774103176, + "grad_norm": 7.74287748336792, + "learning_rate": 8.000046498516408e-06, + "loss": 0.4992, + "mean_token_accuracy": 0.8405986934900284, + "num_tokens": 38733462.0, + "step": 32260 + }, + { + "entropy": 1.851724424213171, + "epoch": 0.1000341765353673, + "grad_norm": 10.603741645812988, + "learning_rate": 7.998806815478807e-06, + "loss": 0.6026, + "mean_token_accuracy": 0.8136375203728676, + "num_tokens": 38744694.0, + "step": 32270 + }, + { + "entropy": 1.88350567817688, + "epoch": 0.10006517566041699, + "grad_norm": 10.134400367736816, + "learning_rate": 7.99756770856457e-06, + "loss": 0.5918, + "mean_token_accuracy": 0.8215403065085412, + "num_tokens": 38756425.0, + "step": 32280 + }, + { + "entropy": 1.9489238500595092, + "epoch": 0.10009617478546667, + "grad_norm": 9.280869483947754, + "learning_rate": 7.996329177327595e-06, + "loss": 0.6208, + "mean_token_accuracy": 0.8188161373138427, + "num_tokens": 38767890.0, + "step": 32290 + }, + { + "entropy": 1.9058944791555406, + "epoch": 0.10012717391051637, + "grad_norm": 4.553225994110107, + "learning_rate": 7.995091221322265e-06, + "loss": 0.5334, + "mean_token_accuracy": 0.8262131318449975, + "num_tokens": 38779858.0, + "step": 32300 + }, + { + "entropy": 1.7887984693050385, + "epoch": 0.10015817303556607, + "grad_norm": 10.373005867004395, + "learning_rate": 7.993853840103436e-06, + "loss": 0.4868, + "mean_token_accuracy": 0.8316322803497315, + "num_tokens": 38792827.0, + "step": 32310 + }, + { + "entropy": 1.7936713725328446, + "epoch": 0.10018917216061576, + "grad_norm": 9.501617431640625, + "learning_rate": 7.992617033226463e-06, + "loss": 0.4981, + "mean_token_accuracy": 0.8299453228712081, + "num_tokens": 38805462.0, + "step": 32320 + }, + { + "entropy": 1.9167312130331993, + "epoch": 0.10022017128566546, + "grad_norm": 9.644965171813965, + "learning_rate": 7.991380800247169e-06, + "loss": 0.5801, + "mean_token_accuracy": 0.8267593100667, + "num_tokens": 38816933.0, + "step": 32330 + }, + { + "entropy": 1.8285753324627876, + "epoch": 0.10025117041071516, + "grad_norm": 8.455294609069824, + "learning_rate": 7.990145140721862e-06, + "loss": 0.5222, + "mean_token_accuracy": 0.8261840432882309, + "num_tokens": 38829859.0, + "step": 32340 + }, + { + "entropy": 1.89251828789711, + "epoch": 0.10028216953576485, + "grad_norm": 8.720006942749023, + "learning_rate": 7.98891005420733e-06, + "loss": 0.6323, + "mean_token_accuracy": 0.8221445247530937, + "num_tokens": 38841339.0, + "step": 32350 + }, + { + "entropy": 1.845874121785164, + "epoch": 0.10031316866081455, + "grad_norm": 8.771798133850098, + "learning_rate": 7.987675540260844e-06, + "loss": 0.5616, + "mean_token_accuracy": 0.8211918070912361, + "num_tokens": 38853479.0, + "step": 32360 + }, + { + "entropy": 1.8901546359062196, + "epoch": 0.10034416778586425, + "grad_norm": 6.715768337249756, + "learning_rate": 7.986441598440147e-06, + "loss": 0.5698, + "mean_token_accuracy": 0.8205464497208595, + "num_tokens": 38865059.0, + "step": 32370 + }, + { + "entropy": 1.879352556169033, + "epoch": 0.10037516691091394, + "grad_norm": 9.13968563079834, + "learning_rate": 7.985208228303463e-06, + "loss": 0.6066, + "mean_token_accuracy": 0.8105901688337326, + "num_tokens": 38876867.0, + "step": 32380 + }, + { + "entropy": 1.84751605540514, + "epoch": 0.10040616603596364, + "grad_norm": 8.604146003723145, + "learning_rate": 7.983975429409497e-06, + "loss": 0.5442, + "mean_token_accuracy": 0.8336275666952133, + "num_tokens": 38889307.0, + "step": 32390 + }, + { + "entropy": 1.7912178918719293, + "epoch": 0.10043716516101334, + "grad_norm": 9.056184768676758, + "learning_rate": 7.982743201317426e-06, + "loss": 0.4912, + "mean_token_accuracy": 0.8329012975096702, + "num_tokens": 38902802.0, + "step": 32400 + }, + { + "entropy": 1.8672286108136178, + "epoch": 0.10046816428606302, + "grad_norm": 8.324599266052246, + "learning_rate": 7.981511543586906e-06, + "loss": 0.5679, + "mean_token_accuracy": 0.8331062749028206, + "num_tokens": 38914048.0, + "step": 32410 + }, + { + "entropy": 1.8162163376808167, + "epoch": 0.10049916341111272, + "grad_norm": 2.5847465991973877, + "learning_rate": 7.980280455778062e-06, + "loss": 0.524, + "mean_token_accuracy": 0.832746496796608, + "num_tokens": 38926231.0, + "step": 32420 + }, + { + "entropy": 1.8801830425858497, + "epoch": 0.10053016253616241, + "grad_norm": 8.904119491577148, + "learning_rate": 7.979049937451507e-06, + "loss": 0.5467, + "mean_token_accuracy": 0.8272658884525299, + "num_tokens": 38938316.0, + "step": 32430 + }, + { + "entropy": 1.9005606561899184, + "epoch": 0.10056116166121211, + "grad_norm": 5.724231719970703, + "learning_rate": 7.977819988168313e-06, + "loss": 0.5275, + "mean_token_accuracy": 0.829011881351471, + "num_tokens": 38950624.0, + "step": 32440 + }, + { + "entropy": 1.9463760763406754, + "epoch": 0.1005921607862618, + "grad_norm": 8.63336181640625, + "learning_rate": 7.97659060749004e-06, + "loss": 0.5908, + "mean_token_accuracy": 0.82104711830616, + "num_tokens": 38962016.0, + "step": 32450 + }, + { + "entropy": 1.8716831341385842, + "epoch": 0.1006231599113115, + "grad_norm": 10.11276912689209, + "learning_rate": 7.975361794978705e-06, + "loss": 0.5483, + "mean_token_accuracy": 0.8303984850645065, + "num_tokens": 38973912.0, + "step": 32460 + }, + { + "entropy": 1.8566598653793336, + "epoch": 0.1006541590363612, + "grad_norm": 7.346640586853027, + "learning_rate": 7.974133550196811e-06, + "loss": 0.5429, + "mean_token_accuracy": 0.8243806138634682, + "num_tokens": 38985487.0, + "step": 32470 + }, + { + "entropy": 1.9269026920199395, + "epoch": 0.1006851581614109, + "grad_norm": 8.665224075317383, + "learning_rate": 7.972905872707326e-06, + "loss": 0.6133, + "mean_token_accuracy": 0.8166972935199738, + "num_tokens": 38997045.0, + "step": 32480 + }, + { + "entropy": 1.7712698966264724, + "epoch": 0.10071615728646059, + "grad_norm": 4.503955364227295, + "learning_rate": 7.97167876207369e-06, + "loss": 0.4208, + "mean_token_accuracy": 0.8519754007458686, + "num_tokens": 39010592.0, + "step": 32490 + }, + { + "entropy": 1.8833463504910468, + "epoch": 0.10074715641151029, + "grad_norm": 10.112072944641113, + "learning_rate": 7.970452217859811e-06, + "loss": 0.5322, + "mean_token_accuracy": 0.8291536673903466, + "num_tokens": 39021828.0, + "step": 32500 + }, + { + "entropy": 1.9463322639465332, + "epoch": 0.10077815553655999, + "grad_norm": 11.4695405960083, + "learning_rate": 7.96922623963007e-06, + "loss": 0.6672, + "mean_token_accuracy": 0.8071766123175621, + "num_tokens": 39032954.0, + "step": 32510 + }, + { + "entropy": 1.830717845261097, + "epoch": 0.10080915466160968, + "grad_norm": 5.045051574707031, + "learning_rate": 7.968000826949319e-06, + "loss": 0.4842, + "mean_token_accuracy": 0.8264394268393517, + "num_tokens": 39046531.0, + "step": 32520 + }, + { + "entropy": 1.8757388591766357, + "epoch": 0.10084015378665938, + "grad_norm": 5.511293888092041, + "learning_rate": 7.96677597938287e-06, + "loss": 0.5013, + "mean_token_accuracy": 0.8393940135836602, + "num_tokens": 39059751.0, + "step": 32530 + }, + { + "entropy": 1.9044818967580794, + "epoch": 0.10087115291170906, + "grad_norm": 9.242785453796387, + "learning_rate": 7.965551696496507e-06, + "loss": 0.5406, + "mean_token_accuracy": 0.8220992356538772, + "num_tokens": 39072505.0, + "step": 32540 + }, + { + "entropy": 1.9557638376951219, + "epoch": 0.10090215203675876, + "grad_norm": 9.517579078674316, + "learning_rate": 7.964327977856484e-06, + "loss": 0.6215, + "mean_token_accuracy": 0.8174139723181725, + "num_tokens": 39083179.0, + "step": 32550 + }, + { + "entropy": 2.0160479575395582, + "epoch": 0.10093315116180845, + "grad_norm": 9.149009704589844, + "learning_rate": 7.963104823029519e-06, + "loss": 0.644, + "mean_token_accuracy": 0.8098774880170823, + "num_tokens": 39093653.0, + "step": 32560 + }, + { + "entropy": 1.9425939425826073, + "epoch": 0.10096415028685815, + "grad_norm": 8.069141387939453, + "learning_rate": 7.961882231582794e-06, + "loss": 0.5416, + "mean_token_accuracy": 0.8205993890762329, + "num_tokens": 39105306.0, + "step": 32570 + }, + { + "entropy": 1.9585781380534173, + "epoch": 0.10099514941190785, + "grad_norm": 8.68270206451416, + "learning_rate": 7.960660203083954e-06, + "loss": 0.6058, + "mean_token_accuracy": 0.8138043344020843, + "num_tokens": 39117926.0, + "step": 32580 + }, + { + "entropy": 1.952934955060482, + "epoch": 0.10102614853695754, + "grad_norm": 9.11218547821045, + "learning_rate": 7.959438737101118e-06, + "loss": 0.5391, + "mean_token_accuracy": 0.8283255323767662, + "num_tokens": 39130134.0, + "step": 32590 + }, + { + "entropy": 1.9153422683477401, + "epoch": 0.10105714766200724, + "grad_norm": 5.233044147491455, + "learning_rate": 7.958217833202859e-06, + "loss": 0.5562, + "mean_token_accuracy": 0.8188350781798363, + "num_tokens": 39142127.0, + "step": 32600 + }, + { + "entropy": 1.8911504298448563, + "epoch": 0.10108814678705694, + "grad_norm": 4.27568244934082, + "learning_rate": 7.956997490958216e-06, + "loss": 0.533, + "mean_token_accuracy": 0.8261435091495514, + "num_tokens": 39154491.0, + "step": 32610 + }, + { + "entropy": 1.7859768718481064, + "epoch": 0.10111914591210663, + "grad_norm": 4.534674644470215, + "learning_rate": 7.955777709936692e-06, + "loss": 0.4525, + "mean_token_accuracy": 0.8465276822447777, + "num_tokens": 39168135.0, + "step": 32620 + }, + { + "entropy": 1.90359725356102, + "epoch": 0.10115014503715633, + "grad_norm": 7.7813496589660645, + "learning_rate": 7.95455848970825e-06, + "loss": 0.6129, + "mean_token_accuracy": 0.8201944395899773, + "num_tokens": 39180412.0, + "step": 32630 + }, + { + "entropy": 1.924836564064026, + "epoch": 0.10118114416220603, + "grad_norm": 9.442864418029785, + "learning_rate": 7.953339829843315e-06, + "loss": 0.5363, + "mean_token_accuracy": 0.8264301627874374, + "num_tokens": 39192257.0, + "step": 32640 + }, + { + "entropy": 1.9304316490888596, + "epoch": 0.10121214328725572, + "grad_norm": 3.983366012573242, + "learning_rate": 7.952121729912772e-06, + "loss": 0.612, + "mean_token_accuracy": 0.8225965097546577, + "num_tokens": 39203910.0, + "step": 32650 + }, + { + "entropy": 1.9028907686471939, + "epoch": 0.1012431424123054, + "grad_norm": 9.803033828735352, + "learning_rate": 7.95090418948797e-06, + "loss": 0.6047, + "mean_token_accuracy": 0.8184835597872734, + "num_tokens": 39216797.0, + "step": 32660 + }, + { + "entropy": 1.8274445042014122, + "epoch": 0.1012741415373551, + "grad_norm": 10.706036567687988, + "learning_rate": 7.949687208140709e-06, + "loss": 0.5313, + "mean_token_accuracy": 0.8392621099948883, + "num_tokens": 39229208.0, + "step": 32670 + }, + { + "entropy": 1.983510084450245, + "epoch": 0.1013051406624048, + "grad_norm": 9.402242660522461, + "learning_rate": 7.948470785443254e-06, + "loss": 0.6156, + "mean_token_accuracy": 0.8155052870512008, + "num_tokens": 39240609.0, + "step": 32680 + }, + { + "entropy": 1.8997413352131844, + "epoch": 0.1013361397874545, + "grad_norm": 11.164144515991211, + "learning_rate": 7.947254920968327e-06, + "loss": 0.5142, + "mean_token_accuracy": 0.823790366947651, + "num_tokens": 39252794.0, + "step": 32690 + }, + { + "entropy": 1.843451727926731, + "epoch": 0.10136713891250419, + "grad_norm": 9.909012794494629, + "learning_rate": 7.946039614289105e-06, + "loss": 0.546, + "mean_token_accuracy": 0.8248791456222534, + "num_tokens": 39265204.0, + "step": 32700 + }, + { + "entropy": 1.9523500233888627, + "epoch": 0.10139813803755389, + "grad_norm": 8.847643852233887, + "learning_rate": 7.944824864979225e-06, + "loss": 0.6042, + "mean_token_accuracy": 0.8163961425423623, + "num_tokens": 39276926.0, + "step": 32710 + }, + { + "entropy": 1.8452000051736832, + "epoch": 0.10142913716260359, + "grad_norm": 8.985699653625488, + "learning_rate": 7.943610672612779e-06, + "loss": 0.4633, + "mean_token_accuracy": 0.8368108674883843, + "num_tokens": 39288593.0, + "step": 32720 + }, + { + "entropy": 1.8509555876255035, + "epoch": 0.10146013628765328, + "grad_norm": 8.876476287841797, + "learning_rate": 7.94239703676431e-06, + "loss": 0.5477, + "mean_token_accuracy": 0.8287522226572037, + "num_tokens": 39301355.0, + "step": 32730 + }, + { + "entropy": 1.982967707514763, + "epoch": 0.10149113541270298, + "grad_norm": 11.262484550476074, + "learning_rate": 7.941183957008825e-06, + "loss": 0.5909, + "mean_token_accuracy": 0.8227292239665985, + "num_tokens": 39312750.0, + "step": 32740 + }, + { + "entropy": 1.9494792476296425, + "epoch": 0.10152213453775268, + "grad_norm": 12.197062492370605, + "learning_rate": 7.939971432921778e-06, + "loss": 0.5947, + "mean_token_accuracy": 0.8156418994069099, + "num_tokens": 39323923.0, + "step": 32750 + }, + { + "entropy": 1.9586309775710107, + "epoch": 0.10155313366280237, + "grad_norm": 8.947646141052246, + "learning_rate": 7.93875946407908e-06, + "loss": 0.5582, + "mean_token_accuracy": 0.8243048340082169, + "num_tokens": 39335909.0, + "step": 32760 + }, + { + "entropy": 1.9357258334755898, + "epoch": 0.10158413278785207, + "grad_norm": 8.137892723083496, + "learning_rate": 7.937548050057092e-06, + "loss": 0.5899, + "mean_token_accuracy": 0.8267382949590683, + "num_tokens": 39347350.0, + "step": 32770 + }, + { + "entropy": 1.961161696910858, + "epoch": 0.10161513191290175, + "grad_norm": 9.436999320983887, + "learning_rate": 7.936337190432627e-06, + "loss": 0.5788, + "mean_token_accuracy": 0.8202362656593323, + "num_tokens": 39359043.0, + "step": 32780 + }, + { + "entropy": 1.9914384290575982, + "epoch": 0.10164613103795145, + "grad_norm": 8.315853118896484, + "learning_rate": 7.935126884782958e-06, + "loss": 0.5859, + "mean_token_accuracy": 0.8224820002913475, + "num_tokens": 39370438.0, + "step": 32790 + }, + { + "entropy": 1.9369804859161377, + "epoch": 0.10167713016300114, + "grad_norm": 9.922943115234375, + "learning_rate": 7.9339171326858e-06, + "loss": 0.5687, + "mean_token_accuracy": 0.8243139579892158, + "num_tokens": 39380976.0, + "step": 32800 + }, + { + "entropy": 1.9085180804133415, + "epoch": 0.10170812928805084, + "grad_norm": 9.743084907531738, + "learning_rate": 7.932707933719321e-06, + "loss": 0.614, + "mean_token_accuracy": 0.8235570520162583, + "num_tokens": 39392951.0, + "step": 32810 + }, + { + "entropy": 1.9496404856443406, + "epoch": 0.10173912841310054, + "grad_norm": 8.715404510498047, + "learning_rate": 7.931499287462138e-06, + "loss": 0.6393, + "mean_token_accuracy": 0.811781495809555, + "num_tokens": 39404058.0, + "step": 32820 + }, + { + "entropy": 1.9330283522605896, + "epoch": 0.10177012753815023, + "grad_norm": 9.045530319213867, + "learning_rate": 7.930291193493323e-06, + "loss": 0.5864, + "mean_token_accuracy": 0.8231693536043168, + "num_tokens": 39414589.0, + "step": 32830 + }, + { + "entropy": 2.0210474640130998, + "epoch": 0.10180112666319993, + "grad_norm": 7.992177486419678, + "learning_rate": 7.929083651392389e-06, + "loss": 0.6262, + "mean_token_accuracy": 0.8037309676408768, + "num_tokens": 39425755.0, + "step": 32840 + }, + { + "entropy": 1.9645818829536439, + "epoch": 0.10183212578824963, + "grad_norm": 9.055630683898926, + "learning_rate": 7.9278766607393e-06, + "loss": 0.5811, + "mean_token_accuracy": 0.8236428961157799, + "num_tokens": 39437178.0, + "step": 32850 + }, + { + "entropy": 1.9607673034071922, + "epoch": 0.10186312491329932, + "grad_norm": 10.347228050231934, + "learning_rate": 7.92667022111447e-06, + "loss": 0.5655, + "mean_token_accuracy": 0.8204041153192521, + "num_tokens": 39449415.0, + "step": 32860 + }, + { + "entropy": 1.913771566748619, + "epoch": 0.10189412403834902, + "grad_norm": 10.000711441040039, + "learning_rate": 7.92546433209876e-06, + "loss": 0.5313, + "mean_token_accuracy": 0.8246194154024125, + "num_tokens": 39461394.0, + "step": 32870 + }, + { + "entropy": 1.9231099665164948, + "epoch": 0.10192512316339872, + "grad_norm": 5.153701305389404, + "learning_rate": 7.92425899327347e-06, + "loss": 0.5586, + "mean_token_accuracy": 0.8274278253316879, + "num_tokens": 39473357.0, + "step": 32880 + }, + { + "entropy": 1.9601033926010132, + "epoch": 0.10195612228844841, + "grad_norm": 9.463569641113281, + "learning_rate": 7.923054204220351e-06, + "loss": 0.6141, + "mean_token_accuracy": 0.8186487257480621, + "num_tokens": 39484263.0, + "step": 32890 + }, + { + "entropy": 1.989539209008217, + "epoch": 0.10198712141349811, + "grad_norm": 11.15012264251709, + "learning_rate": 7.921849964521603e-06, + "loss": 0.5973, + "mean_token_accuracy": 0.8308931365609169, + "num_tokens": 39495754.0, + "step": 32900 + }, + { + "entropy": 1.8660317227244376, + "epoch": 0.1020181205385478, + "grad_norm": 9.449760437011719, + "learning_rate": 7.92064627375986e-06, + "loss": 0.5162, + "mean_token_accuracy": 0.8349911078810692, + "num_tokens": 39508683.0, + "step": 32910 + }, + { + "entropy": 1.956910152733326, + "epoch": 0.10204911966359749, + "grad_norm": 8.791659355163574, + "learning_rate": 7.919443131518211e-06, + "loss": 0.546, + "mean_token_accuracy": 0.8249326780438423, + "num_tokens": 39520549.0, + "step": 32920 + }, + { + "entropy": 1.9195820301771165, + "epoch": 0.10208011878864719, + "grad_norm": 4.308915615081787, + "learning_rate": 7.91824053738018e-06, + "loss": 0.5599, + "mean_token_accuracy": 0.8237533152103425, + "num_tokens": 39533438.0, + "step": 32930 + }, + { + "entropy": 2.001399652659893, + "epoch": 0.10211111791369688, + "grad_norm": 11.334473609924316, + "learning_rate": 7.917038490929737e-06, + "loss": 0.5865, + "mean_token_accuracy": 0.8264929071068764, + "num_tokens": 39544650.0, + "step": 32940 + }, + { + "entropy": 1.9738114327192307, + "epoch": 0.10214211703874658, + "grad_norm": 4.6155595779418945, + "learning_rate": 7.915836991751293e-06, + "loss": 0.6039, + "mean_token_accuracy": 0.8131748422980308, + "num_tokens": 39556308.0, + "step": 32950 + }, + { + "entropy": 1.884914068877697, + "epoch": 0.10217311616379628, + "grad_norm": 9.995161056518555, + "learning_rate": 7.914636039429701e-06, + "loss": 0.5548, + "mean_token_accuracy": 0.8334923848509789, + "num_tokens": 39568702.0, + "step": 32960 + }, + { + "entropy": 1.7988762125372886, + "epoch": 0.10220411528884597, + "grad_norm": 9.979853630065918, + "learning_rate": 7.913435633550255e-06, + "loss": 0.4959, + "mean_token_accuracy": 0.83943440169096, + "num_tokens": 39581966.0, + "step": 32970 + }, + { + "entropy": 1.8382903322577477, + "epoch": 0.10223511441389567, + "grad_norm": 8.669686317443848, + "learning_rate": 7.912235773698689e-06, + "loss": 0.4643, + "mean_token_accuracy": 0.838770891726017, + "num_tokens": 39595357.0, + "step": 32980 + }, + { + "entropy": 1.9224609807133675, + "epoch": 0.10226611353894537, + "grad_norm": 8.906074523925781, + "learning_rate": 7.911036459461177e-06, + "loss": 0.5724, + "mean_token_accuracy": 0.8308119490742684, + "num_tokens": 39606657.0, + "step": 32990 + }, + { + "entropy": 1.8199217766523361, + "epoch": 0.10229711266399506, + "grad_norm": 9.60714340209961, + "learning_rate": 7.909837690424327e-06, + "loss": 0.5371, + "mean_token_accuracy": 0.8385667935013771, + "num_tokens": 39619596.0, + "step": 33000 + }, + { + "entropy": 1.9392257377505302, + "epoch": 0.10232811178904476, + "grad_norm": 9.89013671875, + "learning_rate": 7.908639466175193e-06, + "loss": 0.5837, + "mean_token_accuracy": 0.8208078041672706, + "num_tokens": 39631627.0, + "step": 33010 + }, + { + "entropy": 1.9637094184756279, + "epoch": 0.10235911091409446, + "grad_norm": 9.878365516662598, + "learning_rate": 7.907441786301261e-06, + "loss": 0.5518, + "mean_token_accuracy": 0.8377602383494377, + "num_tokens": 39642887.0, + "step": 33020 + }, + { + "entropy": 1.8965819612145425, + "epoch": 0.10239011003914414, + "grad_norm": 9.686004638671875, + "learning_rate": 7.906244650390462e-06, + "loss": 0.5211, + "mean_token_accuracy": 0.8283202305436135, + "num_tokens": 39654706.0, + "step": 33030 + }, + { + "entropy": 1.8306727975606918, + "epoch": 0.10242110916419384, + "grad_norm": 10.983489990234375, + "learning_rate": 7.905048058031153e-06, + "loss": 0.4686, + "mean_token_accuracy": 0.8358713150024414, + "num_tokens": 39668226.0, + "step": 33040 + }, + { + "entropy": 1.898146103322506, + "epoch": 0.10245210828924353, + "grad_norm": 8.633929252624512, + "learning_rate": 7.903852008812132e-06, + "loss": 0.5579, + "mean_token_accuracy": 0.8244705319404602, + "num_tokens": 39680301.0, + "step": 33050 + }, + { + "entropy": 1.8473190173506737, + "epoch": 0.10248310741429323, + "grad_norm": 4.211243629455566, + "learning_rate": 7.902656502322633e-06, + "loss": 0.5038, + "mean_token_accuracy": 0.8376186162233352, + "num_tokens": 39693311.0, + "step": 33060 + }, + { + "entropy": 1.8591630086302757, + "epoch": 0.10251410653934293, + "grad_norm": 4.487063884735107, + "learning_rate": 7.901461538152326e-06, + "loss": 0.46, + "mean_token_accuracy": 0.8340800628066063, + "num_tokens": 39706806.0, + "step": 33070 + }, + { + "entropy": 1.9278041929006577, + "epoch": 0.10254510566439262, + "grad_norm": 9.240703582763672, + "learning_rate": 7.90026711589131e-06, + "loss": 0.5148, + "mean_token_accuracy": 0.8337907418608665, + "num_tokens": 39719457.0, + "step": 33080 + }, + { + "entropy": 1.9978653281927108, + "epoch": 0.10257610478944232, + "grad_norm": 10.427108764648438, + "learning_rate": 7.899073235130122e-06, + "loss": 0.6744, + "mean_token_accuracy": 0.8095801830291748, + "num_tokens": 39730503.0, + "step": 33090 + }, + { + "entropy": 1.9866398319602012, + "epoch": 0.10260710391449201, + "grad_norm": 8.23255729675293, + "learning_rate": 7.897879895459734e-06, + "loss": 0.5904, + "mean_token_accuracy": 0.8214378640055656, + "num_tokens": 39741771.0, + "step": 33100 + }, + { + "entropy": 1.8491350293159485, + "epoch": 0.10263810303954171, + "grad_norm": 3.994368076324463, + "learning_rate": 7.896687096471543e-06, + "loss": 0.5219, + "mean_token_accuracy": 0.8272738143801689, + "num_tokens": 39754475.0, + "step": 33110 + }, + { + "entropy": 1.9193340376019479, + "epoch": 0.10266910216459141, + "grad_norm": 8.316622734069824, + "learning_rate": 7.895494837757387e-06, + "loss": 0.5405, + "mean_token_accuracy": 0.82731723934412, + "num_tokens": 39766516.0, + "step": 33120 + }, + { + "entropy": 1.775647784769535, + "epoch": 0.1027001012896411, + "grad_norm": 4.390643119812012, + "learning_rate": 7.894303118909526e-06, + "loss": 0.4468, + "mean_token_accuracy": 0.8442358478903771, + "num_tokens": 39780754.0, + "step": 33130 + }, + { + "entropy": 1.9259379595518111, + "epoch": 0.1027311004146908, + "grad_norm": 3.8750622272491455, + "learning_rate": 7.893111939520659e-06, + "loss": 0.5695, + "mean_token_accuracy": 0.8207632303237915, + "num_tokens": 39792005.0, + "step": 33140 + }, + { + "entropy": 2.0047280192375183, + "epoch": 0.10276209953974048, + "grad_norm": 8.025463104248047, + "learning_rate": 7.891921299183906e-06, + "loss": 0.5898, + "mean_token_accuracy": 0.8211274608969689, + "num_tokens": 39803614.0, + "step": 33150 + }, + { + "entropy": 2.0167581260204317, + "epoch": 0.10279309866479018, + "grad_norm": 10.989029884338379, + "learning_rate": 7.890731197492827e-06, + "loss": 0.6009, + "mean_token_accuracy": 0.8194682911038399, + "num_tokens": 39814772.0, + "step": 33160 + }, + { + "entropy": 2.003499576449394, + "epoch": 0.10282409778983988, + "grad_norm": 8.563013076782227, + "learning_rate": 7.889541634041405e-06, + "loss": 0.5893, + "mean_token_accuracy": 0.8160374283790588, + "num_tokens": 39826015.0, + "step": 33170 + }, + { + "entropy": 1.9005308762192725, + "epoch": 0.10285509691488957, + "grad_norm": 10.718070030212402, + "learning_rate": 7.888352608424046e-06, + "loss": 0.4896, + "mean_token_accuracy": 0.8393619701266288, + "num_tokens": 39838620.0, + "step": 33180 + }, + { + "entropy": 1.9308624148368836, + "epoch": 0.10288609603993927, + "grad_norm": 9.094322204589844, + "learning_rate": 7.887164120235598e-06, + "loss": 0.5783, + "mean_token_accuracy": 0.8276225015521049, + "num_tokens": 39850572.0, + "step": 33190 + }, + { + "entropy": 1.9245057985186578, + "epoch": 0.10291709516498897, + "grad_norm": 9.67980670928955, + "learning_rate": 7.885976169071323e-06, + "loss": 0.5649, + "mean_token_accuracy": 0.8204725295305252, + "num_tokens": 39863035.0, + "step": 33200 + }, + { + "entropy": 1.8938257083296777, + "epoch": 0.10294809429003866, + "grad_norm": 11.41298770904541, + "learning_rate": 7.884788754526915e-06, + "loss": 0.5521, + "mean_token_accuracy": 0.8219778835773468, + "num_tokens": 39875895.0, + "step": 33210 + }, + { + "entropy": 1.9652521327137946, + "epoch": 0.10297909341508836, + "grad_norm": 4.608140468597412, + "learning_rate": 7.883601876198497e-06, + "loss": 0.5639, + "mean_token_accuracy": 0.8251558512449264, + "num_tokens": 39888247.0, + "step": 33220 + }, + { + "entropy": 1.9580962508916855, + "epoch": 0.10301009254013806, + "grad_norm": 7.537057399749756, + "learning_rate": 7.882415533682607e-06, + "loss": 0.5439, + "mean_token_accuracy": 0.8344349786639214, + "num_tokens": 39900720.0, + "step": 33230 + }, + { + "entropy": 1.9044880703091622, + "epoch": 0.10304109166518775, + "grad_norm": 8.43292236328125, + "learning_rate": 7.88122972657622e-06, + "loss": 0.5042, + "mean_token_accuracy": 0.8305587694048882, + "num_tokens": 39913681.0, + "step": 33240 + }, + { + "entropy": 1.8795413970947266, + "epoch": 0.10307209079023745, + "grad_norm": 3.7221243381500244, + "learning_rate": 7.88004445447673e-06, + "loss": 0.5004, + "mean_token_accuracy": 0.8384215503931045, + "num_tokens": 39926055.0, + "step": 33250 + }, + { + "entropy": 1.9621031075716018, + "epoch": 0.10310308991528715, + "grad_norm": 10.001662254333496, + "learning_rate": 7.878859716981954e-06, + "loss": 0.6182, + "mean_token_accuracy": 0.8217739418148995, + "num_tokens": 39936665.0, + "step": 33260 + }, + { + "entropy": 1.935880383849144, + "epoch": 0.10313408904033684, + "grad_norm": 9.233072280883789, + "learning_rate": 7.87767551369013e-06, + "loss": 0.5461, + "mean_token_accuracy": 0.8238107547163963, + "num_tokens": 39948808.0, + "step": 33270 + }, + { + "entropy": 1.9450037762522698, + "epoch": 0.10316508816538653, + "grad_norm": 4.693704128265381, + "learning_rate": 7.876491844199926e-06, + "loss": 0.6156, + "mean_token_accuracy": 0.8216011270880699, + "num_tokens": 39960695.0, + "step": 33280 + }, + { + "entropy": 1.910569779574871, + "epoch": 0.10319608729043622, + "grad_norm": 10.65746021270752, + "learning_rate": 7.875308708110426e-06, + "loss": 0.5713, + "mean_token_accuracy": 0.8182385757565498, + "num_tokens": 39973311.0, + "step": 33290 + }, + { + "entropy": 1.8974032238125802, + "epoch": 0.10322708641548592, + "grad_norm": 8.886612892150879, + "learning_rate": 7.874126105021134e-06, + "loss": 0.5677, + "mean_token_accuracy": 0.8244770526885986, + "num_tokens": 39985794.0, + "step": 33300 + }, + { + "entropy": 1.9022233217954636, + "epoch": 0.10325808554053562, + "grad_norm": 4.597454071044922, + "learning_rate": 7.872944034531982e-06, + "loss": 0.4985, + "mean_token_accuracy": 0.8278315708041191, + "num_tokens": 39998753.0, + "step": 33310 + }, + { + "entropy": 1.8010682314634323, + "epoch": 0.10328908466558531, + "grad_norm": 2.244396924972534, + "learning_rate": 7.871762496243318e-06, + "loss": 0.4603, + "mean_token_accuracy": 0.84285439401865, + "num_tokens": 40013133.0, + "step": 33320 + }, + { + "entropy": 1.868261407315731, + "epoch": 0.10332008379063501, + "grad_norm": 10.890486717224121, + "learning_rate": 7.870581489755905e-06, + "loss": 0.4996, + "mean_token_accuracy": 0.8372749105095864, + "num_tokens": 40025402.0, + "step": 33330 + }, + { + "entropy": 1.8567996740341186, + "epoch": 0.1033510829156847, + "grad_norm": 8.594077110290527, + "learning_rate": 7.869401014670937e-06, + "loss": 0.4789, + "mean_token_accuracy": 0.8362266525626183, + "num_tokens": 40038004.0, + "step": 33340 + }, + { + "entropy": 1.8234371304512025, + "epoch": 0.1033820820407344, + "grad_norm": 8.579446792602539, + "learning_rate": 7.868221070590013e-06, + "loss": 0.4925, + "mean_token_accuracy": 0.8359899684786797, + "num_tokens": 40051320.0, + "step": 33350 + }, + { + "entropy": 1.9455517828464508, + "epoch": 0.1034130811657841, + "grad_norm": 10.794142723083496, + "learning_rate": 7.86704165711516e-06, + "loss": 0.5869, + "mean_token_accuracy": 0.8300697863101959, + "num_tokens": 40063005.0, + "step": 33360 + }, + { + "entropy": 1.902688279747963, + "epoch": 0.1034440802908338, + "grad_norm": 2.90916109085083, + "learning_rate": 7.865862773848816e-06, + "loss": 0.5366, + "mean_token_accuracy": 0.8291781514883041, + "num_tokens": 40074960.0, + "step": 33370 + }, + { + "entropy": 1.8599769324064255, + "epoch": 0.10347507941588349, + "grad_norm": 4.288544178009033, + "learning_rate": 7.864684420393842e-06, + "loss": 0.4894, + "mean_token_accuracy": 0.8394434854388237, + "num_tokens": 40087590.0, + "step": 33380 + }, + { + "entropy": 1.8361037239432334, + "epoch": 0.10350607854093319, + "grad_norm": 4.094844818115234, + "learning_rate": 7.863506596353514e-06, + "loss": 0.4848, + "mean_token_accuracy": 0.8373026907444, + "num_tokens": 40101356.0, + "step": 33390 + }, + { + "entropy": 1.8626354187726974, + "epoch": 0.10353707766598287, + "grad_norm": 9.581649780273438, + "learning_rate": 7.862329301331517e-06, + "loss": 0.5136, + "mean_token_accuracy": 0.8337352395057678, + "num_tokens": 40113440.0, + "step": 33400 + }, + { + "entropy": 1.9167112335562706, + "epoch": 0.10356807679103257, + "grad_norm": 12.560930252075195, + "learning_rate": 7.86115253493196e-06, + "loss": 0.559, + "mean_token_accuracy": 0.8210726290941238, + "num_tokens": 40125063.0, + "step": 33410 + }, + { + "entropy": 1.917654138803482, + "epoch": 0.10359907591608226, + "grad_norm": 9.083182334899902, + "learning_rate": 7.859976296759359e-06, + "loss": 0.5644, + "mean_token_accuracy": 0.8320072874426842, + "num_tokens": 40136790.0, + "step": 33420 + }, + { + "entropy": 1.884972706437111, + "epoch": 0.10363007504113196, + "grad_norm": 9.505833625793457, + "learning_rate": 7.858800586418653e-06, + "loss": 0.5672, + "mean_token_accuracy": 0.8238067865371704, + "num_tokens": 40148423.0, + "step": 33430 + }, + { + "entropy": 1.8900686636567117, + "epoch": 0.10366107416618166, + "grad_norm": 9.053071022033691, + "learning_rate": 7.857625403515186e-06, + "loss": 0.6058, + "mean_token_accuracy": 0.8166046008467674, + "num_tokens": 40160812.0, + "step": 33440 + }, + { + "entropy": 1.8167245775461196, + "epoch": 0.10369207329123135, + "grad_norm": 10.170108795166016, + "learning_rate": 7.856450747654719e-06, + "loss": 0.4918, + "mean_token_accuracy": 0.8358997702598572, + "num_tokens": 40174474.0, + "step": 33450 + }, + { + "entropy": 1.8785471424460412, + "epoch": 0.10372307241628105, + "grad_norm": 10.019652366638184, + "learning_rate": 7.855276618443426e-06, + "loss": 0.5148, + "mean_token_accuracy": 0.8401072070002555, + "num_tokens": 40186694.0, + "step": 33460 + }, + { + "entropy": 1.8747892037034035, + "epoch": 0.10375407154133075, + "grad_norm": 9.287095069885254, + "learning_rate": 7.854103015487889e-06, + "loss": 0.5128, + "mean_token_accuracy": 0.8236067086458206, + "num_tokens": 40199002.0, + "step": 33470 + }, + { + "entropy": 1.8964219346642495, + "epoch": 0.10378507066638044, + "grad_norm": 8.723387718200684, + "learning_rate": 7.852929938395108e-06, + "loss": 0.5132, + "mean_token_accuracy": 0.8313620582222938, + "num_tokens": 40211446.0, + "step": 33480 + }, + { + "entropy": 1.8906508162617683, + "epoch": 0.10381606979143014, + "grad_norm": 9.373579025268555, + "learning_rate": 7.85175738677249e-06, + "loss": 0.5188, + "mean_token_accuracy": 0.8374797239899635, + "num_tokens": 40222830.0, + "step": 33490 + }, + { + "entropy": 1.9234597817063332, + "epoch": 0.10384706891647984, + "grad_norm": 7.460415363311768, + "learning_rate": 7.85058536022785e-06, + "loss": 0.5451, + "mean_token_accuracy": 0.8325879499316216, + "num_tokens": 40234416.0, + "step": 33500 + }, + { + "entropy": 1.9918899089097977, + "epoch": 0.10387806804152953, + "grad_norm": 8.504613876342773, + "learning_rate": 7.849413858369415e-06, + "loss": 0.6361, + "mean_token_accuracy": 0.8134088665246964, + "num_tokens": 40245140.0, + "step": 33510 + }, + { + "entropy": 1.9241836652159692, + "epoch": 0.10390906716657922, + "grad_norm": 8.564291000366211, + "learning_rate": 7.848242880805818e-06, + "loss": 0.5701, + "mean_token_accuracy": 0.8313742846250534, + "num_tokens": 40257078.0, + "step": 33520 + }, + { + "entropy": 1.8901431530714035, + "epoch": 0.10394006629162891, + "grad_norm": 4.144465923309326, + "learning_rate": 7.847072427146111e-06, + "loss": 0.5654, + "mean_token_accuracy": 0.8276931583881378, + "num_tokens": 40269349.0, + "step": 33530 + }, + { + "entropy": 1.887324059009552, + "epoch": 0.10397106541667861, + "grad_norm": 14.356313705444336, + "learning_rate": 7.845902496999739e-06, + "loss": 0.5668, + "mean_token_accuracy": 0.8178930029273033, + "num_tokens": 40281901.0, + "step": 33540 + }, + { + "entropy": 1.9168416380882263, + "epoch": 0.1040020645417283, + "grad_norm": 11.27478313446045, + "learning_rate": 7.844733089976564e-06, + "loss": 0.5204, + "mean_token_accuracy": 0.8334863767027855, + "num_tokens": 40293978.0, + "step": 33550 + }, + { + "entropy": 1.813339551538229, + "epoch": 0.104033063666778, + "grad_norm": 8.250176429748535, + "learning_rate": 7.843564205686856e-06, + "loss": 0.4894, + "mean_token_accuracy": 0.838591480255127, + "num_tokens": 40306930.0, + "step": 33560 + }, + { + "entropy": 1.9164869621396066, + "epoch": 0.1040640627918277, + "grad_norm": 8.932500839233398, + "learning_rate": 7.842395843741287e-06, + "loss": 0.5966, + "mean_token_accuracy": 0.8232863306999206, + "num_tokens": 40318760.0, + "step": 33570 + }, + { + "entropy": 1.9515147507190704, + "epoch": 0.1040950619168774, + "grad_norm": 8.837969779968262, + "learning_rate": 7.841228003750933e-06, + "loss": 0.6046, + "mean_token_accuracy": 0.8223684683442116, + "num_tokens": 40330749.0, + "step": 33580 + }, + { + "entropy": 1.9510893940925598, + "epoch": 0.10412606104192709, + "grad_norm": 10.218021392822266, + "learning_rate": 7.84006068532728e-06, + "loss": 0.5715, + "mean_token_accuracy": 0.8213855206966401, + "num_tokens": 40342128.0, + "step": 33590 + }, + { + "entropy": 1.9364799603819847, + "epoch": 0.10415706016697679, + "grad_norm": 9.228703498840332, + "learning_rate": 7.838893888082218e-06, + "loss": 0.5729, + "mean_token_accuracy": 0.8253035977482795, + "num_tokens": 40353315.0, + "step": 33600 + }, + { + "entropy": 1.9496245756745338, + "epoch": 0.10418805929202649, + "grad_norm": 9.735555648803711, + "learning_rate": 7.837727611628043e-06, + "loss": 0.5944, + "mean_token_accuracy": 0.8161328807473183, + "num_tokens": 40365395.0, + "step": 33610 + }, + { + "entropy": 1.952901628613472, + "epoch": 0.10421905841707618, + "grad_norm": 10.927205085754395, + "learning_rate": 7.836561855577443e-06, + "loss": 0.5448, + "mean_token_accuracy": 0.8294717028737069, + "num_tokens": 40376818.0, + "step": 33620 + }, + { + "entropy": 1.8274588227272033, + "epoch": 0.10425005754212588, + "grad_norm": 9.007339477539062, + "learning_rate": 7.835396619543528e-06, + "loss": 0.5046, + "mean_token_accuracy": 0.8340933352708817, + "num_tokens": 40390071.0, + "step": 33630 + }, + { + "entropy": 1.9603443384170531, + "epoch": 0.10428105666717558, + "grad_norm": 9.797590255737305, + "learning_rate": 7.834231903139795e-06, + "loss": 0.6205, + "mean_token_accuracy": 0.8226620614528656, + "num_tokens": 40401203.0, + "step": 33640 + }, + { + "entropy": 1.8578498139977455, + "epoch": 0.10431205579222526, + "grad_norm": 8.355208396911621, + "learning_rate": 7.833067705980151e-06, + "loss": 0.5405, + "mean_token_accuracy": 0.8306270629167557, + "num_tokens": 40413367.0, + "step": 33650 + }, + { + "entropy": 1.8901911661028863, + "epoch": 0.10434305491727495, + "grad_norm": 9.124183654785156, + "learning_rate": 7.831904027678902e-06, + "loss": 0.5805, + "mean_token_accuracy": 0.8174741074442864, + "num_tokens": 40425631.0, + "step": 33660 + }, + { + "entropy": 2.0042662411928176, + "epoch": 0.10437405404232465, + "grad_norm": 8.978107452392578, + "learning_rate": 7.830740867850753e-06, + "loss": 0.5935, + "mean_token_accuracy": 0.8254029154777527, + "num_tokens": 40435603.0, + "step": 33670 + }, + { + "entropy": 1.8774489134550094, + "epoch": 0.10440505316737435, + "grad_norm": 10.732882499694824, + "learning_rate": 7.829578226110816e-06, + "loss": 0.6055, + "mean_token_accuracy": 0.8271615326404571, + "num_tokens": 40447665.0, + "step": 33680 + }, + { + "entropy": 1.981630663573742, + "epoch": 0.10443605229242404, + "grad_norm": 11.340205192565918, + "learning_rate": 7.828416102074594e-06, + "loss": 0.5885, + "mean_token_accuracy": 0.8235415056347847, + "num_tokens": 40458773.0, + "step": 33690 + }, + { + "entropy": 1.8451252445578574, + "epoch": 0.10446705141747374, + "grad_norm": 11.530259132385254, + "learning_rate": 7.827254495357994e-06, + "loss": 0.4994, + "mean_token_accuracy": 0.8376246988773346, + "num_tokens": 40471667.0, + "step": 33700 + }, + { + "entropy": 1.950427158176899, + "epoch": 0.10449805054252344, + "grad_norm": 10.422981262207031, + "learning_rate": 7.826093405577326e-06, + "loss": 0.5776, + "mean_token_accuracy": 0.8280929684638977, + "num_tokens": 40483536.0, + "step": 33710 + }, + { + "entropy": 1.9090193212032318, + "epoch": 0.10452904966757313, + "grad_norm": 8.866836547851562, + "learning_rate": 7.824932832349292e-06, + "loss": 0.5093, + "mean_token_accuracy": 0.8405329346656799, + "num_tokens": 40494937.0, + "step": 33720 + }, + { + "entropy": 1.8627185627818108, + "epoch": 0.10456004879262283, + "grad_norm": 9.671921730041504, + "learning_rate": 7.823772775290993e-06, + "loss": 0.5157, + "mean_token_accuracy": 0.8325039520859718, + "num_tokens": 40507661.0, + "step": 33730 + }, + { + "entropy": 1.8910223886370658, + "epoch": 0.10459104791767253, + "grad_norm": 4.5752458572387695, + "learning_rate": 7.822613234019927e-06, + "loss": 0.5511, + "mean_token_accuracy": 0.8261340275406838, + "num_tokens": 40520066.0, + "step": 33740 + }, + { + "entropy": 1.9918366223573685, + "epoch": 0.10462204704272222, + "grad_norm": 9.358546257019043, + "learning_rate": 7.821454208153992e-06, + "loss": 0.6184, + "mean_token_accuracy": 0.8228957876563072, + "num_tokens": 40531109.0, + "step": 33750 + }, + { + "entropy": 1.9470916539430618, + "epoch": 0.10465304616777192, + "grad_norm": 10.189216613769531, + "learning_rate": 7.82029569731148e-06, + "loss": 0.6156, + "mean_token_accuracy": 0.8150023117661476, + "num_tokens": 40542537.0, + "step": 33760 + }, + { + "entropy": 1.9262451082468033, + "epoch": 0.1046840452928216, + "grad_norm": 8.381092071533203, + "learning_rate": 7.819137701111077e-06, + "loss": 0.5313, + "mean_token_accuracy": 0.8301904648542404, + "num_tokens": 40554442.0, + "step": 33770 + }, + { + "entropy": 1.910345396399498, + "epoch": 0.1047150444178713, + "grad_norm": 8.140871047973633, + "learning_rate": 7.817980219171866e-06, + "loss": 0.5684, + "mean_token_accuracy": 0.8225700587034226, + "num_tokens": 40566939.0, + "step": 33780 + }, + { + "entropy": 1.974452766776085, + "epoch": 0.104746043542921, + "grad_norm": 7.459367275238037, + "learning_rate": 7.816823251113325e-06, + "loss": 0.5918, + "mean_token_accuracy": 0.8310072511434555, + "num_tokens": 40578413.0, + "step": 33790 + }, + { + "entropy": 1.9643989235162735, + "epoch": 0.1047770426679707, + "grad_norm": 9.469351768493652, + "learning_rate": 7.815666796555324e-06, + "loss": 0.5912, + "mean_token_accuracy": 0.8164726868271828, + "num_tokens": 40589015.0, + "step": 33800 + }, + { + "entropy": 1.7843807175755502, + "epoch": 0.10480804179302039, + "grad_norm": 9.5103178024292, + "learning_rate": 7.814510855118131e-06, + "loss": 0.4146, + "mean_token_accuracy": 0.8469722703099251, + "num_tokens": 40602753.0, + "step": 33810 + }, + { + "entropy": 1.786408032476902, + "epoch": 0.10483904091807009, + "grad_norm": 8.267829895019531, + "learning_rate": 7.8133554264224e-06, + "loss": 0.4993, + "mean_token_accuracy": 0.8310681402683258, + "num_tokens": 40616558.0, + "step": 33820 + }, + { + "entropy": 1.9000122025609016, + "epoch": 0.10487004004311978, + "grad_norm": 8.900775909423828, + "learning_rate": 7.812200510089185e-06, + "loss": 0.5472, + "mean_token_accuracy": 0.8183383151888848, + "num_tokens": 40629539.0, + "step": 33830 + }, + { + "entropy": 1.86521013379097, + "epoch": 0.10490103916816948, + "grad_norm": 9.411357879638672, + "learning_rate": 7.811046105739927e-06, + "loss": 0.5011, + "mean_token_accuracy": 0.8357746317982674, + "num_tokens": 40641726.0, + "step": 33840 + }, + { + "entropy": 1.8550844490528107, + "epoch": 0.10493203829321918, + "grad_norm": 7.637185096740723, + "learning_rate": 7.809892212996458e-06, + "loss": 0.507, + "mean_token_accuracy": 0.836801141500473, + "num_tokens": 40654094.0, + "step": 33850 + }, + { + "entropy": 1.8382833272218704, + "epoch": 0.10496303741826887, + "grad_norm": 9.953614234924316, + "learning_rate": 7.808738831481007e-06, + "loss": 0.5009, + "mean_token_accuracy": 0.8312429085373878, + "num_tokens": 40667469.0, + "step": 33860 + }, + { + "entropy": 1.9534501269459725, + "epoch": 0.10499403654331857, + "grad_norm": 11.463193893432617, + "learning_rate": 7.807585960816184e-06, + "loss": 0.567, + "mean_token_accuracy": 0.8186778038740158, + "num_tokens": 40679414.0, + "step": 33870 + }, + { + "entropy": 1.8588482439517975, + "epoch": 0.10502503566836827, + "grad_norm": 8.321738243103027, + "learning_rate": 7.806433600624999e-06, + "loss": 0.501, + "mean_token_accuracy": 0.8387341931462288, + "num_tokens": 40692355.0, + "step": 33880 + }, + { + "entropy": 1.949586683511734, + "epoch": 0.10505603479341795, + "grad_norm": 8.959894180297852, + "learning_rate": 7.805281750530844e-06, + "loss": 0.5933, + "mean_token_accuracy": 0.8129965797066688, + "num_tokens": 40703355.0, + "step": 33890 + }, + { + "entropy": 1.9277196362614633, + "epoch": 0.10508703391846765, + "grad_norm": 4.52024507522583, + "learning_rate": 7.804130410157503e-06, + "loss": 0.536, + "mean_token_accuracy": 0.8277270719408989, + "num_tokens": 40715053.0, + "step": 33900 + }, + { + "entropy": 1.901253941655159, + "epoch": 0.10511803304351734, + "grad_norm": 9.572426795959473, + "learning_rate": 7.802979579129147e-06, + "loss": 0.5616, + "mean_token_accuracy": 0.8186801239848137, + "num_tokens": 40726747.0, + "step": 33910 + }, + { + "entropy": 1.9022594541311264, + "epoch": 0.10514903216856704, + "grad_norm": 9.995318412780762, + "learning_rate": 7.801829257070337e-06, + "loss": 0.5805, + "mean_token_accuracy": 0.8272980287671089, + "num_tokens": 40738081.0, + "step": 33920 + }, + { + "entropy": 1.8792105168104172, + "epoch": 0.10518003129361674, + "grad_norm": 5.292135715484619, + "learning_rate": 7.800679443606019e-06, + "loss": 0.5246, + "mean_token_accuracy": 0.8302574872970581, + "num_tokens": 40750419.0, + "step": 33930 + }, + { + "entropy": 1.8759152442216873, + "epoch": 0.10521103041866643, + "grad_norm": 4.318228721618652, + "learning_rate": 7.799530138361527e-06, + "loss": 0.455, + "mean_token_accuracy": 0.8474466159939766, + "num_tokens": 40762714.0, + "step": 33940 + }, + { + "entropy": 1.871192954480648, + "epoch": 0.10524202954371613, + "grad_norm": 4.8864336013793945, + "learning_rate": 7.79838134096258e-06, + "loss": 0.4999, + "mean_token_accuracy": 0.8384427651762962, + "num_tokens": 40775536.0, + "step": 33950 + }, + { + "entropy": 1.8584145426750183, + "epoch": 0.10527302866876583, + "grad_norm": 7.488039493560791, + "learning_rate": 7.797233051035284e-06, + "loss": 0.5697, + "mean_token_accuracy": 0.8298766538500786, + "num_tokens": 40789635.0, + "step": 33960 + }, + { + "entropy": 1.9506609112024307, + "epoch": 0.10530402779381552, + "grad_norm": 9.773181915283203, + "learning_rate": 7.796085268206132e-06, + "loss": 0.5754, + "mean_token_accuracy": 0.8298773288726806, + "num_tokens": 40800485.0, + "step": 33970 + }, + { + "entropy": 1.8901255756616593, + "epoch": 0.10533502691886522, + "grad_norm": 3.6833689212799072, + "learning_rate": 7.794937992102e-06, + "loss": 0.5683, + "mean_token_accuracy": 0.8308203518390656, + "num_tokens": 40812631.0, + "step": 33980 + }, + { + "entropy": 1.8272457644343376, + "epoch": 0.10536602604391491, + "grad_norm": 5.967985153198242, + "learning_rate": 7.793791222350145e-06, + "loss": 0.5022, + "mean_token_accuracy": 0.8280576094985008, + "num_tokens": 40825791.0, + "step": 33990 + }, + { + "entropy": 1.9373245880007743, + "epoch": 0.10539702516896461, + "grad_norm": 8.980064392089844, + "learning_rate": 7.792644958578212e-06, + "loss": 0.585, + "mean_token_accuracy": 0.8223650082945824, + "num_tokens": 40837496.0, + "step": 34000 + }, + { + "entropy": 1.9898645401000976, + "epoch": 0.10542802429401431, + "grad_norm": 9.691417694091797, + "learning_rate": 7.79149920041423e-06, + "loss": 0.5725, + "mean_token_accuracy": 0.8341428533196449, + "num_tokens": 40848939.0, + "step": 34010 + }, + { + "entropy": 1.9258566856384278, + "epoch": 0.10545902341906399, + "grad_norm": 9.314157485961914, + "learning_rate": 7.790353947486607e-06, + "loss": 0.5529, + "mean_token_accuracy": 0.83617302775383, + "num_tokens": 40861200.0, + "step": 34020 + }, + { + "entropy": 1.9404375448822975, + "epoch": 0.10549002254411369, + "grad_norm": 9.493596076965332, + "learning_rate": 7.789209199424134e-06, + "loss": 0.5846, + "mean_token_accuracy": 0.8195881083607673, + "num_tokens": 40872706.0, + "step": 34030 + }, + { + "entropy": 1.8327529534697533, + "epoch": 0.10552102166916338, + "grad_norm": 9.407135963439941, + "learning_rate": 7.788064955855987e-06, + "loss": 0.4977, + "mean_token_accuracy": 0.8332900017499923, + "num_tokens": 40885468.0, + "step": 34040 + }, + { + "entropy": 1.8383815258741378, + "epoch": 0.10555202079421308, + "grad_norm": 9.750924110412598, + "learning_rate": 7.78692121641172e-06, + "loss": 0.4872, + "mean_token_accuracy": 0.830281549692154, + "num_tokens": 40898646.0, + "step": 34050 + }, + { + "entropy": 1.9488092795014382, + "epoch": 0.10558301991926278, + "grad_norm": 8.798650741577148, + "learning_rate": 7.785777980721267e-06, + "loss": 0.5789, + "mean_token_accuracy": 0.8310556679964065, + "num_tokens": 40909910.0, + "step": 34060 + }, + { + "entropy": 1.8724893182516098, + "epoch": 0.10561401904431247, + "grad_norm": 8.707452774047852, + "learning_rate": 7.784635248414945e-06, + "loss": 0.5413, + "mean_token_accuracy": 0.825086310505867, + "num_tokens": 40921391.0, + "step": 34070 + }, + { + "entropy": 1.8841823562979698, + "epoch": 0.10564501816936217, + "grad_norm": 9.368388175964355, + "learning_rate": 7.783493019123451e-06, + "loss": 0.5391, + "mean_token_accuracy": 0.8284827679395675, + "num_tokens": 40933106.0, + "step": 34080 + }, + { + "entropy": 1.893832103908062, + "epoch": 0.10567601729441187, + "grad_norm": 9.662880897521973, + "learning_rate": 7.78235129247786e-06, + "loss": 0.5353, + "mean_token_accuracy": 0.8264197260141373, + "num_tokens": 40945947.0, + "step": 34090 + }, + { + "entropy": 1.959150141477585, + "epoch": 0.10570701641946156, + "grad_norm": 11.217707633972168, + "learning_rate": 7.781210068109623e-06, + "loss": 0.6322, + "mean_token_accuracy": 0.8149213716387749, + "num_tokens": 40957886.0, + "step": 34100 + }, + { + "entropy": 1.8911303892731666, + "epoch": 0.10573801554451126, + "grad_norm": 9.337475776672363, + "learning_rate": 7.780069345650573e-06, + "loss": 0.5306, + "mean_token_accuracy": 0.8183432757854462, + "num_tokens": 40969448.0, + "step": 34110 + }, + { + "entropy": 1.8776145607233048, + "epoch": 0.10576901466956096, + "grad_norm": 9.5222749710083, + "learning_rate": 7.778929124732918e-06, + "loss": 0.5401, + "mean_token_accuracy": 0.8276842266321183, + "num_tokens": 40981711.0, + "step": 34120 + }, + { + "entropy": 1.94348586499691, + "epoch": 0.10580001379461065, + "grad_norm": 8.874763488769531, + "learning_rate": 7.777789404989248e-06, + "loss": 0.5776, + "mean_token_accuracy": 0.8309635683894158, + "num_tokens": 40992978.0, + "step": 34130 + }, + { + "entropy": 1.8115389600396157, + "epoch": 0.10583101291966034, + "grad_norm": 8.382539749145508, + "learning_rate": 7.776650186052521e-06, + "loss": 0.4918, + "mean_token_accuracy": 0.8349054649472236, + "num_tokens": 41005680.0, + "step": 34140 + }, + { + "entropy": 1.8448505356907845, + "epoch": 0.10586201204471003, + "grad_norm": 4.7137451171875, + "learning_rate": 7.77551146755608e-06, + "loss": 0.4434, + "mean_token_accuracy": 0.8491442829370499, + "num_tokens": 41017977.0, + "step": 34150 + }, + { + "entropy": 1.8678378522396089, + "epoch": 0.10589301116975973, + "grad_norm": 8.694931983947754, + "learning_rate": 7.774373249133641e-06, + "loss": 0.5733, + "mean_token_accuracy": 0.8227035477757454, + "num_tokens": 41029814.0, + "step": 34160 + }, + { + "entropy": 1.9137464344501496, + "epoch": 0.10592401029480943, + "grad_norm": 8.952034950256348, + "learning_rate": 7.773235530419292e-06, + "loss": 0.558, + "mean_token_accuracy": 0.8287974938750267, + "num_tokens": 41041332.0, + "step": 34170 + }, + { + "entropy": 1.872650384902954, + "epoch": 0.10595500941985912, + "grad_norm": 9.672776222229004, + "learning_rate": 7.7720983110475e-06, + "loss": 0.5314, + "mean_token_accuracy": 0.8209738954901695, + "num_tokens": 41054678.0, + "step": 34180 + }, + { + "entropy": 1.8565445743501185, + "epoch": 0.10598600854490882, + "grad_norm": 2.5345826148986816, + "learning_rate": 7.770961590653102e-06, + "loss": 0.5304, + "mean_token_accuracy": 0.8245491355657577, + "num_tokens": 41067439.0, + "step": 34190 + }, + { + "entropy": 1.997004970908165, + "epoch": 0.10601700766995852, + "grad_norm": 10.320369720458984, + "learning_rate": 7.769825368871312e-06, + "loss": 0.5853, + "mean_token_accuracy": 0.8295348644256592, + "num_tokens": 41077968.0, + "step": 34200 + }, + { + "entropy": 1.916537807881832, + "epoch": 0.10604800679500821, + "grad_norm": 9.567767143249512, + "learning_rate": 7.76868964533772e-06, + "loss": 0.559, + "mean_token_accuracy": 0.8223750725388527, + "num_tokens": 41089515.0, + "step": 34210 + }, + { + "entropy": 1.8700389847159387, + "epoch": 0.10607900592005791, + "grad_norm": 8.427227020263672, + "learning_rate": 7.767554419688279e-06, + "loss": 0.5372, + "mean_token_accuracy": 0.8274742797017097, + "num_tokens": 41103388.0, + "step": 34220 + }, + { + "entropy": 1.920013178884983, + "epoch": 0.1061100050451076, + "grad_norm": 9.945453643798828, + "learning_rate": 7.766419691559324e-06, + "loss": 0.5577, + "mean_token_accuracy": 0.8253174960613251, + "num_tokens": 41116163.0, + "step": 34230 + }, + { + "entropy": 1.8836813405156136, + "epoch": 0.1061410041701573, + "grad_norm": 7.964671611785889, + "learning_rate": 7.765285460587557e-06, + "loss": 0.4761, + "mean_token_accuracy": 0.8437875971198082, + "num_tokens": 41128197.0, + "step": 34240 + }, + { + "entropy": 1.8750670045614242, + "epoch": 0.106172003295207, + "grad_norm": 8.137727737426758, + "learning_rate": 7.764151726410055e-06, + "loss": 0.5366, + "mean_token_accuracy": 0.8270835474133491, + "num_tokens": 41140601.0, + "step": 34250 + }, + { + "entropy": 1.9073902159929275, + "epoch": 0.10620300242025668, + "grad_norm": 8.645315170288086, + "learning_rate": 7.76301848866426e-06, + "loss": 0.5461, + "mean_token_accuracy": 0.8317181885242462, + "num_tokens": 41152349.0, + "step": 34260 + }, + { + "entropy": 1.9043108850717545, + "epoch": 0.10623400154530638, + "grad_norm": 10.44301700592041, + "learning_rate": 7.761885746987988e-06, + "loss": 0.5454, + "mean_token_accuracy": 0.8260853886604309, + "num_tokens": 41164311.0, + "step": 34270 + }, + { + "entropy": 1.791078907251358, + "epoch": 0.10626500067035607, + "grad_norm": 8.425952911376953, + "learning_rate": 7.760753501019428e-06, + "loss": 0.46, + "mean_token_accuracy": 0.8489831522107124, + "num_tokens": 41178238.0, + "step": 34280 + }, + { + "entropy": 1.8554448202252387, + "epoch": 0.10629599979540577, + "grad_norm": 8.66896915435791, + "learning_rate": 7.759621750397129e-06, + "loss": 0.5169, + "mean_token_accuracy": 0.8302797332406044, + "num_tokens": 41191021.0, + "step": 34290 + }, + { + "entropy": 1.9642982304096221, + "epoch": 0.10632699892045547, + "grad_norm": 9.48714828491211, + "learning_rate": 7.758490494760018e-06, + "loss": 0.5871, + "mean_token_accuracy": 0.8185561686754227, + "num_tokens": 41202431.0, + "step": 34300 + }, + { + "entropy": 1.8738651275634766, + "epoch": 0.10635799804550516, + "grad_norm": 4.17008113861084, + "learning_rate": 7.757359733747389e-06, + "loss": 0.5411, + "mean_token_accuracy": 0.830881142616272, + "num_tokens": 41215618.0, + "step": 34310 + }, + { + "entropy": 1.993169930577278, + "epoch": 0.10638899717055486, + "grad_norm": 8.395801544189453, + "learning_rate": 7.756229466998896e-06, + "loss": 0.6001, + "mean_token_accuracy": 0.8117834225296974, + "num_tokens": 41226702.0, + "step": 34320 + }, + { + "entropy": 1.8517363399267197, + "epoch": 0.10641999629560456, + "grad_norm": 8.656838417053223, + "learning_rate": 7.755099694154571e-06, + "loss": 0.5783, + "mean_token_accuracy": 0.8280891165137291, + "num_tokens": 41239449.0, + "step": 34330 + }, + { + "entropy": 1.9847760811448096, + "epoch": 0.10645099542065425, + "grad_norm": 8.594053268432617, + "learning_rate": 7.753970414854808e-06, + "loss": 0.6134, + "mean_token_accuracy": 0.806470163166523, + "num_tokens": 41250602.0, + "step": 34340 + }, + { + "entropy": 1.9564678460359572, + "epoch": 0.10648199454570395, + "grad_norm": 12.032402992248535, + "learning_rate": 7.752841628740366e-06, + "loss": 0.6145, + "mean_token_accuracy": 0.8159340843558311, + "num_tokens": 41262285.0, + "step": 34350 + }, + { + "entropy": 1.9261070042848587, + "epoch": 0.10651299367075365, + "grad_norm": 4.049191951751709, + "learning_rate": 7.751713335452372e-06, + "loss": 0.5818, + "mean_token_accuracy": 0.8276063561439514, + "num_tokens": 41273764.0, + "step": 34360 + }, + { + "entropy": 1.879560787975788, + "epoch": 0.10654399279580334, + "grad_norm": 8.463150024414062, + "learning_rate": 7.750585534632318e-06, + "loss": 0.525, + "mean_token_accuracy": 0.8304709896445275, + "num_tokens": 41285963.0, + "step": 34370 + }, + { + "entropy": 1.8946872353553772, + "epoch": 0.10657499192085304, + "grad_norm": 3.9842400550842285, + "learning_rate": 7.74945822592206e-06, + "loss": 0.5734, + "mean_token_accuracy": 0.8228972956538201, + "num_tokens": 41297942.0, + "step": 34380 + }, + { + "entropy": 1.9021208494901658, + "epoch": 0.10660599104590272, + "grad_norm": 8.44839096069336, + "learning_rate": 7.748331408963822e-06, + "loss": 0.5639, + "mean_token_accuracy": 0.8181784346699714, + "num_tokens": 41309011.0, + "step": 34390 + }, + { + "entropy": 1.9210452124476434, + "epoch": 0.10663699017095242, + "grad_norm": 7.22066593170166, + "learning_rate": 7.747205083400192e-06, + "loss": 0.5812, + "mean_token_accuracy": 0.8235123857855797, + "num_tokens": 41320533.0, + "step": 34400 + }, + { + "entropy": 1.8867201492190362, + "epoch": 0.10666798929600212, + "grad_norm": 5.259980201721191, + "learning_rate": 7.746079248874114e-06, + "loss": 0.577, + "mean_token_accuracy": 0.8186955004930496, + "num_tokens": 41333184.0, + "step": 34410 + }, + { + "entropy": 1.8663445100188256, + "epoch": 0.10669898842105181, + "grad_norm": 10.16250228881836, + "learning_rate": 7.7449539050289e-06, + "loss": 0.5772, + "mean_token_accuracy": 0.8328115671873093, + "num_tokens": 41344781.0, + "step": 34420 + }, + { + "entropy": 1.8742147445678712, + "epoch": 0.10672998754610151, + "grad_norm": 8.090653419494629, + "learning_rate": 7.743829051508229e-06, + "loss": 0.5541, + "mean_token_accuracy": 0.8289492219686508, + "num_tokens": 41356329.0, + "step": 34430 + }, + { + "entropy": 1.8659478917717933, + "epoch": 0.1067609866711512, + "grad_norm": 11.673904418945312, + "learning_rate": 7.742704687956137e-06, + "loss": 0.5138, + "mean_token_accuracy": 0.8437393367290497, + "num_tokens": 41368494.0, + "step": 34440 + }, + { + "entropy": 1.884896233677864, + "epoch": 0.1067919857962009, + "grad_norm": 8.397565841674805, + "learning_rate": 7.741580814017023e-06, + "loss": 0.5282, + "mean_token_accuracy": 0.830049929022789, + "num_tokens": 41380843.0, + "step": 34450 + }, + { + "entropy": 1.8628281027078628, + "epoch": 0.1068229849212506, + "grad_norm": 9.932318687438965, + "learning_rate": 7.740457429335646e-06, + "loss": 0.5343, + "mean_token_accuracy": 0.8300712257623672, + "num_tokens": 41393082.0, + "step": 34460 + }, + { + "entropy": 1.8951723709702493, + "epoch": 0.1068539840463003, + "grad_norm": 9.382803916931152, + "learning_rate": 7.739334533557126e-06, + "loss": 0.5893, + "mean_token_accuracy": 0.8235040470957756, + "num_tokens": 41404878.0, + "step": 34470 + }, + { + "entropy": 1.8519971266388893, + "epoch": 0.10688498317134999, + "grad_norm": 9.738779067993164, + "learning_rate": 7.738212126326949e-06, + "loss": 0.5648, + "mean_token_accuracy": 0.8380891785025597, + "num_tokens": 41416588.0, + "step": 34480 + }, + { + "entropy": 1.9151021018624306, + "epoch": 0.10691598229639969, + "grad_norm": 9.413227081298828, + "learning_rate": 7.73709020729095e-06, + "loss": 0.655, + "mean_token_accuracy": 0.8125704079866409, + "num_tokens": 41428429.0, + "step": 34490 + }, + { + "entropy": 1.9181360185146332, + "epoch": 0.10694698142144939, + "grad_norm": 8.843558311462402, + "learning_rate": 7.735968776095331e-06, + "loss": 0.5688, + "mean_token_accuracy": 0.8319054901599884, + "num_tokens": 41440103.0, + "step": 34500 + }, + { + "entropy": 1.8450947090983392, + "epoch": 0.10697798054649907, + "grad_norm": 9.410344123840332, + "learning_rate": 7.734847832386653e-06, + "loss": 0.5725, + "mean_token_accuracy": 0.8249197214841842, + "num_tokens": 41452461.0, + "step": 34510 + }, + { + "entropy": 1.8387939289212227, + "epoch": 0.10700897967154877, + "grad_norm": 3.964416980743408, + "learning_rate": 7.73372737581183e-06, + "loss": 0.5444, + "mean_token_accuracy": 0.8225889384746552, + "num_tokens": 41464924.0, + "step": 34520 + }, + { + "entropy": 1.929051786661148, + "epoch": 0.10703997879659846, + "grad_norm": 12.12761402130127, + "learning_rate": 7.73260740601814e-06, + "loss": 0.5192, + "mean_token_accuracy": 0.8324173107743263, + "num_tokens": 41475753.0, + "step": 34530 + }, + { + "entropy": 1.9498266860842706, + "epoch": 0.10707097792164816, + "grad_norm": 3.5817205905914307, + "learning_rate": 7.731487922653216e-06, + "loss": 0.6098, + "mean_token_accuracy": 0.8192368969321251, + "num_tokens": 41486719.0, + "step": 34540 + }, + { + "entropy": 1.7467003166675568, + "epoch": 0.10710197704669785, + "grad_norm": 4.695430278778076, + "learning_rate": 7.730368925365049e-06, + "loss": 0.4131, + "mean_token_accuracy": 0.8428031265735626, + "num_tokens": 41500593.0, + "step": 34550 + }, + { + "entropy": 1.9088023439049722, + "epoch": 0.10713297617174755, + "grad_norm": 9.617088317871094, + "learning_rate": 7.72925041380198e-06, + "loss": 0.5735, + "mean_token_accuracy": 0.8277084872126579, + "num_tokens": 41512514.0, + "step": 34560 + }, + { + "entropy": 1.9499268174171447, + "epoch": 0.10716397529679725, + "grad_norm": 9.283459663391113, + "learning_rate": 7.728132387612718e-06, + "loss": 0.6333, + "mean_token_accuracy": 0.8034616574645043, + "num_tokens": 41523613.0, + "step": 34570 + }, + { + "entropy": 1.857503816485405, + "epoch": 0.10719497442184694, + "grad_norm": 9.62498950958252, + "learning_rate": 7.727014846446315e-06, + "loss": 0.5637, + "mean_token_accuracy": 0.8330330818891525, + "num_tokens": 41536222.0, + "step": 34580 + }, + { + "entropy": 1.927737507224083, + "epoch": 0.10722597354689664, + "grad_norm": 7.827761173248291, + "learning_rate": 7.72589778995219e-06, + "loss": 0.6017, + "mean_token_accuracy": 0.8250738069415092, + "num_tokens": 41547116.0, + "step": 34590 + }, + { + "entropy": 1.894035741686821, + "epoch": 0.10725697267194634, + "grad_norm": 7.857428073883057, + "learning_rate": 7.724781217780106e-06, + "loss": 0.546, + "mean_token_accuracy": 0.8303033024072647, + "num_tokens": 41558322.0, + "step": 34600 + }, + { + "entropy": 1.8952913254499435, + "epoch": 0.10728797179699603, + "grad_norm": 7.348844051361084, + "learning_rate": 7.723665129580187e-06, + "loss": 0.6053, + "mean_token_accuracy": 0.8175556018948555, + "num_tokens": 41569188.0, + "step": 34610 + }, + { + "entropy": 1.8319330915808678, + "epoch": 0.10731897092204573, + "grad_norm": 4.2293901443481445, + "learning_rate": 7.72254952500291e-06, + "loss": 0.5056, + "mean_token_accuracy": 0.8246565207839012, + "num_tokens": 41582070.0, + "step": 34620 + }, + { + "entropy": 1.9387992650270462, + "epoch": 0.10734997004709541, + "grad_norm": 9.970220565795898, + "learning_rate": 7.721434403699101e-06, + "loss": 0.6521, + "mean_token_accuracy": 0.8120075181126595, + "num_tokens": 41593014.0, + "step": 34630 + }, + { + "entropy": 1.781299701333046, + "epoch": 0.10738096917214511, + "grad_norm": 4.431691646575928, + "learning_rate": 7.720319765319946e-06, + "loss": 0.4936, + "mean_token_accuracy": 0.8438036203384399, + "num_tokens": 41606201.0, + "step": 34640 + }, + { + "entropy": 1.902137640118599, + "epoch": 0.10741196829719481, + "grad_norm": 8.883857727050781, + "learning_rate": 7.719205609516975e-06, + "loss": 0.6318, + "mean_token_accuracy": 0.8142830148339272, + "num_tokens": 41616952.0, + "step": 34650 + }, + { + "entropy": 1.7906480133533478, + "epoch": 0.1074429674222445, + "grad_norm": 8.499350547790527, + "learning_rate": 7.718091935942078e-06, + "loss": 0.5041, + "mean_token_accuracy": 0.8440255209803581, + "num_tokens": 41629686.0, + "step": 34660 + }, + { + "entropy": 1.8703452154994011, + "epoch": 0.1074739665472942, + "grad_norm": 9.963065147399902, + "learning_rate": 7.71697874424749e-06, + "loss": 0.5693, + "mean_token_accuracy": 0.8165108144283295, + "num_tokens": 41641553.0, + "step": 34670 + }, + { + "entropy": 1.842675694823265, + "epoch": 0.1075049656723439, + "grad_norm": 7.852229118347168, + "learning_rate": 7.7158660340858e-06, + "loss": 0.5359, + "mean_token_accuracy": 0.8368835672736168, + "num_tokens": 41653222.0, + "step": 34680 + }, + { + "entropy": 1.8542825773358345, + "epoch": 0.1075359647973936, + "grad_norm": 8.768325805664062, + "learning_rate": 7.71475380510995e-06, + "loss": 0.5247, + "mean_token_accuracy": 0.8340888366103172, + "num_tokens": 41665255.0, + "step": 34690 + }, + { + "entropy": 1.796558152139187, + "epoch": 0.10756696392244329, + "grad_norm": 9.535799980163574, + "learning_rate": 7.713642056973227e-06, + "loss": 0.5226, + "mean_token_accuracy": 0.8312591359019279, + "num_tokens": 41677579.0, + "step": 34700 + }, + { + "entropy": 1.8598329350352287, + "epoch": 0.10759796304749299, + "grad_norm": 9.506328582763672, + "learning_rate": 7.71253078932927e-06, + "loss": 0.5596, + "mean_token_accuracy": 0.8224905788898468, + "num_tokens": 41688761.0, + "step": 34710 + }, + { + "entropy": 1.8570543482899666, + "epoch": 0.10762896217254268, + "grad_norm": 8.487010955810547, + "learning_rate": 7.711420001832066e-06, + "loss": 0.5343, + "mean_token_accuracy": 0.8308738321065903, + "num_tokens": 41700400.0, + "step": 34720 + }, + { + "entropy": 1.8241771847009658, + "epoch": 0.10765996129759238, + "grad_norm": 8.443219184875488, + "learning_rate": 7.710309694135956e-06, + "loss": 0.4889, + "mean_token_accuracy": 0.8372260481119156, + "num_tokens": 41712818.0, + "step": 34730 + }, + { + "entropy": 1.8751182019710542, + "epoch": 0.10769096042264208, + "grad_norm": 10.146086692810059, + "learning_rate": 7.709199865895622e-06, + "loss": 0.5334, + "mean_token_accuracy": 0.8366193175315857, + "num_tokens": 41724064.0, + "step": 34740 + }, + { + "entropy": 1.8233239591121673, + "epoch": 0.10772195954769177, + "grad_norm": 10.28363037109375, + "learning_rate": 7.708090516766096e-06, + "loss": 0.5517, + "mean_token_accuracy": 0.8335344001650811, + "num_tokens": 41736368.0, + "step": 34750 + }, + { + "entropy": 1.8685315743088722, + "epoch": 0.10775295867274146, + "grad_norm": 4.088825702667236, + "learning_rate": 7.706981646402762e-06, + "loss": 0.5963, + "mean_token_accuracy": 0.8252598002552987, + "num_tokens": 41747379.0, + "step": 34760 + }, + { + "entropy": 1.8391776710748673, + "epoch": 0.10778395779779115, + "grad_norm": 9.287007331848145, + "learning_rate": 7.705873254461345e-06, + "loss": 0.5292, + "mean_token_accuracy": 0.8283318698406219, + "num_tokens": 41759137.0, + "step": 34770 + }, + { + "entropy": 1.825559838116169, + "epoch": 0.10781495692284085, + "grad_norm": 8.258102416992188, + "learning_rate": 7.704765340597917e-06, + "loss": 0.5032, + "mean_token_accuracy": 0.8404336258769035, + "num_tokens": 41771923.0, + "step": 34780 + }, + { + "entropy": 1.876462672650814, + "epoch": 0.10784595604789055, + "grad_norm": 9.638961791992188, + "learning_rate": 7.703657904468902e-06, + "loss": 0.5253, + "mean_token_accuracy": 0.8340917885303497, + "num_tokens": 41784775.0, + "step": 34790 + }, + { + "entropy": 1.7784705072641374, + "epoch": 0.10787695517294024, + "grad_norm": 8.940130233764648, + "learning_rate": 7.702550945731066e-06, + "loss": 0.5115, + "mean_token_accuracy": 0.831823606789112, + "num_tokens": 41797943.0, + "step": 34800 + }, + { + "entropy": 1.882892769575119, + "epoch": 0.10790795429798994, + "grad_norm": 10.979787826538086, + "learning_rate": 7.701444464041514e-06, + "loss": 0.5337, + "mean_token_accuracy": 0.827871498465538, + "num_tokens": 41810184.0, + "step": 34810 + }, + { + "entropy": 1.912169161438942, + "epoch": 0.10793895342303964, + "grad_norm": 8.458621978759766, + "learning_rate": 7.700338459057705e-06, + "loss": 0.5786, + "mean_token_accuracy": 0.8240523219108582, + "num_tokens": 41821501.0, + "step": 34820 + }, + { + "entropy": 1.8121132165193559, + "epoch": 0.10796995254808933, + "grad_norm": 4.544675350189209, + "learning_rate": 7.699232930437439e-06, + "loss": 0.5187, + "mean_token_accuracy": 0.8329327836632728, + "num_tokens": 41834757.0, + "step": 34830 + }, + { + "entropy": 1.811763161420822, + "epoch": 0.10800095167313903, + "grad_norm": 5.41200590133667, + "learning_rate": 7.698127877838858e-06, + "loss": 0.4817, + "mean_token_accuracy": 0.8376516059041024, + "num_tokens": 41847967.0, + "step": 34840 + }, + { + "entropy": 1.9670744597911836, + "epoch": 0.10803195079818873, + "grad_norm": 8.704374313354492, + "learning_rate": 7.69702330092045e-06, + "loss": 0.5992, + "mean_token_accuracy": 0.8268567532300949, + "num_tokens": 41859345.0, + "step": 34850 + }, + { + "entropy": 1.9194496154785157, + "epoch": 0.10806294992323842, + "grad_norm": 4.117202281951904, + "learning_rate": 7.695919199341043e-06, + "loss": 0.5799, + "mean_token_accuracy": 0.8342431426048279, + "num_tokens": 41870624.0, + "step": 34860 + }, + { + "entropy": 1.8487198695540428, + "epoch": 0.10809394904828812, + "grad_norm": 4.074057102203369, + "learning_rate": 7.694815572759812e-06, + "loss": 0.4905, + "mean_token_accuracy": 0.8297583907842636, + "num_tokens": 41882966.0, + "step": 34870 + }, + { + "entropy": 1.8381000190973282, + "epoch": 0.1081249481733378, + "grad_norm": 7.816097736358643, + "learning_rate": 7.693712420836265e-06, + "loss": 0.4824, + "mean_token_accuracy": 0.8306377053260803, + "num_tokens": 41895795.0, + "step": 34880 + }, + { + "entropy": 1.9027506560087204, + "epoch": 0.1081559472983875, + "grad_norm": 8.619202613830566, + "learning_rate": 7.692609743230265e-06, + "loss": 0.5376, + "mean_token_accuracy": 0.8351558074355125, + "num_tokens": 41907196.0, + "step": 34890 + }, + { + "entropy": 1.937445905804634, + "epoch": 0.1081869464234372, + "grad_norm": 8.807971000671387, + "learning_rate": 7.691507539602005e-06, + "loss": 0.6096, + "mean_token_accuracy": 0.8202361524105072, + "num_tokens": 41918599.0, + "step": 34900 + }, + { + "entropy": 1.8574762254953385, + "epoch": 0.10821794554848689, + "grad_norm": 8.557307243347168, + "learning_rate": 7.690405809612025e-06, + "loss": 0.4873, + "mean_token_accuracy": 0.8375605642795563, + "num_tokens": 41930689.0, + "step": 34910 + }, + { + "entropy": 1.9088428899645806, + "epoch": 0.10824894467353659, + "grad_norm": 8.38049602508545, + "learning_rate": 7.689304552921199e-06, + "loss": 0.5719, + "mean_token_accuracy": 0.8217715606093406, + "num_tokens": 41941970.0, + "step": 34920 + }, + { + "entropy": 1.8612014800310135, + "epoch": 0.10827994379858628, + "grad_norm": 10.078642845153809, + "learning_rate": 7.688203769190748e-06, + "loss": 0.5714, + "mean_token_accuracy": 0.8198879033327102, + "num_tokens": 41954804.0, + "step": 34930 + }, + { + "entropy": 1.8768370226025581, + "epoch": 0.10831094292363598, + "grad_norm": 10.8411865234375, + "learning_rate": 7.687103458082228e-06, + "loss": 0.534, + "mean_token_accuracy": 0.8370490476489068, + "num_tokens": 41965644.0, + "step": 34940 + }, + { + "entropy": 1.8967427670955659, + "epoch": 0.10834194204868568, + "grad_norm": 9.28530502319336, + "learning_rate": 7.686003619257535e-06, + "loss": 0.6235, + "mean_token_accuracy": 0.8072300642728806, + "num_tokens": 41977403.0, + "step": 34950 + }, + { + "entropy": 1.8946552872657776, + "epoch": 0.10837294117373537, + "grad_norm": 4.223282337188721, + "learning_rate": 7.684904252378904e-06, + "loss": 0.6063, + "mean_token_accuracy": 0.8308607801795006, + "num_tokens": 41989197.0, + "step": 34960 + }, + { + "entropy": 1.9117924958467483, + "epoch": 0.10840394029878507, + "grad_norm": 12.681775093078613, + "learning_rate": 7.683805357108907e-06, + "loss": 0.5557, + "mean_token_accuracy": 0.823358316719532, + "num_tokens": 42000840.0, + "step": 34970 + }, + { + "entropy": 1.8810236111283303, + "epoch": 0.10843493942383477, + "grad_norm": 8.788151741027832, + "learning_rate": 7.682706933110456e-06, + "loss": 0.5739, + "mean_token_accuracy": 0.8217835262417793, + "num_tokens": 42013194.0, + "step": 34980 + }, + { + "entropy": 1.8799246475100517, + "epoch": 0.10846593854888446, + "grad_norm": 8.282735824584961, + "learning_rate": 7.681608980046798e-06, + "loss": 0.5044, + "mean_token_accuracy": 0.8285661846399307, + "num_tokens": 42025091.0, + "step": 34990 + }, + { + "entropy": 1.8529492557048797, + "epoch": 0.10849693767393416, + "grad_norm": 8.723884582519531, + "learning_rate": 7.680511497581516e-06, + "loss": 0.4907, + "mean_token_accuracy": 0.8336036711931228, + "num_tokens": 42038406.0, + "step": 35000 + }, + { + "entropy": 1.8494236335158347, + "epoch": 0.10852793679898384, + "grad_norm": 4.324466705322266, + "learning_rate": 7.67941448537853e-06, + "loss": 0.5534, + "mean_token_accuracy": 0.8200669184327125, + "num_tokens": 42051116.0, + "step": 35010 + }, + { + "entropy": 1.9139706775546075, + "epoch": 0.10855893592403354, + "grad_norm": 8.67935562133789, + "learning_rate": 7.6783179431021e-06, + "loss": 0.5685, + "mean_token_accuracy": 0.8277176558971405, + "num_tokens": 42063002.0, + "step": 35020 + }, + { + "entropy": 1.8269087105989457, + "epoch": 0.10858993504908324, + "grad_norm": 9.06470775604248, + "learning_rate": 7.677221870416817e-06, + "loss": 0.4758, + "mean_token_accuracy": 0.8408965677022934, + "num_tokens": 42075421.0, + "step": 35030 + }, + { + "entropy": 1.862933087348938, + "epoch": 0.10862093417413293, + "grad_norm": 9.062261581420898, + "learning_rate": 7.676126266987606e-06, + "loss": 0.5405, + "mean_token_accuracy": 0.8264860972762108, + "num_tokens": 42087233.0, + "step": 35040 + }, + { + "entropy": 1.874533024430275, + "epoch": 0.10865193329918263, + "grad_norm": 8.091785430908203, + "learning_rate": 7.67503113247973e-06, + "loss": 0.4997, + "mean_token_accuracy": 0.8257019862532615, + "num_tokens": 42099681.0, + "step": 35050 + }, + { + "entropy": 1.956665936112404, + "epoch": 0.10868293242423233, + "grad_norm": 8.70102310180664, + "learning_rate": 7.673936466558786e-06, + "loss": 0.6122, + "mean_token_accuracy": 0.8198647305369378, + "num_tokens": 42110874.0, + "step": 35060 + }, + { + "entropy": 1.912957863509655, + "epoch": 0.10871393154928202, + "grad_norm": 9.428136825561523, + "learning_rate": 7.672842268890703e-06, + "loss": 0.6027, + "mean_token_accuracy": 0.8237558603286743, + "num_tokens": 42122704.0, + "step": 35070 + }, + { + "entropy": 1.9081886559724808, + "epoch": 0.10874493067433172, + "grad_norm": 9.511113166809082, + "learning_rate": 7.671748539141744e-06, + "loss": 0.6123, + "mean_token_accuracy": 0.8153191924095153, + "num_tokens": 42135165.0, + "step": 35080 + }, + { + "entropy": 1.8714108362793922, + "epoch": 0.10877592979938142, + "grad_norm": 9.29901123046875, + "learning_rate": 7.670655276978506e-06, + "loss": 0.5503, + "mean_token_accuracy": 0.8311486825346946, + "num_tokens": 42147431.0, + "step": 35090 + }, + { + "entropy": 1.8901761874556542, + "epoch": 0.10880692892443111, + "grad_norm": 8.597390174865723, + "learning_rate": 7.669562482067915e-06, + "loss": 0.5282, + "mean_token_accuracy": 0.8393839746713638, + "num_tokens": 42159935.0, + "step": 35100 + }, + { + "entropy": 1.8835955768823625, + "epoch": 0.10883792804948081, + "grad_norm": 8.558197021484375, + "learning_rate": 7.668470154077237e-06, + "loss": 0.594, + "mean_token_accuracy": 0.8140904232859612, + "num_tokens": 42172790.0, + "step": 35110 + }, + { + "entropy": 1.9337810754776001, + "epoch": 0.1088689271745305, + "grad_norm": 8.871868133544922, + "learning_rate": 7.667378292674056e-06, + "loss": 0.5931, + "mean_token_accuracy": 0.8145381569862366, + "num_tokens": 42184215.0, + "step": 35120 + }, + { + "entropy": 1.8214157864451408, + "epoch": 0.10889992629958019, + "grad_norm": 4.693521499633789, + "learning_rate": 7.666286897526304e-06, + "loss": 0.4763, + "mean_token_accuracy": 0.8404714584350585, + "num_tokens": 42197411.0, + "step": 35130 + }, + { + "entropy": 1.8844437003135681, + "epoch": 0.10893092542462988, + "grad_norm": 9.958372116088867, + "learning_rate": 7.66519596830223e-06, + "loss": 0.5385, + "mean_token_accuracy": 0.822202742099762, + "num_tokens": 42209406.0, + "step": 35140 + }, + { + "entropy": 1.9353954821825028, + "epoch": 0.10896192454967958, + "grad_norm": 9.83383560180664, + "learning_rate": 7.664105504670418e-06, + "loss": 0.5952, + "mean_token_accuracy": 0.8054858520627022, + "num_tokens": 42221328.0, + "step": 35150 + }, + { + "entropy": 1.7801556140184402, + "epoch": 0.10899292367472928, + "grad_norm": 5.5207061767578125, + "learning_rate": 7.663015506299786e-06, + "loss": 0.4345, + "mean_token_accuracy": 0.844315542280674, + "num_tokens": 42234769.0, + "step": 35160 + }, + { + "entropy": 1.862704548239708, + "epoch": 0.10902392279977897, + "grad_norm": 7.391566276550293, + "learning_rate": 7.661925972859575e-06, + "loss": 0.4737, + "mean_token_accuracy": 0.8368149012327194, + "num_tokens": 42247060.0, + "step": 35170 + }, + { + "entropy": 1.9155489355325699, + "epoch": 0.10905492192482867, + "grad_norm": 4.612364768981934, + "learning_rate": 7.66083690401936e-06, + "loss": 0.5633, + "mean_token_accuracy": 0.8262100204825401, + "num_tokens": 42258503.0, + "step": 35180 + }, + { + "entropy": 1.9643469721078872, + "epoch": 0.10908592104987837, + "grad_norm": 9.24181079864502, + "learning_rate": 7.659748299449044e-06, + "loss": 0.6368, + "mean_token_accuracy": 0.8126005738973617, + "num_tokens": 42269513.0, + "step": 35190 + }, + { + "entropy": 1.8432982012629509, + "epoch": 0.10911692017492806, + "grad_norm": 9.919602394104004, + "learning_rate": 7.658660158818853e-06, + "loss": 0.5025, + "mean_token_accuracy": 0.8412719219923019, + "num_tokens": 42281589.0, + "step": 35200 + }, + { + "entropy": 1.8541820272803307, + "epoch": 0.10914791929997776, + "grad_norm": 4.436375617980957, + "learning_rate": 7.657572481799348e-06, + "loss": 0.5486, + "mean_token_accuracy": 0.8266458585858345, + "num_tokens": 42293563.0, + "step": 35210 + }, + { + "entropy": 1.8942555904388427, + "epoch": 0.10917891842502746, + "grad_norm": 10.810229301452637, + "learning_rate": 7.656485268061414e-06, + "loss": 0.5745, + "mean_token_accuracy": 0.8252988263964653, + "num_tokens": 42305355.0, + "step": 35220 + }, + { + "entropy": 1.8540264323353768, + "epoch": 0.10920991755007715, + "grad_norm": 8.089149475097656, + "learning_rate": 7.655398517276262e-06, + "loss": 0.5059, + "mean_token_accuracy": 0.8406173124909401, + "num_tokens": 42317042.0, + "step": 35230 + }, + { + "entropy": 1.831780730187893, + "epoch": 0.10924091667512685, + "grad_norm": 10.136151313781738, + "learning_rate": 7.654312229115433e-06, + "loss": 0.4951, + "mean_token_accuracy": 0.8417727887630463, + "num_tokens": 42329730.0, + "step": 35240 + }, + { + "entropy": 1.8968460232019424, + "epoch": 0.10927191580017653, + "grad_norm": 10.715241432189941, + "learning_rate": 7.65322640325079e-06, + "loss": 0.6043, + "mean_token_accuracy": 0.8156336963176727, + "num_tokens": 42341271.0, + "step": 35250 + }, + { + "entropy": 1.8117874696850778, + "epoch": 0.10930291492522623, + "grad_norm": 8.665305137634277, + "learning_rate": 7.652141039354524e-06, + "loss": 0.4908, + "mean_token_accuracy": 0.8276491865515709, + "num_tokens": 42354850.0, + "step": 35260 + }, + { + "entropy": 1.85145965218544, + "epoch": 0.10933391405027593, + "grad_norm": 10.74636173248291, + "learning_rate": 7.651056137099154e-06, + "loss": 0.512, + "mean_token_accuracy": 0.8369231000542641, + "num_tokens": 42367039.0, + "step": 35270 + }, + { + "entropy": 1.818227480351925, + "epoch": 0.10936491317532562, + "grad_norm": 10.717218399047852, + "learning_rate": 7.649971696157518e-06, + "loss": 0.4993, + "mean_token_accuracy": 0.8339720323681832, + "num_tokens": 42380069.0, + "step": 35280 + }, + { + "entropy": 1.8332565039396287, + "epoch": 0.10939591230037532, + "grad_norm": 8.914962768554688, + "learning_rate": 7.648887716202781e-06, + "loss": 0.503, + "mean_token_accuracy": 0.8378045499324799, + "num_tokens": 42392532.0, + "step": 35290 + }, + { + "entropy": 1.8315134570002556, + "epoch": 0.10942691142542502, + "grad_norm": 8.588201522827148, + "learning_rate": 7.647804196908435e-06, + "loss": 0.4782, + "mean_token_accuracy": 0.8481560617685318, + "num_tokens": 42404545.0, + "step": 35300 + }, + { + "entropy": 1.9187032520771026, + "epoch": 0.10945791055047471, + "grad_norm": 7.530204772949219, + "learning_rate": 7.646721137948292e-06, + "loss": 0.6584, + "mean_token_accuracy": 0.8151246398687363, + "num_tokens": 42416517.0, + "step": 35310 + }, + { + "entropy": 1.8617155611515046, + "epoch": 0.10948890967552441, + "grad_norm": 9.000489234924316, + "learning_rate": 7.64563853899649e-06, + "loss": 0.5521, + "mean_token_accuracy": 0.837635052204132, + "num_tokens": 42429370.0, + "step": 35320 + }, + { + "entropy": 1.9121372044086455, + "epoch": 0.1095199088005741, + "grad_norm": 11.23980712890625, + "learning_rate": 7.644556399727486e-06, + "loss": 0.5493, + "mean_token_accuracy": 0.8292974248528481, + "num_tokens": 42440341.0, + "step": 35330 + }, + { + "entropy": 1.9182278335094451, + "epoch": 0.1095509079256238, + "grad_norm": 10.349918365478516, + "learning_rate": 7.643474719816064e-06, + "loss": 0.5964, + "mean_token_accuracy": 0.8224563241004944, + "num_tokens": 42451755.0, + "step": 35340 + }, + { + "entropy": 1.9242574140429496, + "epoch": 0.1095819070506735, + "grad_norm": 8.09676742553711, + "learning_rate": 7.642393498937326e-06, + "loss": 0.5653, + "mean_token_accuracy": 0.8311538890004158, + "num_tokens": 42463039.0, + "step": 35350 + }, + { + "entropy": 1.9706586122512817, + "epoch": 0.1096129061757232, + "grad_norm": 10.670557022094727, + "learning_rate": 7.6413127367667e-06, + "loss": 0.6166, + "mean_token_accuracy": 0.817748686671257, + "num_tokens": 42473647.0, + "step": 35360 + }, + { + "entropy": 1.8795671597123147, + "epoch": 0.10964390530077289, + "grad_norm": 4.104644775390625, + "learning_rate": 7.640232432979932e-06, + "loss": 0.5604, + "mean_token_accuracy": 0.8274060249328613, + "num_tokens": 42485582.0, + "step": 35370 + }, + { + "entropy": 1.9466561555862427, + "epoch": 0.10967490442582258, + "grad_norm": 11.086281776428223, + "learning_rate": 7.639152587253087e-06, + "loss": 0.6171, + "mean_token_accuracy": 0.8145519152283669, + "num_tokens": 42496181.0, + "step": 35380 + }, + { + "entropy": 1.8248122036457062, + "epoch": 0.10970590355087227, + "grad_norm": 5.816926956176758, + "learning_rate": 7.638073199262556e-06, + "loss": 0.5611, + "mean_token_accuracy": 0.8266668051481247, + "num_tokens": 42509157.0, + "step": 35390 + }, + { + "entropy": 1.8483053863048553, + "epoch": 0.10973690267592197, + "grad_norm": 9.291295051574707, + "learning_rate": 7.636994268685048e-06, + "loss": 0.5601, + "mean_token_accuracy": 0.8281493291258812, + "num_tokens": 42520936.0, + "step": 35400 + }, + { + "entropy": 1.906061513721943, + "epoch": 0.10976790180097167, + "grad_norm": 9.213872909545898, + "learning_rate": 7.635915795197586e-06, + "loss": 0.577, + "mean_token_accuracy": 0.8235763564705849, + "num_tokens": 42531678.0, + "step": 35410 + }, + { + "entropy": 1.9281126588582993, + "epoch": 0.10979890092602136, + "grad_norm": 9.515471458435059, + "learning_rate": 7.634837778477519e-06, + "loss": 0.5632, + "mean_token_accuracy": 0.8304232120513916, + "num_tokens": 42542312.0, + "step": 35420 + }, + { + "entropy": 1.8663022175431252, + "epoch": 0.10982990005107106, + "grad_norm": 4.878605842590332, + "learning_rate": 7.633760218202513e-06, + "loss": 0.5287, + "mean_token_accuracy": 0.8294124737381935, + "num_tokens": 42554279.0, + "step": 35430 + }, + { + "entropy": 1.774543423950672, + "epoch": 0.10986089917612075, + "grad_norm": 5.2295002937316895, + "learning_rate": 7.632683114050551e-06, + "loss": 0.4696, + "mean_token_accuracy": 0.8427405998110771, + "num_tokens": 42567273.0, + "step": 35440 + }, + { + "entropy": 1.8794532790780067, + "epoch": 0.10989189830117045, + "grad_norm": 9.379169464111328, + "learning_rate": 7.631606465699934e-06, + "loss": 0.5689, + "mean_token_accuracy": 0.8301954373717308, + "num_tokens": 42579668.0, + "step": 35450 + }, + { + "entropy": 1.878871650993824, + "epoch": 0.10992289742622015, + "grad_norm": 4.546285152435303, + "learning_rate": 7.630530272829285e-06, + "loss": 0.5423, + "mean_token_accuracy": 0.8306925192475318, + "num_tokens": 42591107.0, + "step": 35460 + }, + { + "entropy": 1.7232867300510406, + "epoch": 0.10995389655126984, + "grad_norm": 4.568423748016357, + "learning_rate": 7.629454535117535e-06, + "loss": 0.3853, + "mean_token_accuracy": 0.8541397914290428, + "num_tokens": 42604850.0, + "step": 35470 + }, + { + "entropy": 1.8509867563843727, + "epoch": 0.10998489567631954, + "grad_norm": 9.155996322631836, + "learning_rate": 7.6283792522439415e-06, + "loss": 0.5216, + "mean_token_accuracy": 0.8339457020163537, + "num_tokens": 42616914.0, + "step": 35480 + }, + { + "entropy": 1.9788624405860902, + "epoch": 0.11001589480136924, + "grad_norm": 9.914673805236816, + "learning_rate": 7.6273044238880745e-06, + "loss": 0.6481, + "mean_token_accuracy": 0.8120816603302956, + "num_tokens": 42628068.0, + "step": 35490 + }, + { + "entropy": 1.8431833282113075, + "epoch": 0.11004689392641892, + "grad_norm": 8.730440139770508, + "learning_rate": 7.626230049729815e-06, + "loss": 0.5176, + "mean_token_accuracy": 0.8376294538378716, + "num_tokens": 42640526.0, + "step": 35500 + }, + { + "entropy": 1.8701015904545784, + "epoch": 0.11007789305146862, + "grad_norm": 9.417694091796875, + "learning_rate": 7.625156129449368e-06, + "loss": 0.5799, + "mean_token_accuracy": 0.8195733115077019, + "num_tokens": 42652641.0, + "step": 35510 + }, + { + "entropy": 1.9821488350629806, + "epoch": 0.11010889217651831, + "grad_norm": 7.322350025177002, + "learning_rate": 7.624082662727249e-06, + "loss": 0.6099, + "mean_token_accuracy": 0.8140304252505303, + "num_tokens": 42664155.0, + "step": 35520 + }, + { + "entropy": 1.8935043781995773, + "epoch": 0.11013989130156801, + "grad_norm": 9.178773880004883, + "learning_rate": 7.623009649244287e-06, + "loss": 0.513, + "mean_token_accuracy": 0.8334458202123642, + "num_tokens": 42676188.0, + "step": 35530 + }, + { + "entropy": 1.891274669766426, + "epoch": 0.1101708904266177, + "grad_norm": 10.043092727661133, + "learning_rate": 7.62193708868163e-06, + "loss": 0.5871, + "mean_token_accuracy": 0.816723170876503, + "num_tokens": 42688395.0, + "step": 35540 + }, + { + "entropy": 1.8267503723502159, + "epoch": 0.1102018895516674, + "grad_norm": 4.575459957122803, + "learning_rate": 7.620864980720736e-06, + "loss": 0.5009, + "mean_token_accuracy": 0.8234075903892517, + "num_tokens": 42701643.0, + "step": 35550 + }, + { + "entropy": 1.956119680404663, + "epoch": 0.1102328886767171, + "grad_norm": 7.073038101196289, + "learning_rate": 7.619793325043378e-06, + "loss": 0.6057, + "mean_token_accuracy": 0.8232446178793907, + "num_tokens": 42713319.0, + "step": 35560 + }, + { + "entropy": 1.9920398950576783, + "epoch": 0.1102638878017668, + "grad_norm": 8.376012802124023, + "learning_rate": 7.618722121331642e-06, + "loss": 0.598, + "mean_token_accuracy": 0.8307427033782006, + "num_tokens": 42724006.0, + "step": 35570 + }, + { + "entropy": 1.906696754693985, + "epoch": 0.1102948869268165, + "grad_norm": 4.264830589294434, + "learning_rate": 7.617651369267926e-06, + "loss": 0.591, + "mean_token_accuracy": 0.8193379983305931, + "num_tokens": 42735888.0, + "step": 35580 + }, + { + "entropy": 1.8427229791879653, + "epoch": 0.11032588605186619, + "grad_norm": 9.698928833007812, + "learning_rate": 7.6165810685349415e-06, + "loss": 0.5671, + "mean_token_accuracy": 0.8317845165729523, + "num_tokens": 42747806.0, + "step": 35590 + }, + { + "entropy": 1.8134034514427184, + "epoch": 0.11035688517691589, + "grad_norm": 3.2644388675689697, + "learning_rate": 7.615511218815713e-06, + "loss": 0.4577, + "mean_token_accuracy": 0.8382085859775543, + "num_tokens": 42761313.0, + "step": 35600 + }, + { + "entropy": 1.8178164064884186, + "epoch": 0.11038788430196558, + "grad_norm": 8.92131233215332, + "learning_rate": 7.614441819793575e-06, + "loss": 0.5052, + "mean_token_accuracy": 0.8241424784064293, + "num_tokens": 42774872.0, + "step": 35610 + }, + { + "entropy": 1.953722095489502, + "epoch": 0.11041888342701527, + "grad_norm": 10.026144981384277, + "learning_rate": 7.61337287115217e-06, + "loss": 0.5721, + "mean_token_accuracy": 0.8248757779598236, + "num_tokens": 42787444.0, + "step": 35620 + }, + { + "entropy": 1.8925233513116837, + "epoch": 0.11044988255206496, + "grad_norm": 9.82945728302002, + "learning_rate": 7.612304372575457e-06, + "loss": 0.5397, + "mean_token_accuracy": 0.8335311338305473, + "num_tokens": 42800219.0, + "step": 35630 + }, + { + "entropy": 1.9320378288626672, + "epoch": 0.11048088167711466, + "grad_norm": 8.172740936279297, + "learning_rate": 7.611236323747706e-06, + "loss": 0.567, + "mean_token_accuracy": 0.8279871761798858, + "num_tokens": 42811737.0, + "step": 35640 + }, + { + "entropy": 1.8768427297472954, + "epoch": 0.11051188080216436, + "grad_norm": 4.6713972091674805, + "learning_rate": 7.610168724353488e-06, + "loss": 0.5321, + "mean_token_accuracy": 0.8223695576190948, + "num_tokens": 42823969.0, + "step": 35650 + }, + { + "entropy": 1.8219685062766076, + "epoch": 0.11054287992721405, + "grad_norm": 8.601825714111328, + "learning_rate": 7.6091015740776955e-06, + "loss": 0.515, + "mean_token_accuracy": 0.8326170966029167, + "num_tokens": 42835925.0, + "step": 35660 + }, + { + "entropy": 1.9021424800157547, + "epoch": 0.11057387905226375, + "grad_norm": 8.71493148803711, + "learning_rate": 7.608034872605521e-06, + "loss": 0.5521, + "mean_token_accuracy": 0.8241175979375839, + "num_tokens": 42847533.0, + "step": 35670 + }, + { + "entropy": 1.8227483302354812, + "epoch": 0.11060487817731345, + "grad_norm": 9.012811660766602, + "learning_rate": 7.606968619622469e-06, + "loss": 0.4728, + "mean_token_accuracy": 0.8344050481915474, + "num_tokens": 42860072.0, + "step": 35680 + }, + { + "entropy": 1.9262940838932991, + "epoch": 0.11063587730236314, + "grad_norm": 8.894078254699707, + "learning_rate": 7.605902814814354e-06, + "loss": 0.586, + "mean_token_accuracy": 0.8217165872454644, + "num_tokens": 42871958.0, + "step": 35690 + }, + { + "entropy": 1.9645220905542373, + "epoch": 0.11066687642741284, + "grad_norm": 6.617010116577148, + "learning_rate": 7.604837457867298e-06, + "loss": 0.5833, + "mean_token_accuracy": 0.8164415255188942, + "num_tokens": 42882974.0, + "step": 35700 + }, + { + "entropy": 1.8369232729077338, + "epoch": 0.11069787555246254, + "grad_norm": 4.062877178192139, + "learning_rate": 7.603772548467727e-06, + "loss": 0.4606, + "mean_token_accuracy": 0.8306289702653885, + "num_tokens": 42895928.0, + "step": 35710 + }, + { + "entropy": 1.9046246379613876, + "epoch": 0.11072887467751223, + "grad_norm": 9.858223915100098, + "learning_rate": 7.6027080863023806e-06, + "loss": 0.5456, + "mean_token_accuracy": 0.8280208811163903, + "num_tokens": 42908519.0, + "step": 35720 + }, + { + "entropy": 1.861171054840088, + "epoch": 0.11075987380256193, + "grad_norm": 4.55070686340332, + "learning_rate": 7.601644071058299e-06, + "loss": 0.5389, + "mean_token_accuracy": 0.8228657454252243, + "num_tokens": 42921353.0, + "step": 35730 + }, + { + "entropy": 1.8522720783948898, + "epoch": 0.11079087292761162, + "grad_norm": 9.031878471374512, + "learning_rate": 7.600580502422833e-06, + "loss": 0.4988, + "mean_token_accuracy": 0.8284451842308045, + "num_tokens": 42934541.0, + "step": 35740 + }, + { + "entropy": 1.965764506161213, + "epoch": 0.11082187205266131, + "grad_norm": 9.1340970993042, + "learning_rate": 7.59951738008364e-06, + "loss": 0.5677, + "mean_token_accuracy": 0.8256940588355064, + "num_tokens": 42946350.0, + "step": 35750 + }, + { + "entropy": 1.9360377460718154, + "epoch": 0.110852871177711, + "grad_norm": 9.9710693359375, + "learning_rate": 7.598454703728679e-06, + "loss": 0.5423, + "mean_token_accuracy": 0.8291813790798187, + "num_tokens": 42957478.0, + "step": 35760 + }, + { + "entropy": 1.898517556488514, + "epoch": 0.1108838703027607, + "grad_norm": 11.122152328491211, + "learning_rate": 7.597392473046215e-06, + "loss": 0.5485, + "mean_token_accuracy": 0.8223585531115531, + "num_tokens": 42969607.0, + "step": 35770 + }, + { + "entropy": 1.9774446800351142, + "epoch": 0.1109148694278104, + "grad_norm": 8.738960266113281, + "learning_rate": 7.596330687724825e-06, + "loss": 0.5825, + "mean_token_accuracy": 0.8237465664744377, + "num_tokens": 42981808.0, + "step": 35780 + }, + { + "entropy": 2.0006524354219435, + "epoch": 0.1109458685528601, + "grad_norm": 10.508487701416016, + "learning_rate": 7.595269347453383e-06, + "loss": 0.5879, + "mean_token_accuracy": 0.8199486985802651, + "num_tokens": 42993002.0, + "step": 35790 + }, + { + "entropy": 1.9162476733326912, + "epoch": 0.11097686767790979, + "grad_norm": 8.760506629943848, + "learning_rate": 7.5942084519210665e-06, + "loss": 0.5071, + "mean_token_accuracy": 0.8336918845772743, + "num_tokens": 43005023.0, + "step": 35800 + }, + { + "entropy": 1.9015693604946136, + "epoch": 0.11100786680295949, + "grad_norm": 3.9503958225250244, + "learning_rate": 7.5931480008173654e-06, + "loss": 0.5675, + "mean_token_accuracy": 0.8282639935612679, + "num_tokens": 43017177.0, + "step": 35810 + }, + { + "entropy": 1.8436180412769319, + "epoch": 0.11103886592800918, + "grad_norm": 5.058619022369385, + "learning_rate": 7.592087993832064e-06, + "loss": 0.556, + "mean_token_accuracy": 0.8285689920186996, + "num_tokens": 43029713.0, + "step": 35820 + }, + { + "entropy": 1.934212613105774, + "epoch": 0.11106986505305888, + "grad_norm": 9.86316967010498, + "learning_rate": 7.591028430655252e-06, + "loss": 0.587, + "mean_token_accuracy": 0.8228744760155677, + "num_tokens": 43041006.0, + "step": 35830 + }, + { + "entropy": 1.9125287413597107, + "epoch": 0.11110086417810858, + "grad_norm": 11.178047180175781, + "learning_rate": 7.589969310977325e-06, + "loss": 0.5717, + "mean_token_accuracy": 0.8275589749217034, + "num_tokens": 43053262.0, + "step": 35840 + }, + { + "entropy": 1.890585133433342, + "epoch": 0.11113186330315827, + "grad_norm": 8.734784126281738, + "learning_rate": 7.588910634488981e-06, + "loss": 0.5319, + "mean_token_accuracy": 0.8315401718020439, + "num_tokens": 43064804.0, + "step": 35850 + }, + { + "entropy": 1.9079217493534089, + "epoch": 0.11116286242820797, + "grad_norm": 8.155106544494629, + "learning_rate": 7.587852400881212e-06, + "loss": 0.5076, + "mean_token_accuracy": 0.8348406001925468, + "num_tokens": 43076576.0, + "step": 35860 + }, + { + "entropy": 1.9703772380948066, + "epoch": 0.11119386155325765, + "grad_norm": 9.1405611038208, + "learning_rate": 7.586794609845321e-06, + "loss": 0.5764, + "mean_token_accuracy": 0.8214395239949226, + "num_tokens": 43087904.0, + "step": 35870 + }, + { + "entropy": 2.0060620248317718, + "epoch": 0.11122486067830735, + "grad_norm": 8.357717514038086, + "learning_rate": 7.58573726107291e-06, + "loss": 0.6403, + "mean_token_accuracy": 0.8090036302804947, + "num_tokens": 43098746.0, + "step": 35880 + }, + { + "entropy": 1.9552163541316987, + "epoch": 0.11125585980335705, + "grad_norm": 12.635717391967773, + "learning_rate": 7.584680354255878e-06, + "loss": 0.6179, + "mean_token_accuracy": 0.809755727648735, + "num_tokens": 43109605.0, + "step": 35890 + }, + { + "entropy": 1.9294922351837158, + "epoch": 0.11128685892840674, + "grad_norm": 7.567071437835693, + "learning_rate": 7.583623889086426e-06, + "loss": 0.5897, + "mean_token_accuracy": 0.8201198145747185, + "num_tokens": 43121734.0, + "step": 35900 + }, + { + "entropy": 1.8198191657662393, + "epoch": 0.11131785805345644, + "grad_norm": 10.164730072021484, + "learning_rate": 7.5825678652570565e-06, + "loss": 0.4756, + "mean_token_accuracy": 0.8393372297286987, + "num_tokens": 43135018.0, + "step": 35910 + }, + { + "entropy": 1.9463165909051896, + "epoch": 0.11134885717850614, + "grad_norm": 10.353556632995605, + "learning_rate": 7.58151228246057e-06, + "loss": 0.5536, + "mean_token_accuracy": 0.8151269420981407, + "num_tokens": 43146922.0, + "step": 35920 + }, + { + "entropy": 1.8669220060110092, + "epoch": 0.11137985630355583, + "grad_norm": 7.747506618499756, + "learning_rate": 7.5804571403900685e-06, + "loss": 0.4764, + "mean_token_accuracy": 0.8381304860115051, + "num_tokens": 43159179.0, + "step": 35930 + }, + { + "entropy": 1.975856500864029, + "epoch": 0.11141085542860553, + "grad_norm": 8.58327579498291, + "learning_rate": 7.579402438738951e-06, + "loss": 0.6303, + "mean_token_accuracy": 0.8211771547794342, + "num_tokens": 43170565.0, + "step": 35940 + }, + { + "entropy": 1.8953873470425606, + "epoch": 0.11144185455365523, + "grad_norm": 3.883213520050049, + "learning_rate": 7.578348177200915e-06, + "loss": 0.548, + "mean_token_accuracy": 0.8355704694986343, + "num_tokens": 43182660.0, + "step": 35950 + }, + { + "entropy": 1.9930355623364449, + "epoch": 0.11147285367870492, + "grad_norm": 7.887125015258789, + "learning_rate": 7.577294355469956e-06, + "loss": 0.5559, + "mean_token_accuracy": 0.8245319411158561, + "num_tokens": 43194697.0, + "step": 35960 + }, + { + "entropy": 1.9415101364254952, + "epoch": 0.11150385280375462, + "grad_norm": 9.421870231628418, + "learning_rate": 7.576240973240371e-06, + "loss": 0.5932, + "mean_token_accuracy": 0.8234505504369736, + "num_tokens": 43206210.0, + "step": 35970 + }, + { + "entropy": 1.9392582058906556, + "epoch": 0.11153485192880432, + "grad_norm": 8.467351913452148, + "learning_rate": 7.575188030206747e-06, + "loss": 0.5192, + "mean_token_accuracy": 0.8313940614461899, + "num_tokens": 43218118.0, + "step": 35980 + }, + { + "entropy": 1.9223637744784354, + "epoch": 0.111565851053854, + "grad_norm": 9.47497272491455, + "learning_rate": 7.574135526063976e-06, + "loss": 0.5115, + "mean_token_accuracy": 0.840501819550991, + "num_tokens": 43229669.0, + "step": 35990 + }, + { + "entropy": 2.0074471473693847, + "epoch": 0.1115968501789037, + "grad_norm": 10.357573509216309, + "learning_rate": 7.5730834605072416e-06, + "loss": 0.6127, + "mean_token_accuracy": 0.8227631166577339, + "num_tokens": 43240593.0, + "step": 36000 + }, + { + "entropy": 1.859610728919506, + "epoch": 0.11162784930395339, + "grad_norm": 9.257423400878906, + "learning_rate": 7.5720318332320255e-06, + "loss": 0.5297, + "mean_token_accuracy": 0.8329069197177887, + "num_tokens": 43253611.0, + "step": 36010 + }, + { + "entropy": 1.9028906911611556, + "epoch": 0.11165884842900309, + "grad_norm": 8.12290096282959, + "learning_rate": 7.570980643934104e-06, + "loss": 0.5245, + "mean_token_accuracy": 0.8339066132903099, + "num_tokens": 43265093.0, + "step": 36020 + }, + { + "entropy": 1.8995389580726623, + "epoch": 0.11168984755405278, + "grad_norm": 9.205475807189941, + "learning_rate": 7.56992989230955e-06, + "loss": 0.4958, + "mean_token_accuracy": 0.8334958225488662, + "num_tokens": 43277585.0, + "step": 36030 + }, + { + "entropy": 1.9100613698363305, + "epoch": 0.11172084667910248, + "grad_norm": 7.445013046264648, + "learning_rate": 7.5688795780547335e-06, + "loss": 0.4955, + "mean_token_accuracy": 0.842551352083683, + "num_tokens": 43289300.0, + "step": 36040 + }, + { + "entropy": 1.9307687520980834, + "epoch": 0.11175184580415218, + "grad_norm": 9.54047966003418, + "learning_rate": 7.5678297008663135e-06, + "loss": 0.5823, + "mean_token_accuracy": 0.8196006685495376, + "num_tokens": 43300995.0, + "step": 36050 + }, + { + "entropy": 1.9431654140353203, + "epoch": 0.11178284492920187, + "grad_norm": 8.615473747253418, + "learning_rate": 7.566780260441252e-06, + "loss": 0.6086, + "mean_token_accuracy": 0.8174616977572441, + "num_tokens": 43312218.0, + "step": 36060 + }, + { + "entropy": 1.8542766660451888, + "epoch": 0.11181384405425157, + "grad_norm": 8.529784202575684, + "learning_rate": 7.565731256476797e-06, + "loss": 0.5373, + "mean_token_accuracy": 0.8297783643007278, + "num_tokens": 43325617.0, + "step": 36070 + }, + { + "entropy": 1.9352862015366554, + "epoch": 0.11184484317930127, + "grad_norm": 8.217283248901367, + "learning_rate": 7.564682688670496e-06, + "loss": 0.5751, + "mean_token_accuracy": 0.8212688997387886, + "num_tokens": 43337245.0, + "step": 36080 + }, + { + "entropy": 1.972224122285843, + "epoch": 0.11187584230435096, + "grad_norm": 10.360859870910645, + "learning_rate": 7.563634556720185e-06, + "loss": 0.567, + "mean_token_accuracy": 0.8349591135978699, + "num_tokens": 43348686.0, + "step": 36090 + }, + { + "entropy": 1.9579823553562163, + "epoch": 0.11190684142940066, + "grad_norm": 9.387046813964844, + "learning_rate": 7.562586860323996e-06, + "loss": 0.5886, + "mean_token_accuracy": 0.8227823153138161, + "num_tokens": 43359689.0, + "step": 36100 + }, + { + "entropy": 1.9274040132761001, + "epoch": 0.11193784055445036, + "grad_norm": 4.451748371124268, + "learning_rate": 7.561539599180354e-06, + "loss": 0.5332, + "mean_token_accuracy": 0.8205605849623681, + "num_tokens": 43372242.0, + "step": 36110 + }, + { + "entropy": 1.8405972003936768, + "epoch": 0.11196883967950004, + "grad_norm": 4.114645481109619, + "learning_rate": 7.560492772987975e-06, + "loss": 0.4202, + "mean_token_accuracy": 0.8403662115335464, + "num_tokens": 43385797.0, + "step": 36120 + }, + { + "entropy": 1.9486818492412568, + "epoch": 0.11199983880454974, + "grad_norm": 4.341696739196777, + "learning_rate": 7.5594463814458676e-06, + "loss": 0.5247, + "mean_token_accuracy": 0.839286656677723, + "num_tokens": 43396906.0, + "step": 36130 + }, + { + "entropy": 1.9541378676891328, + "epoch": 0.11203083792959943, + "grad_norm": 9.982046127319336, + "learning_rate": 7.558400424253328e-06, + "loss": 0.5785, + "mean_token_accuracy": 0.8157087191939354, + "num_tokens": 43409073.0, + "step": 36140 + }, + { + "entropy": 1.9935852527618407, + "epoch": 0.11206183705464913, + "grad_norm": 8.649558067321777, + "learning_rate": 7.557354901109952e-06, + "loss": 0.5575, + "mean_token_accuracy": 0.8184923201799392, + "num_tokens": 43420300.0, + "step": 36150 + }, + { + "entropy": 1.9860820427536965, + "epoch": 0.11209283617969883, + "grad_norm": 9.3584566116333, + "learning_rate": 7.556309811715618e-06, + "loss": 0.5715, + "mean_token_accuracy": 0.8259601920843125, + "num_tokens": 43431655.0, + "step": 36160 + }, + { + "entropy": 1.9509647816419602, + "epoch": 0.11212383530474852, + "grad_norm": 10.400931358337402, + "learning_rate": 7.5552651557705e-06, + "loss": 0.598, + "mean_token_accuracy": 0.8289371073246002, + "num_tokens": 43443051.0, + "step": 36170 + }, + { + "entropy": 1.8806715980172157, + "epoch": 0.11215483442979822, + "grad_norm": 11.046425819396973, + "learning_rate": 7.55422093297506e-06, + "loss": 0.547, + "mean_token_accuracy": 0.8374142736196518, + "num_tokens": 43455604.0, + "step": 36180 + }, + { + "entropy": 1.9038266450166703, + "epoch": 0.11218583355484792, + "grad_norm": 4.295387268066406, + "learning_rate": 7.553177143030047e-06, + "loss": 0.5133, + "mean_token_accuracy": 0.8337746739387513, + "num_tokens": 43468108.0, + "step": 36190 + }, + { + "entropy": 1.8910601362586021, + "epoch": 0.11221683267989761, + "grad_norm": 8.46925163269043, + "learning_rate": 7.5521337856365064e-06, + "loss": 0.4906, + "mean_token_accuracy": 0.8458377867937088, + "num_tokens": 43480308.0, + "step": 36200 + }, + { + "entropy": 1.9406689956784249, + "epoch": 0.11224783180494731, + "grad_norm": 8.312734603881836, + "learning_rate": 7.551090860495766e-06, + "loss": 0.5991, + "mean_token_accuracy": 0.8199841871857643, + "num_tokens": 43492821.0, + "step": 36210 + }, + { + "entropy": 1.9295011416077614, + "epoch": 0.112278830929997, + "grad_norm": 4.657339096069336, + "learning_rate": 7.550048367309445e-06, + "loss": 0.5671, + "mean_token_accuracy": 0.8267869040369987, + "num_tokens": 43504911.0, + "step": 36220 + }, + { + "entropy": 1.8883660644292832, + "epoch": 0.1123098300550467, + "grad_norm": 12.443599700927734, + "learning_rate": 7.54900630577945e-06, + "loss": 0.5342, + "mean_token_accuracy": 0.8197856619954109, + "num_tokens": 43517464.0, + "step": 36230 + }, + { + "entropy": 1.9811781361699103, + "epoch": 0.11234082918009639, + "grad_norm": 11.399888038635254, + "learning_rate": 7.547964675607977e-06, + "loss": 0.6267, + "mean_token_accuracy": 0.8182986229658127, + "num_tokens": 43528891.0, + "step": 36240 + }, + { + "entropy": 1.847057183086872, + "epoch": 0.11237182830514608, + "grad_norm": 7.622710704803467, + "learning_rate": 7.546923476497509e-06, + "loss": 0.5199, + "mean_token_accuracy": 0.8390681058168411, + "num_tokens": 43541805.0, + "step": 36250 + }, + { + "entropy": 1.9104607298970222, + "epoch": 0.11240282743019578, + "grad_norm": 9.49293041229248, + "learning_rate": 7.545882708150815e-06, + "loss": 0.5786, + "mean_token_accuracy": 0.8211572468280792, + "num_tokens": 43552709.0, + "step": 36260 + }, + { + "entropy": 1.7649886459112167, + "epoch": 0.11243382655524548, + "grad_norm": 4.7445807456970215, + "learning_rate": 7.544842370270952e-06, + "loss": 0.4435, + "mean_token_accuracy": 0.841453991830349, + "num_tokens": 43566226.0, + "step": 36270 + }, + { + "entropy": 1.9194318518042564, + "epoch": 0.11246482568029517, + "grad_norm": 4.042590618133545, + "learning_rate": 7.543802462561263e-06, + "loss": 0.5773, + "mean_token_accuracy": 0.8197874516248703, + "num_tokens": 43578124.0, + "step": 36280 + }, + { + "entropy": 1.8747497126460075, + "epoch": 0.11249582480534487, + "grad_norm": 8.346511840820312, + "learning_rate": 7.5427629847253766e-06, + "loss": 0.52, + "mean_token_accuracy": 0.8315855219960213, + "num_tokens": 43590273.0, + "step": 36290 + }, + { + "entropy": 1.962376557290554, + "epoch": 0.11252682393039456, + "grad_norm": 7.562712669372559, + "learning_rate": 7.541723936467211e-06, + "loss": 0.541, + "mean_token_accuracy": 0.8308800607919693, + "num_tokens": 43601348.0, + "step": 36300 + }, + { + "entropy": 1.9383685559034347, + "epoch": 0.11255782305544426, + "grad_norm": 4.1252875328063965, + "learning_rate": 7.540685317490964e-06, + "loss": 0.6044, + "mean_token_accuracy": 0.8207226291298866, + "num_tokens": 43613086.0, + "step": 36310 + }, + { + "entropy": 1.9785435765981674, + "epoch": 0.11258882218049396, + "grad_norm": 9.106582641601562, + "learning_rate": 7.539647127501121e-06, + "loss": 0.6021, + "mean_token_accuracy": 0.8204389974474907, + "num_tokens": 43623726.0, + "step": 36320 + }, + { + "entropy": 1.9137952119112014, + "epoch": 0.11261982130554365, + "grad_norm": 8.635375022888184, + "learning_rate": 7.5386093662024515e-06, + "loss": 0.5612, + "mean_token_accuracy": 0.8232575729489326, + "num_tokens": 43635777.0, + "step": 36330 + }, + { + "entropy": 2.0330847591161727, + "epoch": 0.11265082043059335, + "grad_norm": 10.423239707946777, + "learning_rate": 7.537572033300013e-06, + "loss": 0.6766, + "mean_token_accuracy": 0.8112093076109886, + "num_tokens": 43646664.0, + "step": 36340 + }, + { + "entropy": 2.00158928334713, + "epoch": 0.11268181955564305, + "grad_norm": 8.458577156066895, + "learning_rate": 7.536535128499144e-06, + "loss": 0.5829, + "mean_token_accuracy": 0.8329016759991645, + "num_tokens": 43657222.0, + "step": 36350 + }, + { + "entropy": 1.9344168439507485, + "epoch": 0.11271281868069273, + "grad_norm": 5.806880950927734, + "learning_rate": 7.535498651505465e-06, + "loss": 0.5342, + "mean_token_accuracy": 0.8308494806289672, + "num_tokens": 43669133.0, + "step": 36360 + }, + { + "entropy": 1.9564279466867447, + "epoch": 0.11274381780574243, + "grad_norm": 7.9867329597473145, + "learning_rate": 7.5344626020248825e-06, + "loss": 0.5584, + "mean_token_accuracy": 0.8347235858440399, + "num_tokens": 43680521.0, + "step": 36370 + }, + { + "entropy": 1.9638302087783814, + "epoch": 0.11277481693079212, + "grad_norm": 9.185903549194336, + "learning_rate": 7.533426979763585e-06, + "loss": 0.6052, + "mean_token_accuracy": 0.8131253823637963, + "num_tokens": 43691642.0, + "step": 36380 + }, + { + "entropy": 1.910866443812847, + "epoch": 0.11280581605584182, + "grad_norm": 8.041609764099121, + "learning_rate": 7.532391784428045e-06, + "loss": 0.5606, + "mean_token_accuracy": 0.8212843969464302, + "num_tokens": 43703753.0, + "step": 36390 + }, + { + "entropy": 1.7988116875290872, + "epoch": 0.11283681518089152, + "grad_norm": 8.769583702087402, + "learning_rate": 7.531357015725014e-06, + "loss": 0.4508, + "mean_token_accuracy": 0.8505406931042672, + "num_tokens": 43716887.0, + "step": 36400 + }, + { + "entropy": 1.8762123107910156, + "epoch": 0.11286781430594121, + "grad_norm": 7.339711666107178, + "learning_rate": 7.53032267336153e-06, + "loss": 0.5858, + "mean_token_accuracy": 0.8283303081989288, + "num_tokens": 43728905.0, + "step": 36410 + }, + { + "entropy": 2.0162509113550184, + "epoch": 0.11289881343099091, + "grad_norm": 8.925753593444824, + "learning_rate": 7.529288757044908e-06, + "loss": 0.6202, + "mean_token_accuracy": 0.8204985901713371, + "num_tokens": 43739992.0, + "step": 36420 + }, + { + "entropy": 1.8500737398862839, + "epoch": 0.1129298125560406, + "grad_norm": 4.230498790740967, + "learning_rate": 7.528255266482748e-06, + "loss": 0.4843, + "mean_token_accuracy": 0.8447869628667831, + "num_tokens": 43752806.0, + "step": 36430 + }, + { + "entropy": 1.9196542382240296, + "epoch": 0.1129608116810903, + "grad_norm": 10.557394981384277, + "learning_rate": 7.527222201382927e-06, + "loss": 0.56, + "mean_token_accuracy": 0.8287433817982673, + "num_tokens": 43764530.0, + "step": 36440 + }, + { + "entropy": 1.937133614718914, + "epoch": 0.11299181080614, + "grad_norm": 9.187378883361816, + "learning_rate": 7.526189561453605e-06, + "loss": 0.5236, + "mean_token_accuracy": 0.8356496065855026, + "num_tokens": 43775958.0, + "step": 36450 + }, + { + "entropy": 1.8305316418409348, + "epoch": 0.1130228099311897, + "grad_norm": 10.253772735595703, + "learning_rate": 7.525157346403224e-06, + "loss": 0.4758, + "mean_token_accuracy": 0.846756660938263, + "num_tokens": 43788156.0, + "step": 36460 + }, + { + "entropy": 1.8645455896854402, + "epoch": 0.1130538090562394, + "grad_norm": 2.7411298751831055, + "learning_rate": 7.5241255559405015e-06, + "loss": 0.5632, + "mean_token_accuracy": 0.8278805449604988, + "num_tokens": 43800701.0, + "step": 36470 + }, + { + "entropy": 1.974195511639118, + "epoch": 0.11308480818128909, + "grad_norm": 4.0369110107421875, + "learning_rate": 7.523094189774437e-06, + "loss": 0.5814, + "mean_token_accuracy": 0.8239115789532662, + "num_tokens": 43811490.0, + "step": 36480 + }, + { + "entropy": 1.8882897064089774, + "epoch": 0.11311580730633877, + "grad_norm": 9.004681587219238, + "learning_rate": 7.5220632476143095e-06, + "loss": 0.5609, + "mean_token_accuracy": 0.8223536923527718, + "num_tokens": 43823579.0, + "step": 36490 + }, + { + "entropy": 1.8579727187752724, + "epoch": 0.11314680643138847, + "grad_norm": 4.422814846038818, + "learning_rate": 7.521032729169676e-06, + "loss": 0.5213, + "mean_token_accuracy": 0.8264896929264068, + "num_tokens": 43835758.0, + "step": 36500 + }, + { + "entropy": 1.8324081644415855, + "epoch": 0.11317780555643817, + "grad_norm": 4.702271461486816, + "learning_rate": 7.520002634150373e-06, + "loss": 0.4898, + "mean_token_accuracy": 0.8371421471238136, + "num_tokens": 43848630.0, + "step": 36510 + }, + { + "entropy": 1.8566824480891229, + "epoch": 0.11320880468148786, + "grad_norm": 4.19157075881958, + "learning_rate": 7.518972962266511e-06, + "loss": 0.6098, + "mean_token_accuracy": 0.8109669283032417, + "num_tokens": 43861241.0, + "step": 36520 + }, + { + "entropy": 1.8918891534209252, + "epoch": 0.11323980380653756, + "grad_norm": 4.325606346130371, + "learning_rate": 7.517943713228485e-06, + "loss": 0.5829, + "mean_token_accuracy": 0.8150859504938126, + "num_tokens": 43872873.0, + "step": 36530 + }, + { + "entropy": 1.8818762391805648, + "epoch": 0.11327080293158726, + "grad_norm": 8.26830768585205, + "learning_rate": 7.516914886746964e-06, + "loss": 0.5098, + "mean_token_accuracy": 0.8375229358673095, + "num_tokens": 43885092.0, + "step": 36540 + }, + { + "entropy": 1.8862257033586503, + "epoch": 0.11330180205663695, + "grad_norm": 4.080634117126465, + "learning_rate": 7.515886482532891e-06, + "loss": 0.5554, + "mean_token_accuracy": 0.8279654592275619, + "num_tokens": 43897930.0, + "step": 36550 + }, + { + "entropy": 1.9121008217334747, + "epoch": 0.11333280118168665, + "grad_norm": 7.579226970672607, + "learning_rate": 7.5148585002974895e-06, + "loss": 0.5392, + "mean_token_accuracy": 0.8319680899381637, + "num_tokens": 43910576.0, + "step": 36560 + }, + { + "entropy": 1.8942013755440712, + "epoch": 0.11336380030673635, + "grad_norm": 10.820945739746094, + "learning_rate": 7.513830939752261e-06, + "loss": 0.5398, + "mean_token_accuracy": 0.8288922995328903, + "num_tokens": 43922364.0, + "step": 36570 + }, + { + "entropy": 1.8880238860845566, + "epoch": 0.11339479943178604, + "grad_norm": 4.12148904800415, + "learning_rate": 7.512803800608977e-06, + "loss": 0.5554, + "mean_token_accuracy": 0.8186680763959885, + "num_tokens": 43934444.0, + "step": 36580 + }, + { + "entropy": 1.8842089757323266, + "epoch": 0.11342579855683574, + "grad_norm": 7.816810131072998, + "learning_rate": 7.511777082579692e-06, + "loss": 0.5072, + "mean_token_accuracy": 0.8386390700936317, + "num_tokens": 43946333.0, + "step": 36590 + }, + { + "entropy": 1.905991567671299, + "epoch": 0.11345679768188544, + "grad_norm": 9.459228515625, + "learning_rate": 7.5107507853767304e-06, + "loss": 0.556, + "mean_token_accuracy": 0.8341964855790138, + "num_tokens": 43958369.0, + "step": 36600 + }, + { + "entropy": 1.9278974682092667, + "epoch": 0.11348779680693512, + "grad_norm": 9.090117454528809, + "learning_rate": 7.509724908712693e-06, + "loss": 0.531, + "mean_token_accuracy": 0.8365562587976456, + "num_tokens": 43970116.0, + "step": 36610 + }, + { + "entropy": 1.95464196652174, + "epoch": 0.11351879593198481, + "grad_norm": 8.458623886108398, + "learning_rate": 7.508699452300459e-06, + "loss": 0.5787, + "mean_token_accuracy": 0.830603589117527, + "num_tokens": 43981286.0, + "step": 36620 + }, + { + "entropy": 1.839498682320118, + "epoch": 0.11354979505703451, + "grad_norm": 8.516286849975586, + "learning_rate": 7.507674415853176e-06, + "loss": 0.4453, + "mean_token_accuracy": 0.8414850428700447, + "num_tokens": 43993909.0, + "step": 36630 + }, + { + "entropy": 1.936448486149311, + "epoch": 0.11358079418208421, + "grad_norm": 4.347235202789307, + "learning_rate": 7.506649799084268e-06, + "loss": 0.5356, + "mean_token_accuracy": 0.8339558869600296, + "num_tokens": 44005833.0, + "step": 36640 + }, + { + "entropy": 1.884600205719471, + "epoch": 0.1136117933071339, + "grad_norm": 7.8523712158203125, + "learning_rate": 7.505625601707435e-06, + "loss": 0.517, + "mean_token_accuracy": 0.8333307653665543, + "num_tokens": 44018334.0, + "step": 36650 + }, + { + "entropy": 1.8594166815280915, + "epoch": 0.1136427924321836, + "grad_norm": 8.999347686767578, + "learning_rate": 7.504601823436648e-06, + "loss": 0.5549, + "mean_token_accuracy": 0.8310374036431313, + "num_tokens": 44030326.0, + "step": 36660 + }, + { + "entropy": 1.9014796167612076, + "epoch": 0.1136737915572333, + "grad_norm": 7.825422763824463, + "learning_rate": 7.503578463986152e-06, + "loss": 0.579, + "mean_token_accuracy": 0.816353191435337, + "num_tokens": 44042199.0, + "step": 36670 + }, + { + "entropy": 1.7826032146811486, + "epoch": 0.113704790682283, + "grad_norm": 9.484929084777832, + "learning_rate": 7.502555523070463e-06, + "loss": 0.4162, + "mean_token_accuracy": 0.851135803759098, + "num_tokens": 44055210.0, + "step": 36680 + }, + { + "entropy": 1.8730618610978127, + "epoch": 0.11373578980733269, + "grad_norm": 10.11883544921875, + "learning_rate": 7.5015330004043705e-06, + "loss": 0.534, + "mean_token_accuracy": 0.8271143138408661, + "num_tokens": 44066956.0, + "step": 36690 + }, + { + "entropy": 1.9547926247119904, + "epoch": 0.11376678893238239, + "grad_norm": 9.232172012329102, + "learning_rate": 7.500510895702939e-06, + "loss": 0.6011, + "mean_token_accuracy": 0.8220591425895691, + "num_tokens": 44078247.0, + "step": 36700 + }, + { + "entropy": 1.8781561613082887, + "epoch": 0.11379778805743208, + "grad_norm": 7.583634376525879, + "learning_rate": 7.499489208681497e-06, + "loss": 0.5071, + "mean_token_accuracy": 0.8380724370479584, + "num_tokens": 44089827.0, + "step": 36710 + }, + { + "entropy": 1.822493526339531, + "epoch": 0.11382878718248178, + "grad_norm": 4.266534328460693, + "learning_rate": 7.498467939055656e-06, + "loss": 0.4656, + "mean_token_accuracy": 0.8341230794787406, + "num_tokens": 44102605.0, + "step": 36720 + }, + { + "entropy": 1.8786309957504272, + "epoch": 0.11385978630753146, + "grad_norm": 9.360684394836426, + "learning_rate": 7.497447086541285e-06, + "loss": 0.5763, + "mean_token_accuracy": 0.8306545913219452, + "num_tokens": 44114636.0, + "step": 36730 + }, + { + "entropy": 1.904089206457138, + "epoch": 0.11389078543258116, + "grad_norm": 5.562136650085449, + "learning_rate": 7.496426650854535e-06, + "loss": 0.6014, + "mean_token_accuracy": 0.8137870237231255, + "num_tokens": 44127313.0, + "step": 36740 + }, + { + "entropy": 1.9380560591816902, + "epoch": 0.11392178455763086, + "grad_norm": 9.176344871520996, + "learning_rate": 7.4954066317118205e-06, + "loss": 0.5927, + "mean_token_accuracy": 0.8196625456213951, + "num_tokens": 44138997.0, + "step": 36750 + }, + { + "entropy": 1.881463260948658, + "epoch": 0.11395278368268055, + "grad_norm": 8.139053344726562, + "learning_rate": 7.494387028829828e-06, + "loss": 0.5145, + "mean_token_accuracy": 0.8382967382669448, + "num_tokens": 44151672.0, + "step": 36760 + }, + { + "entropy": 1.8621976956725121, + "epoch": 0.11398378280773025, + "grad_norm": 4.298003673553467, + "learning_rate": 7.493367841925514e-06, + "loss": 0.5409, + "mean_token_accuracy": 0.8231415241956711, + "num_tokens": 44164318.0, + "step": 36770 + }, + { + "entropy": 1.9737728282809257, + "epoch": 0.11401478193277995, + "grad_norm": 10.217061996459961, + "learning_rate": 7.492349070716108e-06, + "loss": 0.5793, + "mean_token_accuracy": 0.8322141289710998, + "num_tokens": 44175114.0, + "step": 36780 + }, + { + "entropy": 1.9328984022140503, + "epoch": 0.11404578105782964, + "grad_norm": 4.168199062347412, + "learning_rate": 7.4913307149191e-06, + "loss": 0.5295, + "mean_token_accuracy": 0.8284803554415703, + "num_tokens": 44187241.0, + "step": 36790 + }, + { + "entropy": 1.9318128556013108, + "epoch": 0.11407678018287934, + "grad_norm": 9.854540824890137, + "learning_rate": 7.490312774252257e-06, + "loss": 0.5912, + "mean_token_accuracy": 0.8278173923492431, + "num_tokens": 44197838.0, + "step": 36800 + }, + { + "entropy": 1.8671427622437478, + "epoch": 0.11410777930792904, + "grad_norm": 8.820489883422852, + "learning_rate": 7.489295248433609e-06, + "loss": 0.6118, + "mean_token_accuracy": 0.8205173119902611, + "num_tokens": 44209989.0, + "step": 36810 + }, + { + "entropy": 1.9128638491034509, + "epoch": 0.11413877843297873, + "grad_norm": 10.370189666748047, + "learning_rate": 7.488278137181456e-06, + "loss": 0.593, + "mean_token_accuracy": 0.8210515394806862, + "num_tokens": 44221247.0, + "step": 36820 + }, + { + "entropy": 1.8790946841239928, + "epoch": 0.11416977755802843, + "grad_norm": 8.685145378112793, + "learning_rate": 7.48726144021437e-06, + "loss": 0.512, + "mean_token_accuracy": 0.8406370490789413, + "num_tokens": 44233155.0, + "step": 36830 + }, + { + "entropy": 1.973842205107212, + "epoch": 0.11420077668307813, + "grad_norm": 9.793169021606445, + "learning_rate": 7.48624515725118e-06, + "loss": 0.5913, + "mean_token_accuracy": 0.8233802810311317, + "num_tokens": 44243970.0, + "step": 36840 + }, + { + "entropy": 1.800303579866886, + "epoch": 0.11423177580812782, + "grad_norm": 9.683552742004395, + "learning_rate": 7.485229288010991e-06, + "loss": 0.4214, + "mean_token_accuracy": 0.8449695229530334, + "num_tokens": 44257962.0, + "step": 36850 + }, + { + "entropy": 1.8852539584040642, + "epoch": 0.1142627749331775, + "grad_norm": 8.617185592651367, + "learning_rate": 7.484213832213174e-06, + "loss": 0.531, + "mean_token_accuracy": 0.8190362945199012, + "num_tokens": 44270821.0, + "step": 36860 + }, + { + "entropy": 1.8611651435494423, + "epoch": 0.1142937740582272, + "grad_norm": 3.9128634929656982, + "learning_rate": 7.483198789577362e-06, + "loss": 0.5256, + "mean_token_accuracy": 0.825941427052021, + "num_tokens": 44283425.0, + "step": 36870 + }, + { + "entropy": 1.863780789077282, + "epoch": 0.1143247731832769, + "grad_norm": 9.122457504272461, + "learning_rate": 7.482184159823459e-06, + "loss": 0.5205, + "mean_token_accuracy": 0.8322040572762489, + "num_tokens": 44295820.0, + "step": 36880 + }, + { + "entropy": 1.8578864350914954, + "epoch": 0.1143557723083266, + "grad_norm": 4.551950931549072, + "learning_rate": 7.481169942671628e-06, + "loss": 0.5337, + "mean_token_accuracy": 0.8270832076668739, + "num_tokens": 44308102.0, + "step": 36890 + }, + { + "entropy": 1.9780358895659447, + "epoch": 0.11438677143337629, + "grad_norm": 9.209576606750488, + "learning_rate": 7.480156137842306e-06, + "loss": 0.5974, + "mean_token_accuracy": 0.8193370506167412, + "num_tokens": 44319023.0, + "step": 36900 + }, + { + "entropy": 1.9365268610417843, + "epoch": 0.11441777055842599, + "grad_norm": 9.44648551940918, + "learning_rate": 7.479142745056188e-06, + "loss": 0.5477, + "mean_token_accuracy": 0.8305159211158752, + "num_tokens": 44330798.0, + "step": 36910 + }, + { + "entropy": 1.9427412554621697, + "epoch": 0.11444876968347568, + "grad_norm": 9.822259902954102, + "learning_rate": 7.478129764034238e-06, + "loss": 0.5697, + "mean_token_accuracy": 0.8291716650128365, + "num_tokens": 44341817.0, + "step": 36920 + }, + { + "entropy": 1.884063209593296, + "epoch": 0.11447976880852538, + "grad_norm": 9.40356731414795, + "learning_rate": 7.477117194497685e-06, + "loss": 0.5212, + "mean_token_accuracy": 0.8296123817563057, + "num_tokens": 44354019.0, + "step": 36930 + }, + { + "entropy": 1.8861080020666123, + "epoch": 0.11451076793357508, + "grad_norm": 10.153528213500977, + "learning_rate": 7.476105036168018e-06, + "loss": 0.548, + "mean_token_accuracy": 0.8213342532515526, + "num_tokens": 44366926.0, + "step": 36940 + }, + { + "entropy": 1.8933350324630738, + "epoch": 0.11454176705862477, + "grad_norm": 9.459976196289062, + "learning_rate": 7.475093288766992e-06, + "loss": 0.5634, + "mean_token_accuracy": 0.828109310567379, + "num_tokens": 44378877.0, + "step": 36950 + }, + { + "entropy": 1.8734344974160195, + "epoch": 0.11457276618367447, + "grad_norm": 8.441810607910156, + "learning_rate": 7.474081952016626e-06, + "loss": 0.5369, + "mean_token_accuracy": 0.8315728649497032, + "num_tokens": 44392038.0, + "step": 36960 + }, + { + "entropy": 1.874902254343033, + "epoch": 0.11460376530872417, + "grad_norm": 4.568936347961426, + "learning_rate": 7.473071025639202e-06, + "loss": 0.5017, + "mean_token_accuracy": 0.8361835688352585, + "num_tokens": 44404425.0, + "step": 36970 + }, + { + "entropy": 1.9008426293730736, + "epoch": 0.11463476443377385, + "grad_norm": 9.730356216430664, + "learning_rate": 7.4720605093572664e-06, + "loss": 0.5532, + "mean_token_accuracy": 0.8256875947117805, + "num_tokens": 44416109.0, + "step": 36980 + }, + { + "entropy": 1.925182616710663, + "epoch": 0.11466576355882355, + "grad_norm": 3.8875186443328857, + "learning_rate": 7.471050402893625e-06, + "loss": 0.611, + "mean_token_accuracy": 0.8156601503491402, + "num_tokens": 44427762.0, + "step": 36990 + }, + { + "entropy": 1.8984755888581275, + "epoch": 0.11469676268387324, + "grad_norm": 8.992522239685059, + "learning_rate": 7.470040705971346e-06, + "loss": 0.5251, + "mean_token_accuracy": 0.8337834537029266, + "num_tokens": 44439999.0, + "step": 37000 + }, + { + "entropy": 1.9663574412465095, + "epoch": 0.11472776180892294, + "grad_norm": 8.80916690826416, + "learning_rate": 7.469031418313763e-06, + "loss": 0.571, + "mean_token_accuracy": 0.8257024556398391, + "num_tokens": 44451506.0, + "step": 37010 + }, + { + "entropy": 1.8491823315620421, + "epoch": 0.11475876093397264, + "grad_norm": 4.779745101928711, + "learning_rate": 7.46802253964447e-06, + "loss": 0.5263, + "mean_token_accuracy": 0.8266303971409797, + "num_tokens": 44464222.0, + "step": 37020 + }, + { + "entropy": 1.9856320530176164, + "epoch": 0.11478976005902233, + "grad_norm": 7.643355846405029, + "learning_rate": 7.46701406968732e-06, + "loss": 0.6223, + "mean_token_accuracy": 0.8207634121179581, + "num_tokens": 44475276.0, + "step": 37030 + }, + { + "entropy": 1.940791991353035, + "epoch": 0.11482075918407203, + "grad_norm": 5.0710954666137695, + "learning_rate": 7.466006008166425e-06, + "loss": 0.5636, + "mean_token_accuracy": 0.8188979029655457, + "num_tokens": 44487103.0, + "step": 37040 + }, + { + "entropy": 1.9016083896160125, + "epoch": 0.11485175830912173, + "grad_norm": 4.698379039764404, + "learning_rate": 7.464998354806166e-06, + "loss": 0.5691, + "mean_token_accuracy": 0.8199516862630845, + "num_tokens": 44498694.0, + "step": 37050 + }, + { + "entropy": 1.9881937965750693, + "epoch": 0.11488275743417142, + "grad_norm": 9.964612007141113, + "learning_rate": 7.463991109331177e-06, + "loss": 0.618, + "mean_token_accuracy": 0.8096556261181831, + "num_tokens": 44509990.0, + "step": 37060 + }, + { + "entropy": 1.884983916580677, + "epoch": 0.11491375655922112, + "grad_norm": 9.16238021850586, + "learning_rate": 7.462984271466356e-06, + "loss": 0.4769, + "mean_token_accuracy": 0.8483946248888969, + "num_tokens": 44521793.0, + "step": 37070 + }, + { + "entropy": 1.9347106859087944, + "epoch": 0.11494475568427082, + "grad_norm": 8.995933532714844, + "learning_rate": 7.461977840936856e-06, + "loss": 0.5537, + "mean_token_accuracy": 0.8224361136555671, + "num_tokens": 44533297.0, + "step": 37080 + }, + { + "entropy": 1.9060018703341484, + "epoch": 0.11497575480932051, + "grad_norm": 8.693288803100586, + "learning_rate": 7.460971817468093e-06, + "loss": 0.5297, + "mean_token_accuracy": 0.8281079009175301, + "num_tokens": 44545609.0, + "step": 37090 + }, + { + "entropy": 1.8859413847327233, + "epoch": 0.1150067539343702, + "grad_norm": 7.988814353942871, + "learning_rate": 7.459966200785744e-06, + "loss": 0.4902, + "mean_token_accuracy": 0.8470205634832382, + "num_tokens": 44557728.0, + "step": 37100 + }, + { + "entropy": 1.8287798702716827, + "epoch": 0.11503775305941989, + "grad_norm": 7.7615437507629395, + "learning_rate": 7.45896099061574e-06, + "loss": 0.5346, + "mean_token_accuracy": 0.8320494025945664, + "num_tokens": 44571340.0, + "step": 37110 + }, + { + "entropy": 1.91844123005867, + "epoch": 0.11506875218446959, + "grad_norm": 11.6615629196167, + "learning_rate": 7.457956186684274e-06, + "loss": 0.5549, + "mean_token_accuracy": 0.8266859829425812, + "num_tokens": 44582865.0, + "step": 37120 + }, + { + "entropy": 1.7767401605844497, + "epoch": 0.11509975130951929, + "grad_norm": 7.414669513702393, + "learning_rate": 7.4569517887177935e-06, + "loss": 0.4635, + "mean_token_accuracy": 0.8381605073809624, + "num_tokens": 44596679.0, + "step": 37130 + }, + { + "entropy": 1.9388007283210755, + "epoch": 0.11513075043456898, + "grad_norm": 11.721390724182129, + "learning_rate": 7.455947796443009e-06, + "loss": 0.6072, + "mean_token_accuracy": 0.8257848516106605, + "num_tokens": 44608325.0, + "step": 37140 + }, + { + "entropy": 1.9356591030955315, + "epoch": 0.11516174955961868, + "grad_norm": 7.786122798919678, + "learning_rate": 7.454944209586882e-06, + "loss": 0.5724, + "mean_token_accuracy": 0.8272521525621415, + "num_tokens": 44619842.0, + "step": 37150 + }, + { + "entropy": 1.8918299242854117, + "epoch": 0.11519274868466838, + "grad_norm": 10.668729782104492, + "learning_rate": 7.453941027876637e-06, + "loss": 0.5737, + "mean_token_accuracy": 0.8250093191862107, + "num_tokens": 44631902.0, + "step": 37160 + }, + { + "entropy": 1.7994364276528358, + "epoch": 0.11522374780971807, + "grad_norm": 6.118994235992432, + "learning_rate": 7.452938251039754e-06, + "loss": 0.4845, + "mean_token_accuracy": 0.8353979960083961, + "num_tokens": 44645016.0, + "step": 37170 + }, + { + "entropy": 1.9420592486858368, + "epoch": 0.11525474693476777, + "grad_norm": 9.136016845703125, + "learning_rate": 7.451935878803968e-06, + "loss": 0.5985, + "mean_token_accuracy": 0.8215920001268386, + "num_tokens": 44656784.0, + "step": 37180 + }, + { + "entropy": 1.9028725042939185, + "epoch": 0.11528574605981746, + "grad_norm": 7.933960914611816, + "learning_rate": 7.45093391089727e-06, + "loss": 0.5122, + "mean_token_accuracy": 0.8299235314130783, + "num_tokens": 44669372.0, + "step": 37190 + }, + { + "entropy": 1.8886778056621552, + "epoch": 0.11531674518486716, + "grad_norm": 8.652473449707031, + "learning_rate": 7.4499323470479075e-06, + "loss": 0.4984, + "mean_token_accuracy": 0.835668683052063, + "num_tokens": 44681290.0, + "step": 37200 + }, + { + "entropy": 1.902733063697815, + "epoch": 0.11534774430991686, + "grad_norm": 9.28559684753418, + "learning_rate": 7.448931186984387e-06, + "loss": 0.5584, + "mean_token_accuracy": 0.8212042555212975, + "num_tokens": 44692941.0, + "step": 37210 + }, + { + "entropy": 1.900286576151848, + "epoch": 0.11537874343496655, + "grad_norm": 8.697643280029297, + "learning_rate": 7.447930430435463e-06, + "loss": 0.5436, + "mean_token_accuracy": 0.8290207460522652, + "num_tokens": 44704646.0, + "step": 37220 + }, + { + "entropy": 1.8271199837327003, + "epoch": 0.11540974256001624, + "grad_norm": 10.892705917358398, + "learning_rate": 7.446930077130154e-06, + "loss": 0.4804, + "mean_token_accuracy": 0.8422973319888115, + "num_tokens": 44717316.0, + "step": 37230 + }, + { + "entropy": 1.875792995095253, + "epoch": 0.11544074168506593, + "grad_norm": 8.272143363952637, + "learning_rate": 7.445930126797723e-06, + "loss": 0.5986, + "mean_token_accuracy": 0.8243017882108689, + "num_tokens": 44729226.0, + "step": 37240 + }, + { + "entropy": 1.8794658452272415, + "epoch": 0.11547174081011563, + "grad_norm": 8.320490837097168, + "learning_rate": 7.444930579167699e-06, + "loss": 0.5515, + "mean_token_accuracy": 0.8193329513072968, + "num_tokens": 44740879.0, + "step": 37250 + }, + { + "entropy": 1.8461437433958054, + "epoch": 0.11550273993516533, + "grad_norm": 7.777599334716797, + "learning_rate": 7.443931433969854e-06, + "loss": 0.505, + "mean_token_accuracy": 0.8361176624894142, + "num_tokens": 44753425.0, + "step": 37260 + }, + { + "entropy": 1.9898195713758469, + "epoch": 0.11553373906021502, + "grad_norm": 12.342960357666016, + "learning_rate": 7.442932690934222e-06, + "loss": 0.6227, + "mean_token_accuracy": 0.8236858904361725, + "num_tokens": 44764029.0, + "step": 37270 + }, + { + "entropy": 1.850753267109394, + "epoch": 0.11556473818526472, + "grad_norm": 10.538897514343262, + "learning_rate": 7.441934349791088e-06, + "loss": 0.533, + "mean_token_accuracy": 0.8256380185484886, + "num_tokens": 44775779.0, + "step": 37280 + }, + { + "entropy": 1.7188558861613275, + "epoch": 0.11559573731031442, + "grad_norm": 4.442847728729248, + "learning_rate": 7.440936410270987e-06, + "loss": 0.3958, + "mean_token_accuracy": 0.8477207824587822, + "num_tokens": 44789900.0, + "step": 37290 + }, + { + "entropy": 1.854294976592064, + "epoch": 0.11562673643536411, + "grad_norm": 9.337111473083496, + "learning_rate": 7.439938872104712e-06, + "loss": 0.5511, + "mean_token_accuracy": 0.8315454825758934, + "num_tokens": 44801588.0, + "step": 37300 + }, + { + "entropy": 1.7844763696193695, + "epoch": 0.11565773556041381, + "grad_norm": 8.090426445007324, + "learning_rate": 7.438941735023301e-06, + "loss": 0.4811, + "mean_token_accuracy": 0.8366570591926574, + "num_tokens": 44813896.0, + "step": 37310 + }, + { + "entropy": 1.808639107644558, + "epoch": 0.1156887346854635, + "grad_norm": 4.602088451385498, + "learning_rate": 7.437944998758055e-06, + "loss": 0.5219, + "mean_token_accuracy": 0.8374237030744552, + "num_tokens": 44826692.0, + "step": 37320 + }, + { + "entropy": 1.9111674383282662, + "epoch": 0.1157197338105132, + "grad_norm": 9.890421867370605, + "learning_rate": 7.436948663040519e-06, + "loss": 0.5756, + "mean_token_accuracy": 0.8278648778796196, + "num_tokens": 44838639.0, + "step": 37330 + }, + { + "entropy": 1.9008503317832948, + "epoch": 0.1157507329355629, + "grad_norm": 10.691512107849121, + "learning_rate": 7.435952727602491e-06, + "loss": 0.5903, + "mean_token_accuracy": 0.8256294175982475, + "num_tokens": 44850123.0, + "step": 37340 + }, + { + "entropy": 1.9054113239049912, + "epoch": 0.11578173206061258, + "grad_norm": 9.177732467651367, + "learning_rate": 7.434957192176021e-06, + "loss": 0.5876, + "mean_token_accuracy": 0.8166389659047126, + "num_tokens": 44862112.0, + "step": 37350 + }, + { + "entropy": 1.7511047348380089, + "epoch": 0.11581273118566228, + "grad_norm": 9.275190353393555, + "learning_rate": 7.4339620564934135e-06, + "loss": 0.4522, + "mean_token_accuracy": 0.8530907735228539, + "num_tokens": 44875516.0, + "step": 37360 + }, + { + "entropy": 1.7619803249835968, + "epoch": 0.11584373031071198, + "grad_norm": 5.685603618621826, + "learning_rate": 7.432967320287217e-06, + "loss": 0.4322, + "mean_token_accuracy": 0.834882402420044, + "num_tokens": 44889047.0, + "step": 37370 + }, + { + "entropy": 1.8839653521776198, + "epoch": 0.11587472943576167, + "grad_norm": 9.889922142028809, + "learning_rate": 7.431972983290233e-06, + "loss": 0.5696, + "mean_token_accuracy": 0.820878466963768, + "num_tokens": 44901808.0, + "step": 37380 + }, + { + "entropy": 1.9028914406895638, + "epoch": 0.11590572856081137, + "grad_norm": 9.42833423614502, + "learning_rate": 7.430979045235518e-06, + "loss": 0.5525, + "mean_token_accuracy": 0.8282787010073662, + "num_tokens": 44912993.0, + "step": 37390 + }, + { + "entropy": 1.9422337591648102, + "epoch": 0.11593672768586107, + "grad_norm": 10.66053295135498, + "learning_rate": 7.429985505856372e-06, + "loss": 0.6107, + "mean_token_accuracy": 0.813813716173172, + "num_tokens": 44924256.0, + "step": 37400 + }, + { + "entropy": 1.8587273508310318, + "epoch": 0.11596772681091076, + "grad_norm": 7.039865493774414, + "learning_rate": 7.428992364886347e-06, + "loss": 0.5092, + "mean_token_accuracy": 0.8369274318218232, + "num_tokens": 44935852.0, + "step": 37410 + }, + { + "entropy": 1.8579163044691085, + "epoch": 0.11599872593596046, + "grad_norm": 3.244483470916748, + "learning_rate": 7.427999622059245e-06, + "loss": 0.5921, + "mean_token_accuracy": 0.8229958653450012, + "num_tokens": 44948437.0, + "step": 37420 + }, + { + "entropy": 1.8869281217455864, + "epoch": 0.11602972506101016, + "grad_norm": 8.914671897888184, + "learning_rate": 7.427007277109115e-06, + "loss": 0.5231, + "mean_token_accuracy": 0.8428304120898247, + "num_tokens": 44959542.0, + "step": 37430 + }, + { + "entropy": 1.88766710460186, + "epoch": 0.11606072418605985, + "grad_norm": 8.622590065002441, + "learning_rate": 7.426015329770258e-06, + "loss": 0.5914, + "mean_token_accuracy": 0.8276036903262138, + "num_tokens": 44970414.0, + "step": 37440 + }, + { + "entropy": 1.8154839545488357, + "epoch": 0.11609172331110955, + "grad_norm": 9.341155052185059, + "learning_rate": 7.425023779777217e-06, + "loss": 0.5234, + "mean_token_accuracy": 0.8210044398903846, + "num_tokens": 44983628.0, + "step": 37450 + }, + { + "entropy": 1.8794740170240403, + "epoch": 0.11612272243615925, + "grad_norm": 8.086356163024902, + "learning_rate": 7.424032626864791e-06, + "loss": 0.6234, + "mean_token_accuracy": 0.8183172300457955, + "num_tokens": 44994394.0, + "step": 37460 + }, + { + "entropy": 1.8632339894771577, + "epoch": 0.11615372156120893, + "grad_norm": 7.764867782592773, + "learning_rate": 7.423041870768022e-06, + "loss": 0.5312, + "mean_token_accuracy": 0.8289612337946892, + "num_tokens": 45005298.0, + "step": 37470 + }, + { + "entropy": 1.8341542795300483, + "epoch": 0.11618472068625862, + "grad_norm": 8.701674461364746, + "learning_rate": 7.422051511222199e-06, + "loss": 0.506, + "mean_token_accuracy": 0.8334753260016441, + "num_tokens": 45018299.0, + "step": 37480 + }, + { + "entropy": 1.915319959819317, + "epoch": 0.11621571981130832, + "grad_norm": 9.27292251586914, + "learning_rate": 7.4210615479628625e-06, + "loss": 0.5573, + "mean_token_accuracy": 0.8271935701370239, + "num_tokens": 45029787.0, + "step": 37490 + }, + { + "entropy": 1.8192658558487893, + "epoch": 0.11624671893635802, + "grad_norm": 5.670734882354736, + "learning_rate": 7.420071980725793e-06, + "loss": 0.4966, + "mean_token_accuracy": 0.8343614637851715, + "num_tokens": 45042647.0, + "step": 37500 + }, + { + "entropy": 1.9440273195505142, + "epoch": 0.11627771806140771, + "grad_norm": 10.800583839416504, + "learning_rate": 7.419082809247022e-06, + "loss": 0.6296, + "mean_token_accuracy": 0.8177393227815628, + "num_tokens": 45053626.0, + "step": 37510 + }, + { + "entropy": 1.8665451392531396, + "epoch": 0.11630871718645741, + "grad_norm": 11.033953666687012, + "learning_rate": 7.41809403326283e-06, + "loss": 0.5682, + "mean_token_accuracy": 0.822401012480259, + "num_tokens": 45064923.0, + "step": 37520 + }, + { + "entropy": 1.899769589304924, + "epoch": 0.11633971631150711, + "grad_norm": 8.622315406799316, + "learning_rate": 7.417105652509737e-06, + "loss": 0.5622, + "mean_token_accuracy": 0.8258567243814469, + "num_tokens": 45076477.0, + "step": 37530 + }, + { + "entropy": 1.951041680574417, + "epoch": 0.1163707154365568, + "grad_norm": 12.631427764892578, + "learning_rate": 7.4161176667245125e-06, + "loss": 0.6739, + "mean_token_accuracy": 0.8053397119045258, + "num_tokens": 45087117.0, + "step": 37540 + }, + { + "entropy": 1.740734612941742, + "epoch": 0.1164017145616065, + "grad_norm": 4.5630879402160645, + "learning_rate": 7.415130075644172e-06, + "loss": 0.4542, + "mean_token_accuracy": 0.8315686360001564, + "num_tokens": 45101656.0, + "step": 37550 + }, + { + "entropy": 1.8748736679553986, + "epoch": 0.1164327136866562, + "grad_norm": 8.166940689086914, + "learning_rate": 7.414142879005973e-06, + "loss": 0.5907, + "mean_token_accuracy": 0.8158686429262161, + "num_tokens": 45113754.0, + "step": 37560 + }, + { + "entropy": 1.8461486741900444, + "epoch": 0.1164637128117059, + "grad_norm": 10.343168258666992, + "learning_rate": 7.4131560765474195e-06, + "loss": 0.5501, + "mean_token_accuracy": 0.8233461812138557, + "num_tokens": 45126280.0, + "step": 37570 + }, + { + "entropy": 1.8095831125974655, + "epoch": 0.11649471193675559, + "grad_norm": 9.836362838745117, + "learning_rate": 7.4121696680062626e-06, + "loss": 0.531, + "mean_token_accuracy": 0.8389960706233979, + "num_tokens": 45138548.0, + "step": 37580 + }, + { + "entropy": 1.8525902017951013, + "epoch": 0.11652571106180529, + "grad_norm": 2.9441442489624023, + "learning_rate": 7.411183653120493e-06, + "loss": 0.5153, + "mean_token_accuracy": 0.8308546274900437, + "num_tokens": 45150311.0, + "step": 37590 + }, + { + "entropy": 1.8115593805909156, + "epoch": 0.11655671018685497, + "grad_norm": 10.99833869934082, + "learning_rate": 7.410198031628346e-06, + "loss": 0.5517, + "mean_token_accuracy": 0.8222774997353554, + "num_tokens": 45161956.0, + "step": 37600 + }, + { + "entropy": 1.7148562341928482, + "epoch": 0.11658770931190467, + "grad_norm": 9.413897514343262, + "learning_rate": 7.409212803268305e-06, + "loss": 0.5039, + "mean_token_accuracy": 0.8297837525606155, + "num_tokens": 45175439.0, + "step": 37610 + }, + { + "entropy": 1.8933440566062927, + "epoch": 0.11661870843695436, + "grad_norm": 9.255085945129395, + "learning_rate": 7.408227967779092e-06, + "loss": 0.6599, + "mean_token_accuracy": 0.8186682641506196, + "num_tokens": 45187297.0, + "step": 37620 + }, + { + "entropy": 1.8789986670017242, + "epoch": 0.11664970756200406, + "grad_norm": 7.9891180992126465, + "learning_rate": 7.407243524899674e-06, + "loss": 0.5768, + "mean_token_accuracy": 0.82134408056736, + "num_tokens": 45198638.0, + "step": 37630 + }, + { + "entropy": 1.9211965501308441, + "epoch": 0.11668070668705376, + "grad_norm": 7.830356597900391, + "learning_rate": 7.40625947436926e-06, + "loss": 0.6321, + "mean_token_accuracy": 0.813792322576046, + "num_tokens": 45209458.0, + "step": 37640 + }, + { + "entropy": 1.853388948738575, + "epoch": 0.11671170581210345, + "grad_norm": 9.149991989135742, + "learning_rate": 7.405275815927302e-06, + "loss": 0.5775, + "mean_token_accuracy": 0.8271188646554947, + "num_tokens": 45220490.0, + "step": 37650 + }, + { + "entropy": 1.8560687810182572, + "epoch": 0.11674270493715315, + "grad_norm": 10.992624282836914, + "learning_rate": 7.404292549313496e-06, + "loss": 0.5735, + "mean_token_accuracy": 0.8299055308103561, + "num_tokens": 45231538.0, + "step": 37660 + }, + { + "entropy": 1.8339884772896766, + "epoch": 0.11677370406220285, + "grad_norm": 9.377613067626953, + "learning_rate": 7.403309674267774e-06, + "loss": 0.5771, + "mean_token_accuracy": 0.8145007371902466, + "num_tokens": 45243273.0, + "step": 37670 + }, + { + "entropy": 1.8108773604035378, + "epoch": 0.11680470318725254, + "grad_norm": 8.911115646362305, + "learning_rate": 7.402327190530316e-06, + "loss": 0.5862, + "mean_token_accuracy": 0.8245181262493133, + "num_tokens": 45255185.0, + "step": 37680 + }, + { + "entropy": 1.7524414852261543, + "epoch": 0.11683570231230224, + "grad_norm": 8.754755973815918, + "learning_rate": 7.40134509784154e-06, + "loss": 0.442, + "mean_token_accuracy": 0.8447420582175255, + "num_tokens": 45268461.0, + "step": 37690 + }, + { + "entropy": 1.853088989853859, + "epoch": 0.11686670143735194, + "grad_norm": 8.089972496032715, + "learning_rate": 7.400363395942107e-06, + "loss": 0.5872, + "mean_token_accuracy": 0.8264397710561753, + "num_tokens": 45279403.0, + "step": 37700 + }, + { + "entropy": 1.8071050971746445, + "epoch": 0.11689770056240163, + "grad_norm": 4.270401954650879, + "learning_rate": 7.399382084572917e-06, + "loss": 0.5712, + "mean_token_accuracy": 0.829101575911045, + "num_tokens": 45292068.0, + "step": 37710 + }, + { + "entropy": 1.878752313554287, + "epoch": 0.11692869968745132, + "grad_norm": 5.220165252685547, + "learning_rate": 7.39840116347511e-06, + "loss": 0.5356, + "mean_token_accuracy": 0.828040985763073, + "num_tokens": 45303427.0, + "step": 37720 + }, + { + "entropy": 1.860033529996872, + "epoch": 0.11695969881250101, + "grad_norm": 8.818233489990234, + "learning_rate": 7.397420632390068e-06, + "loss": 0.5614, + "mean_token_accuracy": 0.8269825220108032, + "num_tokens": 45314769.0, + "step": 37730 + }, + { + "entropy": 1.88105977922678, + "epoch": 0.11699069793755071, + "grad_norm": 8.774882316589355, + "learning_rate": 7.396440491059412e-06, + "loss": 0.5873, + "mean_token_accuracy": 0.82732093334198, + "num_tokens": 45326083.0, + "step": 37740 + }, + { + "entropy": 1.9004520326852798, + "epoch": 0.1170216970626004, + "grad_norm": 7.999516487121582, + "learning_rate": 7.395460739225003e-06, + "loss": 0.5686, + "mean_token_accuracy": 0.8362064957618713, + "num_tokens": 45337413.0, + "step": 37750 + }, + { + "entropy": 1.858451707661152, + "epoch": 0.1170526961876501, + "grad_norm": 10.978395462036133, + "learning_rate": 7.39448137662894e-06, + "loss": 0.583, + "mean_token_accuracy": 0.8314261004328728, + "num_tokens": 45349411.0, + "step": 37760 + }, + { + "entropy": 1.7877505451440812, + "epoch": 0.1170836953126998, + "grad_norm": 8.580302238464355, + "learning_rate": 7.393502403013563e-06, + "loss": 0.539, + "mean_token_accuracy": 0.8316345691680909, + "num_tokens": 45362113.0, + "step": 37770 + }, + { + "entropy": 1.7756628662347793, + "epoch": 0.1171146944377495, + "grad_norm": 2.5867841243743896, + "learning_rate": 7.3925238181214465e-06, + "loss": 0.4867, + "mean_token_accuracy": 0.8359642580151558, + "num_tokens": 45375493.0, + "step": 37780 + }, + { + "entropy": 1.9025995954871178, + "epoch": 0.11714569356279919, + "grad_norm": 8.61524486541748, + "learning_rate": 7.39154562169541e-06, + "loss": 0.6269, + "mean_token_accuracy": 0.8146417796611786, + "num_tokens": 45386898.0, + "step": 37790 + }, + { + "entropy": 1.8143214404582977, + "epoch": 0.11717669268784889, + "grad_norm": 4.412146091461182, + "learning_rate": 7.390567813478508e-06, + "loss": 0.5244, + "mean_token_accuracy": 0.8260921567678452, + "num_tokens": 45398636.0, + "step": 37800 + }, + { + "entropy": 1.8221836000680924, + "epoch": 0.11720769181289858, + "grad_norm": 9.057048797607422, + "learning_rate": 7.38959039321403e-06, + "loss": 0.4918, + "mean_token_accuracy": 0.8306093484163284, + "num_tokens": 45411214.0, + "step": 37810 + }, + { + "entropy": 1.7029403105378151, + "epoch": 0.11723869093794828, + "grad_norm": 3.986116886138916, + "learning_rate": 7.388613360645508e-06, + "loss": 0.4131, + "mean_token_accuracy": 0.8498943701386452, + "num_tokens": 45424321.0, + "step": 37820 + }, + { + "entropy": 1.8506017461419106, + "epoch": 0.11726969006299798, + "grad_norm": 11.799568176269531, + "learning_rate": 7.387636715516706e-06, + "loss": 0.5875, + "mean_token_accuracy": 0.8202611520886421, + "num_tokens": 45435300.0, + "step": 37830 + }, + { + "entropy": 1.7843811869621278, + "epoch": 0.11730068918804766, + "grad_norm": 9.326491355895996, + "learning_rate": 7.38666045757163e-06, + "loss": 0.5139, + "mean_token_accuracy": 0.8335476636886596, + "num_tokens": 45448436.0, + "step": 37840 + }, + { + "entropy": 1.7802442833781242, + "epoch": 0.11733168831309736, + "grad_norm": 9.365975379943848, + "learning_rate": 7.38568458655452e-06, + "loss": 0.4799, + "mean_token_accuracy": 0.8346278235316277, + "num_tokens": 45461982.0, + "step": 37850 + }, + { + "entropy": 1.774220162630081, + "epoch": 0.11736268743814705, + "grad_norm": 5.482513427734375, + "learning_rate": 7.384709102209855e-06, + "loss": 0.5163, + "mean_token_accuracy": 0.8305883064866066, + "num_tokens": 45474506.0, + "step": 37860 + }, + { + "entropy": 1.7526094153523446, + "epoch": 0.11739368656319675, + "grad_norm": 4.42122220993042, + "learning_rate": 7.383734004282347e-06, + "loss": 0.5358, + "mean_token_accuracy": 0.8322111248970032, + "num_tokens": 45487584.0, + "step": 37870 + }, + { + "entropy": 1.7954672902822495, + "epoch": 0.11742468568824645, + "grad_norm": 9.287115097045898, + "learning_rate": 7.382759292516944e-06, + "loss": 0.4978, + "mean_token_accuracy": 0.8465468645095825, + "num_tokens": 45499890.0, + "step": 37880 + }, + { + "entropy": 1.8831429094076158, + "epoch": 0.11745568481329614, + "grad_norm": 8.810885429382324, + "learning_rate": 7.3817849666588325e-06, + "loss": 0.5603, + "mean_token_accuracy": 0.8357957974076271, + "num_tokens": 45511132.0, + "step": 37890 + }, + { + "entropy": 1.8124101281166076, + "epoch": 0.11748668393834584, + "grad_norm": 8.837002754211426, + "learning_rate": 7.3808110264534325e-06, + "loss": 0.5354, + "mean_token_accuracy": 0.8318711072206497, + "num_tokens": 45523253.0, + "step": 37900 + }, + { + "entropy": 1.8693040862679482, + "epoch": 0.11751768306339554, + "grad_norm": 7.558449745178223, + "learning_rate": 7.379837471646401e-06, + "loss": 0.6194, + "mean_token_accuracy": 0.817514568567276, + "num_tokens": 45534537.0, + "step": 37910 + }, + { + "entropy": 1.8211430594325067, + "epoch": 0.11754868218844523, + "grad_norm": 9.2406005859375, + "learning_rate": 7.378864301983624e-06, + "loss": 0.521, + "mean_token_accuracy": 0.8348601713776589, + "num_tokens": 45547222.0, + "step": 37920 + }, + { + "entropy": 1.788476151227951, + "epoch": 0.11757968131349493, + "grad_norm": 8.003228187561035, + "learning_rate": 7.3778915172112294e-06, + "loss": 0.5148, + "mean_token_accuracy": 0.8349843844771385, + "num_tokens": 45558966.0, + "step": 37930 + }, + { + "entropy": 1.8203987568616866, + "epoch": 0.11761068043854463, + "grad_norm": 8.81309700012207, + "learning_rate": 7.376919117075574e-06, + "loss": 0.5361, + "mean_token_accuracy": 0.8295062810182572, + "num_tokens": 45571715.0, + "step": 37940 + }, + { + "entropy": 1.8281734257936477, + "epoch": 0.11764167956359432, + "grad_norm": 9.268831253051758, + "learning_rate": 7.375947101323252e-06, + "loss": 0.5534, + "mean_token_accuracy": 0.8323431298136711, + "num_tokens": 45583585.0, + "step": 37950 + }, + { + "entropy": 1.8576543658971787, + "epoch": 0.11767267868864402, + "grad_norm": 9.705360412597656, + "learning_rate": 7.37497546970109e-06, + "loss": 0.5438, + "mean_token_accuracy": 0.8329675674438477, + "num_tokens": 45595651.0, + "step": 37960 + }, + { + "entropy": 1.7942541763186455, + "epoch": 0.1177036778136937, + "grad_norm": 8.475988388061523, + "learning_rate": 7.374004221956146e-06, + "loss": 0.5249, + "mean_token_accuracy": 0.8307255417108536, + "num_tokens": 45608005.0, + "step": 37970 + }, + { + "entropy": 1.7157593086361884, + "epoch": 0.1177346769387434, + "grad_norm": 10.300793647766113, + "learning_rate": 7.373033357835715e-06, + "loss": 0.4363, + "mean_token_accuracy": 0.8378100886940956, + "num_tokens": 45621783.0, + "step": 37980 + }, + { + "entropy": 1.8599382385611534, + "epoch": 0.1177656760637931, + "grad_norm": 9.142414093017578, + "learning_rate": 7.372062877087321e-06, + "loss": 0.5541, + "mean_token_accuracy": 0.8253407448530197, + "num_tokens": 45632804.0, + "step": 37990 + }, + { + "entropy": 1.8708630874752998, + "epoch": 0.11779667518884279, + "grad_norm": 9.671748161315918, + "learning_rate": 7.371092779458723e-06, + "loss": 0.5699, + "mean_token_accuracy": 0.8188276901841164, + "num_tokens": 45644415.0, + "step": 38000 + }, + { + "entropy": 1.8491891831159593, + "epoch": 0.11782767431389249, + "grad_norm": 8.546015739440918, + "learning_rate": 7.3701230646979126e-06, + "loss": 0.551, + "mean_token_accuracy": 0.8339832827448845, + "num_tokens": 45655755.0, + "step": 38010 + }, + { + "entropy": 1.8837746277451515, + "epoch": 0.11785867343894219, + "grad_norm": 10.270685195922852, + "learning_rate": 7.369153732553109e-06, + "loss": 0.575, + "mean_token_accuracy": 0.8240560159087181, + "num_tokens": 45666750.0, + "step": 38020 + }, + { + "entropy": 1.8285693258047104, + "epoch": 0.11788967256399188, + "grad_norm": 8.460458755493164, + "learning_rate": 7.368184782772773e-06, + "loss": 0.5454, + "mean_token_accuracy": 0.8289884492754936, + "num_tokens": 45678928.0, + "step": 38030 + }, + { + "entropy": 1.903639057278633, + "epoch": 0.11792067168904158, + "grad_norm": 9.557387351989746, + "learning_rate": 7.367216215105582e-06, + "loss": 0.6053, + "mean_token_accuracy": 0.8232555508613586, + "num_tokens": 45689650.0, + "step": 38040 + }, + { + "entropy": 1.840640588104725, + "epoch": 0.11795167081409128, + "grad_norm": 10.47343635559082, + "learning_rate": 7.366248029300462e-06, + "loss": 0.551, + "mean_token_accuracy": 0.8262113586068154, + "num_tokens": 45701186.0, + "step": 38050 + }, + { + "entropy": 1.6772912368178368, + "epoch": 0.11798266993914097, + "grad_norm": 5.621872425079346, + "learning_rate": 7.365280225106553e-06, + "loss": 0.4109, + "mean_token_accuracy": 0.8406716391444207, + "num_tokens": 45715025.0, + "step": 38060 + }, + { + "entropy": 1.674746471643448, + "epoch": 0.11801366906419067, + "grad_norm": 8.922220230102539, + "learning_rate": 7.364312802273238e-06, + "loss": 0.4541, + "mean_token_accuracy": 0.8508239656686782, + "num_tokens": 45729652.0, + "step": 38070 + }, + { + "entropy": 1.8886992782354355, + "epoch": 0.11804466818924036, + "grad_norm": 9.37268352508545, + "learning_rate": 7.3633457605501245e-06, + "loss": 0.5581, + "mean_token_accuracy": 0.8332405045628548, + "num_tokens": 45740799.0, + "step": 38080 + }, + { + "entropy": 1.7671217247843742, + "epoch": 0.11807566731429005, + "grad_norm": 3.940173387527466, + "learning_rate": 7.362379099687053e-06, + "loss": 0.4674, + "mean_token_accuracy": 0.8395871177315712, + "num_tokens": 45753706.0, + "step": 38090 + }, + { + "entropy": 1.83549507856369, + "epoch": 0.11810666643933974, + "grad_norm": 10.712289810180664, + "learning_rate": 7.3614128194340895e-06, + "loss": 0.5369, + "mean_token_accuracy": 0.8255318030714989, + "num_tokens": 45765958.0, + "step": 38100 + }, + { + "entropy": 1.775768581032753, + "epoch": 0.11813766556438944, + "grad_norm": 9.007753372192383, + "learning_rate": 7.360446919541537e-06, + "loss": 0.5142, + "mean_token_accuracy": 0.8286249712109566, + "num_tokens": 45778973.0, + "step": 38110 + }, + { + "entropy": 1.8200758472085, + "epoch": 0.11816866468943914, + "grad_norm": 9.196110725402832, + "learning_rate": 7.359481399759919e-06, + "loss": 0.5358, + "mean_token_accuracy": 0.8254531383514404, + "num_tokens": 45791157.0, + "step": 38120 + }, + { + "entropy": 1.8400531589984894, + "epoch": 0.11819966381448883, + "grad_norm": 8.93841552734375, + "learning_rate": 7.358516259839993e-06, + "loss": 0.5626, + "mean_token_accuracy": 0.8274215057492256, + "num_tokens": 45802910.0, + "step": 38130 + }, + { + "entropy": 1.8769369035959245, + "epoch": 0.11823066293953853, + "grad_norm": 8.658340454101562, + "learning_rate": 7.3575514995327465e-06, + "loss": 0.59, + "mean_token_accuracy": 0.8198636502027512, + "num_tokens": 45814938.0, + "step": 38140 + }, + { + "entropy": 1.8544082716107368, + "epoch": 0.11826166206458823, + "grad_norm": 10.661956787109375, + "learning_rate": 7.3565871185893936e-06, + "loss": 0.5401, + "mean_token_accuracy": 0.833048839867115, + "num_tokens": 45826509.0, + "step": 38150 + }, + { + "entropy": 1.8468938201665879, + "epoch": 0.11829266118963792, + "grad_norm": 8.67770004272461, + "learning_rate": 7.3556231167613724e-06, + "loss": 0.5409, + "mean_token_accuracy": 0.829650467634201, + "num_tokens": 45838046.0, + "step": 38160 + }, + { + "entropy": 1.8621126562356949, + "epoch": 0.11832366031468762, + "grad_norm": 6.933115005493164, + "learning_rate": 7.354659493800356e-06, + "loss": 0.5648, + "mean_token_accuracy": 0.8264819413423539, + "num_tokens": 45850028.0, + "step": 38170 + }, + { + "entropy": 1.8032435089349748, + "epoch": 0.11835465943973732, + "grad_norm": 7.068675994873047, + "learning_rate": 7.353696249458242e-06, + "loss": 0.5598, + "mean_token_accuracy": 0.8219858318567276, + "num_tokens": 45862980.0, + "step": 38180 + }, + { + "entropy": 1.8914562717080117, + "epoch": 0.11838565856478701, + "grad_norm": 8.82200813293457, + "learning_rate": 7.352733383487156e-06, + "loss": 0.6047, + "mean_token_accuracy": 0.8125669702887535, + "num_tokens": 45875364.0, + "step": 38190 + }, + { + "entropy": 1.77557475566864, + "epoch": 0.11841665768983671, + "grad_norm": 4.266529083251953, + "learning_rate": 7.3517708956394485e-06, + "loss": 0.4902, + "mean_token_accuracy": 0.843745194375515, + "num_tokens": 45887863.0, + "step": 38200 + }, + { + "entropy": 1.8870860502123832, + "epoch": 0.1184476568148864, + "grad_norm": 9.665167808532715, + "learning_rate": 7.3508087856677e-06, + "loss": 0.5614, + "mean_token_accuracy": 0.8245103910565377, + "num_tokens": 45899658.0, + "step": 38210 + }, + { + "entropy": 1.7931729927659035, + "epoch": 0.11847865593993609, + "grad_norm": 10.355236053466797, + "learning_rate": 7.3498470533247175e-06, + "loss": 0.5392, + "mean_token_accuracy": 0.8335598543286323, + "num_tokens": 45912392.0, + "step": 38220 + }, + { + "entropy": 1.739191673696041, + "epoch": 0.11850965506498579, + "grad_norm": 4.354625225067139, + "learning_rate": 7.34888569836353e-06, + "loss": 0.4508, + "mean_token_accuracy": 0.8427866518497467, + "num_tokens": 45925460.0, + "step": 38230 + }, + { + "entropy": 1.832131990790367, + "epoch": 0.11854065419003548, + "grad_norm": 11.056679725646973, + "learning_rate": 7.347924720537399e-06, + "loss": 0.5573, + "mean_token_accuracy": 0.8355686992406846, + "num_tokens": 45937998.0, + "step": 38240 + }, + { + "entropy": 1.8358628578484057, + "epoch": 0.11857165331508518, + "grad_norm": 2.7926924228668213, + "learning_rate": 7.346964119599805e-06, + "loss": 0.5361, + "mean_token_accuracy": 0.8267253056168556, + "num_tokens": 45950095.0, + "step": 38250 + }, + { + "entropy": 1.8598995104432106, + "epoch": 0.11860265244013488, + "grad_norm": 9.030755996704102, + "learning_rate": 7.346003895304459e-06, + "loss": 0.5634, + "mean_token_accuracy": 0.8271134316921234, + "num_tokens": 45962187.0, + "step": 38260 + }, + { + "entropy": 1.8761465281248093, + "epoch": 0.11863365156518457, + "grad_norm": 4.900582313537598, + "learning_rate": 7.345044047405296e-06, + "loss": 0.5422, + "mean_token_accuracy": 0.8255822539329529, + "num_tokens": 45974742.0, + "step": 38270 + }, + { + "entropy": 1.8882175594568253, + "epoch": 0.11866465069023427, + "grad_norm": 9.869363784790039, + "learning_rate": 7.344084575656477e-06, + "loss": 0.578, + "mean_token_accuracy": 0.818389254808426, + "num_tokens": 45986513.0, + "step": 38280 + }, + { + "entropy": 1.9118270069360732, + "epoch": 0.11869564981528397, + "grad_norm": 10.015856742858887, + "learning_rate": 7.343125479812384e-06, + "loss": 0.626, + "mean_token_accuracy": 0.8135067865252494, + "num_tokens": 45998695.0, + "step": 38290 + }, + { + "entropy": 1.7809939235448837, + "epoch": 0.11872664894033366, + "grad_norm": 2.5885379314422607, + "learning_rate": 7.342166759627627e-06, + "loss": 0.4918, + "mean_token_accuracy": 0.8359186798334122, + "num_tokens": 46011982.0, + "step": 38300 + }, + { + "entropy": 1.90637346804142, + "epoch": 0.11875764806538336, + "grad_norm": 9.184483528137207, + "learning_rate": 7.341208414857039e-06, + "loss": 0.5245, + "mean_token_accuracy": 0.8383510798215866, + "num_tokens": 46023222.0, + "step": 38310 + }, + { + "entropy": 1.9249227464199066, + "epoch": 0.11878864719043306, + "grad_norm": 8.180423736572266, + "learning_rate": 7.340250445255678e-06, + "loss": 0.591, + "mean_token_accuracy": 0.822523207962513, + "num_tokens": 46033818.0, + "step": 38320 + }, + { + "entropy": 1.8500513106584549, + "epoch": 0.11881964631548275, + "grad_norm": 8.05856990814209, + "learning_rate": 7.3392928505788245e-06, + "loss": 0.5562, + "mean_token_accuracy": 0.8258534908294678, + "num_tokens": 46045985.0, + "step": 38330 + }, + { + "entropy": 1.891994585096836, + "epoch": 0.11885064544053243, + "grad_norm": 10.122121810913086, + "learning_rate": 7.338335630581982e-06, + "loss": 0.5748, + "mean_token_accuracy": 0.8260172605514526, + "num_tokens": 46057552.0, + "step": 38340 + }, + { + "entropy": 1.9085305988788606, + "epoch": 0.11888164456558213, + "grad_norm": 8.372586250305176, + "learning_rate": 7.3373787850208775e-06, + "loss": 0.5706, + "mean_token_accuracy": 0.8344064444303513, + "num_tokens": 46068444.0, + "step": 38350 + }, + { + "entropy": 1.8459767132997513, + "epoch": 0.11891264369063183, + "grad_norm": 9.715620040893555, + "learning_rate": 7.336422313651464e-06, + "loss": 0.5169, + "mean_token_accuracy": 0.83453588783741, + "num_tokens": 46080267.0, + "step": 38360 + }, + { + "entropy": 1.966032502055168, + "epoch": 0.11894364281568152, + "grad_norm": 8.732338905334473, + "learning_rate": 7.33546621622991e-06, + "loss": 0.6502, + "mean_token_accuracy": 0.8121734812855721, + "num_tokens": 46091339.0, + "step": 38370 + }, + { + "entropy": 1.8556820943951606, + "epoch": 0.11897464194073122, + "grad_norm": 10.176673889160156, + "learning_rate": 7.334510492512614e-06, + "loss": 0.5158, + "mean_token_accuracy": 0.837830537557602, + "num_tokens": 46102953.0, + "step": 38380 + }, + { + "entropy": 1.883063942193985, + "epoch": 0.11900564106578092, + "grad_norm": 7.522932052612305, + "learning_rate": 7.3335551422561916e-06, + "loss": 0.5412, + "mean_token_accuracy": 0.8345891013741493, + "num_tokens": 46114625.0, + "step": 38390 + }, + { + "entropy": 1.8097529754042625, + "epoch": 0.11903664019083061, + "grad_norm": 3.6872365474700928, + "learning_rate": 7.3326001652174846e-06, + "loss": 0.515, + "mean_token_accuracy": 0.8357144206762314, + "num_tokens": 46126642.0, + "step": 38400 + }, + { + "entropy": 1.8187772005796432, + "epoch": 0.11906763931588031, + "grad_norm": 11.781440734863281, + "learning_rate": 7.331645561153551e-06, + "loss": 0.5275, + "mean_token_accuracy": 0.8383165881037712, + "num_tokens": 46138142.0, + "step": 38410 + }, + { + "entropy": 1.8450339302420615, + "epoch": 0.11909863844093001, + "grad_norm": 12.367788314819336, + "learning_rate": 7.330691329821676e-06, + "loss": 0.5472, + "mean_token_accuracy": 0.8337818294763565, + "num_tokens": 46149774.0, + "step": 38420 + }, + { + "entropy": 1.8153291314840316, + "epoch": 0.1191296375659797, + "grad_norm": 8.434037208557129, + "learning_rate": 7.329737470979359e-06, + "loss": 0.5374, + "mean_token_accuracy": 0.8328280210494995, + "num_tokens": 46162425.0, + "step": 38430 + }, + { + "entropy": 1.8206452041864396, + "epoch": 0.1191606366910294, + "grad_norm": 8.897926330566406, + "learning_rate": 7.328783984384326e-06, + "loss": 0.4986, + "mean_token_accuracy": 0.8331076070666313, + "num_tokens": 46175025.0, + "step": 38440 + }, + { + "entropy": 1.8406914666295051, + "epoch": 0.1191916358160791, + "grad_norm": 9.005738258361816, + "learning_rate": 7.327830869794524e-06, + "loss": 0.5414, + "mean_token_accuracy": 0.8288857981562614, + "num_tokens": 46187094.0, + "step": 38450 + }, + { + "entropy": 1.8507864832878114, + "epoch": 0.11922263494112878, + "grad_norm": 5.744321823120117, + "learning_rate": 7.326878126968114e-06, + "loss": 0.5862, + "mean_token_accuracy": 0.815381346642971, + "num_tokens": 46199554.0, + "step": 38460 + }, + { + "entropy": 1.8325456768274306, + "epoch": 0.11925363406617848, + "grad_norm": 10.185477256774902, + "learning_rate": 7.325925755663483e-06, + "loss": 0.5335, + "mean_token_accuracy": 0.8304981887340546, + "num_tokens": 46211540.0, + "step": 38470 + }, + { + "entropy": 1.8854450479149818, + "epoch": 0.11928463319122817, + "grad_norm": 8.560661315917969, + "learning_rate": 7.324973755639235e-06, + "loss": 0.5288, + "mean_token_accuracy": 0.823890820145607, + "num_tokens": 46223017.0, + "step": 38480 + }, + { + "entropy": 1.892309932410717, + "epoch": 0.11931563231627787, + "grad_norm": 9.574992179870605, + "learning_rate": 7.324022126654195e-06, + "loss": 0.5751, + "mean_token_accuracy": 0.823742826282978, + "num_tokens": 46235073.0, + "step": 38490 + }, + { + "entropy": 1.814597336947918, + "epoch": 0.11934663144132757, + "grad_norm": 8.49984359741211, + "learning_rate": 7.323070868467407e-06, + "loss": 0.5, + "mean_token_accuracy": 0.8360254690051079, + "num_tokens": 46247594.0, + "step": 38500 + }, + { + "entropy": 1.8429415673017502, + "epoch": 0.11937763056637726, + "grad_norm": 8.082172393798828, + "learning_rate": 7.322119980838131e-06, + "loss": 0.5642, + "mean_token_accuracy": 0.8176757171750069, + "num_tokens": 46259339.0, + "step": 38510 + }, + { + "entropy": 1.8391290426254272, + "epoch": 0.11940862969142696, + "grad_norm": 8.323386192321777, + "learning_rate": 7.32116946352585e-06, + "loss": 0.5536, + "mean_token_accuracy": 0.8202460646629334, + "num_tokens": 46271303.0, + "step": 38520 + }, + { + "entropy": 1.8703968793153762, + "epoch": 0.11943962881647666, + "grad_norm": 4.209524631500244, + "learning_rate": 7.3202193162902655e-06, + "loss": 0.5362, + "mean_token_accuracy": 0.8387380838394165, + "num_tokens": 46282368.0, + "step": 38530 + }, + { + "entropy": 1.8230261772871017, + "epoch": 0.11947062794152635, + "grad_norm": 9.72610855102539, + "learning_rate": 7.319269538891293e-06, + "loss": 0.5044, + "mean_token_accuracy": 0.8248476162552834, + "num_tokens": 46294921.0, + "step": 38540 + }, + { + "entropy": 1.827505610883236, + "epoch": 0.11950162706657605, + "grad_norm": 8.594587326049805, + "learning_rate": 7.318320131089069e-06, + "loss": 0.5674, + "mean_token_accuracy": 0.826340651512146, + "num_tokens": 46307519.0, + "step": 38550 + }, + { + "entropy": 1.8293501362204552, + "epoch": 0.11953262619162575, + "grad_norm": 7.893844127655029, + "learning_rate": 7.31737109264395e-06, + "loss": 0.4868, + "mean_token_accuracy": 0.8477142170071602, + "num_tokens": 46319880.0, + "step": 38560 + }, + { + "entropy": 1.967301172018051, + "epoch": 0.11956362531667544, + "grad_norm": 10.387765884399414, + "learning_rate": 7.316422423316503e-06, + "loss": 0.6318, + "mean_token_accuracy": 0.8172219157218933, + "num_tokens": 46331125.0, + "step": 38570 + }, + { + "entropy": 1.9015961229801177, + "epoch": 0.11959462444172514, + "grad_norm": 8.525230407714844, + "learning_rate": 7.315474122867519e-06, + "loss": 0.5207, + "mean_token_accuracy": 0.8258678615093231, + "num_tokens": 46343547.0, + "step": 38580 + }, + { + "entropy": 1.877706202864647, + "epoch": 0.11962562356677482, + "grad_norm": 8.298994064331055, + "learning_rate": 7.314526191058002e-06, + "loss": 0.5629, + "mean_token_accuracy": 0.8354897379875184, + "num_tokens": 46354401.0, + "step": 38590 + }, + { + "entropy": 1.7901566669344902, + "epoch": 0.11965662269182452, + "grad_norm": 8.73811149597168, + "learning_rate": 7.313578627649177e-06, + "loss": 0.4969, + "mean_token_accuracy": 0.8303673461079597, + "num_tokens": 46367136.0, + "step": 38600 + }, + { + "entropy": 1.855716420710087, + "epoch": 0.11968762181687422, + "grad_norm": 8.627557754516602, + "learning_rate": 7.31263143240248e-06, + "loss": 0.552, + "mean_token_accuracy": 0.8318748503923417, + "num_tokens": 46378695.0, + "step": 38610 + }, + { + "entropy": 1.8308327570557594, + "epoch": 0.11971862094192391, + "grad_norm": 4.684299468994141, + "learning_rate": 7.3116846050795675e-06, + "loss": 0.5274, + "mean_token_accuracy": 0.8276481464505195, + "num_tokens": 46390539.0, + "step": 38620 + }, + { + "entropy": 1.7808781787753105, + "epoch": 0.11974962006697361, + "grad_norm": 9.359618186950684, + "learning_rate": 7.3107381454423095e-06, + "loss": 0.5116, + "mean_token_accuracy": 0.8304883912205696, + "num_tokens": 46403361.0, + "step": 38630 + }, + { + "entropy": 1.832536792755127, + "epoch": 0.1197806191920233, + "grad_norm": 9.308186531066895, + "learning_rate": 7.3097920532527925e-06, + "loss": 0.5477, + "mean_token_accuracy": 0.8254012614488602, + "num_tokens": 46415320.0, + "step": 38640 + }, + { + "entropy": 1.875816921889782, + "epoch": 0.119811618317073, + "grad_norm": 6.979348182678223, + "learning_rate": 7.3088463282733195e-06, + "loss": 0.6144, + "mean_token_accuracy": 0.8188940063118935, + "num_tokens": 46426785.0, + "step": 38650 + }, + { + "entropy": 1.896639221906662, + "epoch": 0.1198426174421227, + "grad_norm": 8.80232048034668, + "learning_rate": 7.307900970266406e-06, + "loss": 0.5687, + "mean_token_accuracy": 0.831016306579113, + "num_tokens": 46437545.0, + "step": 38660 + }, + { + "entropy": 1.8298520892858505, + "epoch": 0.1198736165671724, + "grad_norm": 6.179274559020996, + "learning_rate": 7.3069559789947875e-06, + "loss": 0.6193, + "mean_token_accuracy": 0.8189263239502906, + "num_tokens": 46449253.0, + "step": 38670 + }, + { + "entropy": 1.8082061618566514, + "epoch": 0.11990461569222209, + "grad_norm": 7.518819808959961, + "learning_rate": 7.306011354221407e-06, + "loss": 0.5346, + "mean_token_accuracy": 0.8381061047315598, + "num_tokens": 46461547.0, + "step": 38680 + }, + { + "entropy": 1.8741240479052066, + "epoch": 0.11993561481727179, + "grad_norm": 8.276564598083496, + "learning_rate": 7.305067095709427e-06, + "loss": 0.5714, + "mean_token_accuracy": 0.8233344674110412, + "num_tokens": 46473612.0, + "step": 38690 + }, + { + "entropy": 1.82129565179348, + "epoch": 0.11996661394232148, + "grad_norm": 3.840449571609497, + "learning_rate": 7.3041232032222255e-06, + "loss": 0.5298, + "mean_token_accuracy": 0.8374928280711174, + "num_tokens": 46485577.0, + "step": 38700 + }, + { + "entropy": 1.7987656205892564, + "epoch": 0.11999761306737117, + "grad_norm": 9.687265396118164, + "learning_rate": 7.303179676523391e-06, + "loss": 0.5269, + "mean_token_accuracy": 0.8323082059621811, + "num_tokens": 46497845.0, + "step": 38710 + }, + { + "entropy": 1.8387595310807228, + "epoch": 0.12002861219242086, + "grad_norm": 7.756543159484863, + "learning_rate": 7.302236515376725e-06, + "loss": 0.5594, + "mean_token_accuracy": 0.8210761576890946, + "num_tokens": 46510088.0, + "step": 38720 + }, + { + "entropy": 1.7564123407006265, + "epoch": 0.12005961131747056, + "grad_norm": 8.537627220153809, + "learning_rate": 7.301293719546245e-06, + "loss": 0.5524, + "mean_token_accuracy": 0.831873744726181, + "num_tokens": 46522634.0, + "step": 38730 + }, + { + "entropy": 1.8440282315015792, + "epoch": 0.12009061044252026, + "grad_norm": 7.22578763961792, + "learning_rate": 7.300351288796182e-06, + "loss": 0.575, + "mean_token_accuracy": 0.8209284529089927, + "num_tokens": 46534334.0, + "step": 38740 + }, + { + "entropy": 1.8955597847700119, + "epoch": 0.12012160956756995, + "grad_norm": 8.206884384155273, + "learning_rate": 7.299409222890979e-06, + "loss": 0.6, + "mean_token_accuracy": 0.8230985984206199, + "num_tokens": 46546015.0, + "step": 38750 + }, + { + "entropy": 1.831144355237484, + "epoch": 0.12015260869261965, + "grad_norm": 9.138228416442871, + "learning_rate": 7.298467521595293e-06, + "loss": 0.4659, + "mean_token_accuracy": 0.8403099969029426, + "num_tokens": 46558715.0, + "step": 38760 + }, + { + "entropy": 1.7825349926948548, + "epoch": 0.12018360781766935, + "grad_norm": 8.191915512084961, + "learning_rate": 7.297526184673988e-06, + "loss": 0.5195, + "mean_token_accuracy": 0.8280392870306968, + "num_tokens": 46572867.0, + "step": 38770 + }, + { + "entropy": 1.8217300280928612, + "epoch": 0.12021460694271904, + "grad_norm": 8.535613059997559, + "learning_rate": 7.296585211892146e-06, + "loss": 0.483, + "mean_token_accuracy": 0.8408209979534149, + "num_tokens": 46585902.0, + "step": 38780 + }, + { + "entropy": 1.8220679610967636, + "epoch": 0.12024560606776874, + "grad_norm": 4.572232246398926, + "learning_rate": 7.295644603015063e-06, + "loss": 0.4769, + "mean_token_accuracy": 0.8344119265675545, + "num_tokens": 46599076.0, + "step": 38790 + }, + { + "entropy": 1.8718759939074516, + "epoch": 0.12027660519281844, + "grad_norm": 9.355148315429688, + "learning_rate": 7.294704357808237e-06, + "loss": 0.5121, + "mean_token_accuracy": 0.8285117372870445, + "num_tokens": 46611131.0, + "step": 38800 + }, + { + "entropy": 1.901232734322548, + "epoch": 0.12030760431786813, + "grad_norm": 10.37480354309082, + "learning_rate": 7.2937644760373896e-06, + "loss": 0.5814, + "mean_token_accuracy": 0.8172040551900863, + "num_tokens": 46623342.0, + "step": 38810 + }, + { + "entropy": 1.8803485922515393, + "epoch": 0.12033860344291783, + "grad_norm": 9.32126235961914, + "learning_rate": 7.292824957468444e-06, + "loss": 0.5681, + "mean_token_accuracy": 0.8315271034836769, + "num_tokens": 46634924.0, + "step": 38820 + }, + { + "entropy": 1.9120069742202759, + "epoch": 0.12036960256796751, + "grad_norm": 10.770748138427734, + "learning_rate": 7.29188580186754e-06, + "loss": 0.5592, + "mean_token_accuracy": 0.8313893929123879, + "num_tokens": 46647007.0, + "step": 38830 + }, + { + "entropy": 1.7526630774140357, + "epoch": 0.12040060169301721, + "grad_norm": 4.222184181213379, + "learning_rate": 7.290947009001024e-06, + "loss": 0.4601, + "mean_token_accuracy": 0.8405937299132347, + "num_tokens": 46660402.0, + "step": 38840 + }, + { + "entropy": 1.8250176429748535, + "epoch": 0.1204316008180669, + "grad_norm": 8.69330883026123, + "learning_rate": 7.290008578635457e-06, + "loss": 0.5383, + "mean_token_accuracy": 0.82737468034029, + "num_tokens": 46673041.0, + "step": 38850 + }, + { + "entropy": 1.8118784487247468, + "epoch": 0.1204625999431166, + "grad_norm": 9.943561553955078, + "learning_rate": 7.289070510537608e-06, + "loss": 0.5215, + "mean_token_accuracy": 0.8418055370450019, + "num_tokens": 46686648.0, + "step": 38860 + }, + { + "entropy": 1.9078863650560378, + "epoch": 0.1204935990681663, + "grad_norm": 9.226347923278809, + "learning_rate": 7.288132804474457e-06, + "loss": 0.5599, + "mean_token_accuracy": 0.8267140224575996, + "num_tokens": 46698591.0, + "step": 38870 + }, + { + "entropy": 1.8343891605734826, + "epoch": 0.120524598193216, + "grad_norm": 5.343838691711426, + "learning_rate": 7.28719546021319e-06, + "loss": 0.5039, + "mean_token_accuracy": 0.8361982107162476, + "num_tokens": 46711162.0, + "step": 38880 + }, + { + "entropy": 1.8952967941761016, + "epoch": 0.12055559731826569, + "grad_norm": 7.90724515914917, + "learning_rate": 7.286258477521211e-06, + "loss": 0.5974, + "mean_token_accuracy": 0.8240769699215889, + "num_tokens": 46723517.0, + "step": 38890 + }, + { + "entropy": 1.8919255122542382, + "epoch": 0.12058659644331539, + "grad_norm": 8.763474464416504, + "learning_rate": 7.285321856166125e-06, + "loss": 0.5488, + "mean_token_accuracy": 0.8284026741981506, + "num_tokens": 46734522.0, + "step": 38900 + }, + { + "entropy": 1.9270798504352569, + "epoch": 0.12061759556836509, + "grad_norm": 8.891312599182129, + "learning_rate": 7.284385595915748e-06, + "loss": 0.5376, + "mean_token_accuracy": 0.8333567604422569, + "num_tokens": 46745723.0, + "step": 38910 + }, + { + "entropy": 1.904827606678009, + "epoch": 0.12064859469341478, + "grad_norm": 9.994085311889648, + "learning_rate": 7.283449696538109e-06, + "loss": 0.5673, + "mean_token_accuracy": 0.8297219768166542, + "num_tokens": 46757162.0, + "step": 38920 + }, + { + "entropy": 1.9497202724218368, + "epoch": 0.12067959381846448, + "grad_norm": 10.245466232299805, + "learning_rate": 7.282514157801443e-06, + "loss": 0.6198, + "mean_token_accuracy": 0.8121256932616234, + "num_tokens": 46768342.0, + "step": 38930 + }, + { + "entropy": 1.9423091277480125, + "epoch": 0.12071059294351418, + "grad_norm": 8.347733497619629, + "learning_rate": 7.2815789794741885e-06, + "loss": 0.5971, + "mean_token_accuracy": 0.8201627746224404, + "num_tokens": 46779722.0, + "step": 38940 + }, + { + "entropy": 1.9126493602991104, + "epoch": 0.12074159206856387, + "grad_norm": 8.225603103637695, + "learning_rate": 7.280644161324999e-06, + "loss": 0.5515, + "mean_token_accuracy": 0.8218429237604141, + "num_tokens": 46791464.0, + "step": 38950 + }, + { + "entropy": 1.8790960252285003, + "epoch": 0.12077259119361355, + "grad_norm": 8.749955177307129, + "learning_rate": 7.2797097031227335e-06, + "loss": 0.5397, + "mean_token_accuracy": 0.8350474625825882, + "num_tokens": 46804115.0, + "step": 38960 + }, + { + "entropy": 1.8715863823890686, + "epoch": 0.12080359031866325, + "grad_norm": 11.107318878173828, + "learning_rate": 7.278775604636458e-06, + "loss": 0.5789, + "mean_token_accuracy": 0.8246800765395165, + "num_tokens": 46815723.0, + "step": 38970 + }, + { + "entropy": 1.8465174466371537, + "epoch": 0.12083458944371295, + "grad_norm": 6.929959774017334, + "learning_rate": 7.277841865635446e-06, + "loss": 0.5764, + "mean_token_accuracy": 0.8237011566758156, + "num_tokens": 46828326.0, + "step": 38980 + }, + { + "entropy": 1.9157025456428527, + "epoch": 0.12086558856876264, + "grad_norm": 7.555674076080322, + "learning_rate": 7.27690848588918e-06, + "loss": 0.5341, + "mean_token_accuracy": 0.8201544851064682, + "num_tokens": 46840929.0, + "step": 38990 + }, + { + "entropy": 1.855295367538929, + "epoch": 0.12089658769381234, + "grad_norm": 9.253195762634277, + "learning_rate": 7.275975465167346e-06, + "loss": 0.4612, + "mean_token_accuracy": 0.8400891706347465, + "num_tokens": 46853363.0, + "step": 39000 + }, + { + "entropy": 1.9134544879198074, + "epoch": 0.12092758681886204, + "grad_norm": 9.56458568572998, + "learning_rate": 7.27504280323984e-06, + "loss": 0.5498, + "mean_token_accuracy": 0.8285094171762466, + "num_tokens": 46865204.0, + "step": 39010 + }, + { + "entropy": 1.8941469311714172, + "epoch": 0.12095858594391173, + "grad_norm": 8.927152633666992, + "learning_rate": 7.274110499876761e-06, + "loss": 0.5084, + "mean_token_accuracy": 0.8449266985058784, + "num_tokens": 46876368.0, + "step": 39020 + }, + { + "entropy": 1.8193779736757278, + "epoch": 0.12098958506896143, + "grad_norm": 3.3484020233154297, + "learning_rate": 7.273178554848418e-06, + "loss": 0.5043, + "mean_token_accuracy": 0.8299607783555984, + "num_tokens": 46889571.0, + "step": 39030 + }, + { + "entropy": 1.8515197828412056, + "epoch": 0.12102058419401113, + "grad_norm": 11.17811393737793, + "learning_rate": 7.272246967925323e-06, + "loss": 0.5419, + "mean_token_accuracy": 0.8318181455135345, + "num_tokens": 46901011.0, + "step": 39040 + }, + { + "entropy": 1.9295257300138473, + "epoch": 0.12105158331906082, + "grad_norm": 9.795385360717773, + "learning_rate": 7.271315738878194e-06, + "loss": 0.5301, + "mean_token_accuracy": 0.8381862804293633, + "num_tokens": 46911801.0, + "step": 39050 + }, + { + "entropy": 1.955726158618927, + "epoch": 0.12108258244411052, + "grad_norm": 11.714072227478027, + "learning_rate": 7.270384867477956e-06, + "loss": 0.5593, + "mean_token_accuracy": 0.8203748896718025, + "num_tokens": 46923433.0, + "step": 39060 + }, + { + "entropy": 1.9114267766475677, + "epoch": 0.12111358156916022, + "grad_norm": 9.219902992248535, + "learning_rate": 7.269454353495741e-06, + "loss": 0.5248, + "mean_token_accuracy": 0.8343056261539459, + "num_tokens": 46934777.0, + "step": 39070 + }, + { + "entropy": 1.9058862313628198, + "epoch": 0.1211445806942099, + "grad_norm": 9.145793914794922, + "learning_rate": 7.26852419670288e-06, + "loss": 0.5637, + "mean_token_accuracy": 0.8218263626098633, + "num_tokens": 46947035.0, + "step": 39080 + }, + { + "entropy": 1.8588959291577338, + "epoch": 0.1211755798192596, + "grad_norm": 9.202445983886719, + "learning_rate": 7.267594396870911e-06, + "loss": 0.5226, + "mean_token_accuracy": 0.8362350627779961, + "num_tokens": 46959557.0, + "step": 39090 + }, + { + "entropy": 1.928936019539833, + "epoch": 0.12120657894430929, + "grad_norm": 11.063957214355469, + "learning_rate": 7.2666649537715814e-06, + "loss": 0.5909, + "mean_token_accuracy": 0.8108921408653259, + "num_tokens": 46971699.0, + "step": 39100 + }, + { + "entropy": 1.9692992329597474, + "epoch": 0.12123757806935899, + "grad_norm": 11.699572563171387, + "learning_rate": 7.265735867176837e-06, + "loss": 0.609, + "mean_token_accuracy": 0.8138938039541245, + "num_tokens": 46983377.0, + "step": 39110 + }, + { + "entropy": 1.942272038757801, + "epoch": 0.12126857719440869, + "grad_norm": 8.434643745422363, + "learning_rate": 7.264807136858832e-06, + "loss": 0.6463, + "mean_token_accuracy": 0.8089325189590454, + "num_tokens": 46996376.0, + "step": 39120 + }, + { + "entropy": 1.8001896619796753, + "epoch": 0.12129957631945838, + "grad_norm": 3.925816059112549, + "learning_rate": 7.2638787625899185e-06, + "loss": 0.4485, + "mean_token_accuracy": 0.8355321779847145, + "num_tokens": 47010314.0, + "step": 39130 + }, + { + "entropy": 1.9362492710351944, + "epoch": 0.12133057544450808, + "grad_norm": 7.411516189575195, + "learning_rate": 7.26295074414266e-06, + "loss": 0.5909, + "mean_token_accuracy": 0.8266261234879494, + "num_tokens": 47022279.0, + "step": 39140 + }, + { + "entropy": 1.892134927213192, + "epoch": 0.12136157456955778, + "grad_norm": 7.244060039520264, + "learning_rate": 7.262023081289816e-06, + "loss": 0.4625, + "mean_token_accuracy": 0.8465871244668961, + "num_tokens": 47034156.0, + "step": 39150 + }, + { + "entropy": 1.9123440355062484, + "epoch": 0.12139257369460747, + "grad_norm": 8.467802047729492, + "learning_rate": 7.261095773804354e-06, + "loss": 0.5257, + "mean_token_accuracy": 0.8312965899705886, + "num_tokens": 47045539.0, + "step": 39160 + }, + { + "entropy": 1.909021759033203, + "epoch": 0.12142357281965717, + "grad_norm": 8.491313934326172, + "learning_rate": 7.260168821459445e-06, + "loss": 0.5967, + "mean_token_accuracy": 0.809444610774517, + "num_tokens": 47057991.0, + "step": 39170 + }, + { + "entropy": 1.8988661482930183, + "epoch": 0.12145457194470687, + "grad_norm": 8.673091888427734, + "learning_rate": 7.259242224028456e-06, + "loss": 0.5633, + "mean_token_accuracy": 0.824192288517952, + "num_tokens": 47069487.0, + "step": 39180 + }, + { + "entropy": 1.8936348468065263, + "epoch": 0.12148557106975656, + "grad_norm": 7.909213066101074, + "learning_rate": 7.258315981284962e-06, + "loss": 0.5449, + "mean_token_accuracy": 0.8279350861907006, + "num_tokens": 47081241.0, + "step": 39190 + }, + { + "entropy": 1.887278787791729, + "epoch": 0.12151657019480624, + "grad_norm": 9.825845718383789, + "learning_rate": 7.2573900930027416e-06, + "loss": 0.5607, + "mean_token_accuracy": 0.82880117893219, + "num_tokens": 47093264.0, + "step": 39200 + }, + { + "entropy": 1.8978556409478187, + "epoch": 0.12154756931985594, + "grad_norm": 9.518089294433594, + "learning_rate": 7.25646455895577e-06, + "loss": 0.5479, + "mean_token_accuracy": 0.8237947776913643, + "num_tokens": 47105376.0, + "step": 39210 + }, + { + "entropy": 1.8898191466927527, + "epoch": 0.12157856844490564, + "grad_norm": 3.807635545730591, + "learning_rate": 7.255539378918229e-06, + "loss": 0.5214, + "mean_token_accuracy": 0.8275874778628349, + "num_tokens": 47117645.0, + "step": 39220 + }, + { + "entropy": 1.8876211807131766, + "epoch": 0.12160956756995533, + "grad_norm": 5.313508033752441, + "learning_rate": 7.254614552664499e-06, + "loss": 0.5445, + "mean_token_accuracy": 0.815770597755909, + "num_tokens": 47129805.0, + "step": 39230 + }, + { + "entropy": 1.89053615629673, + "epoch": 0.12164056669500503, + "grad_norm": 9.875513076782227, + "learning_rate": 7.253690079969162e-06, + "loss": 0.5415, + "mean_token_accuracy": 0.8238019242882728, + "num_tokens": 47142055.0, + "step": 39240 + }, + { + "entropy": 1.900741669535637, + "epoch": 0.12167156582005473, + "grad_norm": 4.167596340179443, + "learning_rate": 7.252765960607002e-06, + "loss": 0.4978, + "mean_token_accuracy": 0.8360105812549591, + "num_tokens": 47154609.0, + "step": 39250 + }, + { + "entropy": 1.8273233488202094, + "epoch": 0.12170256494510442, + "grad_norm": 9.690666198730469, + "learning_rate": 7.251842194353004e-06, + "loss": 0.5093, + "mean_token_accuracy": 0.8299174129962921, + "num_tokens": 47167381.0, + "step": 39260 + }, + { + "entropy": 1.9333147034049034, + "epoch": 0.12173356407015412, + "grad_norm": 3.9693877696990967, + "learning_rate": 7.25091878098235e-06, + "loss": 0.5415, + "mean_token_accuracy": 0.8288700088858605, + "num_tokens": 47179138.0, + "step": 39270 + }, + { + "entropy": 1.8680218786001206, + "epoch": 0.12176456319520382, + "grad_norm": 3.812345504760742, + "learning_rate": 7.249995720270428e-06, + "loss": 0.476, + "mean_token_accuracy": 0.8331614285707474, + "num_tokens": 47192026.0, + "step": 39280 + }, + { + "entropy": 1.8831014022231103, + "epoch": 0.12179556232025351, + "grad_norm": 9.065089225769043, + "learning_rate": 7.249073011992822e-06, + "loss": 0.5455, + "mean_token_accuracy": 0.8253825202584266, + "num_tokens": 47204601.0, + "step": 39290 + }, + { + "entropy": 1.878520594537258, + "epoch": 0.12182656144530321, + "grad_norm": 8.888299942016602, + "learning_rate": 7.248150655925318e-06, + "loss": 0.4776, + "mean_token_accuracy": 0.8410759374499321, + "num_tokens": 47217084.0, + "step": 39300 + }, + { + "entropy": 1.9088293090462685, + "epoch": 0.12185756057035291, + "grad_norm": 8.316096305847168, + "learning_rate": 7.247228651843902e-06, + "loss": 0.5312, + "mean_token_accuracy": 0.8348750114440918, + "num_tokens": 47229268.0, + "step": 39310 + }, + { + "entropy": 1.9051673859357834, + "epoch": 0.1218885596954026, + "grad_norm": 8.587472915649414, + "learning_rate": 7.246306999524752e-06, + "loss": 0.5794, + "mean_token_accuracy": 0.8288885727524757, + "num_tokens": 47241225.0, + "step": 39320 + }, + { + "entropy": 1.970414823293686, + "epoch": 0.12191955882045229, + "grad_norm": 8.662064552307129, + "learning_rate": 7.2453856987442604e-06, + "loss": 0.5843, + "mean_token_accuracy": 0.8233033016324043, + "num_tokens": 47251991.0, + "step": 39330 + }, + { + "entropy": 1.8904896154999733, + "epoch": 0.12195055794550198, + "grad_norm": 8.939061164855957, + "learning_rate": 7.244464749279004e-06, + "loss": 0.547, + "mean_token_accuracy": 0.8356327429413796, + "num_tokens": 47264187.0, + "step": 39340 + }, + { + "entropy": 2.0048918604850767, + "epoch": 0.12198155707055168, + "grad_norm": 7.971733570098877, + "learning_rate": 7.243544150905766e-06, + "loss": 0.6107, + "mean_token_accuracy": 0.8271526664495468, + "num_tokens": 47275694.0, + "step": 39350 + }, + { + "entropy": 2.0341754078865053, + "epoch": 0.12201255619560138, + "grad_norm": 8.931192398071289, + "learning_rate": 7.242623903401524e-06, + "loss": 0.5592, + "mean_token_accuracy": 0.8209575936198235, + "num_tokens": 47286270.0, + "step": 39360 + }, + { + "entropy": 1.960363219678402, + "epoch": 0.12204355532065107, + "grad_norm": 8.31447696685791, + "learning_rate": 7.241704006543459e-06, + "loss": 0.5269, + "mean_token_accuracy": 0.8278711885213852, + "num_tokens": 47297957.0, + "step": 39370 + }, + { + "entropy": 1.9565995991230012, + "epoch": 0.12207455444570077, + "grad_norm": 10.019770622253418, + "learning_rate": 7.240784460108944e-06, + "loss": 0.533, + "mean_token_accuracy": 0.830487422645092, + "num_tokens": 47310043.0, + "step": 39380 + }, + { + "entropy": 1.8893634766340255, + "epoch": 0.12210555357075047, + "grad_norm": 5.0957865715026855, + "learning_rate": 7.239865263875553e-06, + "loss": 0.4684, + "mean_token_accuracy": 0.8399478271603584, + "num_tokens": 47322846.0, + "step": 39390 + }, + { + "entropy": 1.9584024906158448, + "epoch": 0.12213655269580016, + "grad_norm": 9.545947074890137, + "learning_rate": 7.2389464176210585e-06, + "loss": 0.5554, + "mean_token_accuracy": 0.8273420795798302, + "num_tokens": 47333837.0, + "step": 39400 + }, + { + "entropy": 1.9519995272159576, + "epoch": 0.12216755182084986, + "grad_norm": 6.452544212341309, + "learning_rate": 7.2380279211234304e-06, + "loss": 0.5955, + "mean_token_accuracy": 0.8233728304505348, + "num_tokens": 47345921.0, + "step": 39410 + }, + { + "entropy": 1.9542613357305527, + "epoch": 0.12219855094589956, + "grad_norm": 9.183480262756348, + "learning_rate": 7.237109774160834e-06, + "loss": 0.5779, + "mean_token_accuracy": 0.8244384691119194, + "num_tokens": 47357133.0, + "step": 39420 + }, + { + "entropy": 2.000173199176788, + "epoch": 0.12222955007094925, + "grad_norm": 10.723604202270508, + "learning_rate": 7.236191976511631e-06, + "loss": 0.5979, + "mean_token_accuracy": 0.823377488553524, + "num_tokens": 47367446.0, + "step": 39430 + }, + { + "entropy": 2.0126789212226868, + "epoch": 0.12226054919599895, + "grad_norm": 9.821727752685547, + "learning_rate": 7.235274527954382e-06, + "loss": 0.636, + "mean_token_accuracy": 0.8074411496520042, + "num_tokens": 47378541.0, + "step": 39440 + }, + { + "entropy": 1.9093073785305024, + "epoch": 0.12229154832104863, + "grad_norm": 9.164787292480469, + "learning_rate": 7.234357428267842e-06, + "loss": 0.5121, + "mean_token_accuracy": 0.8361682832241059, + "num_tokens": 47390114.0, + "step": 39450 + }, + { + "entropy": 1.8918178245425223, + "epoch": 0.12232254744609833, + "grad_norm": 9.560025215148926, + "learning_rate": 7.233440677230964e-06, + "loss": 0.5231, + "mean_token_accuracy": 0.8367931827902794, + "num_tokens": 47402972.0, + "step": 39460 + }, + { + "entropy": 1.9258590131998061, + "epoch": 0.12235354657114803, + "grad_norm": 8.87900161743164, + "learning_rate": 7.232524274622897e-06, + "loss": 0.5538, + "mean_token_accuracy": 0.8345812693238258, + "num_tokens": 47414317.0, + "step": 39470 + }, + { + "entropy": 1.8653698325157166, + "epoch": 0.12238454569619772, + "grad_norm": 9.456033706665039, + "learning_rate": 7.231608220222983e-06, + "loss": 0.5338, + "mean_token_accuracy": 0.8317572355270386, + "num_tokens": 47426321.0, + "step": 39480 + }, + { + "entropy": 1.9535607546567917, + "epoch": 0.12241554482124742, + "grad_norm": 4.313974857330322, + "learning_rate": 7.230692513810767e-06, + "loss": 0.5645, + "mean_token_accuracy": 0.8326726511120797, + "num_tokens": 47437625.0, + "step": 39490 + }, + { + "entropy": 1.8951896831393242, + "epoch": 0.12244654394629712, + "grad_norm": 8.987655639648438, + "learning_rate": 7.229777155165975e-06, + "loss": 0.4771, + "mean_token_accuracy": 0.8377861216664314, + "num_tokens": 47450365.0, + "step": 39500 + }, + { + "entropy": 1.9469685688614846, + "epoch": 0.12247754307134681, + "grad_norm": 8.353129386901855, + "learning_rate": 7.228862144068547e-06, + "loss": 0.5508, + "mean_token_accuracy": 0.828496278822422, + "num_tokens": 47462011.0, + "step": 39510 + }, + { + "entropy": 1.8816884592175485, + "epoch": 0.12250854219639651, + "grad_norm": 8.542236328125, + "learning_rate": 7.227947480298603e-06, + "loss": 0.5017, + "mean_token_accuracy": 0.8291819423437119, + "num_tokens": 47474823.0, + "step": 39520 + }, + { + "entropy": 1.8621670812368394, + "epoch": 0.1225395413214462, + "grad_norm": 8.68166446685791, + "learning_rate": 7.227033163636462e-06, + "loss": 0.5437, + "mean_token_accuracy": 0.8337508887052536, + "num_tokens": 47487106.0, + "step": 39530 + }, + { + "entropy": 1.9243188560009004, + "epoch": 0.1225705404464959, + "grad_norm": 9.343770027160645, + "learning_rate": 7.226119193862641e-06, + "loss": 0.5637, + "mean_token_accuracy": 0.8281685680150985, + "num_tokens": 47498743.0, + "step": 39540 + }, + { + "entropy": 1.9087723910808563, + "epoch": 0.1226015395715456, + "grad_norm": 10.595560073852539, + "learning_rate": 7.225205570757848e-06, + "loss": 0.5472, + "mean_token_accuracy": 0.8282433733344078, + "num_tokens": 47511087.0, + "step": 39550 + }, + { + "entropy": 1.9597302109003067, + "epoch": 0.1226325386965953, + "grad_norm": 9.186569213867188, + "learning_rate": 7.224292294102985e-06, + "loss": 0.6176, + "mean_token_accuracy": 0.8227703988552093, + "num_tokens": 47522853.0, + "step": 39560 + }, + { + "entropy": 1.8456875741481782, + "epoch": 0.12266353782164498, + "grad_norm": 4.475244998931885, + "learning_rate": 7.223379363679148e-06, + "loss": 0.5637, + "mean_token_accuracy": 0.8218344271183013, + "num_tokens": 47535932.0, + "step": 39570 + }, + { + "entropy": 1.8878474682569504, + "epoch": 0.12269453694669467, + "grad_norm": 8.224106788635254, + "learning_rate": 7.222466779267628e-06, + "loss": 0.6166, + "mean_token_accuracy": 0.8177327990531922, + "num_tokens": 47547596.0, + "step": 39580 + }, + { + "entropy": 1.9054535388946534, + "epoch": 0.12272553607174437, + "grad_norm": 9.366242408752441, + "learning_rate": 7.221554540649909e-06, + "loss": 0.5031, + "mean_token_accuracy": 0.8298028498888016, + "num_tokens": 47560679.0, + "step": 39590 + }, + { + "entropy": 1.9364402756094932, + "epoch": 0.12275653519679407, + "grad_norm": 9.393874168395996, + "learning_rate": 7.220642647607665e-06, + "loss": 0.5802, + "mean_token_accuracy": 0.8262747645378112, + "num_tokens": 47572344.0, + "step": 39600 + }, + { + "entropy": 1.904061770439148, + "epoch": 0.12278753432184376, + "grad_norm": 11.156982421875, + "learning_rate": 7.219731099922768e-06, + "loss": 0.5131, + "mean_token_accuracy": 0.8359477326273919, + "num_tokens": 47583272.0, + "step": 39610 + }, + { + "entropy": 1.8150178447365761, + "epoch": 0.12281853344689346, + "grad_norm": 8.800067901611328, + "learning_rate": 7.218819897377277e-06, + "loss": 0.4655, + "mean_token_accuracy": 0.84474868029356, + "num_tokens": 47596939.0, + "step": 39620 + }, + { + "entropy": 1.7370637387037278, + "epoch": 0.12284953257194316, + "grad_norm": 8.489538192749023, + "learning_rate": 7.217909039753451e-06, + "loss": 0.4292, + "mean_token_accuracy": 0.8503855854272843, + "num_tokens": 47610478.0, + "step": 39630 + }, + { + "entropy": 1.8361937582492829, + "epoch": 0.12288053169699285, + "grad_norm": 10.117167472839355, + "learning_rate": 7.216998526833735e-06, + "loss": 0.5115, + "mean_token_accuracy": 0.8379788339138031, + "num_tokens": 47623032.0, + "step": 39640 + }, + { + "entropy": 1.880786618590355, + "epoch": 0.12291153082204255, + "grad_norm": 8.611593246459961, + "learning_rate": 7.216088358400767e-06, + "loss": 0.5068, + "mean_token_accuracy": 0.841580656170845, + "num_tokens": 47634664.0, + "step": 39650 + }, + { + "entropy": 1.8434013739228248, + "epoch": 0.12294252994709225, + "grad_norm": 11.259652137756348, + "learning_rate": 7.2151785342373795e-06, + "loss": 0.544, + "mean_token_accuracy": 0.8298772439360619, + "num_tokens": 47646858.0, + "step": 39660 + }, + { + "entropy": 1.882999302446842, + "epoch": 0.12297352907214194, + "grad_norm": 9.449716567993164, + "learning_rate": 7.214269054126593e-06, + "loss": 0.6034, + "mean_token_accuracy": 0.8209249958395958, + "num_tokens": 47658958.0, + "step": 39670 + }, + { + "entropy": 1.9060813054442405, + "epoch": 0.12300452819719164, + "grad_norm": 4.450017929077148, + "learning_rate": 7.2133599178516235e-06, + "loss": 0.5039, + "mean_token_accuracy": 0.8366493061184883, + "num_tokens": 47670981.0, + "step": 39680 + }, + { + "entropy": 2.007936453819275, + "epoch": 0.12303552732224134, + "grad_norm": 10.02921199798584, + "learning_rate": 7.212451125195874e-06, + "loss": 0.6157, + "mean_token_accuracy": 0.8246709734201432, + "num_tokens": 47681702.0, + "step": 39690 + }, + { + "entropy": 2.0430832996964456, + "epoch": 0.12306652644729102, + "grad_norm": 9.025290489196777, + "learning_rate": 7.211542675942941e-06, + "loss": 0.6219, + "mean_token_accuracy": 0.816537082195282, + "num_tokens": 47692748.0, + "step": 39700 + }, + { + "entropy": 1.9663964182138443, + "epoch": 0.12309752557234072, + "grad_norm": 10.015572547912598, + "learning_rate": 7.2106345698766134e-06, + "loss": 0.5715, + "mean_token_accuracy": 0.814094303548336, + "num_tokens": 47703817.0, + "step": 39710 + }, + { + "entropy": 1.793771331012249, + "epoch": 0.12312852469739041, + "grad_norm": 3.0519356727600098, + "learning_rate": 7.209726806780866e-06, + "loss": 0.4738, + "mean_token_accuracy": 0.8429444998502731, + "num_tokens": 47717762.0, + "step": 39720 + }, + { + "entropy": 1.7859131276607514, + "epoch": 0.12315952382244011, + "grad_norm": 3.6266133785247803, + "learning_rate": 7.208819386439868e-06, + "loss": 0.4706, + "mean_token_accuracy": 0.8404285296797752, + "num_tokens": 47731173.0, + "step": 39730 + }, + { + "entropy": 1.8794568166136743, + "epoch": 0.1231905229474898, + "grad_norm": 4.375041484832764, + "learning_rate": 7.207912308637975e-06, + "loss": 0.5168, + "mean_token_accuracy": 0.8297057136893272, + "num_tokens": 47743900.0, + "step": 39740 + }, + { + "entropy": 1.9013297393918038, + "epoch": 0.1232215220725395, + "grad_norm": 9.238868713378906, + "learning_rate": 7.207005573159738e-06, + "loss": 0.5438, + "mean_token_accuracy": 0.8304916322231293, + "num_tokens": 47755784.0, + "step": 39750 + }, + { + "entropy": 1.9574954420328141, + "epoch": 0.1232525211975892, + "grad_norm": 10.237153053283691, + "learning_rate": 7.2060991797898904e-06, + "loss": 0.5814, + "mean_token_accuracy": 0.8244188159704209, + "num_tokens": 47766731.0, + "step": 39760 + }, + { + "entropy": 1.8875176712870598, + "epoch": 0.1232835203226389, + "grad_norm": 10.606158256530762, + "learning_rate": 7.205193128313362e-06, + "loss": 0.5121, + "mean_token_accuracy": 0.8495481833815575, + "num_tokens": 47778152.0, + "step": 39770 + }, + { + "entropy": 1.9175768822431565, + "epoch": 0.12331451944768859, + "grad_norm": 8.000387191772461, + "learning_rate": 7.204287418515269e-06, + "loss": 0.5059, + "mean_token_accuracy": 0.8478017941117286, + "num_tokens": 47788937.0, + "step": 39780 + }, + { + "entropy": 1.8297215834259988, + "epoch": 0.12334551857273829, + "grad_norm": 10.035210609436035, + "learning_rate": 7.203382050180914e-06, + "loss": 0.4475, + "mean_token_accuracy": 0.8538377195596695, + "num_tokens": 47801439.0, + "step": 39790 + }, + { + "entropy": 1.9494767993688584, + "epoch": 0.12337651769778799, + "grad_norm": 9.916494369506836, + "learning_rate": 7.202477023095793e-06, + "loss": 0.5851, + "mean_token_accuracy": 0.8265805870294571, + "num_tokens": 47811893.0, + "step": 39800 + }, + { + "entropy": 1.8632332697510718, + "epoch": 0.12340751682283768, + "grad_norm": 8.924822807312012, + "learning_rate": 7.201572337045587e-06, + "loss": 0.5439, + "mean_token_accuracy": 0.835116508603096, + "num_tokens": 47823892.0, + "step": 39810 + }, + { + "entropy": 1.8363080993294716, + "epoch": 0.12343851594788736, + "grad_norm": 4.758782863616943, + "learning_rate": 7.200667991816167e-06, + "loss": 0.5063, + "mean_token_accuracy": 0.8292215019464493, + "num_tokens": 47835997.0, + "step": 39820 + }, + { + "entropy": 1.9205212816596031, + "epoch": 0.12346951507293706, + "grad_norm": 8.591593742370605, + "learning_rate": 7.199763987193592e-06, + "loss": 0.5566, + "mean_token_accuracy": 0.8230644062161445, + "num_tokens": 47847772.0, + "step": 39830 + }, + { + "entropy": 1.9247150912880897, + "epoch": 0.12350051419798676, + "grad_norm": 10.074462890625, + "learning_rate": 7.19886032296411e-06, + "loss": 0.57, + "mean_token_accuracy": 0.8206574454903602, + "num_tokens": 47859033.0, + "step": 39840 + }, + { + "entropy": 1.9125849097967147, + "epoch": 0.12353151332303645, + "grad_norm": 9.70457649230957, + "learning_rate": 7.197956998914155e-06, + "loss": 0.574, + "mean_token_accuracy": 0.82592853307724, + "num_tokens": 47870577.0, + "step": 39850 + }, + { + "entropy": 1.92913771122694, + "epoch": 0.12356251244808615, + "grad_norm": 8.795923233032227, + "learning_rate": 7.197054014830351e-06, + "loss": 0.5198, + "mean_token_accuracy": 0.8308982938528061, + "num_tokens": 47883093.0, + "step": 39860 + }, + { + "entropy": 1.8687468573451043, + "epoch": 0.12359351157313585, + "grad_norm": 9.503414154052734, + "learning_rate": 7.196151370499505e-06, + "loss": 0.5147, + "mean_token_accuracy": 0.8317637592554092, + "num_tokens": 47895551.0, + "step": 39870 + }, + { + "entropy": 1.969892618060112, + "epoch": 0.12362451069818554, + "grad_norm": 9.0645112991333, + "learning_rate": 7.195249065708615e-06, + "loss": 0.6059, + "mean_token_accuracy": 0.818208034336567, + "num_tokens": 47906405.0, + "step": 39880 + }, + { + "entropy": 1.8821232318878174, + "epoch": 0.12365550982323524, + "grad_norm": 8.767228126525879, + "learning_rate": 7.194347100244863e-06, + "loss": 0.5053, + "mean_token_accuracy": 0.8397012487053871, + "num_tokens": 47918131.0, + "step": 39890 + }, + { + "entropy": 1.8662964954972268, + "epoch": 0.12368650894828494, + "grad_norm": 9.380695343017578, + "learning_rate": 7.1934454738956235e-06, + "loss": 0.5109, + "mean_token_accuracy": 0.8346205174922943, + "num_tokens": 47929900.0, + "step": 39900 + }, + { + "entropy": 1.9048209875822066, + "epoch": 0.12371750807333463, + "grad_norm": 7.6859354972839355, + "learning_rate": 7.19254418644845e-06, + "loss": 0.5487, + "mean_token_accuracy": 0.8317638859152794, + "num_tokens": 47941788.0, + "step": 39910 + }, + { + "entropy": 1.9060806199908256, + "epoch": 0.12374850719838433, + "grad_norm": 14.849568367004395, + "learning_rate": 7.1916432376910865e-06, + "loss": 0.5648, + "mean_token_accuracy": 0.8300574287772179, + "num_tokens": 47953554.0, + "step": 39920 + }, + { + "entropy": 1.8868515014648437, + "epoch": 0.12377950632343403, + "grad_norm": 8.418484687805176, + "learning_rate": 7.19074262741146e-06, + "loss": 0.5313, + "mean_token_accuracy": 0.8337209805846214, + "num_tokens": 47965620.0, + "step": 39930 + }, + { + "entropy": 1.9150093987584114, + "epoch": 0.12381050544848371, + "grad_norm": 8.23653793334961, + "learning_rate": 7.18984235539769e-06, + "loss": 0.5473, + "mean_token_accuracy": 0.8326784566044807, + "num_tokens": 47977186.0, + "step": 39940 + }, + { + "entropy": 1.8675342485308648, + "epoch": 0.1238415045735334, + "grad_norm": 8.780365943908691, + "learning_rate": 7.188942421438074e-06, + "loss": 0.5194, + "mean_token_accuracy": 0.8340345472097397, + "num_tokens": 47988962.0, + "step": 39950 + }, + { + "entropy": 1.8479683220386505, + "epoch": 0.1238725036985831, + "grad_norm": 9.722196578979492, + "learning_rate": 7.188042825321099e-06, + "loss": 0.4927, + "mean_token_accuracy": 0.8346738666296005, + "num_tokens": 48001542.0, + "step": 39960 + }, + { + "entropy": 1.9192697882652283, + "epoch": 0.1239035028236328, + "grad_norm": 9.703927993774414, + "learning_rate": 7.187143566835436e-06, + "loss": 0.5948, + "mean_token_accuracy": 0.8207048952579499, + "num_tokens": 48013716.0, + "step": 39970 + }, + { + "entropy": 1.9120296582579612, + "epoch": 0.1239345019486825, + "grad_norm": 8.731795310974121, + "learning_rate": 7.186244645769942e-06, + "loss": 0.5596, + "mean_token_accuracy": 0.8310200199484825, + "num_tokens": 48026045.0, + "step": 39980 + }, + { + "entropy": 1.9499492287635802, + "epoch": 0.12396550107373219, + "grad_norm": 7.662987232208252, + "learning_rate": 7.185346061913657e-06, + "loss": 0.5668, + "mean_token_accuracy": 0.8196876659989357, + "num_tokens": 48037456.0, + "step": 39990 + }, + { + "entropy": 1.832308356463909, + "epoch": 0.12399650019878189, + "grad_norm": 3.518549919128418, + "learning_rate": 7.18444781505581e-06, + "loss": 0.4718, + "mean_token_accuracy": 0.8477145195007324, + "num_tokens": 48049861.0, + "step": 40000 + }, + { + "entropy": 1.86460652500391, + "epoch": 0.12402749932383159, + "grad_norm": 8.867399215698242, + "learning_rate": 7.183549904985806e-06, + "loss": 0.5642, + "mean_token_accuracy": 0.82719586789608, + "num_tokens": 48062011.0, + "step": 40010 + }, + { + "entropy": 1.880849677324295, + "epoch": 0.12405849844888128, + "grad_norm": 4.325946807861328, + "learning_rate": 7.182652331493244e-06, + "loss": 0.5531, + "mean_token_accuracy": 0.8259584322571755, + "num_tokens": 48074386.0, + "step": 40020 + }, + { + "entropy": 1.9078103736042977, + "epoch": 0.12408949757393098, + "grad_norm": 10.76919937133789, + "learning_rate": 7.181755094367901e-06, + "loss": 0.507, + "mean_token_accuracy": 0.8391536310315132, + "num_tokens": 48085967.0, + "step": 40030 + }, + { + "entropy": 1.8335755363106727, + "epoch": 0.12412049669898068, + "grad_norm": 12.115399360656738, + "learning_rate": 7.1808581933997365e-06, + "loss": 0.4988, + "mean_token_accuracy": 0.833656270802021, + "num_tokens": 48098343.0, + "step": 40040 + }, + { + "entropy": 1.8345079183578492, + "epoch": 0.12415149582403037, + "grad_norm": 9.696667671203613, + "learning_rate": 7.1799616283789e-06, + "loss": 0.5002, + "mean_token_accuracy": 0.8341936364769935, + "num_tokens": 48111160.0, + "step": 40050 + }, + { + "entropy": 1.7580282986164093, + "epoch": 0.12418249494908007, + "grad_norm": 8.78324031829834, + "learning_rate": 7.179065399095719e-06, + "loss": 0.5413, + "mean_token_accuracy": 0.8408230692148209, + "num_tokens": 48125547.0, + "step": 40060 + }, + { + "entropy": 1.8306955844163895, + "epoch": 0.12421349407412975, + "grad_norm": 9.833135604858398, + "learning_rate": 7.178169505340706e-06, + "loss": 0.5345, + "mean_token_accuracy": 0.8345167860388756, + "num_tokens": 48137633.0, + "step": 40070 + }, + { + "entropy": 1.819261023402214, + "epoch": 0.12424449319917945, + "grad_norm": 13.903244972229004, + "learning_rate": 7.177273946904556e-06, + "loss": 0.4783, + "mean_token_accuracy": 0.8336721554398536, + "num_tokens": 48151380.0, + "step": 40080 + }, + { + "entropy": 1.8060119107365609, + "epoch": 0.12427549232422914, + "grad_norm": 3.684872627258301, + "learning_rate": 7.176378723578145e-06, + "loss": 0.4902, + "mean_token_accuracy": 0.8292297214269638, + "num_tokens": 48164473.0, + "step": 40090 + }, + { + "entropy": 1.8856921419501305, + "epoch": 0.12430649144927884, + "grad_norm": 10.007135391235352, + "learning_rate": 7.175483835152539e-06, + "loss": 0.5491, + "mean_token_accuracy": 0.8323625057935715, + "num_tokens": 48176696.0, + "step": 40100 + }, + { + "entropy": 1.7466407373547554, + "epoch": 0.12433749057432854, + "grad_norm": 4.418228626251221, + "learning_rate": 7.174589281418974e-06, + "loss": 0.4057, + "mean_token_accuracy": 0.8500324800610543, + "num_tokens": 48190217.0, + "step": 40110 + }, + { + "entropy": 1.9157506242394446, + "epoch": 0.12436848969937823, + "grad_norm": 8.385701179504395, + "learning_rate": 7.17369506216888e-06, + "loss": 0.5272, + "mean_token_accuracy": 0.825266569852829, + "num_tokens": 48201921.0, + "step": 40120 + }, + { + "entropy": 1.858496817946434, + "epoch": 0.12439948882442793, + "grad_norm": 4.637474060058594, + "learning_rate": 7.172801177193862e-06, + "loss": 0.5327, + "mean_token_accuracy": 0.8193719312548637, + "num_tokens": 48214518.0, + "step": 40130 + }, + { + "entropy": 1.8998822063207625, + "epoch": 0.12443048794947763, + "grad_norm": 9.965042114257812, + "learning_rate": 7.171907626285708e-06, + "loss": 0.505, + "mean_token_accuracy": 0.838045471906662, + "num_tokens": 48225714.0, + "step": 40140 + }, + { + "entropy": 1.8577251955866814, + "epoch": 0.12446148707452732, + "grad_norm": 9.38553237915039, + "learning_rate": 7.171014409236389e-06, + "loss": 0.5167, + "mean_token_accuracy": 0.8318695619702339, + "num_tokens": 48238441.0, + "step": 40150 + }, + { + "entropy": 1.9177394479513168, + "epoch": 0.12449248619957702, + "grad_norm": 8.650186538696289, + "learning_rate": 7.1701215258380555e-06, + "loss": 0.5711, + "mean_token_accuracy": 0.8154436364769936, + "num_tokens": 48250481.0, + "step": 40160 + }, + { + "entropy": 1.9266600027680396, + "epoch": 0.12452348532462672, + "grad_norm": 8.298869132995605, + "learning_rate": 7.169228975883042e-06, + "loss": 0.5794, + "mean_token_accuracy": 0.8180478677153588, + "num_tokens": 48262220.0, + "step": 40170 + }, + { + "entropy": 1.9378298938274383, + "epoch": 0.12455448444967641, + "grad_norm": 4.73800802230835, + "learning_rate": 7.16833675916386e-06, + "loss": 0.5535, + "mean_token_accuracy": 0.8223996505141258, + "num_tokens": 48274072.0, + "step": 40180 + }, + { + "entropy": 1.8640633895993233, + "epoch": 0.1245854835747261, + "grad_norm": 10.636198043823242, + "learning_rate": 7.167444875473203e-06, + "loss": 0.5658, + "mean_token_accuracy": 0.8277820602059365, + "num_tokens": 48286360.0, + "step": 40190 + }, + { + "entropy": 1.9393206879496574, + "epoch": 0.1246164826997758, + "grad_norm": 4.20045804977417, + "learning_rate": 7.166553324603949e-06, + "loss": 0.578, + "mean_token_accuracy": 0.8166181564331054, + "num_tokens": 48297250.0, + "step": 40200 + }, + { + "entropy": 1.8666782602667809, + "epoch": 0.12464748182482549, + "grad_norm": 9.612086296081543, + "learning_rate": 7.165662106349151e-06, + "loss": 0.5188, + "mean_token_accuracy": 0.8358882114291191, + "num_tokens": 48309504.0, + "step": 40210 + }, + { + "entropy": 1.8223158940672874, + "epoch": 0.12467848094987519, + "grad_norm": 8.702556610107422, + "learning_rate": 7.164771220502042e-06, + "loss": 0.5055, + "mean_token_accuracy": 0.8401806324720382, + "num_tokens": 48321758.0, + "step": 40220 + }, + { + "entropy": 1.8826921790838242, + "epoch": 0.12470948007492488, + "grad_norm": 8.89100456237793, + "learning_rate": 7.16388066685604e-06, + "loss": 0.5317, + "mean_token_accuracy": 0.823623314499855, + "num_tokens": 48334423.0, + "step": 40230 + }, + { + "entropy": 1.935545490682125, + "epoch": 0.12474047919997458, + "grad_norm": 8.564839363098145, + "learning_rate": 7.16299044520474e-06, + "loss": 0.552, + "mean_token_accuracy": 0.8268688634037972, + "num_tokens": 48345619.0, + "step": 40240 + }, + { + "entropy": 1.835379946231842, + "epoch": 0.12477147832502428, + "grad_norm": 8.765735626220703, + "learning_rate": 7.162100555341913e-06, + "loss": 0.4967, + "mean_token_accuracy": 0.8396931126713753, + "num_tokens": 48358114.0, + "step": 40250 + }, + { + "entropy": 1.8680355235934258, + "epoch": 0.12480247745007397, + "grad_norm": 10.267287254333496, + "learning_rate": 7.161210997061516e-06, + "loss": 0.6458, + "mean_token_accuracy": 0.8112869113683701, + "num_tokens": 48370421.0, + "step": 40260 + }, + { + "entropy": 1.8904762849211694, + "epoch": 0.12483347657512367, + "grad_norm": 10.460432052612305, + "learning_rate": 7.1603217701576784e-06, + "loss": 0.5603, + "mean_token_accuracy": 0.8190462201833725, + "num_tokens": 48382227.0, + "step": 40270 + }, + { + "entropy": 1.8971378430724144, + "epoch": 0.12486447570017337, + "grad_norm": 7.277894020080566, + "learning_rate": 7.159432874424715e-06, + "loss": 0.5434, + "mean_token_accuracy": 0.8331350639462471, + "num_tokens": 48393629.0, + "step": 40280 + }, + { + "entropy": 1.941260239481926, + "epoch": 0.12489547482522306, + "grad_norm": 9.197176933288574, + "learning_rate": 7.158544309657114e-06, + "loss": 0.6038, + "mean_token_accuracy": 0.8198348104953765, + "num_tokens": 48404836.0, + "step": 40290 + }, + { + "entropy": 1.834352783858776, + "epoch": 0.12492647395027276, + "grad_norm": 4.567273139953613, + "learning_rate": 7.157656075649543e-06, + "loss": 0.556, + "mean_token_accuracy": 0.8348585799336433, + "num_tokens": 48416706.0, + "step": 40300 + }, + { + "entropy": 1.7823708355426788, + "epoch": 0.12495747307532244, + "grad_norm": 4.625446796417236, + "learning_rate": 7.1567681721968504e-06, + "loss": 0.4906, + "mean_token_accuracy": 0.8396563857793808, + "num_tokens": 48430018.0, + "step": 40310 + }, + { + "entropy": 1.8015142977237701, + "epoch": 0.12498847220037214, + "grad_norm": 10.068672180175781, + "learning_rate": 7.155880599094063e-06, + "loss": 0.5809, + "mean_token_accuracy": 0.8157304137945175, + "num_tokens": 48442115.0, + "step": 40320 + }, + { + "entropy": 1.8101561158895492, + "epoch": 0.12501947132542185, + "grad_norm": 9.83664321899414, + "learning_rate": 7.154993356136379e-06, + "loss": 0.5194, + "mean_token_accuracy": 0.8348063215613365, + "num_tokens": 48454395.0, + "step": 40330 + }, + { + "entropy": 1.8592476963996887, + "epoch": 0.12505047045047155, + "grad_norm": 8.112977027893066, + "learning_rate": 7.154106443119184e-06, + "loss": 0.525, + "mean_token_accuracy": 0.8234350681304932, + "num_tokens": 48466396.0, + "step": 40340 + }, + { + "entropy": 1.8576691791415214, + "epoch": 0.12508146957552124, + "grad_norm": 4.848905086517334, + "learning_rate": 7.153219859838033e-06, + "loss": 0.5681, + "mean_token_accuracy": 0.8281613394618035, + "num_tokens": 48479521.0, + "step": 40350 + }, + { + "entropy": 1.831693847477436, + "epoch": 0.12511246870057094, + "grad_norm": 7.906863689422607, + "learning_rate": 7.152333606088664e-06, + "loss": 0.5156, + "mean_token_accuracy": 0.8364334732294083, + "num_tokens": 48492267.0, + "step": 40360 + }, + { + "entropy": 1.805116631090641, + "epoch": 0.12514346782562064, + "grad_norm": 7.8519415855407715, + "learning_rate": 7.151447681666986e-06, + "loss": 0.5006, + "mean_token_accuracy": 0.8286394074559211, + "num_tokens": 48505894.0, + "step": 40370 + }, + { + "entropy": 1.8447122775018214, + "epoch": 0.1251744669506703, + "grad_norm": 7.955524444580078, + "learning_rate": 7.150562086369092e-06, + "loss": 0.4819, + "mean_token_accuracy": 0.8388951927423477, + "num_tokens": 48518153.0, + "step": 40380 + }, + { + "entropy": 1.8237828686833382, + "epoch": 0.12520546607572, + "grad_norm": 10.909595489501953, + "learning_rate": 7.149676819991247e-06, + "loss": 0.5013, + "mean_token_accuracy": 0.835619455575943, + "num_tokens": 48530887.0, + "step": 40390 + }, + { + "entropy": 1.9085283294320106, + "epoch": 0.1252364652007697, + "grad_norm": 9.184427261352539, + "learning_rate": 7.148791882329893e-06, + "loss": 0.5645, + "mean_token_accuracy": 0.8193691223859787, + "num_tokens": 48542762.0, + "step": 40400 + }, + { + "entropy": 1.8631555259227752, + "epoch": 0.1252674643258194, + "grad_norm": 9.436893463134766, + "learning_rate": 7.147907273181649e-06, + "loss": 0.5495, + "mean_token_accuracy": 0.8169412419199944, + "num_tokens": 48555091.0, + "step": 40410 + }, + { + "entropy": 1.836050059646368, + "epoch": 0.1252984634508691, + "grad_norm": 10.38294792175293, + "learning_rate": 7.1470229923433125e-06, + "loss": 0.5127, + "mean_token_accuracy": 0.8353980749845504, + "num_tokens": 48567717.0, + "step": 40420 + }, + { + "entropy": 1.9533065795898437, + "epoch": 0.1253294625759188, + "grad_norm": 3.965822696685791, + "learning_rate": 7.146139039611852e-06, + "loss": 0.5775, + "mean_token_accuracy": 0.8248067319393158, + "num_tokens": 48579059.0, + "step": 40430 + }, + { + "entropy": 1.8541604191064835, + "epoch": 0.12536046170096848, + "grad_norm": 8.920310020446777, + "learning_rate": 7.1452554147844155e-06, + "loss": 0.5358, + "mean_token_accuracy": 0.8245360970497131, + "num_tokens": 48591120.0, + "step": 40440 + }, + { + "entropy": 1.8779646888375283, + "epoch": 0.12539146082601818, + "grad_norm": 8.486520767211914, + "learning_rate": 7.144372117658325e-06, + "loss": 0.4997, + "mean_token_accuracy": 0.8279622659087181, + "num_tokens": 48603019.0, + "step": 40450 + }, + { + "entropy": 1.9320873647928238, + "epoch": 0.12542245995106788, + "grad_norm": 11.511739730834961, + "learning_rate": 7.143489148031079e-06, + "loss": 0.5977, + "mean_token_accuracy": 0.8194258600473404, + "num_tokens": 48615247.0, + "step": 40460 + }, + { + "entropy": 1.9659615710377694, + "epoch": 0.12545345907611757, + "grad_norm": 9.763450622558594, + "learning_rate": 7.142606505700348e-06, + "loss": 0.6165, + "mean_token_accuracy": 0.8184546142816543, + "num_tokens": 48626590.0, + "step": 40470 + }, + { + "entropy": 1.9306480765342713, + "epoch": 0.12548445820116727, + "grad_norm": 8.713022232055664, + "learning_rate": 7.141724190463982e-06, + "loss": 0.5698, + "mean_token_accuracy": 0.8337675094604492, + "num_tokens": 48637473.0, + "step": 40480 + }, + { + "entropy": 1.798674686253071, + "epoch": 0.12551545732621697, + "grad_norm": 11.227383613586426, + "learning_rate": 7.140842202120004e-06, + "loss": 0.4768, + "mean_token_accuracy": 0.8351314097642899, + "num_tokens": 48651671.0, + "step": 40490 + }, + { + "entropy": 1.9205691695213318, + "epoch": 0.12554645645126666, + "grad_norm": 8.593749046325684, + "learning_rate": 7.139960540466611e-06, + "loss": 0.5708, + "mean_token_accuracy": 0.8320924565196037, + "num_tokens": 48662844.0, + "step": 40500 + }, + { + "entropy": 1.9018804863095284, + "epoch": 0.12557745557631636, + "grad_norm": 9.196910858154297, + "learning_rate": 7.13907920530217e-06, + "loss": 0.5599, + "mean_token_accuracy": 0.8320263445377349, + "num_tokens": 48674513.0, + "step": 40510 + }, + { + "entropy": 1.8978378668427467, + "epoch": 0.12560845470136606, + "grad_norm": 7.797028064727783, + "learning_rate": 7.138198196425235e-06, + "loss": 0.541, + "mean_token_accuracy": 0.826276271045208, + "num_tokens": 48686033.0, + "step": 40520 + }, + { + "entropy": 1.8717727780342102, + "epoch": 0.12563945382641575, + "grad_norm": 8.808572769165039, + "learning_rate": 7.137317513634519e-06, + "loss": 0.5139, + "mean_token_accuracy": 0.827865032851696, + "num_tokens": 48698508.0, + "step": 40530 + }, + { + "entropy": 1.8457770988345146, + "epoch": 0.12567045295146545, + "grad_norm": 8.519002914428711, + "learning_rate": 7.136437156728917e-06, + "loss": 0.5143, + "mean_token_accuracy": 0.8214613869786263, + "num_tokens": 48711134.0, + "step": 40540 + }, + { + "entropy": 1.935419850051403, + "epoch": 0.12570145207651515, + "grad_norm": 8.7819185256958, + "learning_rate": 7.135557125507497e-06, + "loss": 0.52, + "mean_token_accuracy": 0.8435436561703682, + "num_tokens": 48722163.0, + "step": 40550 + }, + { + "entropy": 1.8241950839757919, + "epoch": 0.12573245120156484, + "grad_norm": 3.9725289344787598, + "learning_rate": 7.134677419769499e-06, + "loss": 0.451, + "mean_token_accuracy": 0.8425084307789803, + "num_tokens": 48735211.0, + "step": 40560 + }, + { + "entropy": 1.8317349657416344, + "epoch": 0.12576345032661454, + "grad_norm": 3.390087842941284, + "learning_rate": 7.133798039314337e-06, + "loss": 0.4573, + "mean_token_accuracy": 0.8422825932502747, + "num_tokens": 48748014.0, + "step": 40570 + }, + { + "entropy": 1.7657338783144951, + "epoch": 0.12579444945166424, + "grad_norm": 8.908466339111328, + "learning_rate": 7.1329189839415956e-06, + "loss": 0.4756, + "mean_token_accuracy": 0.8356280282139779, + "num_tokens": 48761505.0, + "step": 40580 + }, + { + "entropy": 1.8681147009134293, + "epoch": 0.12582544857671393, + "grad_norm": 9.203214645385742, + "learning_rate": 7.132040253451038e-06, + "loss": 0.5362, + "mean_token_accuracy": 0.8317851752042771, + "num_tokens": 48773868.0, + "step": 40590 + }, + { + "entropy": 1.8376054048538208, + "epoch": 0.12585644770176363, + "grad_norm": 4.301458835601807, + "learning_rate": 7.131161847642594e-06, + "loss": 0.4825, + "mean_token_accuracy": 0.8354909658432007, + "num_tokens": 48786722.0, + "step": 40600 + }, + { + "entropy": 1.968809324502945, + "epoch": 0.12588744682681333, + "grad_norm": 9.997700691223145, + "learning_rate": 7.130283766316368e-06, + "loss": 0.6149, + "mean_token_accuracy": 0.8223230019211769, + "num_tokens": 48797626.0, + "step": 40610 + }, + { + "entropy": 1.9517834931612015, + "epoch": 0.125918445951863, + "grad_norm": 8.660726547241211, + "learning_rate": 7.1294060092726395e-06, + "loss": 0.6263, + "mean_token_accuracy": 0.8167603313922882, + "num_tokens": 48809941.0, + "step": 40620 + }, + { + "entropy": 1.854266294836998, + "epoch": 0.1259494450769127, + "grad_norm": 8.519408226013184, + "learning_rate": 7.128528576311854e-06, + "loss": 0.5019, + "mean_token_accuracy": 0.8381375819444656, + "num_tokens": 48822363.0, + "step": 40630 + }, + { + "entropy": 1.8118067890405656, + "epoch": 0.1259804442019624, + "grad_norm": 10.48330020904541, + "learning_rate": 7.127651467234633e-06, + "loss": 0.5479, + "mean_token_accuracy": 0.8347623705863952, + "num_tokens": 48835835.0, + "step": 40640 + }, + { + "entropy": 1.8399090513586998, + "epoch": 0.12601144332701208, + "grad_norm": 9.834306716918945, + "learning_rate": 7.12677468184177e-06, + "loss": 0.4583, + "mean_token_accuracy": 0.8451863750815392, + "num_tokens": 48848639.0, + "step": 40650 + }, + { + "entropy": 1.8340865150094032, + "epoch": 0.12604244245206178, + "grad_norm": 10.710108757019043, + "learning_rate": 7.125898219934229e-06, + "loss": 0.4604, + "mean_token_accuracy": 0.8310162261128425, + "num_tokens": 48861498.0, + "step": 40660 + }, + { + "entropy": 1.9059862732887267, + "epoch": 0.12607344157711148, + "grad_norm": 8.74216365814209, + "learning_rate": 7.125022081313144e-06, + "loss": 0.5565, + "mean_token_accuracy": 0.8258930623531342, + "num_tokens": 48873676.0, + "step": 40670 + }, + { + "entropy": 1.8104983791708946, + "epoch": 0.12610444070216117, + "grad_norm": 3.50832462310791, + "learning_rate": 7.124146265779823e-06, + "loss": 0.5084, + "mean_token_accuracy": 0.8388922065496445, + "num_tokens": 48887359.0, + "step": 40680 + }, + { + "entropy": 1.886466035246849, + "epoch": 0.12613543982721087, + "grad_norm": 9.473098754882812, + "learning_rate": 7.123270773135742e-06, + "loss": 0.5395, + "mean_token_accuracy": 0.8378918841481209, + "num_tokens": 48898642.0, + "step": 40690 + }, + { + "entropy": 1.9465019404888153, + "epoch": 0.12616643895226057, + "grad_norm": 8.107137680053711, + "learning_rate": 7.12239560318255e-06, + "loss": 0.5828, + "mean_token_accuracy": 0.8316583067178727, + "num_tokens": 48909355.0, + "step": 40700 + }, + { + "entropy": 1.864406055212021, + "epoch": 0.12619743807731026, + "grad_norm": 7.923045635223389, + "learning_rate": 7.121520755722065e-06, + "loss": 0.5385, + "mean_token_accuracy": 0.8234068945050239, + "num_tokens": 48921713.0, + "step": 40710 + }, + { + "entropy": 1.917890764772892, + "epoch": 0.12622843720235996, + "grad_norm": 8.965694427490234, + "learning_rate": 7.120646230556275e-06, + "loss": 0.5322, + "mean_token_accuracy": 0.8340031638741493, + "num_tokens": 48933362.0, + "step": 40720 + }, + { + "entropy": 1.9002999886870384, + "epoch": 0.12625943632740966, + "grad_norm": 9.010135650634766, + "learning_rate": 7.119772027487341e-06, + "loss": 0.5219, + "mean_token_accuracy": 0.8289919942617416, + "num_tokens": 48945208.0, + "step": 40730 + }, + { + "entropy": 1.862901757657528, + "epoch": 0.12629043545245935, + "grad_norm": 9.43112564086914, + "learning_rate": 7.118898146317591e-06, + "loss": 0.5057, + "mean_token_accuracy": 0.8338583365082741, + "num_tokens": 48957105.0, + "step": 40740 + }, + { + "entropy": 1.8847228810191154, + "epoch": 0.12632143457750905, + "grad_norm": 4.666851043701172, + "learning_rate": 7.118024586849524e-06, + "loss": 0.5383, + "mean_token_accuracy": 0.8274060249328613, + "num_tokens": 48970207.0, + "step": 40750 + }, + { + "entropy": 1.8614213794469834, + "epoch": 0.12635243370255875, + "grad_norm": 8.402881622314453, + "learning_rate": 7.117151348885809e-06, + "loss": 0.5099, + "mean_token_accuracy": 0.8313090309500695, + "num_tokens": 48983272.0, + "step": 40760 + }, + { + "entropy": 1.8629606261849403, + "epoch": 0.12638343282760844, + "grad_norm": 3.4327898025512695, + "learning_rate": 7.116278432229283e-06, + "loss": 0.4715, + "mean_token_accuracy": 0.8382740557193756, + "num_tokens": 48995280.0, + "step": 40770 + }, + { + "entropy": 1.866362100839615, + "epoch": 0.12641443195265814, + "grad_norm": 6.627174377441406, + "learning_rate": 7.1154058366829534e-06, + "loss": 0.4933, + "mean_token_accuracy": 0.8405808374285698, + "num_tokens": 49007496.0, + "step": 40780 + }, + { + "entropy": 1.9819986671209335, + "epoch": 0.12644543107770784, + "grad_norm": 9.322339057922363, + "learning_rate": 7.114533562049997e-06, + "loss": 0.6276, + "mean_token_accuracy": 0.8162664338946343, + "num_tokens": 49018388.0, + "step": 40790 + }, + { + "entropy": 1.849839760363102, + "epoch": 0.12647643020275753, + "grad_norm": 3.9357082843780518, + "learning_rate": 7.113661608133757e-06, + "loss": 0.5208, + "mean_token_accuracy": 0.8274443417787551, + "num_tokens": 49030285.0, + "step": 40800 + }, + { + "entropy": 1.8829088985919953, + "epoch": 0.12650742932780723, + "grad_norm": 7.758213996887207, + "learning_rate": 7.112789974737751e-06, + "loss": 0.5549, + "mean_token_accuracy": 0.831216461956501, + "num_tokens": 49042179.0, + "step": 40810 + }, + { + "entropy": 1.9304823964834212, + "epoch": 0.12653842845285693, + "grad_norm": 8.798954963684082, + "learning_rate": 7.1119186616656555e-06, + "loss": 0.5907, + "mean_token_accuracy": 0.8195964187383652, + "num_tokens": 49054008.0, + "step": 40820 + }, + { + "entropy": 1.9207462221384048, + "epoch": 0.12656942757790662, + "grad_norm": 9.050512313842773, + "learning_rate": 7.111047668721327e-06, + "loss": 0.575, + "mean_token_accuracy": 0.8204113721847535, + "num_tokens": 49066034.0, + "step": 40830 + }, + { + "entropy": 1.908941000699997, + "epoch": 0.12660042670295632, + "grad_norm": 4.945900917053223, + "learning_rate": 7.11017699570878e-06, + "loss": 0.559, + "mean_token_accuracy": 0.8230469167232514, + "num_tokens": 49077639.0, + "step": 40840 + }, + { + "entropy": 1.8663305729627608, + "epoch": 0.12663142582800602, + "grad_norm": 8.779427528381348, + "learning_rate": 7.109306642432202e-06, + "loss": 0.4799, + "mean_token_accuracy": 0.8439178988337517, + "num_tokens": 49090125.0, + "step": 40850 + }, + { + "entropy": 1.9162701606750487, + "epoch": 0.1266624249530557, + "grad_norm": 9.195842742919922, + "learning_rate": 7.108436608695949e-06, + "loss": 0.5807, + "mean_token_accuracy": 0.8183003440499306, + "num_tokens": 49101421.0, + "step": 40860 + }, + { + "entropy": 1.9350932016968727, + "epoch": 0.12669342407810538, + "grad_norm": 9.8731689453125, + "learning_rate": 7.10756689430454e-06, + "loss": 0.6017, + "mean_token_accuracy": 0.8179444178938866, + "num_tokens": 49113375.0, + "step": 40870 + }, + { + "entropy": 1.9218543514609336, + "epoch": 0.12672442320315508, + "grad_norm": 8.230029106140137, + "learning_rate": 7.106697499062666e-06, + "loss": 0.5615, + "mean_token_accuracy": 0.8303100064396858, + "num_tokens": 49124773.0, + "step": 40880 + }, + { + "entropy": 1.8881905749440193, + "epoch": 0.12675542232820478, + "grad_norm": 5.785170555114746, + "learning_rate": 7.105828422775184e-06, + "loss": 0.5658, + "mean_token_accuracy": 0.8263862758874894, + "num_tokens": 49137292.0, + "step": 40890 + }, + { + "entropy": 1.8283898428082466, + "epoch": 0.12678642145325447, + "grad_norm": 10.07806396484375, + "learning_rate": 7.1049596652471145e-06, + "loss": 0.4984, + "mean_token_accuracy": 0.8381018027663231, + "num_tokens": 49149805.0, + "step": 40900 + }, + { + "entropy": 1.8706416577100753, + "epoch": 0.12681742057830417, + "grad_norm": 8.058819770812988, + "learning_rate": 7.104091226283651e-06, + "loss": 0.5054, + "mean_token_accuracy": 0.84037394374609, + "num_tokens": 49160775.0, + "step": 40910 + }, + { + "entropy": 1.8797523587942124, + "epoch": 0.12684841970335387, + "grad_norm": 8.583271026611328, + "learning_rate": 7.103223105690148e-06, + "loss": 0.5492, + "mean_token_accuracy": 0.8319562718272209, + "num_tokens": 49172562.0, + "step": 40920 + }, + { + "entropy": 1.81065763682127, + "epoch": 0.12687941882840356, + "grad_norm": 9.554974555969238, + "learning_rate": 7.1023553032721315e-06, + "loss": 0.5111, + "mean_token_accuracy": 0.839790141582489, + "num_tokens": 49185408.0, + "step": 40930 + }, + { + "entropy": 1.9385173588991165, + "epoch": 0.12691041795345326, + "grad_norm": 9.629557609558105, + "learning_rate": 7.101487818835289e-06, + "loss": 0.5379, + "mean_token_accuracy": 0.8281932666897773, + "num_tokens": 49197278.0, + "step": 40940 + }, + { + "entropy": 1.8193309232592583, + "epoch": 0.12694141707850295, + "grad_norm": 8.458683013916016, + "learning_rate": 7.100620652185476e-06, + "loss": 0.451, + "mean_token_accuracy": 0.8422279581427574, + "num_tokens": 49210376.0, + "step": 40950 + }, + { + "entropy": 1.8436155632138251, + "epoch": 0.12697241620355265, + "grad_norm": 10.37833023071289, + "learning_rate": 7.099753803128716e-06, + "loss": 0.5406, + "mean_token_accuracy": 0.8349693238735199, + "num_tokens": 49222925.0, + "step": 40960 + }, + { + "entropy": 1.836123764514923, + "epoch": 0.12700341532860235, + "grad_norm": 9.300790786743164, + "learning_rate": 7.0988872714711934e-06, + "loss": 0.4906, + "mean_token_accuracy": 0.8327738150954247, + "num_tokens": 49235480.0, + "step": 40970 + }, + { + "entropy": 1.8801463842391968, + "epoch": 0.12703441445365204, + "grad_norm": 5.394232749938965, + "learning_rate": 7.098021057019264e-06, + "loss": 0.541, + "mean_token_accuracy": 0.8303487420082092, + "num_tokens": 49247573.0, + "step": 40980 + }, + { + "entropy": 1.8794305130839348, + "epoch": 0.12706541357870174, + "grad_norm": 9.10645580291748, + "learning_rate": 7.097155159579446e-06, + "loss": 0.52, + "mean_token_accuracy": 0.8420027121901512, + "num_tokens": 49258928.0, + "step": 40990 + }, + { + "entropy": 1.9318907380104064, + "epoch": 0.12709641270375144, + "grad_norm": 9.83584976196289, + "learning_rate": 7.09628957895842e-06, + "loss": 0.5834, + "mean_token_accuracy": 0.8214644491672516, + "num_tokens": 49270094.0, + "step": 41000 + }, + { + "entropy": 1.8540407776832581, + "epoch": 0.12712741182880113, + "grad_norm": 9.077452659606934, + "learning_rate": 7.095424314963037e-06, + "loss": 0.4985, + "mean_token_accuracy": 0.8308973863720894, + "num_tokens": 49282965.0, + "step": 41010 + }, + { + "entropy": 1.8202777698636055, + "epoch": 0.12715841095385083, + "grad_norm": 4.704195976257324, + "learning_rate": 7.094559367400309e-06, + "loss": 0.5531, + "mean_token_accuracy": 0.8255807936191559, + "num_tokens": 49296075.0, + "step": 41020 + }, + { + "entropy": 1.9383866339921951, + "epoch": 0.12718941007890053, + "grad_norm": 9.143877983093262, + "learning_rate": 7.093694736077415e-06, + "loss": 0.5269, + "mean_token_accuracy": 0.8385669678449631, + "num_tokens": 49306680.0, + "step": 41030 + }, + { + "entropy": 1.742047442495823, + "epoch": 0.12722040920395022, + "grad_norm": 8.927555084228516, + "learning_rate": 7.092830420801696e-06, + "loss": 0.4322, + "mean_token_accuracy": 0.8529659613966942, + "num_tokens": 49320860.0, + "step": 41040 + }, + { + "entropy": 1.8793509498238563, + "epoch": 0.12725140832899992, + "grad_norm": 8.80550765991211, + "learning_rate": 7.09196642138066e-06, + "loss": 0.5804, + "mean_token_accuracy": 0.819065073132515, + "num_tokens": 49332538.0, + "step": 41050 + }, + { + "entropy": 1.8639131098985673, + "epoch": 0.12728240745404962, + "grad_norm": 8.924798965454102, + "learning_rate": 7.091102737621975e-06, + "loss": 0.5412, + "mean_token_accuracy": 0.829890587925911, + "num_tokens": 49345210.0, + "step": 41060 + }, + { + "entropy": 1.8677813604474067, + "epoch": 0.12731340657909931, + "grad_norm": 8.423454284667969, + "learning_rate": 7.0902393693334806e-06, + "loss": 0.4858, + "mean_token_accuracy": 0.8379834398627282, + "num_tokens": 49357016.0, + "step": 41070 + }, + { + "entropy": 1.9531373485922814, + "epoch": 0.127344405704149, + "grad_norm": 10.435996055603027, + "learning_rate": 7.089376316323171e-06, + "loss": 0.5963, + "mean_token_accuracy": 0.8160718679428101, + "num_tokens": 49368452.0, + "step": 41080 + }, + { + "entropy": 1.9246175095438958, + "epoch": 0.1273754048291987, + "grad_norm": 8.71939468383789, + "learning_rate": 7.088513578399207e-06, + "loss": 0.562, + "mean_token_accuracy": 0.8220181420445443, + "num_tokens": 49380204.0, + "step": 41090 + }, + { + "entropy": 1.8995565429329873, + "epoch": 0.1274064039542484, + "grad_norm": 9.864094734191895, + "learning_rate": 7.08765115536992e-06, + "loss": 0.5459, + "mean_token_accuracy": 0.8301453128457069, + "num_tokens": 49392226.0, + "step": 41100 + }, + { + "entropy": 1.900149242579937, + "epoch": 0.1274374030792981, + "grad_norm": 8.183351516723633, + "learning_rate": 7.086789047043793e-06, + "loss": 0.5465, + "mean_token_accuracy": 0.8305352702736855, + "num_tokens": 49404110.0, + "step": 41110 + }, + { + "entropy": 1.9237231731414794, + "epoch": 0.12746840220434777, + "grad_norm": 8.36330509185791, + "learning_rate": 7.08592725322948e-06, + "loss": 0.5263, + "mean_token_accuracy": 0.8335365414619446, + "num_tokens": 49416490.0, + "step": 41120 + }, + { + "entropy": 1.9189850777387618, + "epoch": 0.12749940132939747, + "grad_norm": 9.289202690124512, + "learning_rate": 7.085065773735793e-06, + "loss": 0.5032, + "mean_token_accuracy": 0.8426647335290909, + "num_tokens": 49427534.0, + "step": 41130 + }, + { + "entropy": 1.8757270485162736, + "epoch": 0.12753040045444716, + "grad_norm": 9.52793025970459, + "learning_rate": 7.084204608371712e-06, + "loss": 0.5224, + "mean_token_accuracy": 0.8288914114236832, + "num_tokens": 49439317.0, + "step": 41140 + }, + { + "entropy": 1.8509258836507798, + "epoch": 0.12756139957949686, + "grad_norm": 5.029160976409912, + "learning_rate": 7.083343756946375e-06, + "loss": 0.5584, + "mean_token_accuracy": 0.8297637715935707, + "num_tokens": 49451446.0, + "step": 41150 + }, + { + "entropy": 1.9340157762169838, + "epoch": 0.12759239870454656, + "grad_norm": 11.50483226776123, + "learning_rate": 7.082483219269084e-06, + "loss": 0.5791, + "mean_token_accuracy": 0.8197537034749984, + "num_tokens": 49463109.0, + "step": 41160 + }, + { + "entropy": 1.8594611391425133, + "epoch": 0.12762339782959625, + "grad_norm": 7.6252899169921875, + "learning_rate": 7.081622995149303e-06, + "loss": 0.5265, + "mean_token_accuracy": 0.8369016021490097, + "num_tokens": 49474727.0, + "step": 41170 + }, + { + "entropy": 1.800793182849884, + "epoch": 0.12765439695464595, + "grad_norm": 1.9972714185714722, + "learning_rate": 7.080763084396659e-06, + "loss": 0.5289, + "mean_token_accuracy": 0.8380374610424042, + "num_tokens": 49488788.0, + "step": 41180 + }, + { + "entropy": 1.9101984828710556, + "epoch": 0.12768539607969565, + "grad_norm": 10.091278076171875, + "learning_rate": 7.0799034868209375e-06, + "loss": 0.5464, + "mean_token_accuracy": 0.8259019926190376, + "num_tokens": 49501586.0, + "step": 41190 + }, + { + "entropy": 1.8499779477715492, + "epoch": 0.12771639520474534, + "grad_norm": 2.6960954666137695, + "learning_rate": 7.079044202232089e-06, + "loss": 0.5091, + "mean_token_accuracy": 0.833436419069767, + "num_tokens": 49514195.0, + "step": 41200 + }, + { + "entropy": 1.825632943212986, + "epoch": 0.12774739432979504, + "grad_norm": 9.283682823181152, + "learning_rate": 7.078185230440225e-06, + "loss": 0.4755, + "mean_token_accuracy": 0.8395808920264244, + "num_tokens": 49525989.0, + "step": 41210 + }, + { + "entropy": 1.9196676731109619, + "epoch": 0.12777839345484474, + "grad_norm": 8.81406021118164, + "learning_rate": 7.0773265712556175e-06, + "loss": 0.6053, + "mean_token_accuracy": 0.8251327946782112, + "num_tokens": 49536614.0, + "step": 41220 + }, + { + "entropy": 1.8985184118151666, + "epoch": 0.12780939257989443, + "grad_norm": 9.597168922424316, + "learning_rate": 7.076468224488697e-06, + "loss": 0.5692, + "mean_token_accuracy": 0.821132518351078, + "num_tokens": 49548013.0, + "step": 41230 + }, + { + "entropy": 1.8932278618216514, + "epoch": 0.12784039170494413, + "grad_norm": 8.59653091430664, + "learning_rate": 7.075610189950059e-06, + "loss": 0.5927, + "mean_token_accuracy": 0.817491953074932, + "num_tokens": 49559982.0, + "step": 41240 + }, + { + "entropy": 1.9004715710878373, + "epoch": 0.12787139082999383, + "grad_norm": 9.525247573852539, + "learning_rate": 7.074752467450462e-06, + "loss": 0.5614, + "mean_token_accuracy": 0.8346248850226402, + "num_tokens": 49571257.0, + "step": 41250 + }, + { + "entropy": 1.9199392378330231, + "epoch": 0.12790238995504352, + "grad_norm": 9.400187492370605, + "learning_rate": 7.073895056800815e-06, + "loss": 0.5788, + "mean_token_accuracy": 0.8198289066553116, + "num_tokens": 49582933.0, + "step": 41260 + }, + { + "entropy": 1.9200360536575318, + "epoch": 0.12793338908009322, + "grad_norm": 8.804792404174805, + "learning_rate": 7.0730379578121956e-06, + "loss": 0.5771, + "mean_token_accuracy": 0.8220379948616028, + "num_tokens": 49594138.0, + "step": 41270 + }, + { + "entropy": 1.9512885123491288, + "epoch": 0.12796438820514291, + "grad_norm": 10.201251983642578, + "learning_rate": 7.0721811702958406e-06, + "loss": 0.5598, + "mean_token_accuracy": 0.8327943772077561, + "num_tokens": 49604773.0, + "step": 41280 + }, + { + "entropy": 1.9171435490250588, + "epoch": 0.1279953873301926, + "grad_norm": 10.569693565368652, + "learning_rate": 7.071324694063147e-06, + "loss": 0.5814, + "mean_token_accuracy": 0.8239866450428963, + "num_tokens": 49615737.0, + "step": 41290 + }, + { + "entropy": 1.8590099811553955, + "epoch": 0.1280263864552423, + "grad_norm": 11.221964836120605, + "learning_rate": 7.070468528925668e-06, + "loss": 0.5038, + "mean_token_accuracy": 0.8409821361303329, + "num_tokens": 49627714.0, + "step": 41300 + }, + { + "entropy": 1.842246988415718, + "epoch": 0.128057385580292, + "grad_norm": 3.050896406173706, + "learning_rate": 7.06961267469512e-06, + "loss": 0.5573, + "mean_token_accuracy": 0.8346633806824684, + "num_tokens": 49640153.0, + "step": 41310 + }, + { + "entropy": 1.8636904895305633, + "epoch": 0.1280883847053417, + "grad_norm": 8.947188377380371, + "learning_rate": 7.068757131183378e-06, + "loss": 0.5331, + "mean_token_accuracy": 0.8275300249457359, + "num_tokens": 49652387.0, + "step": 41320 + }, + { + "entropy": 1.8928270608186721, + "epoch": 0.1281193838303914, + "grad_norm": 9.10622501373291, + "learning_rate": 7.067901898202475e-06, + "loss": 0.5765, + "mean_token_accuracy": 0.8179589688777924, + "num_tokens": 49664065.0, + "step": 41330 + }, + { + "entropy": 1.9435312688350677, + "epoch": 0.1281503829554411, + "grad_norm": 8.725778579711914, + "learning_rate": 7.067046975564605e-06, + "loss": 0.6324, + "mean_token_accuracy": 0.8186397060751915, + "num_tokens": 49674905.0, + "step": 41340 + }, + { + "entropy": 1.9112671226263047, + "epoch": 0.1281813820804908, + "grad_norm": 8.562727928161621, + "learning_rate": 7.066192363082123e-06, + "loss": 0.5768, + "mean_token_accuracy": 0.8293170660734177, + "num_tokens": 49686743.0, + "step": 41350 + }, + { + "entropy": 1.8668039426207543, + "epoch": 0.12821238120554046, + "grad_norm": 8.329222679138184, + "learning_rate": 7.0653380605675344e-06, + "loss": 0.5288, + "mean_token_accuracy": 0.8435063764452935, + "num_tokens": 49697557.0, + "step": 41360 + }, + { + "entropy": 1.8395963311195374, + "epoch": 0.12824338033059016, + "grad_norm": 2.403559684753418, + "learning_rate": 7.064484067833515e-06, + "loss": 0.5249, + "mean_token_accuracy": 0.8410920441150666, + "num_tokens": 49710159.0, + "step": 41370 + }, + { + "entropy": 1.9411360412836074, + "epoch": 0.12827437945563985, + "grad_norm": 8.784658432006836, + "learning_rate": 7.063630384692888e-06, + "loss": 0.6002, + "mean_token_accuracy": 0.824271696805954, + "num_tokens": 49720451.0, + "step": 41380 + }, + { + "entropy": 1.8381685689091682, + "epoch": 0.12830537858068955, + "grad_norm": 7.8702497482299805, + "learning_rate": 7.062777010958642e-06, + "loss": 0.4757, + "mean_token_accuracy": 0.8503654897212982, + "num_tokens": 49732913.0, + "step": 41390 + }, + { + "entropy": 1.9066902339458465, + "epoch": 0.12833637770573925, + "grad_norm": 8.476966857910156, + "learning_rate": 7.061923946443923e-06, + "loss": 0.5498, + "mean_token_accuracy": 0.8252906069159508, + "num_tokens": 49744489.0, + "step": 41400 + }, + { + "entropy": 1.8539624221622943, + "epoch": 0.12836737683078894, + "grad_norm": 9.746798515319824, + "learning_rate": 7.061071190962031e-06, + "loss": 0.5353, + "mean_token_accuracy": 0.8306064233183861, + "num_tokens": 49757061.0, + "step": 41410 + }, + { + "entropy": 1.8506083533167839, + "epoch": 0.12839837595583864, + "grad_norm": 9.114572525024414, + "learning_rate": 7.060218744326428e-06, + "loss": 0.4466, + "mean_token_accuracy": 0.8507398188114166, + "num_tokens": 49769261.0, + "step": 41420 + }, + { + "entropy": 1.8710011199116707, + "epoch": 0.12842937508088834, + "grad_norm": 4.284439563751221, + "learning_rate": 7.059366606350731e-06, + "loss": 0.5173, + "mean_token_accuracy": 0.8323528245091438, + "num_tokens": 49781442.0, + "step": 41430 + }, + { + "entropy": 1.9052658289670945, + "epoch": 0.12846037420593803, + "grad_norm": 8.909741401672363, + "learning_rate": 7.0585147768487165e-06, + "loss": 0.6048, + "mean_token_accuracy": 0.825349123775959, + "num_tokens": 49794445.0, + "step": 41440 + }, + { + "entropy": 1.911386439204216, + "epoch": 0.12849137333098773, + "grad_norm": 9.72156810760498, + "learning_rate": 7.057663255634316e-06, + "loss": 0.5451, + "mean_token_accuracy": 0.8314669832587243, + "num_tokens": 49806414.0, + "step": 41450 + }, + { + "entropy": 1.8788209095597268, + "epoch": 0.12852237245603743, + "grad_norm": 4.481322288513184, + "learning_rate": 7.056812042521619e-06, + "loss": 0.4999, + "mean_token_accuracy": 0.8367927253246308, + "num_tokens": 49818629.0, + "step": 41460 + }, + { + "entropy": 1.958992251753807, + "epoch": 0.12855337158108712, + "grad_norm": 9.640412330627441, + "learning_rate": 7.0559611373248725e-06, + "loss": 0.607, + "mean_token_accuracy": 0.8203074246644974, + "num_tokens": 49829898.0, + "step": 41470 + }, + { + "entropy": 1.7837151035666465, + "epoch": 0.12858437070613682, + "grad_norm": 3.7444610595703125, + "learning_rate": 7.05511053985848e-06, + "loss": 0.3599, + "mean_token_accuracy": 0.8547704428434372, + "num_tokens": 49844096.0, + "step": 41480 + }, + { + "entropy": 1.8662296697497367, + "epoch": 0.12861536983118652, + "grad_norm": 4.482416152954102, + "learning_rate": 7.054260249937003e-06, + "loss": 0.5229, + "mean_token_accuracy": 0.8283546611666679, + "num_tokens": 49856112.0, + "step": 41490 + }, + { + "entropy": 1.8327308684587478, + "epoch": 0.1286463689562362, + "grad_norm": 4.7443342208862305, + "learning_rate": 7.053410267375156e-06, + "loss": 0.4376, + "mean_token_accuracy": 0.8486336767673492, + "num_tokens": 49869424.0, + "step": 41500 + }, + { + "entropy": 1.7724727407097816, + "epoch": 0.1286773680812859, + "grad_norm": 9.06193733215332, + "learning_rate": 7.052560591987811e-06, + "loss": 0.4431, + "mean_token_accuracy": 0.8514471918344497, + "num_tokens": 49882790.0, + "step": 41510 + }, + { + "entropy": 1.8597662687301635, + "epoch": 0.1287083672063356, + "grad_norm": 10.311935424804688, + "learning_rate": 7.051711223589997e-06, + "loss": 0.5304, + "mean_token_accuracy": 0.83001659065485, + "num_tokens": 49894927.0, + "step": 41520 + }, + { + "entropy": 1.8753248527646065, + "epoch": 0.1287393663313853, + "grad_norm": 9.605613708496094, + "learning_rate": 7.050862161996901e-06, + "loss": 0.5419, + "mean_token_accuracy": 0.8333491206169128, + "num_tokens": 49906710.0, + "step": 41530 + }, + { + "entropy": 1.904360829293728, + "epoch": 0.128770365456435, + "grad_norm": 7.814454078674316, + "learning_rate": 7.050013407023859e-06, + "loss": 0.5386, + "mean_token_accuracy": 0.8288786053657532, + "num_tokens": 49918562.0, + "step": 41540 + }, + { + "entropy": 1.9048177361488343, + "epoch": 0.1288013645814847, + "grad_norm": 9.190512657165527, + "learning_rate": 7.049164958486372e-06, + "loss": 0.5529, + "mean_token_accuracy": 0.8335204407572746, + "num_tokens": 49929488.0, + "step": 41550 + }, + { + "entropy": 1.8486354887485503, + "epoch": 0.1288323637065344, + "grad_norm": 9.387413024902344, + "learning_rate": 7.048316816200086e-06, + "loss": 0.5107, + "mean_token_accuracy": 0.8399450510740281, + "num_tokens": 49941628.0, + "step": 41560 + }, + { + "entropy": 1.8960034802556038, + "epoch": 0.1288633628315841, + "grad_norm": 4.520693302154541, + "learning_rate": 7.047468979980812e-06, + "loss": 0.5923, + "mean_token_accuracy": 0.8215907469391823, + "num_tokens": 49953693.0, + "step": 41570 + }, + { + "entropy": 1.838585540652275, + "epoch": 0.12889436195663379, + "grad_norm": 9.408111572265625, + "learning_rate": 7.046621449644507e-06, + "loss": 0.5268, + "mean_token_accuracy": 0.825011870265007, + "num_tokens": 49964922.0, + "step": 41580 + }, + { + "entropy": 1.8944636061787605, + "epoch": 0.12892536108168348, + "grad_norm": 8.773103713989258, + "learning_rate": 7.045774225007293e-06, + "loss": 0.5337, + "mean_token_accuracy": 0.8297211557626725, + "num_tokens": 49977343.0, + "step": 41590 + }, + { + "entropy": 1.8642171308398248, + "epoch": 0.12895636020673318, + "grad_norm": 9.104546546936035, + "learning_rate": 7.044927305885436e-06, + "loss": 0.5694, + "mean_token_accuracy": 0.824115814268589, + "num_tokens": 49990496.0, + "step": 41600 + }, + { + "entropy": 1.832181690633297, + "epoch": 0.12898735933178285, + "grad_norm": 4.260911464691162, + "learning_rate": 7.044080692095364e-06, + "loss": 0.4779, + "mean_token_accuracy": 0.8464431285858154, + "num_tokens": 50002745.0, + "step": 41610 + }, + { + "entropy": 1.878463228046894, + "epoch": 0.12901835845683254, + "grad_norm": 8.97909927368164, + "learning_rate": 7.043234383453658e-06, + "loss": 0.5373, + "mean_token_accuracy": 0.8333162799477577, + "num_tokens": 50014652.0, + "step": 41620 + }, + { + "entropy": 1.8109195098280906, + "epoch": 0.12904935758188224, + "grad_norm": 10.204434394836426, + "learning_rate": 7.04238837977705e-06, + "loss": 0.5412, + "mean_token_accuracy": 0.8232007443904876, + "num_tokens": 50026970.0, + "step": 41630 + }, + { + "entropy": 1.8154733762145043, + "epoch": 0.12908035670693194, + "grad_norm": 5.0115132331848145, + "learning_rate": 7.041542680882431e-06, + "loss": 0.5119, + "mean_token_accuracy": 0.8351962670683861, + "num_tokens": 50039078.0, + "step": 41640 + }, + { + "entropy": 1.854199130833149, + "epoch": 0.12911135583198163, + "grad_norm": 10.00779914855957, + "learning_rate": 7.04069728658684e-06, + "loss": 0.5641, + "mean_token_accuracy": 0.8258049175143242, + "num_tokens": 50050935.0, + "step": 41650 + }, + { + "entropy": 1.9253348022699357, + "epoch": 0.12914235495703133, + "grad_norm": 8.788981437683105, + "learning_rate": 7.039852196707477e-06, + "loss": 0.5403, + "mean_token_accuracy": 0.8358196586370468, + "num_tokens": 50062097.0, + "step": 41660 + }, + { + "entropy": 1.8359443858265876, + "epoch": 0.12917335408208103, + "grad_norm": 8.613965034484863, + "learning_rate": 7.039007411061688e-06, + "loss": 0.47, + "mean_token_accuracy": 0.8417585626244545, + "num_tokens": 50074289.0, + "step": 41670 + }, + { + "entropy": 1.7834791973233224, + "epoch": 0.12920435320713072, + "grad_norm": 5.170468807220459, + "learning_rate": 7.038162929466977e-06, + "loss": 0.4737, + "mean_token_accuracy": 0.8395596250891686, + "num_tokens": 50087113.0, + "step": 41680 + }, + { + "entropy": 1.8834406480193138, + "epoch": 0.12923535233218042, + "grad_norm": 9.960625648498535, + "learning_rate": 7.037318751741002e-06, + "loss": 0.5586, + "mean_token_accuracy": 0.8236372783780098, + "num_tokens": 50098957.0, + "step": 41690 + }, + { + "entropy": 1.8330654799938202, + "epoch": 0.12926635145723012, + "grad_norm": 8.917972564697266, + "learning_rate": 7.036474877701568e-06, + "loss": 0.5263, + "mean_token_accuracy": 0.8239554926753044, + "num_tokens": 50111327.0, + "step": 41700 + }, + { + "entropy": 1.9040823325514793, + "epoch": 0.1292973505822798, + "grad_norm": 10.261785507202148, + "learning_rate": 7.03563130716664e-06, + "loss": 0.5855, + "mean_token_accuracy": 0.8225290685892105, + "num_tokens": 50123086.0, + "step": 41710 + }, + { + "entropy": 1.91173397898674, + "epoch": 0.1293283497073295, + "grad_norm": 9.091620445251465, + "learning_rate": 7.0347880399543345e-06, + "loss": 0.5809, + "mean_token_accuracy": 0.8207633405923843, + "num_tokens": 50134801.0, + "step": 41720 + }, + { + "entropy": 1.8690212473273278, + "epoch": 0.1293593488323792, + "grad_norm": 8.664191246032715, + "learning_rate": 7.0339450758829165e-06, + "loss": 0.5377, + "mean_token_accuracy": 0.8263664215803146, + "num_tokens": 50146507.0, + "step": 41730 + }, + { + "entropy": 1.8848260268568993, + "epoch": 0.1293903479574289, + "grad_norm": 10.184765815734863, + "learning_rate": 7.033102414770806e-06, + "loss": 0.5267, + "mean_token_accuracy": 0.8280350670218468, + "num_tokens": 50158176.0, + "step": 41740 + }, + { + "entropy": 1.78608690649271, + "epoch": 0.1294213470824786, + "grad_norm": 9.575325012207031, + "learning_rate": 7.032260056436574e-06, + "loss": 0.4691, + "mean_token_accuracy": 0.8456524163484573, + "num_tokens": 50171381.0, + "step": 41750 + }, + { + "entropy": 1.9386123955249785, + "epoch": 0.1294523462075283, + "grad_norm": 10.201217651367188, + "learning_rate": 7.031418000698947e-06, + "loss": 0.5914, + "mean_token_accuracy": 0.8301969483494759, + "num_tokens": 50182781.0, + "step": 41760 + }, + { + "entropy": 1.847770369052887, + "epoch": 0.129483345332578, + "grad_norm": 4.556051254272461, + "learning_rate": 7.0305762473768e-06, + "loss": 0.5087, + "mean_token_accuracy": 0.8307551980018616, + "num_tokens": 50195460.0, + "step": 41770 + }, + { + "entropy": 1.816901859641075, + "epoch": 0.1295143444576277, + "grad_norm": 9.289469718933105, + "learning_rate": 7.0297347962891595e-06, + "loss": 0.5057, + "mean_token_accuracy": 0.8283630818128586, + "num_tokens": 50208808.0, + "step": 41780 + }, + { + "entropy": 1.929781760275364, + "epoch": 0.12954534358267739, + "grad_norm": 8.88871955871582, + "learning_rate": 7.028893647255209e-06, + "loss": 0.5515, + "mean_token_accuracy": 0.8377395987510681, + "num_tokens": 50220192.0, + "step": 41790 + }, + { + "entropy": 1.8346890568733216, + "epoch": 0.12957634270772708, + "grad_norm": 11.108233451843262, + "learning_rate": 7.028052800094273e-06, + "loss": 0.5077, + "mean_token_accuracy": 0.8360419407486915, + "num_tokens": 50232642.0, + "step": 41800 + }, + { + "entropy": 1.762622408568859, + "epoch": 0.12960734183277678, + "grad_norm": 3.646545886993408, + "learning_rate": 7.027212254625838e-06, + "loss": 0.4675, + "mean_token_accuracy": 0.8363216891884804, + "num_tokens": 50245870.0, + "step": 41810 + }, + { + "entropy": 1.7347578413784503, + "epoch": 0.12963834095782648, + "grad_norm": 10.43207836151123, + "learning_rate": 7.026372010669536e-06, + "loss": 0.4645, + "mean_token_accuracy": 0.8484481438994408, + "num_tokens": 50259093.0, + "step": 41820 + }, + { + "entropy": 1.8771490514278413, + "epoch": 0.12966934008287617, + "grad_norm": 10.095263481140137, + "learning_rate": 7.025532068045149e-06, + "loss": 0.5608, + "mean_token_accuracy": 0.8258682683110237, + "num_tokens": 50270396.0, + "step": 41830 + }, + { + "entropy": 1.8018823832273483, + "epoch": 0.12970033920792587, + "grad_norm": 10.791528701782227, + "learning_rate": 7.024692426572615e-06, + "loss": 0.5024, + "mean_token_accuracy": 0.8406801581382751, + "num_tokens": 50283135.0, + "step": 41840 + }, + { + "entropy": 1.8664998412132263, + "epoch": 0.12973133833297557, + "grad_norm": 11.595536231994629, + "learning_rate": 7.023853086072019e-06, + "loss": 0.554, + "mean_token_accuracy": 0.820669724047184, + "num_tokens": 50294875.0, + "step": 41850 + }, + { + "entropy": 1.8681924358010291, + "epoch": 0.12976233745802523, + "grad_norm": 9.37474536895752, + "learning_rate": 7.0230140463635955e-06, + "loss": 0.5866, + "mean_token_accuracy": 0.8231520146131516, + "num_tokens": 50307197.0, + "step": 41860 + }, + { + "entropy": 1.8410033360123634, + "epoch": 0.12979333658307493, + "grad_norm": 4.718626022338867, + "learning_rate": 7.022175307267729e-06, + "loss": 0.467, + "mean_token_accuracy": 0.8391537711024284, + "num_tokens": 50319844.0, + "step": 41870 + }, + { + "entropy": 1.9163996964693069, + "epoch": 0.12982433570812463, + "grad_norm": 13.432290077209473, + "learning_rate": 7.021336868604959e-06, + "loss": 0.6464, + "mean_token_accuracy": 0.8215458050370217, + "num_tokens": 50331051.0, + "step": 41880 + }, + { + "entropy": 1.8775553047657012, + "epoch": 0.12985533483317432, + "grad_norm": 9.11666488647461, + "learning_rate": 7.0204987301959715e-06, + "loss": 0.5546, + "mean_token_accuracy": 0.8308037981390953, + "num_tokens": 50342313.0, + "step": 41890 + }, + { + "entropy": 1.7834023706614972, + "epoch": 0.12988633395822402, + "grad_norm": 5.124344825744629, + "learning_rate": 7.019660891861601e-06, + "loss": 0.4445, + "mean_token_accuracy": 0.8429507181048393, + "num_tokens": 50356360.0, + "step": 41900 + }, + { + "entropy": 1.9084928244352342, + "epoch": 0.12991733308327372, + "grad_norm": 8.772656440734863, + "learning_rate": 7.018823353422832e-06, + "loss": 0.5841, + "mean_token_accuracy": 0.8257664754986763, + "num_tokens": 50367277.0, + "step": 41910 + }, + { + "entropy": 1.8719779431819916, + "epoch": 0.1299483322083234, + "grad_norm": 8.429474830627441, + "learning_rate": 7.017986114700802e-06, + "loss": 0.501, + "mean_token_accuracy": 0.8469484955072403, + "num_tokens": 50378849.0, + "step": 41920 + }, + { + "entropy": 1.8775894209742545, + "epoch": 0.1299793313333731, + "grad_norm": 8.218339920043945, + "learning_rate": 7.0171491755167954e-06, + "loss": 0.5791, + "mean_token_accuracy": 0.813252392411232, + "num_tokens": 50390771.0, + "step": 41930 + }, + { + "entropy": 1.8514499336481094, + "epoch": 0.1300103304584228, + "grad_norm": 8.702057838439941, + "learning_rate": 7.016312535692245e-06, + "loss": 0.5392, + "mean_token_accuracy": 0.837669375538826, + "num_tokens": 50402598.0, + "step": 41940 + }, + { + "entropy": 1.7951854154467584, + "epoch": 0.1300413295834725, + "grad_norm": 9.224007606506348, + "learning_rate": 7.0154761950487325e-06, + "loss": 0.4451, + "mean_token_accuracy": 0.8424011334776879, + "num_tokens": 50415365.0, + "step": 41950 + }, + { + "entropy": 1.941623505949974, + "epoch": 0.1300723287085222, + "grad_norm": 10.595728874206543, + "learning_rate": 7.01464015340799e-06, + "loss": 0.6463, + "mean_token_accuracy": 0.8008893147110939, + "num_tokens": 50426866.0, + "step": 41960 + }, + { + "entropy": 1.8391387566924096, + "epoch": 0.1301033278335719, + "grad_norm": 9.028034210205078, + "learning_rate": 7.0138044105918975e-06, + "loss": 0.5394, + "mean_token_accuracy": 0.8306760504841805, + "num_tokens": 50440268.0, + "step": 41970 + }, + { + "entropy": 1.7658006258308887, + "epoch": 0.1301343269586216, + "grad_norm": 2.6869993209838867, + "learning_rate": 7.0129689664224855e-06, + "loss": 0.4255, + "mean_token_accuracy": 0.8479226931929589, + "num_tokens": 50455088.0, + "step": 41980 + }, + { + "entropy": 1.8967247605323792, + "epoch": 0.1301653260836713, + "grad_norm": 7.826026916503906, + "learning_rate": 7.012133820721929e-06, + "loss": 0.5326, + "mean_token_accuracy": 0.8311007678508758, + "num_tokens": 50466652.0, + "step": 41990 + }, + { + "entropy": 1.958921417593956, + "epoch": 0.130196325208721, + "grad_norm": 8.257652282714844, + "learning_rate": 7.011298973312554e-06, + "loss": 0.5903, + "mean_token_accuracy": 0.8199125394225121, + "num_tokens": 50477973.0, + "step": 42000 + }, + { + "entropy": 1.8253989905118941, + "epoch": 0.13022732433377068, + "grad_norm": 3.0014355182647705, + "learning_rate": 7.0104644240168294e-06, + "loss": 0.5041, + "mean_token_accuracy": 0.835718783736229, + "num_tokens": 50490889.0, + "step": 42010 + }, + { + "entropy": 1.9271552190184593, + "epoch": 0.13025832345882038, + "grad_norm": 10.399243354797363, + "learning_rate": 7.0096301726573835e-06, + "loss": 0.6215, + "mean_token_accuracy": 0.8131484746932983, + "num_tokens": 50502026.0, + "step": 42020 + }, + { + "entropy": 1.9419305875897408, + "epoch": 0.13028932258387008, + "grad_norm": 7.492198467254639, + "learning_rate": 7.008796219056981e-06, + "loss": 0.6331, + "mean_token_accuracy": 0.82049780189991, + "num_tokens": 50513287.0, + "step": 42030 + }, + { + "entropy": 1.9706764385104178, + "epoch": 0.13032032170891977, + "grad_norm": 10.456650733947754, + "learning_rate": 7.00796256303854e-06, + "loss": 0.5711, + "mean_token_accuracy": 0.8240112096071244, + "num_tokens": 50524654.0, + "step": 42040 + }, + { + "entropy": 1.8961515158414841, + "epoch": 0.13035132083396947, + "grad_norm": 10.818836212158203, + "learning_rate": 7.0071292044251215e-06, + "loss": 0.5216, + "mean_token_accuracy": 0.8393558323383331, + "num_tokens": 50537697.0, + "step": 42050 + }, + { + "entropy": 1.9397668451070786, + "epoch": 0.13038231995901917, + "grad_norm": 7.472477912902832, + "learning_rate": 7.006296143039939e-06, + "loss": 0.5613, + "mean_token_accuracy": 0.8343762636184693, + "num_tokens": 50549295.0, + "step": 42060 + }, + { + "entropy": 1.8642112627625465, + "epoch": 0.13041331908406886, + "grad_norm": 2.947805643081665, + "learning_rate": 7.00546337870635e-06, + "loss": 0.5143, + "mean_token_accuracy": 0.8320413127541542, + "num_tokens": 50561911.0, + "step": 42070 + }, + { + "entropy": 1.8363241106271744, + "epoch": 0.13044431820911856, + "grad_norm": 4.179972171783447, + "learning_rate": 7.0046309112478594e-06, + "loss": 0.4682, + "mean_token_accuracy": 0.8355718359351159, + "num_tokens": 50574636.0, + "step": 42080 + }, + { + "entropy": 1.8695838272571563, + "epoch": 0.13047531733416826, + "grad_norm": 10.452752113342285, + "learning_rate": 7.003798740488118e-06, + "loss": 0.5269, + "mean_token_accuracy": 0.8281556889414787, + "num_tokens": 50586479.0, + "step": 42090 + }, + { + "entropy": 1.8435761332511902, + "epoch": 0.13050631645921792, + "grad_norm": 9.886906623840332, + "learning_rate": 7.0029668662509255e-06, + "loss": 0.5702, + "mean_token_accuracy": 0.8295658230781555, + "num_tokens": 50599307.0, + "step": 42100 + }, + { + "entropy": 1.8621219590306282, + "epoch": 0.13053731558426762, + "grad_norm": 7.598883628845215, + "learning_rate": 7.002135288360228e-06, + "loss": 0.4905, + "mean_token_accuracy": 0.8365325689315796, + "num_tokens": 50611631.0, + "step": 42110 + }, + { + "entropy": 1.926572097837925, + "epoch": 0.13056831470931732, + "grad_norm": 7.883036136627197, + "learning_rate": 7.0013040066401135e-06, + "loss": 0.5725, + "mean_token_accuracy": 0.8214625731110573, + "num_tokens": 50622981.0, + "step": 42120 + }, + { + "entropy": 1.9384382754564284, + "epoch": 0.13059931383436701, + "grad_norm": 8.586524963378906, + "learning_rate": 7.000473020914823e-06, + "loss": 0.585, + "mean_token_accuracy": 0.8287076443433762, + "num_tokens": 50633779.0, + "step": 42130 + }, + { + "entropy": 1.9370570868253707, + "epoch": 0.1306303129594167, + "grad_norm": 10.588152885437012, + "learning_rate": 6.999642331008736e-06, + "loss": 0.5732, + "mean_token_accuracy": 0.8158860892057419, + "num_tokens": 50645735.0, + "step": 42140 + }, + { + "entropy": 1.8363999262452126, + "epoch": 0.1306613120844664, + "grad_norm": 4.617247581481934, + "learning_rate": 6.998811936746385e-06, + "loss": 0.5028, + "mean_token_accuracy": 0.8244207426905632, + "num_tokens": 50659623.0, + "step": 42150 + }, + { + "entropy": 1.9262761980295182, + "epoch": 0.1306923112095161, + "grad_norm": 11.821525573730469, + "learning_rate": 6.9979818379524435e-06, + "loss": 0.5402, + "mean_token_accuracy": 0.8174602270126343, + "num_tokens": 50672505.0, + "step": 42160 + }, + { + "entropy": 1.8478658609092236, + "epoch": 0.1307233103345658, + "grad_norm": 5.417337894439697, + "learning_rate": 6.997152034451732e-06, + "loss": 0.4861, + "mean_token_accuracy": 0.8429075211286545, + "num_tokens": 50686049.0, + "step": 42170 + }, + { + "entropy": 1.897514969110489, + "epoch": 0.1307543094596155, + "grad_norm": 7.976996421813965, + "learning_rate": 6.9963225260692145e-06, + "loss": 0.5191, + "mean_token_accuracy": 0.832906337082386, + "num_tokens": 50698264.0, + "step": 42180 + }, + { + "entropy": 1.8931497901678085, + "epoch": 0.1307853085846652, + "grad_norm": 4.187560081481934, + "learning_rate": 6.995493312630006e-06, + "loss": 0.5075, + "mean_token_accuracy": 0.8311450779438019, + "num_tokens": 50710916.0, + "step": 42190 + }, + { + "entropy": 1.8941505983471871, + "epoch": 0.1308163077097149, + "grad_norm": 8.752896308898926, + "learning_rate": 6.9946643939593606e-06, + "loss": 0.5434, + "mean_token_accuracy": 0.8309932291507721, + "num_tokens": 50722894.0, + "step": 42200 + }, + { + "entropy": 1.897823777794838, + "epoch": 0.1308473068347646, + "grad_norm": 8.950542449951172, + "learning_rate": 6.993835769882677e-06, + "loss": 0.5178, + "mean_token_accuracy": 0.8307448908686638, + "num_tokens": 50735048.0, + "step": 42210 + }, + { + "entropy": 2.0057034313678743, + "epoch": 0.13087830595981428, + "grad_norm": 8.577478408813477, + "learning_rate": 6.993007440225504e-06, + "loss": 0.5908, + "mean_token_accuracy": 0.8248661920428276, + "num_tokens": 50745880.0, + "step": 42220 + }, + { + "entropy": 1.933904617279768, + "epoch": 0.13090930508486398, + "grad_norm": 7.418628692626953, + "learning_rate": 6.99217940481353e-06, + "loss": 0.534, + "mean_token_accuracy": 0.8272743076086044, + "num_tokens": 50757689.0, + "step": 42230 + }, + { + "entropy": 1.8607298329472541, + "epoch": 0.13094030420991368, + "grad_norm": 3.3645823001861572, + "learning_rate": 6.991351663472591e-06, + "loss": 0.5565, + "mean_token_accuracy": 0.8338559105992317, + "num_tokens": 50770252.0, + "step": 42240 + }, + { + "entropy": 1.7977669216692447, + "epoch": 0.13097130333496337, + "grad_norm": 8.085532188415527, + "learning_rate": 6.990524216028667e-06, + "loss": 0.4282, + "mean_token_accuracy": 0.8425278753042221, + "num_tokens": 50783859.0, + "step": 42250 + }, + { + "entropy": 1.9316506549715995, + "epoch": 0.13100230246001307, + "grad_norm": 9.128771781921387, + "learning_rate": 6.989697062307879e-06, + "loss": 0.553, + "mean_token_accuracy": 0.8342802032828331, + "num_tokens": 50795907.0, + "step": 42260 + }, + { + "entropy": 2.010502940416336, + "epoch": 0.13103330158506277, + "grad_norm": 9.819306373596191, + "learning_rate": 6.988870202136493e-06, + "loss": 0.6157, + "mean_token_accuracy": 0.8241182819008828, + "num_tokens": 50806819.0, + "step": 42270 + }, + { + "entropy": 1.879169872403145, + "epoch": 0.13106430071011246, + "grad_norm": 8.068976402282715, + "learning_rate": 6.988043635340924e-06, + "loss": 0.474, + "mean_token_accuracy": 0.8331845104694366, + "num_tokens": 50820181.0, + "step": 42280 + }, + { + "entropy": 1.9254466131329537, + "epoch": 0.13109529983516216, + "grad_norm": 8.121038436889648, + "learning_rate": 6.987217361747725e-06, + "loss": 0.5318, + "mean_token_accuracy": 0.830107967555523, + "num_tokens": 50831403.0, + "step": 42290 + }, + { + "entropy": 1.9769433185458183, + "epoch": 0.13112629896021186, + "grad_norm": 10.498136520385742, + "learning_rate": 6.986391381183594e-06, + "loss": 0.5892, + "mean_token_accuracy": 0.8088495776057243, + "num_tokens": 50843327.0, + "step": 42300 + }, + { + "entropy": 1.8907544031739234, + "epoch": 0.13115729808526155, + "grad_norm": 10.089003562927246, + "learning_rate": 6.98556569347537e-06, + "loss": 0.5238, + "mean_token_accuracy": 0.8322427123785019, + "num_tokens": 50856211.0, + "step": 42310 + }, + { + "entropy": 1.9495109453797341, + "epoch": 0.13118829721031125, + "grad_norm": 9.126409530639648, + "learning_rate": 6.984740298450043e-06, + "loss": 0.5702, + "mean_token_accuracy": 0.8336078703403473, + "num_tokens": 50867480.0, + "step": 42320 + }, + { + "entropy": 1.8435195326805114, + "epoch": 0.13121929633536095, + "grad_norm": 6.529050827026367, + "learning_rate": 6.983915195934738e-06, + "loss": 0.5314, + "mean_token_accuracy": 0.8262422546744347, + "num_tokens": 50880593.0, + "step": 42330 + }, + { + "entropy": 1.9224420145154, + "epoch": 0.13125029546041064, + "grad_norm": 10.954397201538086, + "learning_rate": 6.983090385756723e-06, + "loss": 0.5696, + "mean_token_accuracy": 0.8218831300735474, + "num_tokens": 50891387.0, + "step": 42340 + }, + { + "entropy": 1.8449978575110435, + "epoch": 0.1312812945854603, + "grad_norm": 8.803427696228027, + "learning_rate": 6.982265867743417e-06, + "loss": 0.4777, + "mean_token_accuracy": 0.8435740604996681, + "num_tokens": 50903726.0, + "step": 42350 + }, + { + "entropy": 1.8841196730732919, + "epoch": 0.13131229371051, + "grad_norm": 9.251840591430664, + "learning_rate": 6.981441641722373e-06, + "loss": 0.5113, + "mean_token_accuracy": 0.8290949910879135, + "num_tokens": 50916111.0, + "step": 42360 + }, + { + "entropy": 1.927403984963894, + "epoch": 0.1313432928355597, + "grad_norm": 8.574084281921387, + "learning_rate": 6.98061770752129e-06, + "loss": 0.5651, + "mean_token_accuracy": 0.8280455321073532, + "num_tokens": 50928284.0, + "step": 42370 + }, + { + "entropy": 1.9771150171756744, + "epoch": 0.1313742919606094, + "grad_norm": 8.762701988220215, + "learning_rate": 6.9797940649680106e-06, + "loss": 0.6193, + "mean_token_accuracy": 0.8151744335889817, + "num_tokens": 50939233.0, + "step": 42380 + }, + { + "entropy": 1.8265860572457313, + "epoch": 0.1314052910856591, + "grad_norm": 7.463681697845459, + "learning_rate": 6.978970713890515e-06, + "loss": 0.555, + "mean_token_accuracy": 0.8328731596469879, + "num_tokens": 50953036.0, + "step": 42390 + }, + { + "entropy": 1.8653569996356965, + "epoch": 0.1314362902107088, + "grad_norm": 8.80778694152832, + "learning_rate": 6.978147654116929e-06, + "loss": 0.47, + "mean_token_accuracy": 0.8439178183674813, + "num_tokens": 50964848.0, + "step": 42400 + }, + { + "entropy": 1.939113649725914, + "epoch": 0.1314672893357585, + "grad_norm": 10.408567428588867, + "learning_rate": 6.977324885475521e-06, + "loss": 0.5543, + "mean_token_accuracy": 0.8219144955277443, + "num_tokens": 50976553.0, + "step": 42410 + }, + { + "entropy": 1.862153060734272, + "epoch": 0.1314982884608082, + "grad_norm": 8.101511001586914, + "learning_rate": 6.976502407794701e-06, + "loss": 0.485, + "mean_token_accuracy": 0.835495936870575, + "num_tokens": 50988487.0, + "step": 42420 + }, + { + "entropy": 1.9604466244578362, + "epoch": 0.13152928758585788, + "grad_norm": 9.159782409667969, + "learning_rate": 6.975680220903015e-06, + "loss": 0.5845, + "mean_token_accuracy": 0.8208717614412308, + "num_tokens": 51000283.0, + "step": 42430 + }, + { + "entropy": 1.9139268666505813, + "epoch": 0.13156028671090758, + "grad_norm": 8.362812042236328, + "learning_rate": 6.974858324629158e-06, + "loss": 0.5689, + "mean_token_accuracy": 0.8323205769062042, + "num_tokens": 51012209.0, + "step": 42440 + }, + { + "entropy": 1.9230769395828247, + "epoch": 0.13159128583595728, + "grad_norm": 8.45923900604248, + "learning_rate": 6.97403671880196e-06, + "loss": 0.572, + "mean_token_accuracy": 0.8329043105244637, + "num_tokens": 51023308.0, + "step": 42450 + }, + { + "entropy": 1.9842663645744323, + "epoch": 0.13162228496100697, + "grad_norm": 8.042434692382812, + "learning_rate": 6.973215403250397e-06, + "loss": 0.5793, + "mean_token_accuracy": 0.8260912299156189, + "num_tokens": 51033767.0, + "step": 42460 + }, + { + "entropy": 1.9136120170354842, + "epoch": 0.13165328408605667, + "grad_norm": 8.325801849365234, + "learning_rate": 6.972394377803584e-06, + "loss": 0.5382, + "mean_token_accuracy": 0.8208369135856628, + "num_tokens": 51045947.0, + "step": 42470 + }, + { + "entropy": 1.8511994555592537, + "epoch": 0.13168428321110637, + "grad_norm": 3.737189531326294, + "learning_rate": 6.9715736422907764e-06, + "loss": 0.4705, + "mean_token_accuracy": 0.8399424910545349, + "num_tokens": 51057715.0, + "step": 42480 + }, + { + "entropy": 1.8442467346787452, + "epoch": 0.13171528233615606, + "grad_norm": 4.051930904388428, + "learning_rate": 6.9707531965413695e-06, + "loss": 0.4948, + "mean_token_accuracy": 0.8524442002177238, + "num_tokens": 51068742.0, + "step": 42490 + }, + { + "entropy": 1.8845135629177094, + "epoch": 0.13174628146120576, + "grad_norm": 9.697474479675293, + "learning_rate": 6.969933040384902e-06, + "loss": 0.5258, + "mean_token_accuracy": 0.8385147228837013, + "num_tokens": 51080521.0, + "step": 42500 + }, + { + "entropy": 1.7795793518424035, + "epoch": 0.13177728058625546, + "grad_norm": 9.526708602905273, + "learning_rate": 6.96911317365105e-06, + "loss": 0.4709, + "mean_token_accuracy": 0.8431052088737487, + "num_tokens": 51093716.0, + "step": 42510 + }, + { + "entropy": 1.9105603873729706, + "epoch": 0.13180827971130515, + "grad_norm": 8.357869148254395, + "learning_rate": 6.968293596169631e-06, + "loss": 0.5976, + "mean_token_accuracy": 0.8246324315667153, + "num_tokens": 51105239.0, + "step": 42520 + }, + { + "entropy": 1.9591112911701203, + "epoch": 0.13183927883635485, + "grad_norm": 9.68211841583252, + "learning_rate": 6.967474307770603e-06, + "loss": 0.6515, + "mean_token_accuracy": 0.8138914480805397, + "num_tokens": 51116502.0, + "step": 42530 + }, + { + "entropy": 1.9038333266973495, + "epoch": 0.13187027796140455, + "grad_norm": 8.462784767150879, + "learning_rate": 6.966655308284064e-06, + "loss": 0.5137, + "mean_token_accuracy": 0.8396757513284683, + "num_tokens": 51128381.0, + "step": 42540 + }, + { + "entropy": 1.8404983699321746, + "epoch": 0.13190127708645424, + "grad_norm": 8.014533042907715, + "learning_rate": 6.965836597540249e-06, + "loss": 0.4811, + "mean_token_accuracy": 0.8473035827279091, + "num_tokens": 51140787.0, + "step": 42550 + }, + { + "entropy": 1.9219214513897895, + "epoch": 0.13193227621150394, + "grad_norm": 8.607987403869629, + "learning_rate": 6.965018175369538e-06, + "loss": 0.5378, + "mean_token_accuracy": 0.8373000919818878, + "num_tokens": 51152611.0, + "step": 42560 + }, + { + "entropy": 1.8740851491689683, + "epoch": 0.13196327533655364, + "grad_norm": 8.044096946716309, + "learning_rate": 6.9642000416024435e-06, + "loss": 0.5307, + "mean_token_accuracy": 0.8397703349590302, + "num_tokens": 51164281.0, + "step": 42570 + }, + { + "entropy": 1.8905795052647592, + "epoch": 0.13199427446160333, + "grad_norm": 9.585006713867188, + "learning_rate": 6.963382196069625e-06, + "loss": 0.5458, + "mean_token_accuracy": 0.8185367554426193, + "num_tokens": 51175917.0, + "step": 42580 + }, + { + "entropy": 1.814333714544773, + "epoch": 0.13202527358665303, + "grad_norm": 10.914127349853516, + "learning_rate": 6.962564638601874e-06, + "loss": 0.5236, + "mean_token_accuracy": 0.8299461394548416, + "num_tokens": 51188596.0, + "step": 42590 + }, + { + "entropy": 1.8761808335781098, + "epoch": 0.1320562727117027, + "grad_norm": 8.772909164428711, + "learning_rate": 6.961747369030127e-06, + "loss": 0.5328, + "mean_token_accuracy": 0.8223336800932884, + "num_tokens": 51200294.0, + "step": 42600 + }, + { + "entropy": 1.8656433135271073, + "epoch": 0.1320872718367524, + "grad_norm": 4.012499809265137, + "learning_rate": 6.960930387185456e-06, + "loss": 0.5392, + "mean_token_accuracy": 0.8211655914783478, + "num_tokens": 51212299.0, + "step": 42610 + }, + { + "entropy": 1.8340272575616836, + "epoch": 0.1321182709618021, + "grad_norm": 5.148853302001953, + "learning_rate": 6.960113692899071e-06, + "loss": 0.4994, + "mean_token_accuracy": 0.8375645697116851, + "num_tokens": 51224694.0, + "step": 42620 + }, + { + "entropy": 1.8271689996123315, + "epoch": 0.1321492700868518, + "grad_norm": 8.511850357055664, + "learning_rate": 6.9592972860023235e-06, + "loss": 0.4601, + "mean_token_accuracy": 0.8452630266547203, + "num_tokens": 51237562.0, + "step": 42630 + }, + { + "entropy": 1.9014787808060647, + "epoch": 0.13218026921190149, + "grad_norm": 4.459075927734375, + "learning_rate": 6.9584811663267015e-06, + "loss": 0.5206, + "mean_token_accuracy": 0.8347061321139335, + "num_tokens": 51248656.0, + "step": 42640 + }, + { + "entropy": 1.9535549938678742, + "epoch": 0.13221126833695118, + "grad_norm": 4.374874114990234, + "learning_rate": 6.9576653337038325e-06, + "loss": 0.6071, + "mean_token_accuracy": 0.8113219693303109, + "num_tokens": 51259712.0, + "step": 42650 + }, + { + "entropy": 1.8858298167586327, + "epoch": 0.13224226746200088, + "grad_norm": 11.858930587768555, + "learning_rate": 6.956849787965481e-06, + "loss": 0.5066, + "mean_token_accuracy": 0.8303458750247955, + "num_tokens": 51272446.0, + "step": 42660 + }, + { + "entropy": 1.8564813017845154, + "epoch": 0.13227326658705058, + "grad_norm": 3.350623846054077, + "learning_rate": 6.956034528943548e-06, + "loss": 0.5335, + "mean_token_accuracy": 0.8342030212283135, + "num_tokens": 51285301.0, + "step": 42670 + }, + { + "entropy": 1.849312388896942, + "epoch": 0.13230426571210027, + "grad_norm": 9.046114921569824, + "learning_rate": 6.955219556470077e-06, + "loss": 0.4985, + "mean_token_accuracy": 0.8443035989999771, + "num_tokens": 51297696.0, + "step": 42680 + }, + { + "entropy": 1.9661724478006364, + "epoch": 0.13233526483714997, + "grad_norm": 9.500844955444336, + "learning_rate": 6.954404870377246e-06, + "loss": 0.6121, + "mean_token_accuracy": 0.8136408120393753, + "num_tokens": 51307961.0, + "step": 42690 + }, + { + "entropy": 1.939977452158928, + "epoch": 0.13236626396219967, + "grad_norm": 8.7796049118042, + "learning_rate": 6.953590470497371e-06, + "loss": 0.6126, + "mean_token_accuracy": 0.8261479705572128, + "num_tokens": 51318495.0, + "step": 42700 + }, + { + "entropy": 1.840474684536457, + "epoch": 0.13239726308724936, + "grad_norm": 11.025080680847168, + "learning_rate": 6.952776356662905e-06, + "loss": 0.5237, + "mean_token_accuracy": 0.8339684426784515, + "num_tokens": 51330208.0, + "step": 42710 + }, + { + "entropy": 1.947993564605713, + "epoch": 0.13242826221229906, + "grad_norm": 9.111898422241211, + "learning_rate": 6.9519625287064375e-06, + "loss": 0.5756, + "mean_token_accuracy": 0.8302145257592202, + "num_tokens": 51340861.0, + "step": 42720 + }, + { + "entropy": 1.8799726784229278, + "epoch": 0.13245926133734875, + "grad_norm": 9.264261245727539, + "learning_rate": 6.951148986460699e-06, + "loss": 0.5314, + "mean_token_accuracy": 0.8337329775094986, + "num_tokens": 51353102.0, + "step": 42730 + }, + { + "entropy": 1.959149533510208, + "epoch": 0.13249026046239845, + "grad_norm": 10.266985893249512, + "learning_rate": 6.950335729758554e-06, + "loss": 0.6076, + "mean_token_accuracy": 0.821322014927864, + "num_tokens": 51364079.0, + "step": 42740 + }, + { + "entropy": 1.937872663140297, + "epoch": 0.13252125958744815, + "grad_norm": 9.663759231567383, + "learning_rate": 6.949522758433003e-06, + "loss": 0.5499, + "mean_token_accuracy": 0.8321762755513191, + "num_tokens": 51375560.0, + "step": 42750 + }, + { + "entropy": 1.8674385949969292, + "epoch": 0.13255225871249784, + "grad_norm": 7.8949432373046875, + "learning_rate": 6.948710072317184e-06, + "loss": 0.4933, + "mean_token_accuracy": 0.837440250813961, + "num_tokens": 51388270.0, + "step": 42760 + }, + { + "entropy": 1.9320979446172715, + "epoch": 0.13258325783754754, + "grad_norm": 8.221677780151367, + "learning_rate": 6.9478976712443755e-06, + "loss": 0.5291, + "mean_token_accuracy": 0.8377137213945389, + "num_tokens": 51399663.0, + "step": 42770 + }, + { + "entropy": 1.8442893743515014, + "epoch": 0.13261425696259724, + "grad_norm": 7.525363445281982, + "learning_rate": 6.947085555047985e-06, + "loss": 0.5198, + "mean_token_accuracy": 0.8382806748151779, + "num_tokens": 51411959.0, + "step": 42780 + }, + { + "entropy": 1.8832678958773612, + "epoch": 0.13264525608764693, + "grad_norm": 8.396281242370605, + "learning_rate": 6.946273723561562e-06, + "loss": 0.486, + "mean_token_accuracy": 0.8351016476750374, + "num_tokens": 51423692.0, + "step": 42790 + }, + { + "entropy": 1.8216581985354423, + "epoch": 0.13267625521269663, + "grad_norm": 8.608697891235352, + "learning_rate": 6.9454621766187904e-06, + "loss": 0.481, + "mean_token_accuracy": 0.8447302252054214, + "num_tokens": 51436099.0, + "step": 42800 + }, + { + "entropy": 1.9112284123897552, + "epoch": 0.13270725433774633, + "grad_norm": 5.335425853729248, + "learning_rate": 6.944650914053489e-06, + "loss": 0.5714, + "mean_token_accuracy": 0.8203225836157799, + "num_tokens": 51448674.0, + "step": 42810 + }, + { + "entropy": 1.9009429544210434, + "epoch": 0.13273825346279602, + "grad_norm": 8.94002914428711, + "learning_rate": 6.943839935699615e-06, + "loss": 0.5229, + "mean_token_accuracy": 0.8352271884679794, + "num_tokens": 51460644.0, + "step": 42820 + }, + { + "entropy": 1.8937845051288604, + "epoch": 0.13276925258784572, + "grad_norm": 4.647216320037842, + "learning_rate": 6.943029241391259e-06, + "loss": 0.5719, + "mean_token_accuracy": 0.8236138209700584, + "num_tokens": 51473179.0, + "step": 42830 + }, + { + "entropy": 1.8776622116565704, + "epoch": 0.1328002517128954, + "grad_norm": 9.4772367477417, + "learning_rate": 6.942218830962648e-06, + "loss": 0.5519, + "mean_token_accuracy": 0.8267105832695961, + "num_tokens": 51486098.0, + "step": 42840 + }, + { + "entropy": 1.7324663981795312, + "epoch": 0.1328312508379451, + "grad_norm": 2.6686737537384033, + "learning_rate": 6.941408704248144e-06, + "loss": 0.3974, + "mean_token_accuracy": 0.8572231993079186, + "num_tokens": 51499848.0, + "step": 42850 + }, + { + "entropy": 1.8544006049633026, + "epoch": 0.13286224996299478, + "grad_norm": 9.000062942504883, + "learning_rate": 6.940598861082245e-06, + "loss": 0.4792, + "mean_token_accuracy": 0.8476372644305229, + "num_tokens": 51512048.0, + "step": 42860 + }, + { + "entropy": 1.9546643912792205, + "epoch": 0.13289324908804448, + "grad_norm": 11.137999534606934, + "learning_rate": 6.939789301299585e-06, + "loss": 0.6104, + "mean_token_accuracy": 0.8146396011114121, + "num_tokens": 51523110.0, + "step": 42870 + }, + { + "entropy": 1.9276799812912941, + "epoch": 0.13292424821309418, + "grad_norm": 10.72694206237793, + "learning_rate": 6.938980024734927e-06, + "loss": 0.5825, + "mean_token_accuracy": 0.8259019777178764, + "num_tokens": 51534581.0, + "step": 42880 + }, + { + "entropy": 1.7917723521590232, + "epoch": 0.13295524733814387, + "grad_norm": 9.843888282775879, + "learning_rate": 6.938171031223178e-06, + "loss": 0.4621, + "mean_token_accuracy": 0.8381556645035744, + "num_tokens": 51548074.0, + "step": 42890 + }, + { + "entropy": 1.9410393938422204, + "epoch": 0.13298624646319357, + "grad_norm": 10.020930290222168, + "learning_rate": 6.937362320599377e-06, + "loss": 0.5643, + "mean_token_accuracy": 0.8282965987920761, + "num_tokens": 51559680.0, + "step": 42900 + }, + { + "entropy": 1.92443605363369, + "epoch": 0.13301724558824327, + "grad_norm": 10.336004257202148, + "learning_rate": 6.936553892698692e-06, + "loss": 0.557, + "mean_token_accuracy": 0.8238545969128609, + "num_tokens": 51571461.0, + "step": 42910 + }, + { + "entropy": 1.833638320863247, + "epoch": 0.13304824471329296, + "grad_norm": 10.432703971862793, + "learning_rate": 6.935745747356429e-06, + "loss": 0.4945, + "mean_token_accuracy": 0.836958509683609, + "num_tokens": 51583715.0, + "step": 42920 + }, + { + "entropy": 1.9126337066292762, + "epoch": 0.13307924383834266, + "grad_norm": 4.075279712677002, + "learning_rate": 6.934937884408032e-06, + "loss": 0.511, + "mean_token_accuracy": 0.8253563806414604, + "num_tokens": 51595798.0, + "step": 42930 + }, + { + "entropy": 1.9492529153823852, + "epoch": 0.13311024296339236, + "grad_norm": 8.318151473999023, + "learning_rate": 6.934130303689072e-06, + "loss": 0.5273, + "mean_token_accuracy": 0.8317864894866943, + "num_tokens": 51606958.0, + "step": 42940 + }, + { + "entropy": 1.9043660476803779, + "epoch": 0.13314124208844205, + "grad_norm": 4.388843059539795, + "learning_rate": 6.93332300503526e-06, + "loss": 0.5195, + "mean_token_accuracy": 0.8340587362647056, + "num_tokens": 51619061.0, + "step": 42950 + }, + { + "entropy": 1.874220597743988, + "epoch": 0.13317224121349175, + "grad_norm": 3.5497045516967773, + "learning_rate": 6.932515988282438e-06, + "loss": 0.5506, + "mean_token_accuracy": 0.8257475927472114, + "num_tokens": 51630509.0, + "step": 42960 + }, + { + "entropy": 1.8349529922008514, + "epoch": 0.13320324033854145, + "grad_norm": 8.229619979858398, + "learning_rate": 6.931709253266582e-06, + "loss": 0.5153, + "mean_token_accuracy": 0.836476719379425, + "num_tokens": 51642250.0, + "step": 42970 + }, + { + "entropy": 1.9145320609211922, + "epoch": 0.13323423946359114, + "grad_norm": 8.655306816101074, + "learning_rate": 6.930902799823801e-06, + "loss": 0.5779, + "mean_token_accuracy": 0.8278056040406228, + "num_tokens": 51654224.0, + "step": 42980 + }, + { + "entropy": 1.9441554173827171, + "epoch": 0.13326523858864084, + "grad_norm": 10.367897987365723, + "learning_rate": 6.9300966277903415e-06, + "loss": 0.5136, + "mean_token_accuracy": 0.8317381590604782, + "num_tokens": 51665436.0, + "step": 42990 + }, + { + "entropy": 1.8716817542910575, + "epoch": 0.13329623771369054, + "grad_norm": 8.810930252075195, + "learning_rate": 6.929290737002579e-06, + "loss": 0.5147, + "mean_token_accuracy": 0.8309359416365624, + "num_tokens": 51678512.0, + "step": 43000 + }, + { + "entropy": 1.9239093586802483, + "epoch": 0.13332723683874023, + "grad_norm": 10.611305236816406, + "learning_rate": 6.928485127297019e-06, + "loss": 0.5381, + "mean_token_accuracy": 0.828672143816948, + "num_tokens": 51689903.0, + "step": 43010 + }, + { + "entropy": 1.9985890626907348, + "epoch": 0.13335823596378993, + "grad_norm": 9.104059219360352, + "learning_rate": 6.92767979851031e-06, + "loss": 0.6427, + "mean_token_accuracy": 0.8104574084281921, + "num_tokens": 51700881.0, + "step": 43020 + }, + { + "entropy": 1.8614750012755394, + "epoch": 0.13338923508883962, + "grad_norm": 3.7970998287200928, + "learning_rate": 6.926874750479225e-06, + "loss": 0.5172, + "mean_token_accuracy": 0.8261910125613212, + "num_tokens": 51713424.0, + "step": 43030 + }, + { + "entropy": 1.9317557483911514, + "epoch": 0.13342023421388932, + "grad_norm": 4.564321041107178, + "learning_rate": 6.926069983040672e-06, + "loss": 0.514, + "mean_token_accuracy": 0.8412420928478241, + "num_tokens": 51724984.0, + "step": 43040 + }, + { + "entropy": 1.8474729374051093, + "epoch": 0.13345123333893902, + "grad_norm": 4.81390905380249, + "learning_rate": 6.925265496031692e-06, + "loss": 0.473, + "mean_token_accuracy": 0.8341810539364815, + "num_tokens": 51738001.0, + "step": 43050 + }, + { + "entropy": 1.938997507095337, + "epoch": 0.13348223246398871, + "grad_norm": 10.528800010681152, + "learning_rate": 6.92446128928946e-06, + "loss": 0.611, + "mean_token_accuracy": 0.8227848649024964, + "num_tokens": 51749495.0, + "step": 43060 + }, + { + "entropy": 1.724546130001545, + "epoch": 0.1335132315890384, + "grad_norm": 2.564570188522339, + "learning_rate": 6.9236573626512815e-06, + "loss": 0.4273, + "mean_token_accuracy": 0.8508135721087455, + "num_tokens": 51763526.0, + "step": 43070 + }, + { + "entropy": 1.9087968587875366, + "epoch": 0.1335442307140881, + "grad_norm": 10.67532730102539, + "learning_rate": 6.922853715954594e-06, + "loss": 0.5305, + "mean_token_accuracy": 0.8410237103700637, + "num_tokens": 51775239.0, + "step": 43080 + }, + { + "entropy": 1.9154332160949707, + "epoch": 0.13357522983913778, + "grad_norm": 8.595878601074219, + "learning_rate": 6.922050349036968e-06, + "loss": 0.5283, + "mean_token_accuracy": 0.8399403557181359, + "num_tokens": 51786721.0, + "step": 43090 + }, + { + "entropy": 1.9540342479944228, + "epoch": 0.13360622896418747, + "grad_norm": 8.74820327758789, + "learning_rate": 6.921247261736105e-06, + "loss": 0.5451, + "mean_token_accuracy": 0.8402211844921113, + "num_tokens": 51797894.0, + "step": 43100 + }, + { + "entropy": 1.9339706212282182, + "epoch": 0.13363722808923717, + "grad_norm": 10.516135215759277, + "learning_rate": 6.920444453889838e-06, + "loss": 0.5889, + "mean_token_accuracy": 0.8250453129410744, + "num_tokens": 51808969.0, + "step": 43110 + }, + { + "entropy": 1.9804834306240082, + "epoch": 0.13366822721428687, + "grad_norm": 9.053743362426758, + "learning_rate": 6.919641925336133e-06, + "loss": 0.6111, + "mean_token_accuracy": 0.8214120969176293, + "num_tokens": 51820364.0, + "step": 43120 + }, + { + "entropy": 1.856301885843277, + "epoch": 0.13369922633933656, + "grad_norm": 9.301741600036621, + "learning_rate": 6.9188396759130886e-06, + "loss": 0.4907, + "mean_token_accuracy": 0.8374891221523285, + "num_tokens": 51832827.0, + "step": 43130 + }, + { + "entropy": 1.817978872358799, + "epoch": 0.13373022546438626, + "grad_norm": 9.00844955444336, + "learning_rate": 6.918037705458932e-06, + "loss": 0.5028, + "mean_token_accuracy": 0.8360246539115905, + "num_tokens": 51845614.0, + "step": 43140 + }, + { + "entropy": 1.9156497776508332, + "epoch": 0.13376122458943596, + "grad_norm": 9.389613151550293, + "learning_rate": 6.9172360138120205e-06, + "loss": 0.5771, + "mean_token_accuracy": 0.8285941392183304, + "num_tokens": 51857344.0, + "step": 43150 + }, + { + "entropy": 1.9007001653313638, + "epoch": 0.13379222371448565, + "grad_norm": 9.915302276611328, + "learning_rate": 6.9164346008108465e-06, + "loss": 0.5365, + "mean_token_accuracy": 0.8260676816105843, + "num_tokens": 51869381.0, + "step": 43160 + }, + { + "entropy": 1.8302084445953368, + "epoch": 0.13382322283953535, + "grad_norm": 9.13771915435791, + "learning_rate": 6.915633466294033e-06, + "loss": 0.502, + "mean_token_accuracy": 0.841213583946228, + "num_tokens": 51882099.0, + "step": 43170 + }, + { + "entropy": 1.9292520344257356, + "epoch": 0.13385422196458505, + "grad_norm": 9.99196720123291, + "learning_rate": 6.914832610100331e-06, + "loss": 0.562, + "mean_token_accuracy": 0.82747802734375, + "num_tokens": 51893348.0, + "step": 43180 + }, + { + "entropy": 1.8885715886950494, + "epoch": 0.13388522108963474, + "grad_norm": 9.511621475219727, + "learning_rate": 6.914032032068623e-06, + "loss": 0.5304, + "mean_token_accuracy": 0.8361998051404953, + "num_tokens": 51906070.0, + "step": 43190 + }, + { + "entropy": 1.8899712771177293, + "epoch": 0.13391622021468444, + "grad_norm": 12.438838005065918, + "learning_rate": 6.913231732037921e-06, + "loss": 0.5835, + "mean_token_accuracy": 0.8116952329874039, + "num_tokens": 51917513.0, + "step": 43200 + }, + { + "entropy": 1.8960667297244072, + "epoch": 0.13394721933973414, + "grad_norm": 10.735957145690918, + "learning_rate": 6.912431709847373e-06, + "loss": 0.545, + "mean_token_accuracy": 0.8333559215068818, + "num_tokens": 51929508.0, + "step": 43210 + }, + { + "entropy": 1.8930713683366776, + "epoch": 0.13397821846478383, + "grad_norm": 8.979076385498047, + "learning_rate": 6.911631965336252e-06, + "loss": 0.5303, + "mean_token_accuracy": 0.8422817379236222, + "num_tokens": 51941545.0, + "step": 43220 + }, + { + "entropy": 1.8466507881879806, + "epoch": 0.13400921758983353, + "grad_norm": 8.20052719116211, + "learning_rate": 6.9108324983439605e-06, + "loss": 0.5342, + "mean_token_accuracy": 0.8367902502417565, + "num_tokens": 51953888.0, + "step": 43230 + }, + { + "entropy": 1.9022724777460098, + "epoch": 0.13404021671488323, + "grad_norm": 8.77592945098877, + "learning_rate": 6.910033308710034e-06, + "loss": 0.5512, + "mean_token_accuracy": 0.819185683131218, + "num_tokens": 51966016.0, + "step": 43240 + }, + { + "entropy": 1.9349356770515442, + "epoch": 0.13407121583993292, + "grad_norm": 6.25929594039917, + "learning_rate": 6.909234396274137e-06, + "loss": 0.549, + "mean_token_accuracy": 0.833576287329197, + "num_tokens": 51978893.0, + "step": 43250 + }, + { + "entropy": 1.9031900018453598, + "epoch": 0.13410221496498262, + "grad_norm": 8.607972145080566, + "learning_rate": 6.908435760876063e-06, + "loss": 0.5548, + "mean_token_accuracy": 0.8291514754295349, + "num_tokens": 51990673.0, + "step": 43260 + }, + { + "entropy": 1.9223877504467963, + "epoch": 0.13413321409003232, + "grad_norm": 8.238653182983398, + "learning_rate": 6.9076374023557366e-06, + "loss": 0.5656, + "mean_token_accuracy": 0.8217443853616715, + "num_tokens": 52002484.0, + "step": 43270 + }, + { + "entropy": 1.9485286980867387, + "epoch": 0.134164213215082, + "grad_norm": 13.640851974487305, + "learning_rate": 6.90683932055321e-06, + "loss": 0.6573, + "mean_token_accuracy": 0.8062111675739289, + "num_tokens": 52013371.0, + "step": 43280 + }, + { + "entropy": 1.8719741210341454, + "epoch": 0.1341952123401317, + "grad_norm": 7.591920852661133, + "learning_rate": 6.906041515308666e-06, + "loss": 0.5224, + "mean_token_accuracy": 0.8311554431915283, + "num_tokens": 52025599.0, + "step": 43290 + }, + { + "entropy": 1.867848064005375, + "epoch": 0.1342262114651814, + "grad_norm": 6.9264068603515625, + "learning_rate": 6.905243986462417e-06, + "loss": 0.4803, + "mean_token_accuracy": 0.8396237522363663, + "num_tokens": 52038600.0, + "step": 43300 + }, + { + "entropy": 1.9002704456448556, + "epoch": 0.1342572105902311, + "grad_norm": 8.932539939880371, + "learning_rate": 6.9044467338549005e-06, + "loss": 0.5224, + "mean_token_accuracy": 0.8397117152810096, + "num_tokens": 52050301.0, + "step": 43310 + }, + { + "entropy": 1.839927391707897, + "epoch": 0.1342882097152808, + "grad_norm": 3.8596723079681396, + "learning_rate": 6.903649757326689e-06, + "loss": 0.5026, + "mean_token_accuracy": 0.8357837229967118, + "num_tokens": 52062178.0, + "step": 43320 + }, + { + "entropy": 1.9066562354564667, + "epoch": 0.1343192088403305, + "grad_norm": 9.047643661499023, + "learning_rate": 6.902853056718479e-06, + "loss": 0.5685, + "mean_token_accuracy": 0.8244990050792694, + "num_tokens": 52073513.0, + "step": 43330 + }, + { + "entropy": 1.9126151755452157, + "epoch": 0.13435020796538016, + "grad_norm": 7.049984455108643, + "learning_rate": 6.9020566318711e-06, + "loss": 0.5438, + "mean_token_accuracy": 0.8374730840325355, + "num_tokens": 52084870.0, + "step": 43340 + }, + { + "entropy": 1.9268995508551598, + "epoch": 0.13438120709042986, + "grad_norm": 8.504199981689453, + "learning_rate": 6.901260482625506e-06, + "loss": 0.5814, + "mean_token_accuracy": 0.8227284833788872, + "num_tokens": 52096377.0, + "step": 43350 + }, + { + "entropy": 1.8570437088608742, + "epoch": 0.13441220621547956, + "grad_norm": 4.3164873123168945, + "learning_rate": 6.90046460882278e-06, + "loss": 0.4649, + "mean_token_accuracy": 0.8381210267543793, + "num_tokens": 52109355.0, + "step": 43360 + }, + { + "entropy": 1.834708495438099, + "epoch": 0.13444320534052925, + "grad_norm": 8.836848258972168, + "learning_rate": 6.8996690103041376e-06, + "loss": 0.523, + "mean_token_accuracy": 0.8306301310658455, + "num_tokens": 52121829.0, + "step": 43370 + }, + { + "entropy": 1.9287886828184129, + "epoch": 0.13447420446557895, + "grad_norm": 10.67462158203125, + "learning_rate": 6.898873686910913e-06, + "loss": 0.5709, + "mean_token_accuracy": 0.8287528812885284, + "num_tokens": 52133362.0, + "step": 43380 + }, + { + "entropy": 1.855184331536293, + "epoch": 0.13450520359062865, + "grad_norm": 3.361020803451538, + "learning_rate": 6.898078638484581e-06, + "loss": 0.5023, + "mean_token_accuracy": 0.8325510248541832, + "num_tokens": 52145855.0, + "step": 43390 + }, + { + "entropy": 1.9872781455516815, + "epoch": 0.13453620271567834, + "grad_norm": 4.882389545440674, + "learning_rate": 6.897283864866734e-06, + "loss": 0.6029, + "mean_token_accuracy": 0.8120069310069085, + "num_tokens": 52157083.0, + "step": 43400 + }, + { + "entropy": 1.9066018536686897, + "epoch": 0.13456720184072804, + "grad_norm": 8.167949676513672, + "learning_rate": 6.896489365899096e-06, + "loss": 0.5342, + "mean_token_accuracy": 0.8314293026924133, + "num_tokens": 52169100.0, + "step": 43410 + }, + { + "entropy": 1.8998596712946891, + "epoch": 0.13459820096577774, + "grad_norm": 8.666936874389648, + "learning_rate": 6.895695141423521e-06, + "loss": 0.5258, + "mean_token_accuracy": 0.8273270547389984, + "num_tokens": 52181307.0, + "step": 43420 + }, + { + "entropy": 1.8541322633624078, + "epoch": 0.13462920009082743, + "grad_norm": 4.933404922485352, + "learning_rate": 6.894901191281985e-06, + "loss": 0.5296, + "mean_token_accuracy": 0.8414107546210289, + "num_tokens": 52193857.0, + "step": 43430 + }, + { + "entropy": 1.8842070639133452, + "epoch": 0.13466019921587713, + "grad_norm": 7.285519599914551, + "learning_rate": 6.894107515316597e-06, + "loss": 0.513, + "mean_token_accuracy": 0.8374163269996643, + "num_tokens": 52205689.0, + "step": 43440 + }, + { + "entropy": 1.8152766466140746, + "epoch": 0.13469119834092683, + "grad_norm": 7.589842796325684, + "learning_rate": 6.893314113369588e-06, + "loss": 0.4532, + "mean_token_accuracy": 0.845301553606987, + "num_tokens": 52219579.0, + "step": 43450 + }, + { + "entropy": 1.756292749941349, + "epoch": 0.13472219746597652, + "grad_norm": 6.797213554382324, + "learning_rate": 6.89252098528332e-06, + "loss": 0.4379, + "mean_token_accuracy": 0.8514433890581131, + "num_tokens": 52232565.0, + "step": 43460 + }, + { + "entropy": 1.8172416999936103, + "epoch": 0.13475319659102622, + "grad_norm": 7.267201900482178, + "learning_rate": 6.891728130900279e-06, + "loss": 0.4651, + "mean_token_accuracy": 0.8358834356069564, + "num_tokens": 52246625.0, + "step": 43470 + }, + { + "entropy": 1.9737324267625809, + "epoch": 0.13478419571607592, + "grad_norm": 7.657471656799316, + "learning_rate": 6.89093555006308e-06, + "loss": 0.5858, + "mean_token_accuracy": 0.827315254509449, + "num_tokens": 52257290.0, + "step": 43480 + }, + { + "entropy": 1.8453372538089752, + "epoch": 0.1348151948411256, + "grad_norm": 3.9033000469207764, + "learning_rate": 6.890143242614467e-06, + "loss": 0.4739, + "mean_token_accuracy": 0.8415728509426117, + "num_tokens": 52269698.0, + "step": 43490 + }, + { + "entropy": 1.912654523551464, + "epoch": 0.1348461939661753, + "grad_norm": 9.669717788696289, + "learning_rate": 6.889351208397301e-06, + "loss": 0.5615, + "mean_token_accuracy": 0.8255023837089539, + "num_tokens": 52281310.0, + "step": 43500 + }, + { + "entropy": 1.9051228925585746, + "epoch": 0.134877193091225, + "grad_norm": 9.635804176330566, + "learning_rate": 6.888559447254581e-06, + "loss": 0.5713, + "mean_token_accuracy": 0.8345887675881386, + "num_tokens": 52293144.0, + "step": 43510 + }, + { + "entropy": 1.8088420122861861, + "epoch": 0.1349081922162747, + "grad_norm": 7.230260372161865, + "learning_rate": 6.887767959029426e-06, + "loss": 0.4494, + "mean_token_accuracy": 0.8482095554471016, + "num_tokens": 52305778.0, + "step": 43520 + }, + { + "entropy": 1.7873062670230866, + "epoch": 0.1349391913413244, + "grad_norm": 7.785470008850098, + "learning_rate": 6.886976743565082e-06, + "loss": 0.4747, + "mean_token_accuracy": 0.847273476421833, + "num_tokens": 52318326.0, + "step": 43530 + }, + { + "entropy": 1.8959727630019187, + "epoch": 0.1349701904663741, + "grad_norm": 4.0128254890441895, + "learning_rate": 6.886185800704923e-06, + "loss": 0.5779, + "mean_token_accuracy": 0.8261519700288773, + "num_tokens": 52329198.0, + "step": 43540 + }, + { + "entropy": 1.8708949625492095, + "epoch": 0.1350011895914238, + "grad_norm": 8.658716201782227, + "learning_rate": 6.8853951302924424e-06, + "loss": 0.5234, + "mean_token_accuracy": 0.8351444467902184, + "num_tokens": 52340797.0, + "step": 43550 + }, + { + "entropy": 1.9420608460903168, + "epoch": 0.1350321887164735, + "grad_norm": 8.919011116027832, + "learning_rate": 6.884604732171271e-06, + "loss": 0.6175, + "mean_token_accuracy": 0.8187188416719436, + "num_tokens": 52352295.0, + "step": 43560 + }, + { + "entropy": 1.8305167585611344, + "epoch": 0.13506318784152319, + "grad_norm": 7.717691898345947, + "learning_rate": 6.883814606185152e-06, + "loss": 0.5156, + "mean_token_accuracy": 0.8335830017924308, + "num_tokens": 52364889.0, + "step": 43570 + }, + { + "entropy": 1.8474395513534545, + "epoch": 0.13509418696657288, + "grad_norm": 4.04104471206665, + "learning_rate": 6.883024752177963e-06, + "loss": 0.5319, + "mean_token_accuracy": 0.8278833538293838, + "num_tokens": 52377375.0, + "step": 43580 + }, + { + "entropy": 1.8440303042531014, + "epoch": 0.13512518609162255, + "grad_norm": 8.369099617004395, + "learning_rate": 6.882235169993708e-06, + "loss": 0.5352, + "mean_token_accuracy": 0.838912108540535, + "num_tokens": 52389129.0, + "step": 43590 + }, + { + "entropy": 1.7567294076085092, + "epoch": 0.13515618521667225, + "grad_norm": 8.187392234802246, + "learning_rate": 6.881445859476506e-06, + "loss": 0.4261, + "mean_token_accuracy": 0.8403543129563331, + "num_tokens": 52402761.0, + "step": 43600 + }, + { + "entropy": 1.854997991025448, + "epoch": 0.13518718434172194, + "grad_norm": 3.2255561351776123, + "learning_rate": 6.880656820470614e-06, + "loss": 0.4601, + "mean_token_accuracy": 0.8469349488615989, + "num_tokens": 52414592.0, + "step": 43610 + }, + { + "entropy": 1.9099668189883232, + "epoch": 0.13521818346677164, + "grad_norm": 8.775009155273438, + "learning_rate": 6.8798680528204045e-06, + "loss": 0.5708, + "mean_token_accuracy": 0.8262275233864784, + "num_tokens": 52425981.0, + "step": 43620 + }, + { + "entropy": 1.9568244695663453, + "epoch": 0.13524918259182134, + "grad_norm": 7.415703773498535, + "learning_rate": 6.879079556370377e-06, + "loss": 0.5595, + "mean_token_accuracy": 0.8358083561062812, + "num_tokens": 52436580.0, + "step": 43630 + }, + { + "entropy": 1.87071183770895, + "epoch": 0.13528018171687103, + "grad_norm": 12.355653762817383, + "learning_rate": 6.87829133096516e-06, + "loss": 0.5773, + "mean_token_accuracy": 0.8205010086297989, + "num_tokens": 52449363.0, + "step": 43640 + }, + { + "entropy": 1.7835715875029563, + "epoch": 0.13531118084192073, + "grad_norm": 10.02258014678955, + "learning_rate": 6.877503376449503e-06, + "loss": 0.4716, + "mean_token_accuracy": 0.8421624451875687, + "num_tokens": 52462468.0, + "step": 43650 + }, + { + "entropy": 1.834514120221138, + "epoch": 0.13534217996697043, + "grad_norm": 9.3626127243042, + "learning_rate": 6.876715692668278e-06, + "loss": 0.5309, + "mean_token_accuracy": 0.8375687584280967, + "num_tokens": 52475023.0, + "step": 43660 + }, + { + "entropy": 1.9362987548112869, + "epoch": 0.13537317909202012, + "grad_norm": 8.014119148254395, + "learning_rate": 6.875928279466486e-06, + "loss": 0.582, + "mean_token_accuracy": 0.8320098280906677, + "num_tokens": 52486718.0, + "step": 43670 + }, + { + "entropy": 1.9744535475969314, + "epoch": 0.13540417821706982, + "grad_norm": 9.7212495803833, + "learning_rate": 6.87514113668925e-06, + "loss": 0.5774, + "mean_token_accuracy": 0.8281274557113647, + "num_tokens": 52497664.0, + "step": 43680 + }, + { + "entropy": 1.893913634121418, + "epoch": 0.13543517734211952, + "grad_norm": 6.767265796661377, + "learning_rate": 6.874354264181815e-06, + "loss": 0.5278, + "mean_token_accuracy": 0.835284897685051, + "num_tokens": 52509266.0, + "step": 43690 + }, + { + "entropy": 1.985574059188366, + "epoch": 0.1354661764671692, + "grad_norm": 5.145320892333984, + "learning_rate": 6.873567661789554e-06, + "loss": 0.5915, + "mean_token_accuracy": 0.8212827607989311, + "num_tokens": 52520661.0, + "step": 43700 + }, + { + "entropy": 1.8134831815958024, + "epoch": 0.1354971755922189, + "grad_norm": 3.865335702896118, + "learning_rate": 6.872781329357961e-06, + "loss": 0.4826, + "mean_token_accuracy": 0.839052714407444, + "num_tokens": 52533364.0, + "step": 43710 + }, + { + "entropy": 1.9578730061650276, + "epoch": 0.1355281747172686, + "grad_norm": 10.674678802490234, + "learning_rate": 6.871995266732656e-06, + "loss": 0.6165, + "mean_token_accuracy": 0.8204101115465164, + "num_tokens": 52544383.0, + "step": 43720 + }, + { + "entropy": 1.7825062423944473, + "epoch": 0.1355591738423183, + "grad_norm": 8.773101806640625, + "learning_rate": 6.871209473759379e-06, + "loss": 0.4185, + "mean_token_accuracy": 0.8400455713272095, + "num_tokens": 52557813.0, + "step": 43730 + }, + { + "entropy": 1.9574034690856934, + "epoch": 0.135590172967368, + "grad_norm": 9.174714088439941, + "learning_rate": 6.870423950283998e-06, + "loss": 0.5867, + "mean_token_accuracy": 0.8255863264203072, + "num_tokens": 52569076.0, + "step": 43740 + }, + { + "entropy": 1.9655730903148652, + "epoch": 0.1356211720924177, + "grad_norm": 10.547797203063965, + "learning_rate": 6.869638696152497e-06, + "loss": 0.602, + "mean_token_accuracy": 0.8277508243918419, + "num_tokens": 52579434.0, + "step": 43750 + }, + { + "entropy": 1.8552222028374672, + "epoch": 0.1356521712174674, + "grad_norm": 8.642329216003418, + "learning_rate": 6.868853711210994e-06, + "loss": 0.4995, + "mean_token_accuracy": 0.8307379022240639, + "num_tokens": 52592462.0, + "step": 43760 + }, + { + "entropy": 1.897879421710968, + "epoch": 0.1356831703425171, + "grad_norm": 4.2802042961120605, + "learning_rate": 6.868068995305721e-06, + "loss": 0.504, + "mean_token_accuracy": 0.830087074637413, + "num_tokens": 52605129.0, + "step": 43770 + }, + { + "entropy": 1.9668879002332686, + "epoch": 0.1357141694675668, + "grad_norm": 10.869790077209473, + "learning_rate": 6.8672845482830375e-06, + "loss": 0.63, + "mean_token_accuracy": 0.8210312753915787, + "num_tokens": 52616024.0, + "step": 43780 + }, + { + "entropy": 1.909705390036106, + "epoch": 0.13574516859261648, + "grad_norm": 8.73975658416748, + "learning_rate": 6.866500369989424e-06, + "loss": 0.5014, + "mean_token_accuracy": 0.8324807018041611, + "num_tokens": 52627564.0, + "step": 43790 + }, + { + "entropy": 1.8812494575977325, + "epoch": 0.13577616771766618, + "grad_norm": 9.084870338439941, + "learning_rate": 6.865716460271482e-06, + "loss": 0.5365, + "mean_token_accuracy": 0.8389278829097748, + "num_tokens": 52639374.0, + "step": 43800 + }, + { + "entropy": 1.9060865387320518, + "epoch": 0.13580716684271588, + "grad_norm": 8.113476753234863, + "learning_rate": 6.86493281897594e-06, + "loss": 0.5409, + "mean_token_accuracy": 0.8345111593604088, + "num_tokens": 52650722.0, + "step": 43810 + }, + { + "entropy": 1.8717187106609345, + "epoch": 0.13583816596776557, + "grad_norm": 9.743805885314941, + "learning_rate": 6.864149445949645e-06, + "loss": 0.5383, + "mean_token_accuracy": 0.8225096851587296, + "num_tokens": 52663058.0, + "step": 43820 + }, + { + "entropy": 1.8713490128517152, + "epoch": 0.13586916509281524, + "grad_norm": 8.773591995239258, + "learning_rate": 6.86336634103957e-06, + "loss": 0.5154, + "mean_token_accuracy": 0.8360903859138489, + "num_tokens": 52675165.0, + "step": 43830 + }, + { + "entropy": 1.8310488358139991, + "epoch": 0.13590016421786494, + "grad_norm": 4.114312648773193, + "learning_rate": 6.862583504092806e-06, + "loss": 0.4769, + "mean_token_accuracy": 0.8354193419218063, + "num_tokens": 52686891.0, + "step": 43840 + }, + { + "entropy": 1.8633979707956314, + "epoch": 0.13593116334291463, + "grad_norm": 9.394161224365234, + "learning_rate": 6.861800934956568e-06, + "loss": 0.5265, + "mean_token_accuracy": 0.8260962456464768, + "num_tokens": 52698634.0, + "step": 43850 + }, + { + "entropy": 1.8397690102458, + "epoch": 0.13596216246796433, + "grad_norm": 8.046875, + "learning_rate": 6.861018633478194e-06, + "loss": 0.4864, + "mean_token_accuracy": 0.8343220546841621, + "num_tokens": 52711492.0, + "step": 43860 + }, + { + "entropy": 1.8941639497876168, + "epoch": 0.13599316159301403, + "grad_norm": 10.234258651733398, + "learning_rate": 6.8602365995051445e-06, + "loss": 0.5132, + "mean_token_accuracy": 0.8383172944188118, + "num_tokens": 52723250.0, + "step": 43870 + }, + { + "entropy": 1.8545018136501312, + "epoch": 0.13602416071806372, + "grad_norm": 13.808713912963867, + "learning_rate": 6.8594548328849984e-06, + "loss": 0.52, + "mean_token_accuracy": 0.8330124363303184, + "num_tokens": 52734990.0, + "step": 43880 + }, + { + "entropy": 1.8971347585320473, + "epoch": 0.13605515984311342, + "grad_norm": 9.022436141967773, + "learning_rate": 6.858673333465455e-06, + "loss": 0.5029, + "mean_token_accuracy": 0.8370542734861374, + "num_tokens": 52746095.0, + "step": 43890 + }, + { + "entropy": 1.8831921055912972, + "epoch": 0.13608615896816312, + "grad_norm": 11.447751998901367, + "learning_rate": 6.8578921010943434e-06, + "loss": 0.5254, + "mean_token_accuracy": 0.8259268268942833, + "num_tokens": 52758290.0, + "step": 43900 + }, + { + "entropy": 1.8786703854799272, + "epoch": 0.13611715809321281, + "grad_norm": 7.6989898681640625, + "learning_rate": 6.857111135619603e-06, + "loss": 0.4644, + "mean_token_accuracy": 0.8493966236710548, + "num_tokens": 52770682.0, + "step": 43910 + }, + { + "entropy": 1.8685062855482102, + "epoch": 0.1361481572182625, + "grad_norm": 9.15949821472168, + "learning_rate": 6.856330436889304e-06, + "loss": 0.555, + "mean_token_accuracy": 0.8257764980196953, + "num_tokens": 52782612.0, + "step": 43920 + }, + { + "entropy": 1.9584510385990144, + "epoch": 0.1361791563433122, + "grad_norm": 8.610404968261719, + "learning_rate": 6.855550004751631e-06, + "loss": 0.6142, + "mean_token_accuracy": 0.8210650518536567, + "num_tokens": 52793680.0, + "step": 43930 + }, + { + "entropy": 1.8771604105830193, + "epoch": 0.1362101554683619, + "grad_norm": 9.448259353637695, + "learning_rate": 6.854769839054892e-06, + "loss": 0.578, + "mean_token_accuracy": 0.8262861162424088, + "num_tokens": 52804679.0, + "step": 43940 + }, + { + "entropy": 1.9119238778948784, + "epoch": 0.1362411545934116, + "grad_norm": 7.940917015075684, + "learning_rate": 6.853989939647518e-06, + "loss": 0.5228, + "mean_token_accuracy": 0.8279161244630814, + "num_tokens": 52816435.0, + "step": 43950 + }, + { + "entropy": 1.9507309287786483, + "epoch": 0.1362721537184613, + "grad_norm": 8.971407890319824, + "learning_rate": 6.853210306378055e-06, + "loss": 0.5933, + "mean_token_accuracy": 0.8229886546730996, + "num_tokens": 52827096.0, + "step": 43960 + }, + { + "entropy": 2.0049089699983598, + "epoch": 0.136303152843511, + "grad_norm": 9.031655311584473, + "learning_rate": 6.852430939095177e-06, + "loss": 0.6456, + "mean_token_accuracy": 0.8153550997376442, + "num_tokens": 52838566.0, + "step": 43970 + }, + { + "entropy": 1.9605593144893647, + "epoch": 0.1363341519685607, + "grad_norm": 9.522255897521973, + "learning_rate": 6.851651837647672e-06, + "loss": 0.6225, + "mean_token_accuracy": 0.8113553240895272, + "num_tokens": 52848874.0, + "step": 43980 + }, + { + "entropy": 1.9099364891648292, + "epoch": 0.1363651510936104, + "grad_norm": 9.196107864379883, + "learning_rate": 6.85087300188445e-06, + "loss": 0.5653, + "mean_token_accuracy": 0.8262931287288666, + "num_tokens": 52861218.0, + "step": 43990 + }, + { + "entropy": 1.911503429710865, + "epoch": 0.13639615021866008, + "grad_norm": 9.200566291809082, + "learning_rate": 6.850094431654544e-06, + "loss": 0.5383, + "mean_token_accuracy": 0.8331650167703628, + "num_tokens": 52872651.0, + "step": 44000 + }, + { + "entropy": 1.8342396020889282, + "epoch": 0.13642714934370978, + "grad_norm": 4.3369903564453125, + "learning_rate": 6.849316126807107e-06, + "loss": 0.507, + "mean_token_accuracy": 0.8284252122044563, + "num_tokens": 52885546.0, + "step": 44010 + }, + { + "entropy": 1.8490222096443176, + "epoch": 0.13645814846875948, + "grad_norm": 7.278261184692383, + "learning_rate": 6.848538087191405e-06, + "loss": 0.4986, + "mean_token_accuracy": 0.8356017455458641, + "num_tokens": 52897211.0, + "step": 44020 + }, + { + "entropy": 1.8598289757966995, + "epoch": 0.13648914759380917, + "grad_norm": 8.271986961364746, + "learning_rate": 6.8477603126568325e-06, + "loss": 0.5433, + "mean_token_accuracy": 0.8204525783658028, + "num_tokens": 52908373.0, + "step": 44030 + }, + { + "entropy": 1.8169844523072243, + "epoch": 0.13652014671885887, + "grad_norm": 9.633807182312012, + "learning_rate": 6.846982803052898e-06, + "loss": 0.4595, + "mean_token_accuracy": 0.8419378191232681, + "num_tokens": 52921312.0, + "step": 44040 + }, + { + "entropy": 1.8356873735785484, + "epoch": 0.13655114584390857, + "grad_norm": 10.545353889465332, + "learning_rate": 6.846205558229234e-06, + "loss": 0.4975, + "mean_token_accuracy": 0.8387926116585731, + "num_tokens": 52934203.0, + "step": 44050 + }, + { + "entropy": 1.9119671627879142, + "epoch": 0.13658214496895826, + "grad_norm": 10.298490524291992, + "learning_rate": 6.845428578035587e-06, + "loss": 0.5507, + "mean_token_accuracy": 0.8357212603092193, + "num_tokens": 52945457.0, + "step": 44060 + }, + { + "entropy": 1.9574100762605666, + "epoch": 0.13661314409400796, + "grad_norm": 7.986303806304932, + "learning_rate": 6.8446518623218284e-06, + "loss": 0.5709, + "mean_token_accuracy": 0.8229815408587455, + "num_tokens": 52956675.0, + "step": 44070 + }, + { + "entropy": 1.9151659101247787, + "epoch": 0.13664414321905763, + "grad_norm": 8.7662935256958, + "learning_rate": 6.843875410937946e-06, + "loss": 0.5161, + "mean_token_accuracy": 0.8375581040978431, + "num_tokens": 52967730.0, + "step": 44080 + }, + { + "entropy": 1.8822844669222831, + "epoch": 0.13667514234410733, + "grad_norm": 8.069694519042969, + "learning_rate": 6.8430992237340455e-06, + "loss": 0.5308, + "mean_token_accuracy": 0.8303315103054046, + "num_tokens": 52979174.0, + "step": 44090 + }, + { + "entropy": 1.83747291713953, + "epoch": 0.13670614146915702, + "grad_norm": 8.96426773071289, + "learning_rate": 6.8423233005603554e-06, + "loss": 0.5115, + "mean_token_accuracy": 0.8289276763796807, + "num_tokens": 52991305.0, + "step": 44100 + }, + { + "entropy": 1.8574891343712807, + "epoch": 0.13673714059420672, + "grad_norm": 8.317970275878906, + "learning_rate": 6.8415476412672185e-06, + "loss": 0.5309, + "mean_token_accuracy": 0.8376831218600274, + "num_tokens": 53002731.0, + "step": 44110 + }, + { + "entropy": 1.929608154296875, + "epoch": 0.13676813971925642, + "grad_norm": 8.995396614074707, + "learning_rate": 6.8407722457051005e-06, + "loss": 0.5975, + "mean_token_accuracy": 0.8171190902590751, + "num_tokens": 53013927.0, + "step": 44120 + }, + { + "entropy": 1.777741050720215, + "epoch": 0.1367991388443061, + "grad_norm": 9.382816314697266, + "learning_rate": 6.839997113724582e-06, + "loss": 0.4223, + "mean_token_accuracy": 0.8454552739858627, + "num_tokens": 53026933.0, + "step": 44130 + }, + { + "entropy": 1.8432798728346824, + "epoch": 0.1368301379693558, + "grad_norm": 7.455301284790039, + "learning_rate": 6.839222245176366e-06, + "loss": 0.5518, + "mean_token_accuracy": 0.837872776389122, + "num_tokens": 53039377.0, + "step": 44140 + }, + { + "entropy": 1.8326244458556176, + "epoch": 0.1368611370944055, + "grad_norm": 8.096274375915527, + "learning_rate": 6.838447639911271e-06, + "loss": 0.5404, + "mean_token_accuracy": 0.8261276423931122, + "num_tokens": 53050778.0, + "step": 44150 + }, + { + "entropy": 1.7385433629155158, + "epoch": 0.1368921362194552, + "grad_norm": 8.745050430297852, + "learning_rate": 6.837673297780233e-06, + "loss": 0.4397, + "mean_token_accuracy": 0.8421786040067673, + "num_tokens": 53064304.0, + "step": 44160 + }, + { + "entropy": 1.8994769722223281, + "epoch": 0.1369231353445049, + "grad_norm": 9.911214828491211, + "learning_rate": 6.836899218634308e-06, + "loss": 0.5867, + "mean_token_accuracy": 0.8285346269607544, + "num_tokens": 53076071.0, + "step": 44170 + }, + { + "entropy": 1.87768052816391, + "epoch": 0.1369541344695546, + "grad_norm": 8.06799602508545, + "learning_rate": 6.836125402324671e-06, + "loss": 0.5769, + "mean_token_accuracy": 0.8310927867889404, + "num_tokens": 53087494.0, + "step": 44180 + }, + { + "entropy": 1.9110476791858673, + "epoch": 0.1369851335946043, + "grad_norm": 9.325477600097656, + "learning_rate": 6.835351848702615e-06, + "loss": 0.6014, + "mean_token_accuracy": 0.824842332303524, + "num_tokens": 53098984.0, + "step": 44190 + }, + { + "entropy": 1.8674473196268082, + "epoch": 0.137016132719654, + "grad_norm": 9.16952896118164, + "learning_rate": 6.834578557619546e-06, + "loss": 0.5374, + "mean_token_accuracy": 0.8323730126023292, + "num_tokens": 53110328.0, + "step": 44200 + }, + { + "entropy": 1.801041378080845, + "epoch": 0.13704713184470368, + "grad_norm": 4.72969388961792, + "learning_rate": 6.8338055289269914e-06, + "loss": 0.5129, + "mean_token_accuracy": 0.840267626941204, + "num_tokens": 53123078.0, + "step": 44210 + }, + { + "entropy": 1.9226965308189392, + "epoch": 0.13707813096975338, + "grad_norm": 9.973706245422363, + "learning_rate": 6.8330327624765955e-06, + "loss": 0.6247, + "mean_token_accuracy": 0.8245911300182343, + "num_tokens": 53133601.0, + "step": 44220 + }, + { + "entropy": 1.9381461694836617, + "epoch": 0.13710913009480308, + "grad_norm": 8.738226890563965, + "learning_rate": 6.832260258120124e-06, + "loss": 0.5691, + "mean_token_accuracy": 0.8299503594636917, + "num_tokens": 53145004.0, + "step": 44230 + }, + { + "entropy": 1.7989469826221467, + "epoch": 0.13714012921985277, + "grad_norm": 9.312309265136719, + "learning_rate": 6.831488015709451e-06, + "loss": 0.4809, + "mean_token_accuracy": 0.8428971692919731, + "num_tokens": 53158057.0, + "step": 44240 + }, + { + "entropy": 1.9178980767726899, + "epoch": 0.13717112834490247, + "grad_norm": 8.284287452697754, + "learning_rate": 6.830716035096575e-06, + "loss": 0.5569, + "mean_token_accuracy": 0.8296015933156013, + "num_tokens": 53169211.0, + "step": 44250 + }, + { + "entropy": 1.8435182958841323, + "epoch": 0.13720212746995217, + "grad_norm": 9.238348007202148, + "learning_rate": 6.82994431613361e-06, + "loss": 0.5224, + "mean_token_accuracy": 0.8327393636107445, + "num_tokens": 53180990.0, + "step": 44260 + }, + { + "entropy": 1.8937449261546135, + "epoch": 0.13723312659500186, + "grad_norm": 4.894327163696289, + "learning_rate": 6.829172858672786e-06, + "loss": 0.5545, + "mean_token_accuracy": 0.827644082903862, + "num_tokens": 53192974.0, + "step": 44270 + }, + { + "entropy": 1.910013110935688, + "epoch": 0.13726412572005156, + "grad_norm": 7.417638301849365, + "learning_rate": 6.828401662566448e-06, + "loss": 0.5548, + "mean_token_accuracy": 0.8346083343029023, + "num_tokens": 53204669.0, + "step": 44280 + }, + { + "entropy": 1.878174401819706, + "epoch": 0.13729512484510126, + "grad_norm": 8.256863594055176, + "learning_rate": 6.827630727667063e-06, + "loss": 0.5648, + "mean_token_accuracy": 0.8261244371533394, + "num_tokens": 53216346.0, + "step": 44290 + }, + { + "entropy": 1.8674972161650658, + "epoch": 0.13732612397015095, + "grad_norm": 8.606951713562012, + "learning_rate": 6.826860053827209e-06, + "loss": 0.5128, + "mean_token_accuracy": 0.832400643825531, + "num_tokens": 53228130.0, + "step": 44300 + }, + { + "entropy": 1.8570816084742545, + "epoch": 0.13735712309520065, + "grad_norm": 8.84512710571289, + "learning_rate": 6.826089640899584e-06, + "loss": 0.5438, + "mean_token_accuracy": 0.8290506705641747, + "num_tokens": 53240500.0, + "step": 44310 + }, + { + "entropy": 1.9010854482650756, + "epoch": 0.13738812222025035, + "grad_norm": 9.149563789367676, + "learning_rate": 6.825319488737001e-06, + "loss": 0.5429, + "mean_token_accuracy": 0.8214688524603844, + "num_tokens": 53252154.0, + "step": 44320 + }, + { + "entropy": 1.8761808782815934, + "epoch": 0.13741912134530002, + "grad_norm": 8.693964958190918, + "learning_rate": 6.824549597192389e-06, + "loss": 0.4767, + "mean_token_accuracy": 0.8360074177384377, + "num_tokens": 53264128.0, + "step": 44330 + }, + { + "entropy": 1.8591908350586892, + "epoch": 0.1374501204703497, + "grad_norm": 6.832935810089111, + "learning_rate": 6.823779966118794e-06, + "loss": 0.5326, + "mean_token_accuracy": 0.831575682759285, + "num_tokens": 53275450.0, + "step": 44340 + }, + { + "entropy": 1.840942220389843, + "epoch": 0.1374811195953994, + "grad_norm": 8.024560928344727, + "learning_rate": 6.823010595369376e-06, + "loss": 0.481, + "mean_token_accuracy": 0.836443580687046, + "num_tokens": 53287524.0, + "step": 44350 + }, + { + "entropy": 1.9482527136802674, + "epoch": 0.1375121187204491, + "grad_norm": 9.615019798278809, + "learning_rate": 6.8222414847974136e-06, + "loss": 0.565, + "mean_token_accuracy": 0.8262650355696678, + "num_tokens": 53298379.0, + "step": 44360 + }, + { + "entropy": 1.8953028246760368, + "epoch": 0.1375431178454988, + "grad_norm": 5.4591169357299805, + "learning_rate": 6.821472634256301e-06, + "loss": 0.5462, + "mean_token_accuracy": 0.8326457425951957, + "num_tokens": 53310130.0, + "step": 44370 + }, + { + "entropy": 1.9527557328343392, + "epoch": 0.1375741169705485, + "grad_norm": 9.788069725036621, + "learning_rate": 6.820704043599545e-06, + "loss": 0.5806, + "mean_token_accuracy": 0.8213037893176078, + "num_tokens": 53321390.0, + "step": 44380 + }, + { + "entropy": 1.8823087111115455, + "epoch": 0.1376051160955982, + "grad_norm": 9.298927307128906, + "learning_rate": 6.819935712680769e-06, + "loss": 0.5455, + "mean_token_accuracy": 0.8245194494724274, + "num_tokens": 53333069.0, + "step": 44390 + }, + { + "entropy": 1.920275841653347, + "epoch": 0.1376361152206479, + "grad_norm": 8.774785041809082, + "learning_rate": 6.819167641353716e-06, + "loss": 0.5892, + "mean_token_accuracy": 0.8146108031272888, + "num_tokens": 53344813.0, + "step": 44400 + }, + { + "entropy": 1.87754784822464, + "epoch": 0.1376671143456976, + "grad_norm": 4.192923545837402, + "learning_rate": 6.818399829472239e-06, + "loss": 0.5525, + "mean_token_accuracy": 0.8271669283509254, + "num_tokens": 53357822.0, + "step": 44410 + }, + { + "entropy": 1.7767975226044654, + "epoch": 0.13769811347074729, + "grad_norm": 9.163516998291016, + "learning_rate": 6.8176322768903065e-06, + "loss": 0.5023, + "mean_token_accuracy": 0.8373310551047325, + "num_tokens": 53371230.0, + "step": 44420 + }, + { + "entropy": 1.8346783101558686, + "epoch": 0.13772911259579698, + "grad_norm": 8.969266891479492, + "learning_rate": 6.816864983462007e-06, + "loss": 0.4936, + "mean_token_accuracy": 0.8468596264719963, + "num_tokens": 53383491.0, + "step": 44430 + }, + { + "entropy": 1.8975114315748214, + "epoch": 0.13776011172084668, + "grad_norm": 8.912147521972656, + "learning_rate": 6.816097949041537e-06, + "loss": 0.5409, + "mean_token_accuracy": 0.8271856248378754, + "num_tokens": 53395233.0, + "step": 44440 + }, + { + "entropy": 1.7993320614099502, + "epoch": 0.13779111084589638, + "grad_norm": 4.392065048217773, + "learning_rate": 6.815331173483213e-06, + "loss": 0.4544, + "mean_token_accuracy": 0.8502204686403274, + "num_tokens": 53407103.0, + "step": 44450 + }, + { + "entropy": 1.7951432079076768, + "epoch": 0.13782210997094607, + "grad_norm": 8.865302085876465, + "learning_rate": 6.8145646566414645e-06, + "loss": 0.5096, + "mean_token_accuracy": 0.8381197184324265, + "num_tokens": 53418949.0, + "step": 44460 + }, + { + "entropy": 1.787670373916626, + "epoch": 0.13785310909599577, + "grad_norm": 9.359115600585938, + "learning_rate": 6.813798398370836e-06, + "loss": 0.4616, + "mean_token_accuracy": 0.8410790026187897, + "num_tokens": 53431653.0, + "step": 44470 + }, + { + "entropy": 1.8918751433491707, + "epoch": 0.13788410822104546, + "grad_norm": 8.384889602661133, + "learning_rate": 6.813032398525985e-06, + "loss": 0.5482, + "mean_token_accuracy": 0.8350237265229226, + "num_tokens": 53443023.0, + "step": 44480 + }, + { + "entropy": 1.8514575868844987, + "epoch": 0.13791510734609516, + "grad_norm": 9.768966674804688, + "learning_rate": 6.812266656961686e-06, + "loss": 0.5369, + "mean_token_accuracy": 0.8352868661284447, + "num_tokens": 53454879.0, + "step": 44490 + }, + { + "entropy": 1.7858368843793868, + "epoch": 0.13794610647114486, + "grad_norm": 10.17784309387207, + "learning_rate": 6.811501173532825e-06, + "loss": 0.5214, + "mean_token_accuracy": 0.8304376199841499, + "num_tokens": 53467387.0, + "step": 44500 + }, + { + "entropy": 1.8139680430293084, + "epoch": 0.13797710559619455, + "grad_norm": 3.656538724899292, + "learning_rate": 6.810735948094402e-06, + "loss": 0.4672, + "mean_token_accuracy": 0.8408811286091804, + "num_tokens": 53479197.0, + "step": 44510 + }, + { + "entropy": 1.8897014811635018, + "epoch": 0.13800810472124425, + "grad_norm": 8.641545295715332, + "learning_rate": 6.809970980501534e-06, + "loss": 0.5551, + "mean_token_accuracy": 0.8215809658169746, + "num_tokens": 53490882.0, + "step": 44520 + }, + { + "entropy": 1.9230589419603348, + "epoch": 0.13803910384629395, + "grad_norm": 10.131928443908691, + "learning_rate": 6.80920627060945e-06, + "loss": 0.5744, + "mean_token_accuracy": 0.8328511983156204, + "num_tokens": 53502073.0, + "step": 44530 + }, + { + "entropy": 1.924891072511673, + "epoch": 0.13807010297134364, + "grad_norm": 8.855517387390137, + "learning_rate": 6.808441818273496e-06, + "loss": 0.5667, + "mean_token_accuracy": 0.8318179234862327, + "num_tokens": 53513609.0, + "step": 44540 + }, + { + "entropy": 1.9333911418914795, + "epoch": 0.13810110209639334, + "grad_norm": 8.190418243408203, + "learning_rate": 6.807677623349122e-06, + "loss": 0.6013, + "mean_token_accuracy": 0.8251995086669922, + "num_tokens": 53524500.0, + "step": 44550 + }, + { + "entropy": 1.8267549514770507, + "epoch": 0.13813210122144304, + "grad_norm": 9.940940856933594, + "learning_rate": 6.806913685691902e-06, + "loss": 0.5157, + "mean_token_accuracy": 0.8373084679245949, + "num_tokens": 53537133.0, + "step": 44560 + }, + { + "entropy": 1.8623538598418237, + "epoch": 0.1381631003464927, + "grad_norm": 9.177823066711426, + "learning_rate": 6.806150005157519e-06, + "loss": 0.5458, + "mean_token_accuracy": 0.8380098685622215, + "num_tokens": 53548906.0, + "step": 44570 + }, + { + "entropy": 1.8576925709843635, + "epoch": 0.1381940994715424, + "grad_norm": 4.811609268188477, + "learning_rate": 6.805386581601771e-06, + "loss": 0.5152, + "mean_token_accuracy": 0.8375003471970558, + "num_tokens": 53561573.0, + "step": 44580 + }, + { + "entropy": 1.8996227890253068, + "epoch": 0.1382250985965921, + "grad_norm": 9.413639068603516, + "learning_rate": 6.804623414880566e-06, + "loss": 0.5631, + "mean_token_accuracy": 0.8216565132141114, + "num_tokens": 53572897.0, + "step": 44590 + }, + { + "entropy": 1.9721861988306046, + "epoch": 0.1382560977216418, + "grad_norm": 8.800419807434082, + "learning_rate": 6.803860504849928e-06, + "loss": 0.5792, + "mean_token_accuracy": 0.8295346736907959, + "num_tokens": 53583631.0, + "step": 44600 + }, + { + "entropy": 1.848584523051977, + "epoch": 0.1382870968466915, + "grad_norm": 10.385255813598633, + "learning_rate": 6.803097851365994e-06, + "loss": 0.5099, + "mean_token_accuracy": 0.8350134581327439, + "num_tokens": 53596466.0, + "step": 44610 + }, + { + "entropy": 1.854781810939312, + "epoch": 0.1383180959717412, + "grad_norm": 9.363265991210938, + "learning_rate": 6.8023354542850115e-06, + "loss": 0.5288, + "mean_token_accuracy": 0.8366737559437751, + "num_tokens": 53608422.0, + "step": 44620 + }, + { + "entropy": 1.9337295114994049, + "epoch": 0.1383490950967909, + "grad_norm": 10.87320613861084, + "learning_rate": 6.8015733134633434e-06, + "loss": 0.5616, + "mean_token_accuracy": 0.8331938117742539, + "num_tokens": 53619179.0, + "step": 44630 + }, + { + "entropy": 1.864050853252411, + "epoch": 0.13838009422184058, + "grad_norm": 9.210199356079102, + "learning_rate": 6.800811428757463e-06, + "loss": 0.5562, + "mean_token_accuracy": 0.8297410145401954, + "num_tokens": 53631637.0, + "step": 44640 + }, + { + "entropy": 1.8213204242289067, + "epoch": 0.13841109334689028, + "grad_norm": 8.101444244384766, + "learning_rate": 6.80004980002396e-06, + "loss": 0.448, + "mean_token_accuracy": 0.833535049855709, + "num_tokens": 53644709.0, + "step": 44650 + }, + { + "entropy": 1.8359388574957847, + "epoch": 0.13844209247193998, + "grad_norm": 7.781729221343994, + "learning_rate": 6.799288427119529e-06, + "loss": 0.4765, + "mean_token_accuracy": 0.8498460426926613, + "num_tokens": 53656295.0, + "step": 44660 + }, + { + "entropy": 1.7105117201805116, + "epoch": 0.13847309159698967, + "grad_norm": 7.457498073577881, + "learning_rate": 6.798527309900985e-06, + "loss": 0.3764, + "mean_token_accuracy": 0.8544366523623467, + "num_tokens": 53670465.0, + "step": 44670 + }, + { + "entropy": 1.913690346479416, + "epoch": 0.13850409072203937, + "grad_norm": 9.645509719848633, + "learning_rate": 6.797766448225251e-06, + "loss": 0.5731, + "mean_token_accuracy": 0.8224819481372834, + "num_tokens": 53681032.0, + "step": 44680 + }, + { + "entropy": 1.7610852643847466, + "epoch": 0.13853508984708907, + "grad_norm": 9.441847801208496, + "learning_rate": 6.797005841949362e-06, + "loss": 0.5023, + "mean_token_accuracy": 0.8374917894601822, + "num_tokens": 53693660.0, + "step": 44690 + }, + { + "entropy": 1.8001830980181694, + "epoch": 0.13856608897213876, + "grad_norm": 7.770538806915283, + "learning_rate": 6.796245490930466e-06, + "loss": 0.5021, + "mean_token_accuracy": 0.8382193520665169, + "num_tokens": 53706447.0, + "step": 44700 + }, + { + "entropy": 1.8964716732501983, + "epoch": 0.13859708809718846, + "grad_norm": 10.237454414367676, + "learning_rate": 6.795485395025823e-06, + "loss": 0.63, + "mean_token_accuracy": 0.8149755626916886, + "num_tokens": 53717276.0, + "step": 44710 + }, + { + "entropy": 1.8226286932826041, + "epoch": 0.13862808722223816, + "grad_norm": 8.327227592468262, + "learning_rate": 6.794725554092804e-06, + "loss": 0.532, + "mean_token_accuracy": 0.8249738708138465, + "num_tokens": 53729351.0, + "step": 44720 + }, + { + "entropy": 1.8385003119707108, + "epoch": 0.13865908634728785, + "grad_norm": 8.358758926391602, + "learning_rate": 6.793965967988893e-06, + "loss": 0.5196, + "mean_token_accuracy": 0.823200698196888, + "num_tokens": 53741541.0, + "step": 44730 + }, + { + "entropy": 1.8498780086636544, + "epoch": 0.13869008547233755, + "grad_norm": 9.434165000915527, + "learning_rate": 6.793206636571682e-06, + "loss": 0.479, + "mean_token_accuracy": 0.8404179245233536, + "num_tokens": 53753633.0, + "step": 44740 + }, + { + "entropy": 1.7737918436527251, + "epoch": 0.13872108459738725, + "grad_norm": 9.19772720336914, + "learning_rate": 6.792447559698879e-06, + "loss": 0.4508, + "mean_token_accuracy": 0.8522493004798889, + "num_tokens": 53766858.0, + "step": 44750 + }, + { + "entropy": 1.928390011191368, + "epoch": 0.13875208372243694, + "grad_norm": 9.316328048706055, + "learning_rate": 6.791688737228301e-06, + "loss": 0.5819, + "mean_token_accuracy": 0.8271341755986213, + "num_tokens": 53777506.0, + "step": 44760 + }, + { + "entropy": 1.7646124705672264, + "epoch": 0.13878308284748664, + "grad_norm": 3.9269862174987793, + "learning_rate": 6.790930169017873e-06, + "loss": 0.4169, + "mean_token_accuracy": 0.8518116131424904, + "num_tokens": 53790737.0, + "step": 44770 + }, + { + "entropy": 1.8543679133057593, + "epoch": 0.13881408197253634, + "grad_norm": 4.760611534118652, + "learning_rate": 6.790171854925639e-06, + "loss": 0.492, + "mean_token_accuracy": 0.8325586140155792, + "num_tokens": 53803127.0, + "step": 44780 + }, + { + "entropy": 1.9045397624373437, + "epoch": 0.13884508109758603, + "grad_norm": 9.993179321289062, + "learning_rate": 6.789413794809746e-06, + "loss": 0.5373, + "mean_token_accuracy": 0.8328649774193764, + "num_tokens": 53814755.0, + "step": 44790 + }, + { + "entropy": 1.8256755113601684, + "epoch": 0.13887608022263573, + "grad_norm": 9.889800071716309, + "learning_rate": 6.788655988528456e-06, + "loss": 0.4687, + "mean_token_accuracy": 0.8507124841213226, + "num_tokens": 53827255.0, + "step": 44800 + }, + { + "entropy": 1.8313057616353035, + "epoch": 0.13890707934768542, + "grad_norm": 8.37853717803955, + "learning_rate": 6.787898435940142e-06, + "loss": 0.5314, + "mean_token_accuracy": 0.8321856454014778, + "num_tokens": 53838659.0, + "step": 44810 + }, + { + "entropy": 1.8516384482383728, + "epoch": 0.1389380784727351, + "grad_norm": 4.396270751953125, + "learning_rate": 6.787141136903286e-06, + "loss": 0.4627, + "mean_token_accuracy": 0.8430250212550163, + "num_tokens": 53851127.0, + "step": 44820 + }, + { + "entropy": 1.9117999702692032, + "epoch": 0.1389690775977848, + "grad_norm": 4.586118698120117, + "learning_rate": 6.7863840912764766e-06, + "loss": 0.5565, + "mean_token_accuracy": 0.8265089154243469, + "num_tokens": 53862914.0, + "step": 44830 + }, + { + "entropy": 1.956819300353527, + "epoch": 0.1390000767228345, + "grad_norm": 8.921218872070312, + "learning_rate": 6.785627298918424e-06, + "loss": 0.6126, + "mean_token_accuracy": 0.81765276491642, + "num_tokens": 53874681.0, + "step": 44840 + }, + { + "entropy": 1.8389661602675915, + "epoch": 0.13903107584788418, + "grad_norm": 3.82698917388916, + "learning_rate": 6.784870759687936e-06, + "loss": 0.4488, + "mean_token_accuracy": 0.8415364250540733, + "num_tokens": 53888519.0, + "step": 44850 + }, + { + "entropy": 1.8882953882217408, + "epoch": 0.13906207497293388, + "grad_norm": 13.667028427124023, + "learning_rate": 6.78411447344394e-06, + "loss": 0.5228, + "mean_token_accuracy": 0.822119127213955, + "num_tokens": 53900801.0, + "step": 44860 + }, + { + "entropy": 1.8657517284154892, + "epoch": 0.13909307409798358, + "grad_norm": 8.731566429138184, + "learning_rate": 6.783358440045469e-06, + "loss": 0.4694, + "mean_token_accuracy": 0.8443102672696113, + "num_tokens": 53912704.0, + "step": 44870 + }, + { + "entropy": 1.898390594124794, + "epoch": 0.13912407322303327, + "grad_norm": 8.59327507019043, + "learning_rate": 6.782602659351665e-06, + "loss": 0.5484, + "mean_token_accuracy": 0.8349470987915992, + "num_tokens": 53924654.0, + "step": 44880 + }, + { + "entropy": 1.8541928052902221, + "epoch": 0.13915507234808297, + "grad_norm": 4.574117660522461, + "learning_rate": 6.781847131221781e-06, + "loss": 0.5823, + "mean_token_accuracy": 0.8248166784644126, + "num_tokens": 53936606.0, + "step": 44890 + }, + { + "entropy": 1.929851384460926, + "epoch": 0.13918607147313267, + "grad_norm": 8.305390357971191, + "learning_rate": 6.781091855515185e-06, + "loss": 0.5662, + "mean_token_accuracy": 0.8298721700906754, + "num_tokens": 53948064.0, + "step": 44900 + }, + { + "entropy": 1.9231369107961656, + "epoch": 0.13921707059818236, + "grad_norm": 10.014037132263184, + "learning_rate": 6.780336832091346e-06, + "loss": 0.5509, + "mean_token_accuracy": 0.8278407782316208, + "num_tokens": 53959163.0, + "step": 44910 + }, + { + "entropy": 1.8753797337412834, + "epoch": 0.13924806972323206, + "grad_norm": 4.650575160980225, + "learning_rate": 6.779582060809845e-06, + "loss": 0.5268, + "mean_token_accuracy": 0.8299560397863388, + "num_tokens": 53970669.0, + "step": 44920 + }, + { + "entropy": 1.8801968157291413, + "epoch": 0.13927906884828176, + "grad_norm": 7.466404438018799, + "learning_rate": 6.778827541530377e-06, + "loss": 0.5665, + "mean_token_accuracy": 0.8341574132442474, + "num_tokens": 53981942.0, + "step": 44930 + }, + { + "entropy": 1.9134157732129098, + "epoch": 0.13931006797333145, + "grad_norm": 8.176107406616211, + "learning_rate": 6.7780732741127416e-06, + "loss": 0.5927, + "mean_token_accuracy": 0.8287787228822708, + "num_tokens": 53993567.0, + "step": 44940 + }, + { + "entropy": 1.86323581635952, + "epoch": 0.13934106709838115, + "grad_norm": 10.628385543823242, + "learning_rate": 6.77731925841685e-06, + "loss": 0.4713, + "mean_token_accuracy": 0.8439363062381744, + "num_tokens": 54005593.0, + "step": 44950 + }, + { + "entropy": 1.9236777603626252, + "epoch": 0.13937206622343085, + "grad_norm": 10.189666748046875, + "learning_rate": 6.77656549430272e-06, + "loss": 0.5409, + "mean_token_accuracy": 0.8301165014505386, + "num_tokens": 54017353.0, + "step": 44960 + }, + { + "entropy": 1.9137274518609046, + "epoch": 0.13940306534848054, + "grad_norm": 4.35982608795166, + "learning_rate": 6.77581198163048e-06, + "loss": 0.576, + "mean_token_accuracy": 0.8243458449840546, + "num_tokens": 54028577.0, + "step": 44970 + }, + { + "entropy": 1.881200096011162, + "epoch": 0.13943406447353024, + "grad_norm": 8.86418342590332, + "learning_rate": 6.775058720260368e-06, + "loss": 0.6144, + "mean_token_accuracy": 0.831314592063427, + "num_tokens": 54040844.0, + "step": 44980 + }, + { + "entropy": 1.8607298702001571, + "epoch": 0.13946506359857994, + "grad_norm": 8.576333999633789, + "learning_rate": 6.77430571005273e-06, + "loss": 0.5247, + "mean_token_accuracy": 0.831570616364479, + "num_tokens": 54052863.0, + "step": 44990 + }, + { + "entropy": 1.9090143859386444, + "epoch": 0.13949606272362963, + "grad_norm": 4.669756889343262, + "learning_rate": 6.7735529508680195e-06, + "loss": 0.5629, + "mean_token_accuracy": 0.8271263659000396, + "num_tokens": 54064516.0, + "step": 45000 + }, + { + "entropy": 1.8905669406056405, + "epoch": 0.13952706184867933, + "grad_norm": 10.381400108337402, + "learning_rate": 6.772800442566799e-06, + "loss": 0.5365, + "mean_token_accuracy": 0.831216785311699, + "num_tokens": 54076038.0, + "step": 45010 + }, + { + "entropy": 1.7761605516076089, + "epoch": 0.13955806097372903, + "grad_norm": 2.322737216949463, + "learning_rate": 6.772048185009742e-06, + "loss": 0.4652, + "mean_token_accuracy": 0.8467302456498146, + "num_tokens": 54089228.0, + "step": 45020 + }, + { + "entropy": 1.9183564558625221, + "epoch": 0.13958906009877872, + "grad_norm": 3.822122097015381, + "learning_rate": 6.771296178057627e-06, + "loss": 0.5374, + "mean_token_accuracy": 0.8354233950376511, + "num_tokens": 54100756.0, + "step": 45030 + }, + { + "entropy": 1.89002455919981, + "epoch": 0.13962005922382842, + "grad_norm": 10.499687194824219, + "learning_rate": 6.770544421571341e-06, + "loss": 0.5127, + "mean_token_accuracy": 0.833263611793518, + "num_tokens": 54112772.0, + "step": 45040 + }, + { + "entropy": 1.8945647366344929, + "epoch": 0.13965105834887812, + "grad_norm": 8.45938777923584, + "learning_rate": 6.7697929154118806e-06, + "loss": 0.5375, + "mean_token_accuracy": 0.8283353626728058, + "num_tokens": 54124878.0, + "step": 45050 + }, + { + "entropy": 1.9062055438756942, + "epoch": 0.1396820574739278, + "grad_norm": 8.998255729675293, + "learning_rate": 6.769041659440348e-06, + "loss": 0.5651, + "mean_token_accuracy": 0.8295639351010322, + "num_tokens": 54136958.0, + "step": 45060 + }, + { + "entropy": 1.9289535745978355, + "epoch": 0.13971305659897748, + "grad_norm": 7.8160505294799805, + "learning_rate": 6.768290653517961e-06, + "loss": 0.5361, + "mean_token_accuracy": 0.8284988284111023, + "num_tokens": 54148685.0, + "step": 45070 + }, + { + "entropy": 1.8776358231902122, + "epoch": 0.13974405572402718, + "grad_norm": 10.067404747009277, + "learning_rate": 6.767539897506031e-06, + "loss": 0.4766, + "mean_token_accuracy": 0.8291481390595437, + "num_tokens": 54161099.0, + "step": 45080 + }, + { + "entropy": 1.8345778226852416, + "epoch": 0.13977505484907687, + "grad_norm": 9.045985221862793, + "learning_rate": 6.766789391265992e-06, + "loss": 0.5164, + "mean_token_accuracy": 0.8270506203174591, + "num_tokens": 54173712.0, + "step": 45090 + }, + { + "entropy": 1.9202216163277626, + "epoch": 0.13980605397412657, + "grad_norm": 8.717995643615723, + "learning_rate": 6.7660391346593745e-06, + "loss": 0.6067, + "mean_token_accuracy": 0.820853091776371, + "num_tokens": 54186315.0, + "step": 45100 + }, + { + "entropy": 1.9440720230340958, + "epoch": 0.13983705309917627, + "grad_norm": 7.580258846282959, + "learning_rate": 6.765289127547821e-06, + "loss": 0.5868, + "mean_token_accuracy": 0.8336421042680741, + "num_tokens": 54197829.0, + "step": 45110 + }, + { + "entropy": 1.9380245164036751, + "epoch": 0.13986805222422596, + "grad_norm": 9.486980438232422, + "learning_rate": 6.764539369793085e-06, + "loss": 0.5743, + "mean_token_accuracy": 0.818755242228508, + "num_tokens": 54209626.0, + "step": 45120 + }, + { + "entropy": 1.9459102168679236, + "epoch": 0.13989905134927566, + "grad_norm": 8.933557510375977, + "learning_rate": 6.7637898612570185e-06, + "loss": 0.5483, + "mean_token_accuracy": 0.8371159300208092, + "num_tokens": 54221719.0, + "step": 45130 + }, + { + "entropy": 1.7940280467271805, + "epoch": 0.13993005047432536, + "grad_norm": 10.48863410949707, + "learning_rate": 6.7630406018015884e-06, + "loss": 0.4218, + "mean_token_accuracy": 0.8437355652451515, + "num_tokens": 54235073.0, + "step": 45140 + }, + { + "entropy": 1.7776363700628282, + "epoch": 0.13996104959937505, + "grad_norm": 8.434184074401855, + "learning_rate": 6.762291591288863e-06, + "loss": 0.434, + "mean_token_accuracy": 0.8471253350377083, + "num_tokens": 54248626.0, + "step": 45150 + }, + { + "entropy": 1.8552054047584534, + "epoch": 0.13999204872442475, + "grad_norm": 7.70085334777832, + "learning_rate": 6.761542829581025e-06, + "loss": 0.4975, + "mean_token_accuracy": 0.8401309624314308, + "num_tokens": 54260486.0, + "step": 45160 + }, + { + "entropy": 1.856016856431961, + "epoch": 0.14002304784947445, + "grad_norm": 10.124335289001465, + "learning_rate": 6.760794316540352e-06, + "loss": 0.5163, + "mean_token_accuracy": 0.8326075285673141, + "num_tokens": 54273686.0, + "step": 45170 + }, + { + "entropy": 1.845085544884205, + "epoch": 0.14005404697452414, + "grad_norm": 4.437714576721191, + "learning_rate": 6.760046052029241e-06, + "loss": 0.5034, + "mean_token_accuracy": 0.8339051187038422, + "num_tokens": 54286613.0, + "step": 45180 + }, + { + "entropy": 1.8965241134166717, + "epoch": 0.14008504609957384, + "grad_norm": 8.724405288696289, + "learning_rate": 6.7592980359101864e-06, + "loss": 0.5568, + "mean_token_accuracy": 0.818605862557888, + "num_tokens": 54298537.0, + "step": 45190 + }, + { + "entropy": 1.8351117700338364, + "epoch": 0.14011604522462354, + "grad_norm": 2.729274034500122, + "learning_rate": 6.758550268045797e-06, + "loss": 0.5117, + "mean_token_accuracy": 0.8237488672137261, + "num_tokens": 54312345.0, + "step": 45200 + }, + { + "entropy": 1.8630397498607636, + "epoch": 0.14014704434967323, + "grad_norm": 8.679328918457031, + "learning_rate": 6.757802748298778e-06, + "loss": 0.4905, + "mean_token_accuracy": 0.8270406112074852, + "num_tokens": 54324911.0, + "step": 45210 + }, + { + "entropy": 1.8825726188719272, + "epoch": 0.14017804347472293, + "grad_norm": 4.007488250732422, + "learning_rate": 6.757055476531949e-06, + "loss": 0.519, + "mean_token_accuracy": 0.8377444759011269, + "num_tokens": 54337224.0, + "step": 45220 + }, + { + "entropy": 1.978707179427147, + "epoch": 0.14020904259977263, + "grad_norm": 8.847635269165039, + "learning_rate": 6.756308452608234e-06, + "loss": 0.5984, + "mean_token_accuracy": 0.8280522257089615, + "num_tokens": 54348233.0, + "step": 45230 + }, + { + "entropy": 1.9876850560307502, + "epoch": 0.14024004172482232, + "grad_norm": 6.580830097198486, + "learning_rate": 6.755561676390661e-06, + "loss": 0.5664, + "mean_token_accuracy": 0.8235482305288315, + "num_tokens": 54359366.0, + "step": 45240 + }, + { + "entropy": 1.8557663679122924, + "epoch": 0.14027104084987202, + "grad_norm": 8.015181541442871, + "learning_rate": 6.754815147742368e-06, + "loss": 0.5147, + "mean_token_accuracy": 0.8315120905637741, + "num_tokens": 54372683.0, + "step": 45250 + }, + { + "entropy": 1.9107207596302032, + "epoch": 0.14030203997492172, + "grad_norm": 7.833492279052734, + "learning_rate": 6.754068866526591e-06, + "loss": 0.5812, + "mean_token_accuracy": 0.8162291869521141, + "num_tokens": 54384271.0, + "step": 45260 + }, + { + "entropy": 1.8534480392932893, + "epoch": 0.1403330390999714, + "grad_norm": 8.591071128845215, + "learning_rate": 6.753322832606681e-06, + "loss": 0.5102, + "mean_token_accuracy": 0.8405616924166679, + "num_tokens": 54396228.0, + "step": 45270 + }, + { + "entropy": 1.9540248334407806, + "epoch": 0.1403640382250211, + "grad_norm": 8.374753952026367, + "learning_rate": 6.752577045846086e-06, + "loss": 0.5839, + "mean_token_accuracy": 0.8295043364167214, + "num_tokens": 54407440.0, + "step": 45280 + }, + { + "entropy": 1.8010128349065782, + "epoch": 0.1403950373500708, + "grad_norm": 8.82718276977539, + "learning_rate": 6.7518315061083694e-06, + "loss": 0.4518, + "mean_token_accuracy": 0.8495183497667312, + "num_tokens": 54420654.0, + "step": 45290 + }, + { + "entropy": 1.9086335971951485, + "epoch": 0.1404260364751205, + "grad_norm": 9.302217483520508, + "learning_rate": 6.751086213257192e-06, + "loss": 0.5101, + "mean_token_accuracy": 0.8401185050606728, + "num_tokens": 54432469.0, + "step": 45300 + }, + { + "entropy": 1.84428121894598, + "epoch": 0.14045703560017017, + "grad_norm": 8.570785522460938, + "learning_rate": 6.750341167156322e-06, + "loss": 0.4681, + "mean_token_accuracy": 0.840717563033104, + "num_tokens": 54444860.0, + "step": 45310 + }, + { + "entropy": 1.890746709704399, + "epoch": 0.14048803472521987, + "grad_norm": 4.786993980407715, + "learning_rate": 6.749596367669633e-06, + "loss": 0.5198, + "mean_token_accuracy": 0.8295010164380073, + "num_tokens": 54456895.0, + "step": 45320 + }, + { + "entropy": 1.9245510414242744, + "epoch": 0.14051903385026956, + "grad_norm": 10.287075996398926, + "learning_rate": 6.748851814661106e-06, + "loss": 0.5775, + "mean_token_accuracy": 0.8316981479525566, + "num_tokens": 54469287.0, + "step": 45330 + }, + { + "entropy": 1.918796244263649, + "epoch": 0.14055003297531926, + "grad_norm": 7.518104553222656, + "learning_rate": 6.748107507994823e-06, + "loss": 0.5142, + "mean_token_accuracy": 0.8400950893759728, + "num_tokens": 54481147.0, + "step": 45340 + }, + { + "entropy": 1.878979854285717, + "epoch": 0.14058103210036896, + "grad_norm": 4.404152870178223, + "learning_rate": 6.747363447534975e-06, + "loss": 0.5295, + "mean_token_accuracy": 0.8336639732122422, + "num_tokens": 54492785.0, + "step": 45350 + }, + { + "entropy": 1.8506067097187042, + "epoch": 0.14061203122541865, + "grad_norm": 9.694655418395996, + "learning_rate": 6.746619633145854e-06, + "loss": 0.5202, + "mean_token_accuracy": 0.831046088039875, + "num_tokens": 54505486.0, + "step": 45360 + }, + { + "entropy": 1.851573745906353, + "epoch": 0.14064303035046835, + "grad_norm": 8.404928207397461, + "learning_rate": 6.745876064691858e-06, + "loss": 0.4996, + "mean_token_accuracy": 0.8353975236415863, + "num_tokens": 54518430.0, + "step": 45370 + }, + { + "entropy": 1.8510615780949593, + "epoch": 0.14067402947551805, + "grad_norm": 6.233858108520508, + "learning_rate": 6.745132742037491e-06, + "loss": 0.5096, + "mean_token_accuracy": 0.8379595011472702, + "num_tokens": 54530823.0, + "step": 45380 + }, + { + "entropy": 1.8410978406667708, + "epoch": 0.14070502860056774, + "grad_norm": 7.378570556640625, + "learning_rate": 6.744389665047362e-06, + "loss": 0.4424, + "mean_token_accuracy": 0.8501470074057579, + "num_tokens": 54543450.0, + "step": 45390 + }, + { + "entropy": 1.811590349674225, + "epoch": 0.14073602772561744, + "grad_norm": 9.43700122833252, + "learning_rate": 6.743646833586182e-06, + "loss": 0.4649, + "mean_token_accuracy": 0.8426756337285042, + "num_tokens": 54556418.0, + "step": 45400 + }, + { + "entropy": 1.8802534580230712, + "epoch": 0.14076702685066714, + "grad_norm": 8.894246101379395, + "learning_rate": 6.742904247518765e-06, + "loss": 0.5336, + "mean_token_accuracy": 0.831374479830265, + "num_tokens": 54568516.0, + "step": 45410 + }, + { + "entropy": 1.9234187602996826, + "epoch": 0.14079802597571683, + "grad_norm": 8.303701400756836, + "learning_rate": 6.742161906710033e-06, + "loss": 0.5486, + "mean_token_accuracy": 0.8301004365086555, + "num_tokens": 54580154.0, + "step": 45420 + }, + { + "entropy": 1.854028156399727, + "epoch": 0.14082902510076653, + "grad_norm": 6.356640815734863, + "learning_rate": 6.741419811025011e-06, + "loss": 0.5102, + "mean_token_accuracy": 0.8381151556968689, + "num_tokens": 54592717.0, + "step": 45430 + }, + { + "entropy": 1.9734964907169341, + "epoch": 0.14086002422581623, + "grad_norm": 10.328083038330078, + "learning_rate": 6.740677960328828e-06, + "loss": 0.5915, + "mean_token_accuracy": 0.8209626346826553, + "num_tokens": 54603430.0, + "step": 45440 + }, + { + "entropy": 1.973678994178772, + "epoch": 0.14089102335086592, + "grad_norm": 9.18193531036377, + "learning_rate": 6.739936354486713e-06, + "loss": 0.6025, + "mean_token_accuracy": 0.8171725884079933, + "num_tokens": 54614206.0, + "step": 45450 + }, + { + "entropy": 1.9304319381713868, + "epoch": 0.14092202247591562, + "grad_norm": 9.106557846069336, + "learning_rate": 6.7391949933640045e-06, + "loss": 0.5131, + "mean_token_accuracy": 0.8340640947222709, + "num_tokens": 54626347.0, + "step": 45460 + }, + { + "entropy": 1.931530450284481, + "epoch": 0.14095302160096532, + "grad_norm": 10.796926498413086, + "learning_rate": 6.738453876826143e-06, + "loss": 0.561, + "mean_token_accuracy": 0.8309638902544976, + "num_tokens": 54637794.0, + "step": 45470 + }, + { + "entropy": 1.9570015415549278, + "epoch": 0.140984020726015, + "grad_norm": 11.423550605773926, + "learning_rate": 6.7377130047386695e-06, + "loss": 0.5831, + "mean_token_accuracy": 0.820400333404541, + "num_tokens": 54648619.0, + "step": 45480 + }, + { + "entropy": 1.9563427597284317, + "epoch": 0.1410150198510647, + "grad_norm": 7.9254560470581055, + "learning_rate": 6.7369723769672335e-06, + "loss": 0.5795, + "mean_token_accuracy": 0.8188218504190445, + "num_tokens": 54659912.0, + "step": 45490 + }, + { + "entropy": 1.9148031935095786, + "epoch": 0.1410460189761144, + "grad_norm": 7.281384468078613, + "learning_rate": 6.736231993377581e-06, + "loss": 0.5117, + "mean_token_accuracy": 0.8334671080112457, + "num_tokens": 54671882.0, + "step": 45500 + }, + { + "entropy": 1.9391163542866707, + "epoch": 0.1410770181011641, + "grad_norm": 9.63330078125, + "learning_rate": 6.735491853835571e-06, + "loss": 0.5381, + "mean_token_accuracy": 0.8372703313827514, + "num_tokens": 54682803.0, + "step": 45510 + }, + { + "entropy": 1.8276369392871856, + "epoch": 0.1411080172262138, + "grad_norm": 8.537032127380371, + "learning_rate": 6.734751958207155e-06, + "loss": 0.5093, + "mean_token_accuracy": 0.8435697957873345, + "num_tokens": 54695679.0, + "step": 45520 + }, + { + "entropy": 1.8639580070972444, + "epoch": 0.1411390163512635, + "grad_norm": 7.994620323181152, + "learning_rate": 6.7340123063583955e-06, + "loss": 0.5023, + "mean_token_accuracy": 0.8325873509049415, + "num_tokens": 54708838.0, + "step": 45530 + }, + { + "entropy": 1.8180058419704437, + "epoch": 0.1411700154763132, + "grad_norm": 4.438790321350098, + "learning_rate": 6.733272898155452e-06, + "loss": 0.4698, + "mean_token_accuracy": 0.8409150257706642, + "num_tokens": 54721440.0, + "step": 45540 + }, + { + "entropy": 1.8847099259495734, + "epoch": 0.1412010146013629, + "grad_norm": 10.436013221740723, + "learning_rate": 6.732533733464593e-06, + "loss": 0.58, + "mean_token_accuracy": 0.8249880090355873, + "num_tokens": 54733585.0, + "step": 45550 + }, + { + "entropy": 1.913513371348381, + "epoch": 0.14123201372641256, + "grad_norm": 6.460671901702881, + "learning_rate": 6.731794812152185e-06, + "loss": 0.5792, + "mean_token_accuracy": 0.8275411754846573, + "num_tokens": 54744839.0, + "step": 45560 + }, + { + "entropy": 1.9194407686591148, + "epoch": 0.14126301285146226, + "grad_norm": 10.074481964111328, + "learning_rate": 6.7310561340847e-06, + "loss": 0.6015, + "mean_token_accuracy": 0.8203522220253945, + "num_tokens": 54755728.0, + "step": 45570 + }, + { + "entropy": 1.975143238902092, + "epoch": 0.14129401197651195, + "grad_norm": 7.195339679718018, + "learning_rate": 6.73031769912871e-06, + "loss": 0.6573, + "mean_token_accuracy": 0.8187574937939643, + "num_tokens": 54767221.0, + "step": 45580 + }, + { + "entropy": 1.9089319556951523, + "epoch": 0.14132501110156165, + "grad_norm": 9.949457168579102, + "learning_rate": 6.729579507150891e-06, + "loss": 0.5393, + "mean_token_accuracy": 0.8359342351555824, + "num_tokens": 54778850.0, + "step": 45590 + }, + { + "entropy": 1.9323441043496132, + "epoch": 0.14135601022661134, + "grad_norm": 4.49478816986084, + "learning_rate": 6.728841558018021e-06, + "loss": 0.5635, + "mean_token_accuracy": 0.8376797735691071, + "num_tokens": 54790559.0, + "step": 45600 + }, + { + "entropy": 1.9415909707546235, + "epoch": 0.14138700935166104, + "grad_norm": 8.446100234985352, + "learning_rate": 6.72810385159698e-06, + "loss": 0.5561, + "mean_token_accuracy": 0.8303654089570045, + "num_tokens": 54802533.0, + "step": 45610 + }, + { + "entropy": 1.917179961502552, + "epoch": 0.14141800847671074, + "grad_norm": 3.9077000617980957, + "learning_rate": 6.7273663877547516e-06, + "loss": 0.5538, + "mean_token_accuracy": 0.8232975766062737, + "num_tokens": 54814772.0, + "step": 45620 + }, + { + "entropy": 1.8237308233976364, + "epoch": 0.14144900760176043, + "grad_norm": 11.04135799407959, + "learning_rate": 6.726629166358418e-06, + "loss": 0.4777, + "mean_token_accuracy": 0.8421555116772652, + "num_tokens": 54827998.0, + "step": 45630 + }, + { + "entropy": 1.876150907576084, + "epoch": 0.14148000672681013, + "grad_norm": 6.673070907592773, + "learning_rate": 6.725892187275168e-06, + "loss": 0.4424, + "mean_token_accuracy": 0.845268502831459, + "num_tokens": 54840719.0, + "step": 45640 + }, + { + "entropy": 1.8938727974891663, + "epoch": 0.14151100585185983, + "grad_norm": 4.026385307312012, + "learning_rate": 6.725155450372289e-06, + "loss": 0.5532, + "mean_token_accuracy": 0.8300805896520614, + "num_tokens": 54852189.0, + "step": 45650 + }, + { + "entropy": 1.8416188895702361, + "epoch": 0.14154200497690952, + "grad_norm": 8.692848205566406, + "learning_rate": 6.724418955517171e-06, + "loss": 0.5154, + "mean_token_accuracy": 0.8287687584757805, + "num_tokens": 54864719.0, + "step": 45660 + }, + { + "entropy": 1.9456066131591796, + "epoch": 0.14157300410195922, + "grad_norm": 9.043981552124023, + "learning_rate": 6.723682702577305e-06, + "loss": 0.6013, + "mean_token_accuracy": 0.8262223243713379, + "num_tokens": 54876153.0, + "step": 45670 + }, + { + "entropy": 1.8887315690517426, + "epoch": 0.14160400322700892, + "grad_norm": 8.423904418945312, + "learning_rate": 6.7229466914202864e-06, + "loss": 0.4768, + "mean_token_accuracy": 0.851503835618496, + "num_tokens": 54888235.0, + "step": 45680 + }, + { + "entropy": 1.862038327753544, + "epoch": 0.14163500235205861, + "grad_norm": 3.738830804824829, + "learning_rate": 6.722210921913808e-06, + "loss": 0.4949, + "mean_token_accuracy": 0.8433087572455407, + "num_tokens": 54900600.0, + "step": 45690 + }, + { + "entropy": 1.9590093523263932, + "epoch": 0.1416660014771083, + "grad_norm": 4.451807498931885, + "learning_rate": 6.721475393925665e-06, + "loss": 0.5955, + "mean_token_accuracy": 0.8190687134861946, + "num_tokens": 54911971.0, + "step": 45700 + }, + { + "entropy": 1.8686208486557008, + "epoch": 0.141697000602158, + "grad_norm": 9.367164611816406, + "learning_rate": 6.720740107323755e-06, + "loss": 0.511, + "mean_token_accuracy": 0.8351000919938087, + "num_tokens": 54924188.0, + "step": 45710 + }, + { + "entropy": 1.9188775599002839, + "epoch": 0.1417279997272077, + "grad_norm": 10.714550018310547, + "learning_rate": 6.720005061976077e-06, + "loss": 0.5598, + "mean_token_accuracy": 0.8303559988737106, + "num_tokens": 54935495.0, + "step": 45720 + }, + { + "entropy": 1.8970612213015556, + "epoch": 0.1417589988522574, + "grad_norm": 4.424524307250977, + "learning_rate": 6.7192702577507306e-06, + "loss": 0.5181, + "mean_token_accuracy": 0.8270143359899521, + "num_tokens": 54948599.0, + "step": 45730 + }, + { + "entropy": 1.9638367608189582, + "epoch": 0.1417899979773071, + "grad_norm": 8.639518737792969, + "learning_rate": 6.718535694515915e-06, + "loss": 0.5418, + "mean_token_accuracy": 0.8268617764115334, + "num_tokens": 54960096.0, + "step": 45740 + }, + { + "entropy": 1.920237709581852, + "epoch": 0.1418209971023568, + "grad_norm": 8.430821418762207, + "learning_rate": 6.717801372139931e-06, + "loss": 0.5459, + "mean_token_accuracy": 0.8220153465867043, + "num_tokens": 54972068.0, + "step": 45750 + }, + { + "entropy": 1.7880508966743947, + "epoch": 0.1418519962274065, + "grad_norm": 4.553556442260742, + "learning_rate": 6.717067290491183e-06, + "loss": 0.4357, + "mean_token_accuracy": 0.8418186247348786, + "num_tokens": 54985926.0, + "step": 45760 + }, + { + "entropy": 1.9695019006729126, + "epoch": 0.1418829953524562, + "grad_norm": 8.724945068359375, + "learning_rate": 6.716333449438172e-06, + "loss": 0.5232, + "mean_token_accuracy": 0.8376459792256356, + "num_tokens": 54996931.0, + "step": 45770 + }, + { + "entropy": 1.8776928693056107, + "epoch": 0.14191399447750588, + "grad_norm": 4.291085720062256, + "learning_rate": 6.715599848849499e-06, + "loss": 0.5726, + "mean_token_accuracy": 0.822805991768837, + "num_tokens": 55009212.0, + "step": 45780 + }, + { + "entropy": 1.9443933725357057, + "epoch": 0.14194499360255558, + "grad_norm": 8.464522361755371, + "learning_rate": 6.714866488593871e-06, + "loss": 0.5886, + "mean_token_accuracy": 0.8179412320256233, + "num_tokens": 55019971.0, + "step": 45790 + }, + { + "entropy": 1.8963981166481971, + "epoch": 0.14197599272760528, + "grad_norm": 8.489872932434082, + "learning_rate": 6.714133368540089e-06, + "loss": 0.5414, + "mean_token_accuracy": 0.830699360370636, + "num_tokens": 55030869.0, + "step": 45800 + }, + { + "entropy": 1.8366913974285126, + "epoch": 0.14200699185265495, + "grad_norm": 4.7726216316223145, + "learning_rate": 6.713400488557057e-06, + "loss": 0.4756, + "mean_token_accuracy": 0.8389330074191094, + "num_tokens": 55043876.0, + "step": 45810 + }, + { + "entropy": 1.8235629141330718, + "epoch": 0.14203799097770464, + "grad_norm": 4.734610080718994, + "learning_rate": 6.712667848513782e-06, + "loss": 0.4653, + "mean_token_accuracy": 0.8486215993762016, + "num_tokens": 55056283.0, + "step": 45820 + }, + { + "entropy": 1.7981175884604454, + "epoch": 0.14206899010275434, + "grad_norm": 9.670796394348145, + "learning_rate": 6.711935448279365e-06, + "loss": 0.465, + "mean_token_accuracy": 0.8392187222838402, + "num_tokens": 55069610.0, + "step": 45830 + }, + { + "entropy": 1.8424267813563346, + "epoch": 0.14209998922780404, + "grad_norm": 3.722155809402466, + "learning_rate": 6.711203287723014e-06, + "loss": 0.4865, + "mean_token_accuracy": 0.8339389190077782, + "num_tokens": 55083021.0, + "step": 45840 + }, + { + "entropy": 1.877331507205963, + "epoch": 0.14213098835285373, + "grad_norm": 9.001885414123535, + "learning_rate": 6.7104713667140285e-06, + "loss": 0.5586, + "mean_token_accuracy": 0.8358711794018745, + "num_tokens": 55095426.0, + "step": 45850 + }, + { + "entropy": 1.9491191014647484, + "epoch": 0.14216198747790343, + "grad_norm": 4.518219470977783, + "learning_rate": 6.709739685121816e-06, + "loss": 0.6017, + "mean_token_accuracy": 0.8208028897643089, + "num_tokens": 55106699.0, + "step": 45860 + }, + { + "entropy": 1.8255974546074867, + "epoch": 0.14219298660295313, + "grad_norm": 8.47142505645752, + "learning_rate": 6.70900824281588e-06, + "loss": 0.4659, + "mean_token_accuracy": 0.8375585183501244, + "num_tokens": 55119196.0, + "step": 45870 + }, + { + "entropy": 1.844706942141056, + "epoch": 0.14222398572800282, + "grad_norm": 3.8983564376831055, + "learning_rate": 6.70827703966582e-06, + "loss": 0.5071, + "mean_token_accuracy": 0.8367140933871269, + "num_tokens": 55132280.0, + "step": 45880 + }, + { + "entropy": 1.8609853073954583, + "epoch": 0.14225498485305252, + "grad_norm": 4.357093811035156, + "learning_rate": 6.707546075541341e-06, + "loss": 0.4997, + "mean_token_accuracy": 0.8403833657503128, + "num_tokens": 55144004.0, + "step": 45890 + }, + { + "entropy": 1.9344159916043282, + "epoch": 0.14228598397810222, + "grad_norm": 8.425995826721191, + "learning_rate": 6.706815350312245e-06, + "loss": 0.5469, + "mean_token_accuracy": 0.8343609884381294, + "num_tokens": 55155185.0, + "step": 45900 + }, + { + "entropy": 1.9265785560011863, + "epoch": 0.1423169831031519, + "grad_norm": 8.693990707397461, + "learning_rate": 6.706084863848432e-06, + "loss": 0.5486, + "mean_token_accuracy": 0.8264458447694778, + "num_tokens": 55166299.0, + "step": 45910 + }, + { + "entropy": 1.9623035162687301, + "epoch": 0.1423479822282016, + "grad_norm": 7.973003387451172, + "learning_rate": 6.705354616019903e-06, + "loss": 0.589, + "mean_token_accuracy": 0.8248161807656288, + "num_tokens": 55177652.0, + "step": 45920 + }, + { + "entropy": 1.7614901930093765, + "epoch": 0.1423789813532513, + "grad_norm": 10.886859893798828, + "learning_rate": 6.704624606696758e-06, + "loss": 0.4666, + "mean_token_accuracy": 0.8515376761555672, + "num_tokens": 55191188.0, + "step": 45930 + }, + { + "entropy": 1.9051169365644456, + "epoch": 0.142409980478301, + "grad_norm": 9.506697654724121, + "learning_rate": 6.7038948357491925e-06, + "loss": 0.6262, + "mean_token_accuracy": 0.8160225167870522, + "num_tokens": 55202436.0, + "step": 45940 + }, + { + "entropy": 1.8652933433651924, + "epoch": 0.1424409796033507, + "grad_norm": 8.896991729736328, + "learning_rate": 6.703165303047507e-06, + "loss": 0.5543, + "mean_token_accuracy": 0.8282453447580338, + "num_tokens": 55213975.0, + "step": 45950 + }, + { + "entropy": 1.8330632865428924, + "epoch": 0.1424719787284004, + "grad_norm": 10.671113967895508, + "learning_rate": 6.702436008462098e-06, + "loss": 0.4907, + "mean_token_accuracy": 0.8493910744786263, + "num_tokens": 55226050.0, + "step": 45960 + }, + { + "entropy": 1.9106391370296478, + "epoch": 0.1425029778534501, + "grad_norm": 10.167787551879883, + "learning_rate": 6.701706951863456e-06, + "loss": 0.5872, + "mean_token_accuracy": 0.8183905243873596, + "num_tokens": 55237459.0, + "step": 45970 + }, + { + "entropy": 1.898222027719021, + "epoch": 0.1425339769784998, + "grad_norm": 9.797660827636719, + "learning_rate": 6.700978133122177e-06, + "loss": 0.5341, + "mean_token_accuracy": 0.8333139047026634, + "num_tokens": 55249387.0, + "step": 45980 + }, + { + "entropy": 1.9070947885513305, + "epoch": 0.14256497610354948, + "grad_norm": 9.02385139465332, + "learning_rate": 6.700249552108953e-06, + "loss": 0.5671, + "mean_token_accuracy": 0.8196926236152648, + "num_tokens": 55260722.0, + "step": 45990 + }, + { + "entropy": 1.8567352384328841, + "epoch": 0.14259597522859918, + "grad_norm": 4.3364458084106445, + "learning_rate": 6.699521208694573e-06, + "loss": 0.5167, + "mean_token_accuracy": 0.8253255099058151, + "num_tokens": 55273316.0, + "step": 46000 + }, + { + "entropy": 1.9003437846899032, + "epoch": 0.14262697435364888, + "grad_norm": 10.944622993469238, + "learning_rate": 6.6987931027499264e-06, + "loss": 0.5584, + "mean_token_accuracy": 0.8321839943528175, + "num_tokens": 55284892.0, + "step": 46010 + }, + { + "entropy": 1.9338586196303367, + "epoch": 0.14265797347869857, + "grad_norm": 9.371607780456543, + "learning_rate": 6.698065234146e-06, + "loss": 0.6125, + "mean_token_accuracy": 0.8222884982824326, + "num_tokens": 55296243.0, + "step": 46020 + }, + { + "entropy": 1.773510305583477, + "epoch": 0.14268897260374827, + "grad_norm": 4.4485602378845215, + "learning_rate": 6.697337602753876e-06, + "loss": 0.4525, + "mean_token_accuracy": 0.8317829132080078, + "num_tokens": 55310357.0, + "step": 46030 + }, + { + "entropy": 1.794416318833828, + "epoch": 0.14271997172879797, + "grad_norm": 8.086502075195312, + "learning_rate": 6.696610208444741e-06, + "loss": 0.4673, + "mean_token_accuracy": 0.8388103753328323, + "num_tokens": 55323254.0, + "step": 46040 + }, + { + "entropy": 1.8933477729558945, + "epoch": 0.14275097085384764, + "grad_norm": 4.526361465454102, + "learning_rate": 6.695883051089873e-06, + "loss": 0.5491, + "mean_token_accuracy": 0.8254470497369766, + "num_tokens": 55335499.0, + "step": 46050 + }, + { + "entropy": 1.8915483728051186, + "epoch": 0.14278196997889733, + "grad_norm": 8.898006439208984, + "learning_rate": 6.695156130560652e-06, + "loss": 0.5882, + "mean_token_accuracy": 0.8290236473083497, + "num_tokens": 55347737.0, + "step": 46060 + }, + { + "entropy": 1.8370354726910592, + "epoch": 0.14281296910394703, + "grad_norm": 8.612515449523926, + "learning_rate": 6.694429446728551e-06, + "loss": 0.4643, + "mean_token_accuracy": 0.8437874168157578, + "num_tokens": 55360168.0, + "step": 46070 + }, + { + "entropy": 1.8509749799966813, + "epoch": 0.14284396822899673, + "grad_norm": 8.452802658081055, + "learning_rate": 6.6937029994651485e-06, + "loss": 0.511, + "mean_token_accuracy": 0.8318821370601654, + "num_tokens": 55372897.0, + "step": 46080 + }, + { + "entropy": 1.819749604165554, + "epoch": 0.14287496735404642, + "grad_norm": 8.721274375915527, + "learning_rate": 6.692976788642114e-06, + "loss": 0.4727, + "mean_token_accuracy": 0.8412142008543014, + "num_tokens": 55385814.0, + "step": 46090 + }, + { + "entropy": 1.9298714756965638, + "epoch": 0.14290596647909612, + "grad_norm": 7.0825700759887695, + "learning_rate": 6.692250814131215e-06, + "loss": 0.5796, + "mean_token_accuracy": 0.8247520595788955, + "num_tokens": 55397111.0, + "step": 46100 + }, + { + "entropy": 1.9013961970806121, + "epoch": 0.14293696560414582, + "grad_norm": 10.107673645019531, + "learning_rate": 6.691525075804319e-06, + "loss": 0.5349, + "mean_token_accuracy": 0.830139285326004, + "num_tokens": 55408213.0, + "step": 46110 + }, + { + "entropy": 1.9001147076487541, + "epoch": 0.1429679647291955, + "grad_norm": 8.269566535949707, + "learning_rate": 6.690799573533387e-06, + "loss": 0.5766, + "mean_token_accuracy": 0.8280939370393753, + "num_tokens": 55419892.0, + "step": 46120 + }, + { + "entropy": 1.9132464528083801, + "epoch": 0.1429989638542452, + "grad_norm": 8.161412239074707, + "learning_rate": 6.690074307190485e-06, + "loss": 0.5669, + "mean_token_accuracy": 0.825434684753418, + "num_tokens": 55430279.0, + "step": 46130 + }, + { + "entropy": 1.8425243273377419, + "epoch": 0.1430299629792949, + "grad_norm": 4.587552070617676, + "learning_rate": 6.689349276647765e-06, + "loss": 0.4945, + "mean_token_accuracy": 0.8376808211207389, + "num_tokens": 55441827.0, + "step": 46140 + }, + { + "entropy": 1.946098005771637, + "epoch": 0.1430609621043446, + "grad_norm": 11.747321128845215, + "learning_rate": 6.688624481777485e-06, + "loss": 0.5897, + "mean_token_accuracy": 0.8218299329280854, + "num_tokens": 55452623.0, + "step": 46150 + }, + { + "entropy": 1.8413902148604393, + "epoch": 0.1430919612293943, + "grad_norm": 8.52793025970459, + "learning_rate": 6.687899922451993e-06, + "loss": 0.509, + "mean_token_accuracy": 0.839706726372242, + "num_tokens": 55464897.0, + "step": 46160 + }, + { + "entropy": 1.88809677362442, + "epoch": 0.143122960354444, + "grad_norm": 8.712146759033203, + "learning_rate": 6.6871755985437425e-06, + "loss": 0.5336, + "mean_token_accuracy": 0.826761020720005, + "num_tokens": 55476274.0, + "step": 46170 + }, + { + "entropy": 1.8601938232779502, + "epoch": 0.1431539594794937, + "grad_norm": 4.302032947540283, + "learning_rate": 6.686451509925272e-06, + "loss": 0.5246, + "mean_token_accuracy": 0.8418595373630524, + "num_tokens": 55487836.0, + "step": 46180 + }, + { + "entropy": 1.7940224602818489, + "epoch": 0.1431849586045434, + "grad_norm": 11.997045516967773, + "learning_rate": 6.685727656469229e-06, + "loss": 0.451, + "mean_token_accuracy": 0.8457210063934326, + "num_tokens": 55499979.0, + "step": 46190 + }, + { + "entropy": 1.8284522131085397, + "epoch": 0.14321595772959309, + "grad_norm": 8.663432121276855, + "learning_rate": 6.685004038048349e-06, + "loss": 0.4899, + "mean_token_accuracy": 0.8391269639134407, + "num_tokens": 55512352.0, + "step": 46200 + }, + { + "entropy": 1.8878327459096909, + "epoch": 0.14324695685464278, + "grad_norm": 8.888419151306152, + "learning_rate": 6.684280654535462e-06, + "loss": 0.5389, + "mean_token_accuracy": 0.8362879782915116, + "num_tokens": 55523451.0, + "step": 46210 + }, + { + "entropy": 1.8454574525356293, + "epoch": 0.14327795597969248, + "grad_norm": 8.546029090881348, + "learning_rate": 6.683557505803507e-06, + "loss": 0.5064, + "mean_token_accuracy": 0.8269944965839386, + "num_tokens": 55535264.0, + "step": 46220 + }, + { + "entropy": 1.8240334704518317, + "epoch": 0.14330895510474218, + "grad_norm": 4.8377885818481445, + "learning_rate": 6.6828345917255045e-06, + "loss": 0.498, + "mean_token_accuracy": 0.8359172835946083, + "num_tokens": 55547756.0, + "step": 46230 + }, + { + "entropy": 1.9085311383008956, + "epoch": 0.14333995422979187, + "grad_norm": 9.510826110839844, + "learning_rate": 6.682111912174579e-06, + "loss": 0.6108, + "mean_token_accuracy": 0.828006249666214, + "num_tokens": 55559998.0, + "step": 46240 + }, + { + "entropy": 1.8106452487409115, + "epoch": 0.14337095335484157, + "grad_norm": 9.17758846282959, + "learning_rate": 6.681389467023951e-06, + "loss": 0.4858, + "mean_token_accuracy": 0.8463340416550637, + "num_tokens": 55573054.0, + "step": 46250 + }, + { + "entropy": 1.8135211139917373, + "epoch": 0.14340195247989126, + "grad_norm": 8.731499671936035, + "learning_rate": 6.680667256146936e-06, + "loss": 0.4866, + "mean_token_accuracy": 0.8303871005773544, + "num_tokens": 55586890.0, + "step": 46260 + }, + { + "entropy": 1.940975472331047, + "epoch": 0.14343295160494096, + "grad_norm": 7.670884132385254, + "learning_rate": 6.679945279416942e-06, + "loss": 0.5781, + "mean_token_accuracy": 0.8210515171289444, + "num_tokens": 55598136.0, + "step": 46270 + }, + { + "entropy": 1.8406999617815019, + "epoch": 0.14346395072999066, + "grad_norm": 8.801468849182129, + "learning_rate": 6.679223536707477e-06, + "loss": 0.499, + "mean_token_accuracy": 0.8413797855377197, + "num_tokens": 55610122.0, + "step": 46280 + }, + { + "entropy": 1.8838975965976714, + "epoch": 0.14349494985504035, + "grad_norm": 7.5125908851623535, + "learning_rate": 6.678502027892142e-06, + "loss": 0.502, + "mean_token_accuracy": 0.84541737139225, + "num_tokens": 55621817.0, + "step": 46290 + }, + { + "entropy": 1.8918793559074403, + "epoch": 0.14352594898009002, + "grad_norm": 9.769289016723633, + "learning_rate": 6.677780752844637e-06, + "loss": 0.5443, + "mean_token_accuracy": 0.8287898004055023, + "num_tokens": 55633088.0, + "step": 46300 + }, + { + "entropy": 1.82922722697258, + "epoch": 0.14355694810513972, + "grad_norm": 10.320469856262207, + "learning_rate": 6.677059711438752e-06, + "loss": 0.5303, + "mean_token_accuracy": 0.8273350238800049, + "num_tokens": 55644796.0, + "step": 46310 + }, + { + "entropy": 1.8213071301579475, + "epoch": 0.14358794723018942, + "grad_norm": 8.950617790222168, + "learning_rate": 6.676338903548379e-06, + "loss": 0.4821, + "mean_token_accuracy": 0.8466832935810089, + "num_tokens": 55656040.0, + "step": 46320 + }, + { + "entropy": 1.7985667988657952, + "epoch": 0.1436189463552391, + "grad_norm": 11.022417068481445, + "learning_rate": 6.675618329047501e-06, + "loss": 0.485, + "mean_token_accuracy": 0.8403051659464836, + "num_tokens": 55668863.0, + "step": 46330 + }, + { + "entropy": 1.807174201309681, + "epoch": 0.1436499454802888, + "grad_norm": 11.001171112060547, + "learning_rate": 6.674897987810195e-06, + "loss": 0.5059, + "mean_token_accuracy": 0.8363948926329613, + "num_tokens": 55680958.0, + "step": 46340 + }, + { + "entropy": 1.8860581666231155, + "epoch": 0.1436809446053385, + "grad_norm": 9.26038646697998, + "learning_rate": 6.674177879710637e-06, + "loss": 0.5345, + "mean_token_accuracy": 0.8372591659426689, + "num_tokens": 55692194.0, + "step": 46350 + }, + { + "entropy": 1.8062124118208884, + "epoch": 0.1437119437303882, + "grad_norm": 9.093782424926758, + "learning_rate": 6.6734580046230955e-06, + "loss": 0.5286, + "mean_token_accuracy": 0.8431164383888244, + "num_tokens": 55704710.0, + "step": 46360 + }, + { + "entropy": 1.8013732418417931, + "epoch": 0.1437429428554379, + "grad_norm": 9.56591796875, + "learning_rate": 6.672738362421936e-06, + "loss": 0.4691, + "mean_token_accuracy": 0.8444247037172318, + "num_tokens": 55716737.0, + "step": 46370 + }, + { + "entropy": 1.8738340884447098, + "epoch": 0.1437739419804876, + "grad_norm": 9.894036293029785, + "learning_rate": 6.672018952981613e-06, + "loss": 0.5498, + "mean_token_accuracy": 0.8256061196327209, + "num_tokens": 55727558.0, + "step": 46380 + }, + { + "entropy": 1.9396587938070298, + "epoch": 0.1438049411055373, + "grad_norm": 8.471114158630371, + "learning_rate": 6.671299776176685e-06, + "loss": 0.6301, + "mean_token_accuracy": 0.8130400836467743, + "num_tokens": 55738367.0, + "step": 46390 + }, + { + "entropy": 1.8212429881095886, + "epoch": 0.143835940230587, + "grad_norm": 6.425068378448486, + "learning_rate": 6.6705808318817975e-06, + "loss": 0.5034, + "mean_token_accuracy": 0.8256319522857666, + "num_tokens": 55751238.0, + "step": 46400 + }, + { + "entropy": 1.8310098990797996, + "epoch": 0.14386693935563669, + "grad_norm": 4.085906982421875, + "learning_rate": 6.669862119971694e-06, + "loss": 0.5558, + "mean_token_accuracy": 0.8291790023446083, + "num_tokens": 55763570.0, + "step": 46410 + }, + { + "entropy": 1.808751115947962, + "epoch": 0.14389793848068638, + "grad_norm": 10.913744926452637, + "learning_rate": 6.669143640321213e-06, + "loss": 0.5168, + "mean_token_accuracy": 0.8259348452091217, + "num_tokens": 55776144.0, + "step": 46420 + }, + { + "entropy": 1.8655160009860992, + "epoch": 0.14392893760573608, + "grad_norm": 11.0983247756958, + "learning_rate": 6.668425392805282e-06, + "loss": 0.5305, + "mean_token_accuracy": 0.8399639427661896, + "num_tokens": 55787889.0, + "step": 46430 + }, + { + "entropy": 1.863385981321335, + "epoch": 0.14395993673078578, + "grad_norm": 8.552124977111816, + "learning_rate": 6.667707377298932e-06, + "loss": 0.513, + "mean_token_accuracy": 0.8352120831608772, + "num_tokens": 55800483.0, + "step": 46440 + }, + { + "entropy": 1.9188006550073624, + "epoch": 0.14399093585583547, + "grad_norm": 10.365525245666504, + "learning_rate": 6.66698959367728e-06, + "loss": 0.6035, + "mean_token_accuracy": 0.8209626540541649, + "num_tokens": 55811326.0, + "step": 46450 + }, + { + "entropy": 1.9086585596203804, + "epoch": 0.14402193498088517, + "grad_norm": 11.863431930541992, + "learning_rate": 6.666272041815539e-06, + "loss": 0.5813, + "mean_token_accuracy": 0.8237880125641823, + "num_tokens": 55823396.0, + "step": 46460 + }, + { + "entropy": 1.8327384784817695, + "epoch": 0.14405293410593487, + "grad_norm": 7.275395393371582, + "learning_rate": 6.66555472158902e-06, + "loss": 0.4894, + "mean_token_accuracy": 0.8353081822395325, + "num_tokens": 55836892.0, + "step": 46470 + }, + { + "entropy": 1.7775721468031407, + "epoch": 0.14408393323098456, + "grad_norm": 7.417864799499512, + "learning_rate": 6.664837632873123e-06, + "loss": 0.4326, + "mean_token_accuracy": 0.8478566333651543, + "num_tokens": 55849778.0, + "step": 46480 + }, + { + "entropy": 1.8510145902633668, + "epoch": 0.14411493235603426, + "grad_norm": 8.72498607635498, + "learning_rate": 6.664120775543344e-06, + "loss": 0.5191, + "mean_token_accuracy": 0.835884952545166, + "num_tokens": 55861927.0, + "step": 46490 + }, + { + "entropy": 1.811974573135376, + "epoch": 0.14414593148108396, + "grad_norm": 8.177042007446289, + "learning_rate": 6.663404149475273e-06, + "loss": 0.479, + "mean_token_accuracy": 0.8483860984444618, + "num_tokens": 55873403.0, + "step": 46500 + }, + { + "entropy": 1.86693923920393, + "epoch": 0.14417693060613365, + "grad_norm": 8.267274856567383, + "learning_rate": 6.662687754544593e-06, + "loss": 0.5635, + "mean_token_accuracy": 0.8321180418133736, + "num_tokens": 55885678.0, + "step": 46510 + }, + { + "entropy": 1.8252136752009391, + "epoch": 0.14420792973118335, + "grad_norm": 4.183609485626221, + "learning_rate": 6.661971590627081e-06, + "loss": 0.5056, + "mean_token_accuracy": 0.8298415824770927, + "num_tokens": 55897994.0, + "step": 46520 + }, + { + "entropy": 1.8707138195633888, + "epoch": 0.14423892885623305, + "grad_norm": 7.805715084075928, + "learning_rate": 6.661255657598608e-06, + "loss": 0.5584, + "mean_token_accuracy": 0.8322044730186462, + "num_tokens": 55909873.0, + "step": 46530 + }, + { + "entropy": 1.8549921035766601, + "epoch": 0.14426992798128274, + "grad_norm": 10.361091613769531, + "learning_rate": 6.660539955335135e-06, + "loss": 0.5159, + "mean_token_accuracy": 0.8390928566455841, + "num_tokens": 55921497.0, + "step": 46540 + }, + { + "entropy": 1.8014539882540703, + "epoch": 0.1443009271063324, + "grad_norm": 7.748378753662109, + "learning_rate": 6.659824483712719e-06, + "loss": 0.5017, + "mean_token_accuracy": 0.8297727882862092, + "num_tokens": 55933974.0, + "step": 46550 + }, + { + "entropy": 1.912005639076233, + "epoch": 0.1443319262313821, + "grad_norm": 8.874786376953125, + "learning_rate": 6.659109242607511e-06, + "loss": 0.5539, + "mean_token_accuracy": 0.8200063824653625, + "num_tokens": 55945880.0, + "step": 46560 + }, + { + "entropy": 1.8200358718633651, + "epoch": 0.1443629253564318, + "grad_norm": 4.0915913581848145, + "learning_rate": 6.658394231895755e-06, + "loss": 0.5076, + "mean_token_accuracy": 0.8318188101053238, + "num_tokens": 55958253.0, + "step": 46570 + }, + { + "entropy": 1.8874425664544106, + "epoch": 0.1443939244814815, + "grad_norm": 9.732337951660156, + "learning_rate": 6.657679451453786e-06, + "loss": 0.5342, + "mean_token_accuracy": 0.8317007169127464, + "num_tokens": 55970172.0, + "step": 46580 + }, + { + "entropy": 1.8596255108714104, + "epoch": 0.1444249236065312, + "grad_norm": 9.834501266479492, + "learning_rate": 6.656964901158031e-06, + "loss": 0.5135, + "mean_token_accuracy": 0.8399197280406951, + "num_tokens": 55981424.0, + "step": 46590 + }, + { + "entropy": 1.8636808514595031, + "epoch": 0.1444559227315809, + "grad_norm": 9.933505058288574, + "learning_rate": 6.656250580885014e-06, + "loss": 0.5678, + "mean_token_accuracy": 0.8327499255537987, + "num_tokens": 55993099.0, + "step": 46600 + }, + { + "entropy": 1.8671812251210214, + "epoch": 0.1444869218566306, + "grad_norm": 8.399667739868164, + "learning_rate": 6.6555364905113505e-06, + "loss": 0.549, + "mean_token_accuracy": 0.8313580706715584, + "num_tokens": 56004188.0, + "step": 46610 + }, + { + "entropy": 1.8309295520186424, + "epoch": 0.1445179209816803, + "grad_norm": 4.422349452972412, + "learning_rate": 6.654822629913745e-06, + "loss": 0.4595, + "mean_token_accuracy": 0.8452798783779144, + "num_tokens": 56016240.0, + "step": 46620 + }, + { + "entropy": 1.746955469250679, + "epoch": 0.14454892010672998, + "grad_norm": 5.409885406494141, + "learning_rate": 6.654108998968999e-06, + "loss": 0.4726, + "mean_token_accuracy": 0.8372356191277504, + "num_tokens": 56029851.0, + "step": 46630 + }, + { + "entropy": 1.8878102406859398, + "epoch": 0.14457991923177968, + "grad_norm": 9.581794738769531, + "learning_rate": 6.653395597554003e-06, + "loss": 0.5621, + "mean_token_accuracy": 0.8266360089182854, + "num_tokens": 56041081.0, + "step": 46640 + }, + { + "entropy": 1.84484543800354, + "epoch": 0.14461091835682938, + "grad_norm": 7.586556434631348, + "learning_rate": 6.652682425545742e-06, + "loss": 0.5313, + "mean_token_accuracy": 0.8356327712535858, + "num_tokens": 56053083.0, + "step": 46650 + }, + { + "entropy": 1.8373423531651496, + "epoch": 0.14464191748187907, + "grad_norm": 9.485453605651855, + "learning_rate": 6.651969482821293e-06, + "loss": 0.5017, + "mean_token_accuracy": 0.8305898755788803, + "num_tokens": 56065247.0, + "step": 46660 + }, + { + "entropy": 1.8729679718613625, + "epoch": 0.14467291660692877, + "grad_norm": 4.585037708282471, + "learning_rate": 6.651256769257825e-06, + "loss": 0.5095, + "mean_token_accuracy": 0.8245977357029914, + "num_tokens": 56077249.0, + "step": 46670 + }, + { + "entropy": 1.8576960027217866, + "epoch": 0.14470391573197847, + "grad_norm": 8.548659324645996, + "learning_rate": 6.650544284732601e-06, + "loss": 0.5327, + "mean_token_accuracy": 0.8239507541060448, + "num_tokens": 56089496.0, + "step": 46680 + }, + { + "entropy": 1.7963466018438339, + "epoch": 0.14473491485702816, + "grad_norm": 8.17918872833252, + "learning_rate": 6.649832029122969e-06, + "loss": 0.4707, + "mean_token_accuracy": 0.8414889872074127, + "num_tokens": 56102439.0, + "step": 46690 + }, + { + "entropy": 1.8463149771094323, + "epoch": 0.14476591398207786, + "grad_norm": 8.336082458496094, + "learning_rate": 6.6491200023063785e-06, + "loss": 0.532, + "mean_token_accuracy": 0.8314433738589286, + "num_tokens": 56115196.0, + "step": 46700 + }, + { + "entropy": 1.8520590156316756, + "epoch": 0.14479691310712756, + "grad_norm": 9.052793502807617, + "learning_rate": 6.648408204160365e-06, + "loss": 0.5665, + "mean_token_accuracy": 0.8344256520271301, + "num_tokens": 56127775.0, + "step": 46710 + }, + { + "entropy": 1.8827403590083123, + "epoch": 0.14482791223217725, + "grad_norm": 9.495018005371094, + "learning_rate": 6.647696634562557e-06, + "loss": 0.5524, + "mean_token_accuracy": 0.825685128569603, + "num_tokens": 56140171.0, + "step": 46720 + }, + { + "entropy": 1.9633108615875243, + "epoch": 0.14485891135722695, + "grad_norm": 8.340209007263184, + "learning_rate": 6.646985293390675e-06, + "loss": 0.5714, + "mean_token_accuracy": 0.8286648213863372, + "num_tokens": 56151174.0, + "step": 46730 + }, + { + "entropy": 1.912299408018589, + "epoch": 0.14488991048227665, + "grad_norm": 9.881667137145996, + "learning_rate": 6.64627418052253e-06, + "loss": 0.5312, + "mean_token_accuracy": 0.834262129664421, + "num_tokens": 56162304.0, + "step": 46740 + }, + { + "entropy": 1.9305769801139832, + "epoch": 0.14492090960732634, + "grad_norm": 6.763992786407471, + "learning_rate": 6.6455632958360265e-06, + "loss": 0.6003, + "mean_token_accuracy": 0.8186121672391892, + "num_tokens": 56173644.0, + "step": 46750 + }, + { + "entropy": 1.8460824131965636, + "epoch": 0.14495190873237604, + "grad_norm": 10.21903133392334, + "learning_rate": 6.644852639209157e-06, + "loss": 0.5388, + "mean_token_accuracy": 0.8290549755096436, + "num_tokens": 56185409.0, + "step": 46760 + }, + { + "entropy": 1.843972623348236, + "epoch": 0.14498290785742574, + "grad_norm": 5.34625244140625, + "learning_rate": 6.6441422105200105e-06, + "loss": 0.5037, + "mean_token_accuracy": 0.8397470220923424, + "num_tokens": 56197647.0, + "step": 46770 + }, + { + "entropy": 1.9071653231978416, + "epoch": 0.14501390698247543, + "grad_norm": 9.360774993896484, + "learning_rate": 6.643432009646762e-06, + "loss": 0.5526, + "mean_token_accuracy": 0.8233714982867241, + "num_tokens": 56209322.0, + "step": 46780 + }, + { + "entropy": 1.867496982216835, + "epoch": 0.14504490610752513, + "grad_norm": 9.754353523254395, + "learning_rate": 6.642722036467681e-06, + "loss": 0.5032, + "mean_token_accuracy": 0.8340319588780403, + "num_tokens": 56221453.0, + "step": 46790 + }, + { + "entropy": 1.7567606747150422, + "epoch": 0.1450759052325748, + "grad_norm": 12.389626502990723, + "learning_rate": 6.642012290861126e-06, + "loss": 0.4416, + "mean_token_accuracy": 0.8503746286034584, + "num_tokens": 56234727.0, + "step": 46800 + }, + { + "entropy": 1.9208232283592224, + "epoch": 0.1451069043576245, + "grad_norm": 4.7340803146362305, + "learning_rate": 6.641302772705548e-06, + "loss": 0.5905, + "mean_token_accuracy": 0.8217899233102799, + "num_tokens": 56245924.0, + "step": 46810 + }, + { + "entropy": 1.7486246049404144, + "epoch": 0.1451379034826742, + "grad_norm": 4.023608207702637, + "learning_rate": 6.640593481879488e-06, + "loss": 0.4293, + "mean_token_accuracy": 0.8474720388650894, + "num_tokens": 56259967.0, + "step": 46820 + }, + { + "entropy": 1.9232235804200173, + "epoch": 0.1451689026077239, + "grad_norm": 9.536555290222168, + "learning_rate": 6.63988441826158e-06, + "loss": 0.5834, + "mean_token_accuracy": 0.8297638326883316, + "num_tokens": 56271400.0, + "step": 46830 + }, + { + "entropy": 1.862909409403801, + "epoch": 0.14519990173277358, + "grad_norm": 9.564486503601074, + "learning_rate": 6.639175581730542e-06, + "loss": 0.5344, + "mean_token_accuracy": 0.8356858551502228, + "num_tokens": 56283115.0, + "step": 46840 + }, + { + "entropy": 1.8271246001124382, + "epoch": 0.14523090085782328, + "grad_norm": 4.301812648773193, + "learning_rate": 6.638466972165192e-06, + "loss": 0.4924, + "mean_token_accuracy": 0.8387002035975456, + "num_tokens": 56295992.0, + "step": 46850 + }, + { + "entropy": 1.901492816209793, + "epoch": 0.14526189998287298, + "grad_norm": 8.603260040283203, + "learning_rate": 6.63775858944443e-06, + "loss": 0.5661, + "mean_token_accuracy": 0.8132639810442924, + "num_tokens": 56307074.0, + "step": 46860 + }, + { + "entropy": 1.7843642815947534, + "epoch": 0.14529289910792267, + "grad_norm": 7.738562107086182, + "learning_rate": 6.637050433447254e-06, + "loss": 0.4475, + "mean_token_accuracy": 0.8477630734443664, + "num_tokens": 56320727.0, + "step": 46870 + }, + { + "entropy": 1.8238538324832916, + "epoch": 0.14532389823297237, + "grad_norm": 4.08509635925293, + "learning_rate": 6.636342504052748e-06, + "loss": 0.4876, + "mean_token_accuracy": 0.8303026512265206, + "num_tokens": 56333848.0, + "step": 46880 + }, + { + "entropy": 1.7951844319701196, + "epoch": 0.14535489735802207, + "grad_norm": 7.781752586364746, + "learning_rate": 6.635634801140083e-06, + "loss": 0.4462, + "mean_token_accuracy": 0.8464242547750473, + "num_tokens": 56346965.0, + "step": 46890 + }, + { + "entropy": 1.8057351917028428, + "epoch": 0.14538589648307176, + "grad_norm": 3.1438589096069336, + "learning_rate": 6.634927324588528e-06, + "loss": 0.4282, + "mean_token_accuracy": 0.8528795570135117, + "num_tokens": 56359959.0, + "step": 46900 + }, + { + "entropy": 1.9472463458776474, + "epoch": 0.14541689560812146, + "grad_norm": 9.385503768920898, + "learning_rate": 6.634220074277438e-06, + "loss": 0.5786, + "mean_token_accuracy": 0.82788744866848, + "num_tokens": 56371468.0, + "step": 46910 + }, + { + "entropy": 1.8034564316272736, + "epoch": 0.14544789473317116, + "grad_norm": 6.253968238830566, + "learning_rate": 6.633513050086256e-06, + "loss": 0.4302, + "mean_token_accuracy": 0.8509827584028244, + "num_tokens": 56384646.0, + "step": 46920 + }, + { + "entropy": 1.8610297739505768, + "epoch": 0.14547889385822085, + "grad_norm": 9.084609031677246, + "learning_rate": 6.6328062518945195e-06, + "loss": 0.4996, + "mean_token_accuracy": 0.8404456228017807, + "num_tokens": 56396918.0, + "step": 46930 + }, + { + "entropy": 1.901959379762411, + "epoch": 0.14550989298327055, + "grad_norm": 8.576674461364746, + "learning_rate": 6.63209967958185e-06, + "loss": 0.5396, + "mean_token_accuracy": 0.8313698336482048, + "num_tokens": 56409026.0, + "step": 46940 + }, + { + "entropy": 1.8410135254263877, + "epoch": 0.14554089210832025, + "grad_norm": 4.225868225097656, + "learning_rate": 6.631393333027966e-06, + "loss": 0.4869, + "mean_token_accuracy": 0.838833749294281, + "num_tokens": 56421316.0, + "step": 46950 + }, + { + "entropy": 1.8137712955474854, + "epoch": 0.14557189123336994, + "grad_norm": 7.888924598693848, + "learning_rate": 6.630687212112668e-06, + "loss": 0.4549, + "mean_token_accuracy": 0.8446080341935158, + "num_tokens": 56434495.0, + "step": 46960 + }, + { + "entropy": 1.9139295309782027, + "epoch": 0.14560289035841964, + "grad_norm": 7.617772579193115, + "learning_rate": 6.629981316715853e-06, + "loss": 0.5159, + "mean_token_accuracy": 0.831653282046318, + "num_tokens": 56447269.0, + "step": 46970 + }, + { + "entropy": 1.965524472296238, + "epoch": 0.14563388948346934, + "grad_norm": 9.798111915588379, + "learning_rate": 6.629275646717503e-06, + "loss": 0.5769, + "mean_token_accuracy": 0.8235475957393646, + "num_tokens": 56458884.0, + "step": 46980 + }, + { + "entropy": 1.8459963738918304, + "epoch": 0.14566488860851903, + "grad_norm": 8.215746879577637, + "learning_rate": 6.628570201997693e-06, + "loss": 0.4492, + "mean_token_accuracy": 0.8458223536610603, + "num_tokens": 56472093.0, + "step": 46990 + }, + { + "entropy": 1.956969639658928, + "epoch": 0.14569588773356873, + "grad_norm": 8.928169250488281, + "learning_rate": 6.62786498243658e-06, + "loss": 0.6133, + "mean_token_accuracy": 0.8251758500933647, + "num_tokens": 56482622.0, + "step": 47000 + }, + { + "entropy": 1.9019935339689256, + "epoch": 0.14572688685861843, + "grad_norm": 7.649944305419922, + "learning_rate": 6.627159987914421e-06, + "loss": 0.6096, + "mean_token_accuracy": 0.8216905370354652, + "num_tokens": 56495135.0, + "step": 47010 + }, + { + "entropy": 1.8709472745656968, + "epoch": 0.14575788598366812, + "grad_norm": 5.07206916809082, + "learning_rate": 6.626455218311551e-06, + "loss": 0.5006, + "mean_token_accuracy": 0.8332753911614418, + "num_tokens": 56507439.0, + "step": 47020 + }, + { + "entropy": 1.9712065637111664, + "epoch": 0.14578888510871782, + "grad_norm": 8.687137603759766, + "learning_rate": 6.6257506735084055e-06, + "loss": 0.6034, + "mean_token_accuracy": 0.8207579106092453, + "num_tokens": 56518700.0, + "step": 47030 + }, + { + "entropy": 1.7840657129883766, + "epoch": 0.1458198842337675, + "grad_norm": 8.681142807006836, + "learning_rate": 6.625046353385498e-06, + "loss": 0.4615, + "mean_token_accuracy": 0.8308488741517067, + "num_tokens": 56531438.0, + "step": 47040 + }, + { + "entropy": 1.9545001640915871, + "epoch": 0.14585088335881718, + "grad_norm": 11.037637710571289, + "learning_rate": 6.624342257823438e-06, + "loss": 0.5669, + "mean_token_accuracy": 0.8239473134279252, + "num_tokens": 56542655.0, + "step": 47050 + }, + { + "entropy": 1.8824947997927666, + "epoch": 0.14588188248386688, + "grad_norm": 8.642295837402344, + "learning_rate": 6.623638386702921e-06, + "loss": 0.5302, + "mean_token_accuracy": 0.8258578881621361, + "num_tokens": 56555705.0, + "step": 47060 + }, + { + "entropy": 1.871096746623516, + "epoch": 0.14591288160891658, + "grad_norm": 9.842241287231445, + "learning_rate": 6.622934739904732e-06, + "loss": 0.4928, + "mean_token_accuracy": 0.836942833662033, + "num_tokens": 56568268.0, + "step": 47070 + }, + { + "entropy": 1.9141883179545403, + "epoch": 0.14594388073396627, + "grad_norm": 8.952548027038574, + "learning_rate": 6.6222313173097454e-06, + "loss": 0.5371, + "mean_token_accuracy": 0.8351935803890228, + "num_tokens": 56579903.0, + "step": 47080 + }, + { + "entropy": 1.8572052717208862, + "epoch": 0.14597487985901597, + "grad_norm": 9.243520736694336, + "learning_rate": 6.62152811879892e-06, + "loss": 0.5324, + "mean_token_accuracy": 0.8366377666592598, + "num_tokens": 56592500.0, + "step": 47090 + }, + { + "entropy": 1.8863570004701615, + "epoch": 0.14600587898406567, + "grad_norm": 8.63272762298584, + "learning_rate": 6.620825144253312e-06, + "loss": 0.528, + "mean_token_accuracy": 0.8290054813027382, + "num_tokens": 56604096.0, + "step": 47100 + }, + { + "entropy": 1.8670645967125892, + "epoch": 0.14603687810911536, + "grad_norm": 8.21442985534668, + "learning_rate": 6.620122393554056e-06, + "loss": 0.6717, + "mean_token_accuracy": 0.8223129764199257, + "num_tokens": 56616718.0, + "step": 47110 + }, + { + "entropy": 1.9217035099864006, + "epoch": 0.14606787723416506, + "grad_norm": 4.886983871459961, + "learning_rate": 6.6194198665823796e-06, + "loss": 0.5741, + "mean_token_accuracy": 0.8262522548437119, + "num_tokens": 56627768.0, + "step": 47120 + }, + { + "entropy": 1.9420514121651649, + "epoch": 0.14609887635921476, + "grad_norm": 8.3319673538208, + "learning_rate": 6.6187175632195985e-06, + "loss": 0.602, + "mean_token_accuracy": 0.8175177812576294, + "num_tokens": 56638951.0, + "step": 47130 + }, + { + "entropy": 1.9218653574585915, + "epoch": 0.14612987548426445, + "grad_norm": 8.83938217163086, + "learning_rate": 6.618015483347118e-06, + "loss": 0.5549, + "mean_token_accuracy": 0.82304065823555, + "num_tokens": 56651297.0, + "step": 47140 + }, + { + "entropy": 1.9661997646093368, + "epoch": 0.14616087460931415, + "grad_norm": 9.71408462524414, + "learning_rate": 6.6173136268464276e-06, + "loss": 0.5889, + "mean_token_accuracy": 0.8207120850682259, + "num_tokens": 56662494.0, + "step": 47150 + }, + { + "entropy": 1.934261092543602, + "epoch": 0.14619187373436385, + "grad_norm": 8.855196952819824, + "learning_rate": 6.616611993599109e-06, + "loss": 0.5464, + "mean_token_accuracy": 0.8397008880972863, + "num_tokens": 56673808.0, + "step": 47160 + }, + { + "entropy": 1.9365019842982292, + "epoch": 0.14622287285941354, + "grad_norm": 8.653056144714355, + "learning_rate": 6.6159105834868275e-06, + "loss": 0.5803, + "mean_token_accuracy": 0.8273678243160247, + "num_tokens": 56685577.0, + "step": 47170 + }, + { + "entropy": 1.872480408847332, + "epoch": 0.14625387198446324, + "grad_norm": 8.392419815063477, + "learning_rate": 6.615209396391338e-06, + "loss": 0.511, + "mean_token_accuracy": 0.8286007553339004, + "num_tokens": 56698302.0, + "step": 47180 + }, + { + "entropy": 1.9219907984137534, + "epoch": 0.14628487110951294, + "grad_norm": 11.029183387756348, + "learning_rate": 6.614508432194486e-06, + "loss": 0.5493, + "mean_token_accuracy": 0.8315992474555969, + "num_tokens": 56709524.0, + "step": 47190 + }, + { + "entropy": 1.928644596040249, + "epoch": 0.14631587023456263, + "grad_norm": 8.550175666809082, + "learning_rate": 6.613807690778199e-06, + "loss": 0.5016, + "mean_token_accuracy": 0.8338589072227478, + "num_tokens": 56721586.0, + "step": 47200 + }, + { + "entropy": 1.8833053424954413, + "epoch": 0.14634686935961233, + "grad_norm": 4.840978622436523, + "learning_rate": 6.613107172024497e-06, + "loss": 0.5268, + "mean_token_accuracy": 0.8266768530011177, + "num_tokens": 56733887.0, + "step": 47210 + }, + { + "entropy": 1.8826899453997612, + "epoch": 0.14637786848466203, + "grad_norm": 6.925222396850586, + "learning_rate": 6.6124068758154836e-06, + "loss": 0.5489, + "mean_token_accuracy": 0.8324401840567589, + "num_tokens": 56745661.0, + "step": 47220 + }, + { + "entropy": 1.8747637838125228, + "epoch": 0.14640886760971172, + "grad_norm": 9.450230598449707, + "learning_rate": 6.611706802033354e-06, + "loss": 0.5337, + "mean_token_accuracy": 0.8353126794099808, + "num_tokens": 56758002.0, + "step": 47230 + }, + { + "entropy": 1.8948550507426263, + "epoch": 0.14643986673476142, + "grad_norm": 4.941351890563965, + "learning_rate": 6.611006950560388e-06, + "loss": 0.5553, + "mean_token_accuracy": 0.8193787977099418, + "num_tokens": 56770680.0, + "step": 47240 + }, + { + "entropy": 1.9120934292674066, + "epoch": 0.14647086585981112, + "grad_norm": 10.706798553466797, + "learning_rate": 6.610307321278952e-06, + "loss": 0.6107, + "mean_token_accuracy": 0.8241473525762558, + "num_tokens": 56782436.0, + "step": 47250 + }, + { + "entropy": 1.9104921489953994, + "epoch": 0.1465018649848608, + "grad_norm": 8.724833488464355, + "learning_rate": 6.6096079140715005e-06, + "loss": 0.5747, + "mean_token_accuracy": 0.8173437684774398, + "num_tokens": 56794146.0, + "step": 47260 + }, + { + "entropy": 1.826416552066803, + "epoch": 0.1465328641099105, + "grad_norm": 7.878024578094482, + "learning_rate": 6.6089087288205766e-06, + "loss": 0.4277, + "mean_token_accuracy": 0.8480478748679161, + "num_tokens": 56806763.0, + "step": 47270 + }, + { + "entropy": 1.8449065156280995, + "epoch": 0.1465638632349602, + "grad_norm": 8.807769775390625, + "learning_rate": 6.608209765408807e-06, + "loss": 0.4694, + "mean_token_accuracy": 0.8426515519618988, + "num_tokens": 56819546.0, + "step": 47280 + }, + { + "entropy": 1.9609465137124062, + "epoch": 0.14659486236000988, + "grad_norm": 9.253771781921387, + "learning_rate": 6.607511023718909e-06, + "loss": 0.5784, + "mean_token_accuracy": 0.8272478267550468, + "num_tokens": 56830641.0, + "step": 47290 + }, + { + "entropy": 1.8133751511573792, + "epoch": 0.14662586148505957, + "grad_norm": 4.237439155578613, + "learning_rate": 6.6068125036336824e-06, + "loss": 0.479, + "mean_token_accuracy": 0.8408553779125214, + "num_tokens": 56843286.0, + "step": 47300 + }, + { + "entropy": 1.8579260095953942, + "epoch": 0.14665686061010927, + "grad_norm": 8.59593677520752, + "learning_rate": 6.6061142050360174e-06, + "loss": 0.492, + "mean_token_accuracy": 0.8416462868452073, + "num_tokens": 56855446.0, + "step": 47310 + }, + { + "entropy": 1.9909762263298034, + "epoch": 0.14668785973515897, + "grad_norm": 9.792447090148926, + "learning_rate": 6.60541612780889e-06, + "loss": 0.5909, + "mean_token_accuracy": 0.8207048043608666, + "num_tokens": 56866314.0, + "step": 47320 + }, + { + "entropy": 1.8144005626440047, + "epoch": 0.14671885886020866, + "grad_norm": 6.8003644943237305, + "learning_rate": 6.604718271835362e-06, + "loss": 0.4639, + "mean_token_accuracy": 0.8494958788156509, + "num_tokens": 56879467.0, + "step": 47330 + }, + { + "entropy": 1.934771877527237, + "epoch": 0.14674985798525836, + "grad_norm": 7.686990261077881, + "learning_rate": 6.60402063699858e-06, + "loss": 0.5566, + "mean_token_accuracy": 0.8413501441478729, + "num_tokens": 56890100.0, + "step": 47340 + }, + { + "entropy": 1.8710382498800755, + "epoch": 0.14678085711030806, + "grad_norm": 9.023658752441406, + "learning_rate": 6.603323223181781e-06, + "loss": 0.5367, + "mean_token_accuracy": 0.8303638845682144, + "num_tokens": 56903039.0, + "step": 47350 + }, + { + "entropy": 1.853154082596302, + "epoch": 0.14681185623535775, + "grad_norm": 7.490630626678467, + "learning_rate": 6.6026260302682866e-06, + "loss": 0.4761, + "mean_token_accuracy": 0.8393293723464013, + "num_tokens": 56915063.0, + "step": 47360 + }, + { + "entropy": 1.8372676715254783, + "epoch": 0.14684285536040745, + "grad_norm": 6.07124137878418, + "learning_rate": 6.601929058141503e-06, + "loss": 0.5217, + "mean_token_accuracy": 0.8287788331508636, + "num_tokens": 56928744.0, + "step": 47370 + }, + { + "entropy": 1.831204354763031, + "epoch": 0.14687385448545714, + "grad_norm": 4.461570739746094, + "learning_rate": 6.601232306684922e-06, + "loss": 0.4442, + "mean_token_accuracy": 0.8416302114725113, + "num_tokens": 56941775.0, + "step": 47380 + }, + { + "entropy": 1.8040766946971416, + "epoch": 0.14690485361050684, + "grad_norm": 3.481854200363159, + "learning_rate": 6.600535775782128e-06, + "loss": 0.4513, + "mean_token_accuracy": 0.8418020218610763, + "num_tokens": 56955721.0, + "step": 47390 + }, + { + "entropy": 1.8790962159633637, + "epoch": 0.14693585273555654, + "grad_norm": 8.983044624328613, + "learning_rate": 6.599839465316782e-06, + "loss": 0.4852, + "mean_token_accuracy": 0.8474810421466827, + "num_tokens": 56968021.0, + "step": 47400 + }, + { + "entropy": 1.8795790463685988, + "epoch": 0.14696685186060623, + "grad_norm": 9.99634838104248, + "learning_rate": 6.599143375172638e-06, + "loss": 0.5054, + "mean_token_accuracy": 0.8363196149468421, + "num_tokens": 56979842.0, + "step": 47410 + }, + { + "entropy": 1.9174206882715226, + "epoch": 0.14699785098565593, + "grad_norm": 8.22517204284668, + "learning_rate": 6.598447505233533e-06, + "loss": 0.5717, + "mean_token_accuracy": 0.8306628108024597, + "num_tokens": 56991796.0, + "step": 47420 + }, + { + "entropy": 1.9805866748094558, + "epoch": 0.14702885011070563, + "grad_norm": 9.243436813354492, + "learning_rate": 6.59775185538339e-06, + "loss": 0.5826, + "mean_token_accuracy": 0.829287999868393, + "num_tokens": 57003139.0, + "step": 47430 + }, + { + "entropy": 1.8765076369047164, + "epoch": 0.14705984923575532, + "grad_norm": 8.006113052368164, + "learning_rate": 6.597056425506216e-06, + "loss": 0.5153, + "mean_token_accuracy": 0.8328446924686432, + "num_tokens": 57015872.0, + "step": 47440 + }, + { + "entropy": 1.8864967197179794, + "epoch": 0.14709084836080502, + "grad_norm": 3.926126003265381, + "learning_rate": 6.596361215486107e-06, + "loss": 0.4721, + "mean_token_accuracy": 0.8418742284178734, + "num_tokens": 57027981.0, + "step": 47450 + }, + { + "entropy": 1.8978651389479637, + "epoch": 0.14712184748585472, + "grad_norm": 4.1893134117126465, + "learning_rate": 6.595666225207241e-06, + "loss": 0.5869, + "mean_token_accuracy": 0.8199717015028, + "num_tokens": 57039934.0, + "step": 47460 + }, + { + "entropy": 1.9076218917965888, + "epoch": 0.14715284661090441, + "grad_norm": 11.420867919921875, + "learning_rate": 6.594971454553885e-06, + "loss": 0.5421, + "mean_token_accuracy": 0.830481293797493, + "num_tokens": 57052007.0, + "step": 47470 + }, + { + "entropy": 2.0099540084600447, + "epoch": 0.1471838457359541, + "grad_norm": 8.73011589050293, + "learning_rate": 6.5942769034103895e-06, + "loss": 0.6084, + "mean_token_accuracy": 0.8137494072318077, + "num_tokens": 57062753.0, + "step": 47480 + }, + { + "entropy": 1.8646449223160744, + "epoch": 0.1472148448610038, + "grad_norm": 10.318194389343262, + "learning_rate": 6.593582571661188e-06, + "loss": 0.4844, + "mean_token_accuracy": 0.842064967751503, + "num_tokens": 57075443.0, + "step": 47490 + }, + { + "entropy": 1.8991115644574166, + "epoch": 0.1472458439860535, + "grad_norm": 6.439994812011719, + "learning_rate": 6.592888459190802e-06, + "loss": 0.5384, + "mean_token_accuracy": 0.8363965794444084, + "num_tokens": 57088273.0, + "step": 47500 + }, + { + "entropy": 1.9018190592527389, + "epoch": 0.1472768431111032, + "grad_norm": 5.49556827545166, + "learning_rate": 6.592194565883839e-06, + "loss": 0.4666, + "mean_token_accuracy": 0.8406558021903038, + "num_tokens": 57100663.0, + "step": 47510 + }, + { + "entropy": 2.0055468559265135, + "epoch": 0.1473078422361529, + "grad_norm": 8.085359573364258, + "learning_rate": 6.591500891624989e-06, + "loss": 0.612, + "mean_token_accuracy": 0.8196608617901802, + "num_tokens": 57111772.0, + "step": 47520 + }, + { + "entropy": 1.944721657037735, + "epoch": 0.1473388413612026, + "grad_norm": 8.445950508117676, + "learning_rate": 6.590807436299027e-06, + "loss": 0.5381, + "mean_token_accuracy": 0.824900957942009, + "num_tokens": 57123451.0, + "step": 47530 + }, + { + "entropy": 1.9133238226175309, + "epoch": 0.14736984048625226, + "grad_norm": 4.461698532104492, + "learning_rate": 6.590114199790815e-06, + "loss": 0.4967, + "mean_token_accuracy": 0.8348732620477677, + "num_tokens": 57135025.0, + "step": 47540 + }, + { + "entropy": 1.862838363647461, + "epoch": 0.14740083961130196, + "grad_norm": 3.581024408340454, + "learning_rate": 6.589421181985297e-06, + "loss": 0.4594, + "mean_token_accuracy": 0.8478377997875214, + "num_tokens": 57146927.0, + "step": 47550 + }, + { + "entropy": 1.9030270993709564, + "epoch": 0.14743183873635166, + "grad_norm": 7.672686576843262, + "learning_rate": 6.588728382767504e-06, + "loss": 0.5129, + "mean_token_accuracy": 0.8314098447561264, + "num_tokens": 57158627.0, + "step": 47560 + }, + { + "entropy": 1.9589687079191207, + "epoch": 0.14746283786140135, + "grad_norm": 9.731719970703125, + "learning_rate": 6.58803580202255e-06, + "loss": 0.5612, + "mean_token_accuracy": 0.8271705433726311, + "num_tokens": 57169934.0, + "step": 47570 + }, + { + "entropy": 1.934072805941105, + "epoch": 0.14749383698645105, + "grad_norm": 8.698883056640625, + "learning_rate": 6.587343439635634e-06, + "loss": 0.504, + "mean_token_accuracy": 0.8397659227252007, + "num_tokens": 57182220.0, + "step": 47580 + }, + { + "entropy": 1.9594342321157456, + "epoch": 0.14752483611150075, + "grad_norm": 9.804793357849121, + "learning_rate": 6.586651295492042e-06, + "loss": 0.5365, + "mean_token_accuracy": 0.8358360469341278, + "num_tokens": 57193785.0, + "step": 47590 + }, + { + "entropy": 1.985189399123192, + "epoch": 0.14755583523655044, + "grad_norm": 8.428631782531738, + "learning_rate": 6.585959369477139e-06, + "loss": 0.5789, + "mean_token_accuracy": 0.8262299597263336, + "num_tokens": 57205399.0, + "step": 47600 + }, + { + "entropy": 2.0018600046634676, + "epoch": 0.14758683436160014, + "grad_norm": 7.76343297958374, + "learning_rate": 6.585267661476379e-06, + "loss": 0.5989, + "mean_token_accuracy": 0.8301928460597991, + "num_tokens": 57216010.0, + "step": 47610 + }, + { + "entropy": 1.9304006546735764, + "epoch": 0.14761783348664984, + "grad_norm": 7.311439514160156, + "learning_rate": 6.584576171375298e-06, + "loss": 0.5318, + "mean_token_accuracy": 0.8509150952100754, + "num_tokens": 57227397.0, + "step": 47620 + }, + { + "entropy": 1.924649564921856, + "epoch": 0.14764883261169953, + "grad_norm": 8.823622703552246, + "learning_rate": 6.5838848990595135e-06, + "loss": 0.527, + "mean_token_accuracy": 0.8323667719960213, + "num_tokens": 57239462.0, + "step": 47630 + }, + { + "entropy": 1.9695616766810418, + "epoch": 0.14767983173674923, + "grad_norm": 9.303346633911133, + "learning_rate": 6.583193844414736e-06, + "loss": 0.5739, + "mean_token_accuracy": 0.8252382263541221, + "num_tokens": 57250697.0, + "step": 47640 + }, + { + "entropy": 1.9715566843748094, + "epoch": 0.14771083086179893, + "grad_norm": 8.258867263793945, + "learning_rate": 6.582503007326752e-06, + "loss": 0.579, + "mean_token_accuracy": 0.8235922992229462, + "num_tokens": 57261910.0, + "step": 47650 + }, + { + "entropy": 1.9681269809603692, + "epoch": 0.14774182998684862, + "grad_norm": 4.465610504150391, + "learning_rate": 6.58181238768143e-06, + "loss": 0.5258, + "mean_token_accuracy": 0.8352336034178733, + "num_tokens": 57273666.0, + "step": 47660 + }, + { + "entropy": 1.8807261288166046, + "epoch": 0.14777282911189832, + "grad_norm": 8.713099479675293, + "learning_rate": 6.5811219853647315e-06, + "loss": 0.4857, + "mean_token_accuracy": 0.8416456952691078, + "num_tokens": 57286174.0, + "step": 47670 + }, + { + "entropy": 1.9564361080527306, + "epoch": 0.14780382823694801, + "grad_norm": 4.097822189331055, + "learning_rate": 6.580431800262694e-06, + "loss": 0.5526, + "mean_token_accuracy": 0.8282778993248939, + "num_tokens": 57298085.0, + "step": 47680 + }, + { + "entropy": 1.9153534591197967, + "epoch": 0.1478348273619977, + "grad_norm": 8.843733787536621, + "learning_rate": 6.57974183226144e-06, + "loss": 0.5322, + "mean_token_accuracy": 0.832582226395607, + "num_tokens": 57310164.0, + "step": 47690 + }, + { + "entropy": 1.8493407145142555, + "epoch": 0.1478658264870474, + "grad_norm": 10.445718765258789, + "learning_rate": 6.579052081247181e-06, + "loss": 0.4756, + "mean_token_accuracy": 0.8435613483190536, + "num_tokens": 57323308.0, + "step": 47700 + }, + { + "entropy": 1.87943754196167, + "epoch": 0.1478968256120971, + "grad_norm": 7.931858062744141, + "learning_rate": 6.578362547106202e-06, + "loss": 0.5018, + "mean_token_accuracy": 0.8353148818016052, + "num_tokens": 57336086.0, + "step": 47710 + }, + { + "entropy": 1.9901050910353661, + "epoch": 0.1479278247371468, + "grad_norm": 8.560962677001953, + "learning_rate": 6.5776732297248805e-06, + "loss": 0.5716, + "mean_token_accuracy": 0.8269386544823647, + "num_tokens": 57347719.0, + "step": 47720 + }, + { + "entropy": 2.0061254844069483, + "epoch": 0.1479588238621965, + "grad_norm": 7.655875205993652, + "learning_rate": 6.576984128989673e-06, + "loss": 0.5468, + "mean_token_accuracy": 0.837123441696167, + "num_tokens": 57359271.0, + "step": 47730 + }, + { + "entropy": 2.0044953674077988, + "epoch": 0.1479898229872462, + "grad_norm": 8.174701690673828, + "learning_rate": 6.576295244787121e-06, + "loss": 0.5825, + "mean_token_accuracy": 0.8275741189718246, + "num_tokens": 57370109.0, + "step": 47740 + }, + { + "entropy": 1.8147762969136239, + "epoch": 0.1480208221122959, + "grad_norm": 9.397214889526367, + "learning_rate": 6.575606577003847e-06, + "loss": 0.4674, + "mean_token_accuracy": 0.8475957497954368, + "num_tokens": 57383452.0, + "step": 47750 + }, + { + "entropy": 1.9344044476747513, + "epoch": 0.1480518212373456, + "grad_norm": 9.833171844482422, + "learning_rate": 6.574918125526558e-06, + "loss": 0.5411, + "mean_token_accuracy": 0.8328167483210563, + "num_tokens": 57394798.0, + "step": 47760 + }, + { + "entropy": 1.8636110588908195, + "epoch": 0.14808282036239528, + "grad_norm": 3.9326298236846924, + "learning_rate": 6.574229890242045e-06, + "loss": 0.4629, + "mean_token_accuracy": 0.8495140254497529, + "num_tokens": 57407317.0, + "step": 47770 + }, + { + "entropy": 1.8715795949101448, + "epoch": 0.14811381948744495, + "grad_norm": 4.289327621459961, + "learning_rate": 6.57354187103718e-06, + "loss": 0.517, + "mean_token_accuracy": 0.8313918590545655, + "num_tokens": 57419698.0, + "step": 47780 + }, + { + "entropy": 1.8456829696893693, + "epoch": 0.14814481861249465, + "grad_norm": 2.9080862998962402, + "learning_rate": 6.57285406779892e-06, + "loss": 0.4923, + "mean_token_accuracy": 0.8416327074170112, + "num_tokens": 57432183.0, + "step": 47790 + }, + { + "entropy": 1.8631455272436142, + "epoch": 0.14817581773754435, + "grad_norm": 8.882286071777344, + "learning_rate": 6.5721664804143015e-06, + "loss": 0.4566, + "mean_token_accuracy": 0.8395325362682342, + "num_tokens": 57444434.0, + "step": 47800 + }, + { + "entropy": 1.8976523950695992, + "epoch": 0.14820681686259404, + "grad_norm": 9.217388153076172, + "learning_rate": 6.5714791087704465e-06, + "loss": 0.5584, + "mean_token_accuracy": 0.8214605495333671, + "num_tokens": 57456138.0, + "step": 47810 + }, + { + "entropy": 1.9854907482862472, + "epoch": 0.14823781598764374, + "grad_norm": 9.021675109863281, + "learning_rate": 6.570791952754559e-06, + "loss": 0.5766, + "mean_token_accuracy": 0.8244616031646729, + "num_tokens": 57466708.0, + "step": 47820 + }, + { + "entropy": 1.9569459453225135, + "epoch": 0.14826881511269344, + "grad_norm": 9.533801078796387, + "learning_rate": 6.570105012253927e-06, + "loss": 0.5676, + "mean_token_accuracy": 0.8229658871889114, + "num_tokens": 57478273.0, + "step": 47830 + }, + { + "entropy": 1.85792086571455, + "epoch": 0.14829981423774313, + "grad_norm": 9.635293960571289, + "learning_rate": 6.569418287155915e-06, + "loss": 0.518, + "mean_token_accuracy": 0.8407026007771492, + "num_tokens": 57491220.0, + "step": 47840 + }, + { + "entropy": 1.906779670715332, + "epoch": 0.14833081336279283, + "grad_norm": 9.097919464111328, + "learning_rate": 6.568731777347978e-06, + "loss": 0.5183, + "mean_token_accuracy": 0.8338428869843483, + "num_tokens": 57503621.0, + "step": 47850 + }, + { + "entropy": 1.8580841287970542, + "epoch": 0.14836181248784253, + "grad_norm": 9.157151222229004, + "learning_rate": 6.568045482717649e-06, + "loss": 0.4643, + "mean_token_accuracy": 0.8528664171695709, + "num_tokens": 57515610.0, + "step": 47860 + }, + { + "entropy": 1.839845283329487, + "epoch": 0.14839281161289222, + "grad_norm": 4.290517807006836, + "learning_rate": 6.567359403152542e-06, + "loss": 0.4947, + "mean_token_accuracy": 0.8425501629710197, + "num_tokens": 57528868.0, + "step": 47870 + }, + { + "entropy": 1.8454848155379295, + "epoch": 0.14842381073794192, + "grad_norm": 4.269301891326904, + "learning_rate": 6.566673538540357e-06, + "loss": 0.4629, + "mean_token_accuracy": 0.8409277200698853, + "num_tokens": 57542104.0, + "step": 47880 + }, + { + "entropy": 1.8853970304131509, + "epoch": 0.14845480986299162, + "grad_norm": 8.715253829956055, + "learning_rate": 6.5659878887688726e-06, + "loss": 0.4744, + "mean_token_accuracy": 0.8417058885097504, + "num_tokens": 57555226.0, + "step": 47890 + }, + { + "entropy": 1.905742235481739, + "epoch": 0.1484858089880413, + "grad_norm": 8.63672161102295, + "learning_rate": 6.56530245372595e-06, + "loss": 0.5077, + "mean_token_accuracy": 0.8448764130473136, + "num_tokens": 57567249.0, + "step": 47900 + }, + { + "entropy": 1.9830147728323937, + "epoch": 0.148516808113091, + "grad_norm": 7.22813606262207, + "learning_rate": 6.564617233299536e-06, + "loss": 0.5724, + "mean_token_accuracy": 0.8194261863827705, + "num_tokens": 57579388.0, + "step": 47910 + }, + { + "entropy": 1.769668859243393, + "epoch": 0.1485478072381407, + "grad_norm": 4.5545125007629395, + "learning_rate": 6.563932227377654e-06, + "loss": 0.4876, + "mean_token_accuracy": 0.8449043944478035, + "num_tokens": 57594365.0, + "step": 47920 + }, + { + "entropy": 1.802185168862343, + "epoch": 0.1485788063631904, + "grad_norm": 10.426836013793945, + "learning_rate": 6.563247435848412e-06, + "loss": 0.4355, + "mean_token_accuracy": 0.8285350769758224, + "num_tokens": 57608164.0, + "step": 47930 + }, + { + "entropy": 1.88195166811347, + "epoch": 0.1486098054882401, + "grad_norm": 7.976967811584473, + "learning_rate": 6.5625628586e-06, + "loss": 0.5016, + "mean_token_accuracy": 0.8379166424274445, + "num_tokens": 57621398.0, + "step": 47940 + }, + { + "entropy": 1.8815200373530387, + "epoch": 0.1486408046132898, + "grad_norm": 4.0617876052856445, + "learning_rate": 6.561878495520689e-06, + "loss": 0.5044, + "mean_token_accuracy": 0.8357270896434784, + "num_tokens": 57633368.0, + "step": 47950 + }, + { + "entropy": 1.8966475576162338, + "epoch": 0.1486718037383395, + "grad_norm": 7.131133079528809, + "learning_rate": 6.56119434649883e-06, + "loss": 0.5513, + "mean_token_accuracy": 0.8241571202874184, + "num_tokens": 57645429.0, + "step": 47960 + }, + { + "entropy": 1.970197968184948, + "epoch": 0.1487028028633892, + "grad_norm": 8.698470115661621, + "learning_rate": 6.5605104114228565e-06, + "loss": 0.5695, + "mean_token_accuracy": 0.8253947734832764, + "num_tokens": 57656725.0, + "step": 47970 + }, + { + "entropy": 1.8682085782289506, + "epoch": 0.14873380198843889, + "grad_norm": 7.869755744934082, + "learning_rate": 6.5598266901812866e-06, + "loss": 0.4861, + "mean_token_accuracy": 0.8386846721172333, + "num_tokens": 57668875.0, + "step": 47980 + }, + { + "entropy": 1.9324923619627952, + "epoch": 0.14876480111348858, + "grad_norm": 8.918335914611816, + "learning_rate": 6.559143182662716e-06, + "loss": 0.5732, + "mean_token_accuracy": 0.8326233476400375, + "num_tokens": 57680511.0, + "step": 47990 + }, + { + "entropy": 1.8512993976473808, + "epoch": 0.14879580023853828, + "grad_norm": 3.6257503032684326, + "learning_rate": 6.55845988875582e-06, + "loss": 0.4992, + "mean_token_accuracy": 0.8437419295310974, + "num_tokens": 57693483.0, + "step": 48000 + }, + { + "entropy": 1.877199736237526, + "epoch": 0.14882679936358797, + "grad_norm": 4.235811233520508, + "learning_rate": 6.557776808349361e-06, + "loss": 0.48, + "mean_token_accuracy": 0.8381939142942428, + "num_tokens": 57705721.0, + "step": 48010 + }, + { + "entropy": 1.969959369301796, + "epoch": 0.14885779848863767, + "grad_norm": 8.612361907958984, + "learning_rate": 6.557093941332177e-06, + "loss": 0.594, + "mean_token_accuracy": 0.8242570266127587, + "num_tokens": 57716430.0, + "step": 48020 + }, + { + "entropy": 1.9459771722555161, + "epoch": 0.14888879761368734, + "grad_norm": 4.5910749435424805, + "learning_rate": 6.556411287593189e-06, + "loss": 0.5514, + "mean_token_accuracy": 0.824884532392025, + "num_tokens": 57728220.0, + "step": 48030 + }, + { + "entropy": 1.8936843484640122, + "epoch": 0.14891979673873704, + "grad_norm": 9.532984733581543, + "learning_rate": 6.5557288470214e-06, + "loss": 0.5743, + "mean_token_accuracy": 0.8205350682139396, + "num_tokens": 57740471.0, + "step": 48040 + }, + { + "entropy": 1.8339029610157014, + "epoch": 0.14895079586378673, + "grad_norm": 3.72184157371521, + "learning_rate": 6.555046619505892e-06, + "loss": 0.5027, + "mean_token_accuracy": 0.8391190290451049, + "num_tokens": 57752802.0, + "step": 48050 + }, + { + "entropy": 1.9061024576425551, + "epoch": 0.14898179498883643, + "grad_norm": 7.634225845336914, + "learning_rate": 6.554364604935828e-06, + "loss": 0.5458, + "mean_token_accuracy": 0.8316650420427323, + "num_tokens": 57764533.0, + "step": 48060 + }, + { + "entropy": 1.9208767369389534, + "epoch": 0.14901279411388613, + "grad_norm": 9.036537170410156, + "learning_rate": 6.5536828032004554e-06, + "loss": 0.5611, + "mean_token_accuracy": 0.831520353257656, + "num_tokens": 57776014.0, + "step": 48070 + }, + { + "entropy": 1.8474348559975624, + "epoch": 0.14904379323893582, + "grad_norm": 4.276791095733643, + "learning_rate": 6.553001214189095e-06, + "loss": 0.507, + "mean_token_accuracy": 0.8321506321430207, + "num_tokens": 57788876.0, + "step": 48080 + }, + { + "entropy": 1.8484969601035117, + "epoch": 0.14907479236398552, + "grad_norm": 9.312860488891602, + "learning_rate": 6.552319837791156e-06, + "loss": 0.5018, + "mean_token_accuracy": 0.839774203300476, + "num_tokens": 57801080.0, + "step": 48090 + }, + { + "entropy": 1.8271625474095345, + "epoch": 0.14910579148903522, + "grad_norm": 3.9336740970611572, + "learning_rate": 6.551638673896124e-06, + "loss": 0.4867, + "mean_token_accuracy": 0.8387017279863358, + "num_tokens": 57813838.0, + "step": 48100 + }, + { + "entropy": 2.0055576503276824, + "epoch": 0.1491367906140849, + "grad_norm": 8.840508460998535, + "learning_rate": 6.550957722393561e-06, + "loss": 0.602, + "mean_token_accuracy": 0.823385763168335, + "num_tokens": 57824304.0, + "step": 48110 + }, + { + "entropy": 1.8011847533285619, + "epoch": 0.1491677897391346, + "grad_norm": 7.309523582458496, + "learning_rate": 6.55027698317312e-06, + "loss": 0.447, + "mean_token_accuracy": 0.8421018213033676, + "num_tokens": 57838533.0, + "step": 48120 + }, + { + "entropy": 1.8618897780776025, + "epoch": 0.1491987888641843, + "grad_norm": 8.072103500366211, + "learning_rate": 6.549596456124524e-06, + "loss": 0.5082, + "mean_token_accuracy": 0.8348542749881744, + "num_tokens": 57850689.0, + "step": 48130 + }, + { + "entropy": 1.9547663122415542, + "epoch": 0.149229787989234, + "grad_norm": 7.6012349128723145, + "learning_rate": 6.548916141137581e-06, + "loss": 0.5302, + "mean_token_accuracy": 0.8345204189419746, + "num_tokens": 57861907.0, + "step": 48140 + }, + { + "entropy": 1.9206694543361664, + "epoch": 0.1492607871142837, + "grad_norm": 9.59740924835205, + "learning_rate": 6.548236038102178e-06, + "loss": 0.4998, + "mean_token_accuracy": 0.8426736682653427, + "num_tokens": 57873874.0, + "step": 48150 + }, + { + "entropy": 1.9666424721479416, + "epoch": 0.1492917862393334, + "grad_norm": 9.023880958557129, + "learning_rate": 6.547556146908285e-06, + "loss": 0.5622, + "mean_token_accuracy": 0.8249856010079384, + "num_tokens": 57885436.0, + "step": 48160 + }, + { + "entropy": 1.926368948817253, + "epoch": 0.1493227853643831, + "grad_norm": 9.346840858459473, + "learning_rate": 6.546876467445947e-06, + "loss": 0.6086, + "mean_token_accuracy": 0.8210124105215073, + "num_tokens": 57896563.0, + "step": 48170 + }, + { + "entropy": 1.967759844660759, + "epoch": 0.1493537844894328, + "grad_norm": 8.98388385772705, + "learning_rate": 6.546196999605291e-06, + "loss": 0.5438, + "mean_token_accuracy": 0.8355759829282761, + "num_tokens": 57907605.0, + "step": 48180 + }, + { + "entropy": 1.9306065008044242, + "epoch": 0.14938478361448249, + "grad_norm": 6.92803955078125, + "learning_rate": 6.545517743276522e-06, + "loss": 0.5503, + "mean_token_accuracy": 0.8288378581404686, + "num_tokens": 57918939.0, + "step": 48190 + }, + { + "entropy": 1.871036571264267, + "epoch": 0.14941578273953218, + "grad_norm": 5.162753582000732, + "learning_rate": 6.544838698349932e-06, + "loss": 0.5035, + "mean_token_accuracy": 0.8356782793998718, + "num_tokens": 57931205.0, + "step": 48200 + }, + { + "entropy": 1.8842888280749321, + "epoch": 0.14944678186458188, + "grad_norm": 9.374017715454102, + "learning_rate": 6.5441598647158835e-06, + "loss": 0.5492, + "mean_token_accuracy": 0.8327220484614373, + "num_tokens": 57942778.0, + "step": 48210 + }, + { + "entropy": 1.8962712571024896, + "epoch": 0.14947778098963158, + "grad_norm": 8.113261222839355, + "learning_rate": 6.543481242264823e-06, + "loss": 0.5285, + "mean_token_accuracy": 0.8363337010145188, + "num_tokens": 57955128.0, + "step": 48220 + }, + { + "entropy": 1.9268214851617813, + "epoch": 0.14950878011468127, + "grad_norm": 9.561257362365723, + "learning_rate": 6.542802830887277e-06, + "loss": 0.5674, + "mean_token_accuracy": 0.8345524996519089, + "num_tokens": 57966427.0, + "step": 48230 + }, + { + "entropy": 1.8193738594651223, + "epoch": 0.14953977923973097, + "grad_norm": 7.488965034484863, + "learning_rate": 6.542124630473848e-06, + "loss": 0.4613, + "mean_token_accuracy": 0.8413304805755615, + "num_tokens": 57980642.0, + "step": 48240 + }, + { + "entropy": 1.8412556283175945, + "epoch": 0.14957077836478067, + "grad_norm": 4.181149959564209, + "learning_rate": 6.541446640915224e-06, + "loss": 0.4584, + "mean_token_accuracy": 0.840462064743042, + "num_tokens": 57994002.0, + "step": 48250 + }, + { + "entropy": 1.8988297596573829, + "epoch": 0.14960177748983036, + "grad_norm": 7.6107025146484375, + "learning_rate": 6.540768862102166e-06, + "loss": 0.4761, + "mean_token_accuracy": 0.8412007540464401, + "num_tokens": 58005701.0, + "step": 48260 + }, + { + "entropy": 1.860750602185726, + "epoch": 0.14963277661488006, + "grad_norm": 9.468915939331055, + "learning_rate": 6.5400912939255156e-06, + "loss": 0.4706, + "mean_token_accuracy": 0.8450239017605782, + "num_tokens": 58018246.0, + "step": 48270 + }, + { + "entropy": 1.9176729574799538, + "epoch": 0.14966377573992973, + "grad_norm": 5.558367729187012, + "learning_rate": 6.5394139362761964e-06, + "loss": 0.5402, + "mean_token_accuracy": 0.8284741297364235, + "num_tokens": 58030012.0, + "step": 48280 + }, + { + "entropy": 1.8357847198843955, + "epoch": 0.14969477486497942, + "grad_norm": 4.411746501922607, + "learning_rate": 6.5387367890452105e-06, + "loss": 0.4603, + "mean_token_accuracy": 0.8405596747994423, + "num_tokens": 58042890.0, + "step": 48290 + }, + { + "entropy": 1.8932993397116662, + "epoch": 0.14972577399002912, + "grad_norm": 8.260334014892578, + "learning_rate": 6.538059852123636e-06, + "loss": 0.4953, + "mean_token_accuracy": 0.8374821558594704, + "num_tokens": 58055225.0, + "step": 48300 + }, + { + "entropy": 1.7874238356947898, + "epoch": 0.14975677311507882, + "grad_norm": 10.80783462524414, + "learning_rate": 6.537383125402632e-06, + "loss": 0.4407, + "mean_token_accuracy": 0.8481561884284019, + "num_tokens": 58068456.0, + "step": 48310 + }, + { + "entropy": 1.9409461870789528, + "epoch": 0.14978777224012851, + "grad_norm": 8.024087905883789, + "learning_rate": 6.536706608773437e-06, + "loss": 0.5432, + "mean_token_accuracy": 0.835419024527073, + "num_tokens": 58079470.0, + "step": 48320 + }, + { + "entropy": 1.9292469948530198, + "epoch": 0.1498187713651782, + "grad_norm": 7.896714687347412, + "learning_rate": 6.536030302127366e-06, + "loss": 0.5654, + "mean_token_accuracy": 0.8290838211774826, + "num_tokens": 58090946.0, + "step": 48330 + }, + { + "entropy": 1.878405897319317, + "epoch": 0.1498497704902279, + "grad_norm": 11.065820693969727, + "learning_rate": 6.535354205355815e-06, + "loss": 0.5846, + "mean_token_accuracy": 0.8164193764328956, + "num_tokens": 58102826.0, + "step": 48340 + }, + { + "entropy": 1.9426892310380937, + "epoch": 0.1498807696152776, + "grad_norm": 8.384618759155273, + "learning_rate": 6.534678318350258e-06, + "loss": 0.5697, + "mean_token_accuracy": 0.825680835545063, + "num_tokens": 58113587.0, + "step": 48350 + }, + { + "entropy": 1.8836750328540801, + "epoch": 0.1499117687403273, + "grad_norm": 9.055339813232422, + "learning_rate": 6.534002641002247e-06, + "loss": 0.5524, + "mean_token_accuracy": 0.8372475564479828, + "num_tokens": 58125471.0, + "step": 48360 + }, + { + "entropy": 1.9558273077011108, + "epoch": 0.149942767865377, + "grad_norm": 8.898198127746582, + "learning_rate": 6.533327173203413e-06, + "loss": 0.5413, + "mean_token_accuracy": 0.8336478710174561, + "num_tokens": 58137376.0, + "step": 48370 + }, + { + "entropy": 1.9481865465641022, + "epoch": 0.1499737669904267, + "grad_norm": 10.591567993164062, + "learning_rate": 6.532651914845465e-06, + "loss": 0.5301, + "mean_token_accuracy": 0.8392414048314094, + "num_tokens": 58148589.0, + "step": 48380 + }, + { + "entropy": 1.8466496154665948, + "epoch": 0.1500047661154764, + "grad_norm": 9.16861629486084, + "learning_rate": 6.531976865820191e-06, + "loss": 0.5262, + "mean_token_accuracy": 0.836252911388874, + "num_tokens": 58160303.0, + "step": 48390 + }, + { + "entropy": 1.8699641510844232, + "epoch": 0.1500357652405261, + "grad_norm": 8.319191932678223, + "learning_rate": 6.531302026019457e-06, + "loss": 0.4918, + "mean_token_accuracy": 0.8408505141735076, + "num_tokens": 58172442.0, + "step": 48400 + }, + { + "entropy": 1.9123701184988022, + "epoch": 0.15006676436557578, + "grad_norm": 7.481304168701172, + "learning_rate": 6.530627395335206e-06, + "loss": 0.5081, + "mean_token_accuracy": 0.83611781001091, + "num_tokens": 58184383.0, + "step": 48410 + }, + { + "entropy": 1.9331264093518257, + "epoch": 0.15009776349062548, + "grad_norm": 8.620382308959961, + "learning_rate": 6.529952973659459e-06, + "loss": 0.5176, + "mean_token_accuracy": 0.8453733563423157, + "num_tokens": 58195683.0, + "step": 48420 + }, + { + "entropy": 1.8736334875226022, + "epoch": 0.15012876261567518, + "grad_norm": 7.600832939147949, + "learning_rate": 6.52927876088432e-06, + "loss": 0.5108, + "mean_token_accuracy": 0.8397956430912018, + "num_tokens": 58207322.0, + "step": 48430 + }, + { + "entropy": 1.887729911506176, + "epoch": 0.15015976174072487, + "grad_norm": 9.004258155822754, + "learning_rate": 6.5286047569019626e-06, + "loss": 0.5415, + "mean_token_accuracy": 0.8307413533329964, + "num_tokens": 58219063.0, + "step": 48440 + }, + { + "entropy": 1.8791867524385453, + "epoch": 0.15019076086577457, + "grad_norm": 8.612053871154785, + "learning_rate": 6.5279309616046475e-06, + "loss": 0.5384, + "mean_token_accuracy": 0.8294597789645195, + "num_tokens": 58230800.0, + "step": 48450 + }, + { + "entropy": 1.8117628186941146, + "epoch": 0.15022175999082427, + "grad_norm": 7.665522575378418, + "learning_rate": 6.527257374884704e-06, + "loss": 0.5374, + "mean_token_accuracy": 0.8396688923239708, + "num_tokens": 58243429.0, + "step": 48460 + }, + { + "entropy": 1.961036217212677, + "epoch": 0.15025275911587396, + "grad_norm": 9.192258834838867, + "learning_rate": 6.5265839966345466e-06, + "loss": 0.5953, + "mean_token_accuracy": 0.819936765730381, + "num_tokens": 58254745.0, + "step": 48470 + }, + { + "entropy": 1.671479968726635, + "epoch": 0.15028375824092366, + "grad_norm": 4.934950828552246, + "learning_rate": 6.5259108267466635e-06, + "loss": 0.4365, + "mean_token_accuracy": 0.8538110002875328, + "num_tokens": 58270043.0, + "step": 48480 + }, + { + "entropy": 1.9253578931093216, + "epoch": 0.15031475736597336, + "grad_norm": 8.208556175231934, + "learning_rate": 6.525237865113621e-06, + "loss": 0.5404, + "mean_token_accuracy": 0.8374222457408905, + "num_tokens": 58281799.0, + "step": 48490 + }, + { + "entropy": 1.8729837134480476, + "epoch": 0.15034575649102305, + "grad_norm": 9.998933792114258, + "learning_rate": 6.524565111628065e-06, + "loss": 0.5488, + "mean_token_accuracy": 0.8276918828487396, + "num_tokens": 58293623.0, + "step": 48500 + }, + { + "entropy": 1.7981028646230697, + "epoch": 0.15037675561607275, + "grad_norm": 5.204981327056885, + "learning_rate": 6.523892566182717e-06, + "loss": 0.4219, + "mean_token_accuracy": 0.8532559484243393, + "num_tokens": 58306840.0, + "step": 48510 + }, + { + "entropy": 1.884388868510723, + "epoch": 0.15040775474112242, + "grad_norm": 9.6846342086792, + "learning_rate": 6.523220228670375e-06, + "loss": 0.5532, + "mean_token_accuracy": 0.8274782180786133, + "num_tokens": 58318898.0, + "step": 48520 + }, + { + "entropy": 1.9351076558232307, + "epoch": 0.15043875386617211, + "grad_norm": 8.531515121459961, + "learning_rate": 6.522548098983917e-06, + "loss": 0.5733, + "mean_token_accuracy": 0.8250018388032914, + "num_tokens": 58330362.0, + "step": 48530 + }, + { + "entropy": 1.8293303191661834, + "epoch": 0.1504697529912218, + "grad_norm": 9.502824783325195, + "learning_rate": 6.521876177016295e-06, + "loss": 0.4581, + "mean_token_accuracy": 0.8435877069830895, + "num_tokens": 58343242.0, + "step": 48540 + }, + { + "entropy": 1.890285351872444, + "epoch": 0.1505007521162715, + "grad_norm": 9.041160583496094, + "learning_rate": 6.521204462660542e-06, + "loss": 0.501, + "mean_token_accuracy": 0.8346679538488389, + "num_tokens": 58355866.0, + "step": 48550 + }, + { + "entropy": 1.8444904461503029, + "epoch": 0.1505317512413212, + "grad_norm": 3.8206639289855957, + "learning_rate": 6.520532955809765e-06, + "loss": 0.4764, + "mean_token_accuracy": 0.8404908493161202, + "num_tokens": 58368640.0, + "step": 48560 + }, + { + "entropy": 1.835204230248928, + "epoch": 0.1505627503663709, + "grad_norm": 8.419897079467773, + "learning_rate": 6.5198616563571505e-06, + "loss": 0.4638, + "mean_token_accuracy": 0.8446251839399338, + "num_tokens": 58381253.0, + "step": 48570 + }, + { + "entropy": 1.8946281239390372, + "epoch": 0.1505937494914206, + "grad_norm": 9.88506031036377, + "learning_rate": 6.519190564195959e-06, + "loss": 0.5422, + "mean_token_accuracy": 0.8362071871757507, + "num_tokens": 58393895.0, + "step": 48580 + }, + { + "entropy": 1.82159626185894, + "epoch": 0.1506247486164703, + "grad_norm": 3.63238787651062, + "learning_rate": 6.518519679219528e-06, + "loss": 0.4447, + "mean_token_accuracy": 0.8525608211755753, + "num_tokens": 58407125.0, + "step": 48590 + }, + { + "entropy": 1.8428357735276222, + "epoch": 0.15065574774152, + "grad_norm": 9.18499755859375, + "learning_rate": 6.517849001321278e-06, + "loss": 0.4695, + "mean_token_accuracy": 0.8396696642041206, + "num_tokens": 58420081.0, + "step": 48600 + }, + { + "entropy": 1.8873658314347268, + "epoch": 0.1506867468665697, + "grad_norm": 10.179847717285156, + "learning_rate": 6.517178530394698e-06, + "loss": 0.4804, + "mean_token_accuracy": 0.8444565415382386, + "num_tokens": 58432042.0, + "step": 48610 + }, + { + "entropy": 1.844616176187992, + "epoch": 0.15071774599161938, + "grad_norm": 3.9476633071899414, + "learning_rate": 6.516508266333358e-06, + "loss": 0.4762, + "mean_token_accuracy": 0.844041819870472, + "num_tokens": 58444698.0, + "step": 48620 + }, + { + "entropy": 1.8296439558267594, + "epoch": 0.15074874511666908, + "grad_norm": 9.040117263793945, + "learning_rate": 6.5158382090309035e-06, + "loss": 0.4992, + "mean_token_accuracy": 0.8471789911389351, + "num_tokens": 58456911.0, + "step": 48630 + }, + { + "entropy": 1.891487744450569, + "epoch": 0.15077974424171878, + "grad_norm": 8.067523956298828, + "learning_rate": 6.515168358381057e-06, + "loss": 0.5691, + "mean_token_accuracy": 0.822045773267746, + "num_tokens": 58468808.0, + "step": 48640 + }, + { + "entropy": 1.9148033902049064, + "epoch": 0.15081074336676847, + "grad_norm": 8.858370780944824, + "learning_rate": 6.514498714277619e-06, + "loss": 0.5551, + "mean_token_accuracy": 0.8241749078035354, + "num_tokens": 58480696.0, + "step": 48650 + }, + { + "entropy": 1.9157651707530021, + "epoch": 0.15084174249181817, + "grad_norm": 9.38106918334961, + "learning_rate": 6.5138292766144615e-06, + "loss": 0.54, + "mean_token_accuracy": 0.8287394896149636, + "num_tokens": 58492144.0, + "step": 48660 + }, + { + "entropy": 1.9842921257019044, + "epoch": 0.15087274161686787, + "grad_norm": 8.547111511230469, + "learning_rate": 6.5131600452855394e-06, + "loss": 0.5597, + "mean_token_accuracy": 0.8356396153569221, + "num_tokens": 58502452.0, + "step": 48670 + }, + { + "entropy": 1.972530573606491, + "epoch": 0.15090374074191756, + "grad_norm": 9.455718040466309, + "learning_rate": 6.512491020184877e-06, + "loss": 0.5894, + "mean_token_accuracy": 0.8308357432484627, + "num_tokens": 58513985.0, + "step": 48680 + }, + { + "entropy": 1.9101537883281707, + "epoch": 0.15093473986696726, + "grad_norm": 8.880309104919434, + "learning_rate": 6.5118222012065825e-06, + "loss": 0.5024, + "mean_token_accuracy": 0.8405994296073913, + "num_tokens": 58525140.0, + "step": 48690 + }, + { + "entropy": 1.946905219554901, + "epoch": 0.15096573899201696, + "grad_norm": 7.35371732711792, + "learning_rate": 6.511153588244832e-06, + "loss": 0.5481, + "mean_token_accuracy": 0.8271761432290077, + "num_tokens": 58536580.0, + "step": 48700 + }, + { + "entropy": 1.947824102640152, + "epoch": 0.15099673811706665, + "grad_norm": 9.245978355407715, + "learning_rate": 6.510485181193884e-06, + "loss": 0.6137, + "mean_token_accuracy": 0.821645250916481, + "num_tokens": 58548298.0, + "step": 48710 + }, + { + "entropy": 1.8544338196516037, + "epoch": 0.15102773724211635, + "grad_norm": 10.973575592041016, + "learning_rate": 6.50981697994807e-06, + "loss": 0.5094, + "mean_token_accuracy": 0.8355946585536003, + "num_tokens": 58560605.0, + "step": 48720 + }, + { + "entropy": 1.8731532111763953, + "epoch": 0.15105873636716605, + "grad_norm": 8.846532821655273, + "learning_rate": 6.5091489844017984e-06, + "loss": 0.5354, + "mean_token_accuracy": 0.838229563832283, + "num_tokens": 58573583.0, + "step": 48730 + }, + { + "entropy": 1.8979535773396492, + "epoch": 0.15108973549221574, + "grad_norm": 8.72420883178711, + "learning_rate": 6.5084811944495515e-06, + "loss": 0.5351, + "mean_token_accuracy": 0.8275813281536102, + "num_tokens": 58585343.0, + "step": 48740 + }, + { + "entropy": 1.9084653094410897, + "epoch": 0.15112073461726544, + "grad_norm": 8.924386978149414, + "learning_rate": 6.50781360998589e-06, + "loss": 0.5136, + "mean_token_accuracy": 0.8385712206363678, + "num_tokens": 58596594.0, + "step": 48750 + }, + { + "entropy": 1.9596381425857543, + "epoch": 0.15115173374231514, + "grad_norm": 8.702342987060547, + "learning_rate": 6.50714623090545e-06, + "loss": 0.5923, + "mean_token_accuracy": 0.8301157906651497, + "num_tokens": 58607471.0, + "step": 48760 + }, + { + "entropy": 1.8487385302782058, + "epoch": 0.1511827328673648, + "grad_norm": 8.954679489135742, + "learning_rate": 6.506479057102942e-06, + "loss": 0.5111, + "mean_token_accuracy": 0.8388300389051437, + "num_tokens": 58619562.0, + "step": 48770 + }, + { + "entropy": 1.910484978556633, + "epoch": 0.1512137319924145, + "grad_norm": 9.405644416809082, + "learning_rate": 6.505812088473151e-06, + "loss": 0.5814, + "mean_token_accuracy": 0.8201380014419556, + "num_tokens": 58630856.0, + "step": 48780 + }, + { + "entropy": 1.816239494085312, + "epoch": 0.1512447311174642, + "grad_norm": 4.988278388977051, + "learning_rate": 6.505145324910941e-06, + "loss": 0.4222, + "mean_token_accuracy": 0.8467860907316208, + "num_tokens": 58643774.0, + "step": 48790 + }, + { + "entropy": 1.889238955080509, + "epoch": 0.1512757302425139, + "grad_norm": 8.861750602722168, + "learning_rate": 6.504478766311248e-06, + "loss": 0.5108, + "mean_token_accuracy": 0.8347546219825744, + "num_tokens": 58655783.0, + "step": 48800 + }, + { + "entropy": 1.9085273638367652, + "epoch": 0.1513067293675636, + "grad_norm": 9.441707611083984, + "learning_rate": 6.503812412569084e-06, + "loss": 0.523, + "mean_token_accuracy": 0.8369966968894005, + "num_tokens": 58667819.0, + "step": 48810 + }, + { + "entropy": 1.9685264229774475, + "epoch": 0.1513377284926133, + "grad_norm": 10.141180038452148, + "learning_rate": 6.503146263579539e-06, + "loss": 0.5336, + "mean_token_accuracy": 0.8229795888066291, + "num_tokens": 58679528.0, + "step": 48820 + }, + { + "entropy": 1.8758042559027672, + "epoch": 0.15136872761766298, + "grad_norm": 11.18869400024414, + "learning_rate": 6.502480319237775e-06, + "loss": 0.5305, + "mean_token_accuracy": 0.8300367653369903, + "num_tokens": 58691911.0, + "step": 48830 + }, + { + "entropy": 1.8802335485816002, + "epoch": 0.15139972674271268, + "grad_norm": 8.065637588500977, + "learning_rate": 6.5018145794390305e-06, + "loss": 0.501, + "mean_token_accuracy": 0.8387890234589577, + "num_tokens": 58704390.0, + "step": 48840 + }, + { + "entropy": 1.8963167145848274, + "epoch": 0.15143072586776238, + "grad_norm": 5.054469585418701, + "learning_rate": 6.501149044078618e-06, + "loss": 0.5604, + "mean_token_accuracy": 0.8216091677546501, + "num_tokens": 58717245.0, + "step": 48850 + }, + { + "entropy": 1.8458565592765808, + "epoch": 0.15146172499281207, + "grad_norm": 4.077347278594971, + "learning_rate": 6.500483713051927e-06, + "loss": 0.4924, + "mean_token_accuracy": 0.8441688820719719, + "num_tokens": 58729499.0, + "step": 48860 + }, + { + "entropy": 1.9472675323486328, + "epoch": 0.15149272411786177, + "grad_norm": 8.823904991149902, + "learning_rate": 6.499818586254422e-06, + "loss": 0.513, + "mean_token_accuracy": 0.8328737184405327, + "num_tokens": 58741160.0, + "step": 48870 + }, + { + "entropy": 1.9384382277727128, + "epoch": 0.15152372324291147, + "grad_norm": 8.922835350036621, + "learning_rate": 6.499153663581638e-06, + "loss": 0.49, + "mean_token_accuracy": 0.8393163681030273, + "num_tokens": 58752947.0, + "step": 48880 + }, + { + "entropy": 1.9302797958254814, + "epoch": 0.15155472236796116, + "grad_norm": 8.566751480102539, + "learning_rate": 6.49848894492919e-06, + "loss": 0.5269, + "mean_token_accuracy": 0.8295559108257293, + "num_tokens": 58765340.0, + "step": 48890 + }, + { + "entropy": 1.8543498650193215, + "epoch": 0.15158572149301086, + "grad_norm": 8.253662109375, + "learning_rate": 6.497824430192765e-06, + "loss": 0.4847, + "mean_token_accuracy": 0.8355495229363441, + "num_tokens": 58778000.0, + "step": 48900 + }, + { + "entropy": 1.874028617143631, + "epoch": 0.15161672061806056, + "grad_norm": 5.289228916168213, + "learning_rate": 6.497160119268126e-06, + "loss": 0.4513, + "mean_token_accuracy": 0.8463690742850304, + "num_tokens": 58791118.0, + "step": 48910 + }, + { + "entropy": 1.9725682616233826, + "epoch": 0.15164771974311025, + "grad_norm": 8.66529655456543, + "learning_rate": 6.496496012051109e-06, + "loss": 0.6059, + "mean_token_accuracy": 0.8233085095882415, + "num_tokens": 58802602.0, + "step": 48920 + }, + { + "entropy": 1.9239532873034477, + "epoch": 0.15167871886815995, + "grad_norm": 9.758983612060547, + "learning_rate": 6.495832108437626e-06, + "loss": 0.479, + "mean_token_accuracy": 0.8342737898230552, + "num_tokens": 58814927.0, + "step": 48930 + }, + { + "entropy": 1.846892774105072, + "epoch": 0.15170971799320965, + "grad_norm": 4.085844993591309, + "learning_rate": 6.495168408323665e-06, + "loss": 0.4897, + "mean_token_accuracy": 0.8468069404363632, + "num_tokens": 58827118.0, + "step": 48940 + }, + { + "entropy": 1.9423293888568878, + "epoch": 0.15174071711825934, + "grad_norm": 9.48098373413086, + "learning_rate": 6.4945049116052795e-06, + "loss": 0.5816, + "mean_token_accuracy": 0.821548655629158, + "num_tokens": 58838717.0, + "step": 48950 + }, + { + "entropy": 1.9620625630021096, + "epoch": 0.15177171624330904, + "grad_norm": 7.364303112030029, + "learning_rate": 6.493841618178611e-06, + "loss": 0.5329, + "mean_token_accuracy": 0.8329650431871414, + "num_tokens": 58850165.0, + "step": 48960 + }, + { + "entropy": 1.7987186387181282, + "epoch": 0.15180271536835874, + "grad_norm": 8.708568572998047, + "learning_rate": 6.4931785279398666e-06, + "loss": 0.4112, + "mean_token_accuracy": 0.8545974805951119, + "num_tokens": 58863232.0, + "step": 48970 + }, + { + "entropy": 1.92343827188015, + "epoch": 0.15183371449340843, + "grad_norm": 8.546026229858398, + "learning_rate": 6.4925156407853275e-06, + "loss": 0.5071, + "mean_token_accuracy": 0.8306387692689896, + "num_tokens": 58874670.0, + "step": 48980 + }, + { + "entropy": 1.8594204097986222, + "epoch": 0.15186471361845813, + "grad_norm": 3.8005247116088867, + "learning_rate": 6.491852956611351e-06, + "loss": 0.5176, + "mean_token_accuracy": 0.8334858074784279, + "num_tokens": 58887318.0, + "step": 48990 + }, + { + "entropy": 1.939525455236435, + "epoch": 0.15189571274350783, + "grad_norm": 7.96549129486084, + "learning_rate": 6.4911904753143696e-06, + "loss": 0.5349, + "mean_token_accuracy": 0.8367563143372536, + "num_tokens": 58898840.0, + "step": 49000 + }, + { + "entropy": 1.8316703870892526, + "epoch": 0.15192671186855752, + "grad_norm": 7.305845737457275, + "learning_rate": 6.490528196790886e-06, + "loss": 0.5014, + "mean_token_accuracy": 0.8301277205348014, + "num_tokens": 58911661.0, + "step": 49010 + }, + { + "entropy": 1.808566428720951, + "epoch": 0.1519577109936072, + "grad_norm": 7.7132792472839355, + "learning_rate": 6.489866120937483e-06, + "loss": 0.4316, + "mean_token_accuracy": 0.851980808377266, + "num_tokens": 58924530.0, + "step": 49020 + }, + { + "entropy": 1.9115712746977807, + "epoch": 0.1519887101186569, + "grad_norm": 8.088088989257812, + "learning_rate": 6.489204247650809e-06, + "loss": 0.5556, + "mean_token_accuracy": 0.832783767580986, + "num_tokens": 58936710.0, + "step": 49030 + }, + { + "entropy": 1.7950451903045177, + "epoch": 0.15201970924370659, + "grad_norm": 7.494925022125244, + "learning_rate": 6.4885425768275945e-06, + "loss": 0.4237, + "mean_token_accuracy": 0.8493603855371475, + "num_tokens": 58950735.0, + "step": 49040 + }, + { + "entropy": 1.8894059650599957, + "epoch": 0.15205070836875628, + "grad_norm": 7.540571212768555, + "learning_rate": 6.487881108364637e-06, + "loss": 0.5165, + "mean_token_accuracy": 0.835996463894844, + "num_tokens": 58962971.0, + "step": 49050 + }, + { + "entropy": 1.7538513764739037, + "epoch": 0.15208170749380598, + "grad_norm": 4.750351905822754, + "learning_rate": 6.487219842158812e-06, + "loss": 0.3752, + "mean_token_accuracy": 0.8489253923296929, + "num_tokens": 58976766.0, + "step": 49060 + }, + { + "entropy": 1.7878063380718232, + "epoch": 0.15211270661885568, + "grad_norm": 3.8629848957061768, + "learning_rate": 6.486558778107066e-06, + "loss": 0.4389, + "mean_token_accuracy": 0.8460006847977638, + "num_tokens": 58990175.0, + "step": 49070 + }, + { + "entropy": 1.8979044020175935, + "epoch": 0.15214370574390537, + "grad_norm": 10.183228492736816, + "learning_rate": 6.485897916106419e-06, + "loss": 0.5256, + "mean_token_accuracy": 0.8209685668349266, + "num_tokens": 59002434.0, + "step": 49080 + }, + { + "entropy": 1.936001867055893, + "epoch": 0.15217470486895507, + "grad_norm": 10.987131118774414, + "learning_rate": 6.485237256053968e-06, + "loss": 0.5699, + "mean_token_accuracy": 0.8229846864938736, + "num_tokens": 59013865.0, + "step": 49090 + }, + { + "entropy": 1.7930354595184326, + "epoch": 0.15220570399400477, + "grad_norm": 8.664037704467773, + "learning_rate": 6.484576797846879e-06, + "loss": 0.4632, + "mean_token_accuracy": 0.8444106802344322, + "num_tokens": 59026905.0, + "step": 49100 + }, + { + "entropy": 1.848856683075428, + "epoch": 0.15223670311905446, + "grad_norm": 8.302072525024414, + "learning_rate": 6.4839165413823935e-06, + "loss": 0.551, + "mean_token_accuracy": 0.8347058981657028, + "num_tokens": 59039328.0, + "step": 49110 + }, + { + "entropy": 1.754729336500168, + "epoch": 0.15226770224410416, + "grad_norm": 8.616592407226562, + "learning_rate": 6.483256486557824e-06, + "loss": 0.4711, + "mean_token_accuracy": 0.8488777309656144, + "num_tokens": 59052355.0, + "step": 49120 + }, + { + "entropy": 1.854713924229145, + "epoch": 0.15229870136915385, + "grad_norm": 10.268636703491211, + "learning_rate": 6.482596633270561e-06, + "loss": 0.5598, + "mean_token_accuracy": 0.8265779912471771, + "num_tokens": 59063971.0, + "step": 49130 + }, + { + "entropy": 1.860026153922081, + "epoch": 0.15232970049420355, + "grad_norm": 7.569554805755615, + "learning_rate": 6.481936981418064e-06, + "loss": 0.5103, + "mean_token_accuracy": 0.8441597208380699, + "num_tokens": 59075534.0, + "step": 49140 + }, + { + "entropy": 1.9424165233969688, + "epoch": 0.15236069961925325, + "grad_norm": 7.921428680419922, + "learning_rate": 6.481277530897865e-06, + "loss": 0.603, + "mean_token_accuracy": 0.8199049532413483, + "num_tokens": 59086811.0, + "step": 49150 + }, + { + "entropy": 1.862628909945488, + "epoch": 0.15239169874430294, + "grad_norm": 7.977451801300049, + "learning_rate": 6.480618281607572e-06, + "loss": 0.494, + "mean_token_accuracy": 0.8402055114507675, + "num_tokens": 59099011.0, + "step": 49160 + }, + { + "entropy": 1.8622022837400436, + "epoch": 0.15242269786935264, + "grad_norm": 8.97208309173584, + "learning_rate": 6.479959233444862e-06, + "loss": 0.5266, + "mean_token_accuracy": 0.8342375844717026, + "num_tokens": 59110548.0, + "step": 49170 + }, + { + "entropy": 1.8860869765281678, + "epoch": 0.15245369699440234, + "grad_norm": 8.571884155273438, + "learning_rate": 6.47930038630749e-06, + "loss": 0.5412, + "mean_token_accuracy": 0.8293202951550483, + "num_tokens": 59121640.0, + "step": 49180 + }, + { + "entropy": 1.8871512919664384, + "epoch": 0.15248469611945203, + "grad_norm": 8.496338844299316, + "learning_rate": 6.478641740093281e-06, + "loss": 0.5286, + "mean_token_accuracy": 0.8324317663908005, + "num_tokens": 59133528.0, + "step": 49190 + }, + { + "entropy": 1.9237959653139114, + "epoch": 0.15251569524450173, + "grad_norm": 7.444377422332764, + "learning_rate": 6.4779832947001306e-06, + "loss": 0.5512, + "mean_token_accuracy": 0.8336405113339425, + "num_tokens": 59144958.0, + "step": 49200 + }, + { + "entropy": 1.9096000641584396, + "epoch": 0.15254669436955143, + "grad_norm": 10.638242721557617, + "learning_rate": 6.47732505002601e-06, + "loss": 0.5257, + "mean_token_accuracy": 0.836242513358593, + "num_tokens": 59155880.0, + "step": 49210 + }, + { + "entropy": 1.895295462012291, + "epoch": 0.15257769349460112, + "grad_norm": 8.52206802368164, + "learning_rate": 6.4766670059689615e-06, + "loss": 0.5051, + "mean_token_accuracy": 0.834420631825924, + "num_tokens": 59168200.0, + "step": 49220 + }, + { + "entropy": 1.8659065082669257, + "epoch": 0.15260869261965082, + "grad_norm": 8.12820816040039, + "learning_rate": 6.476009162427102e-06, + "loss": 0.6056, + "mean_token_accuracy": 0.8158292233943939, + "num_tokens": 59180782.0, + "step": 49230 + }, + { + "entropy": 1.8979418486356736, + "epoch": 0.15263969174470052, + "grad_norm": 10.203886985778809, + "learning_rate": 6.475351519298617e-06, + "loss": 0.5562, + "mean_token_accuracy": 0.836780446767807, + "num_tokens": 59192007.0, + "step": 49240 + }, + { + "entropy": 1.7612644746899604, + "epoch": 0.15267069086975021, + "grad_norm": 7.155702114105225, + "learning_rate": 6.474694076481769e-06, + "loss": 0.4834, + "mean_token_accuracy": 0.8396282330155372, + "num_tokens": 59205137.0, + "step": 49250 + }, + { + "entropy": 1.8855042546987533, + "epoch": 0.15270168999479988, + "grad_norm": 10.510095596313477, + "learning_rate": 6.474036833874888e-06, + "loss": 0.5486, + "mean_token_accuracy": 0.8316191598773003, + "num_tokens": 59216398.0, + "step": 49260 + }, + { + "entropy": 1.9063111320137978, + "epoch": 0.15273268911984958, + "grad_norm": 8.204238891601562, + "learning_rate": 6.4733797913763806e-06, + "loss": 0.5644, + "mean_token_accuracy": 0.8286448165774345, + "num_tokens": 59228148.0, + "step": 49270 + }, + { + "entropy": 1.8491440996527673, + "epoch": 0.15276368824489928, + "grad_norm": 4.404919147491455, + "learning_rate": 6.472722948884723e-06, + "loss": 0.4648, + "mean_token_accuracy": 0.8485887929797172, + "num_tokens": 59240379.0, + "step": 49280 + }, + { + "entropy": 1.8710645034909248, + "epoch": 0.15279468736994897, + "grad_norm": 4.196066856384277, + "learning_rate": 6.472066306298462e-06, + "loss": 0.5844, + "mean_token_accuracy": 0.821666119992733, + "num_tokens": 59251646.0, + "step": 49290 + }, + { + "entropy": 1.8651590749621392, + "epoch": 0.15282568649499867, + "grad_norm": 9.875333786010742, + "learning_rate": 6.471409863516221e-06, + "loss": 0.5306, + "mean_token_accuracy": 0.8275208979845047, + "num_tokens": 59263652.0, + "step": 49300 + }, + { + "entropy": 1.9215037196874618, + "epoch": 0.15285668562004837, + "grad_norm": 9.746886253356934, + "learning_rate": 6.470753620436694e-06, + "loss": 0.5378, + "mean_token_accuracy": 0.8245725408196449, + "num_tokens": 59275186.0, + "step": 49310 + }, + { + "entropy": 1.9342552363872527, + "epoch": 0.15288768474509806, + "grad_norm": 8.793591499328613, + "learning_rate": 6.470097576958641e-06, + "loss": 0.5609, + "mean_token_accuracy": 0.8296951711177826, + "num_tokens": 59286058.0, + "step": 49320 + }, + { + "entropy": 1.8291342303156852, + "epoch": 0.15291868387014776, + "grad_norm": 8.690282821655273, + "learning_rate": 6.469441732980904e-06, + "loss": 0.4498, + "mean_token_accuracy": 0.8339086413383484, + "num_tokens": 59298932.0, + "step": 49330 + }, + { + "entropy": 1.8255377933382988, + "epoch": 0.15294968299519746, + "grad_norm": 10.050409317016602, + "learning_rate": 6.468786088402388e-06, + "loss": 0.4545, + "mean_token_accuracy": 0.8509842753410339, + "num_tokens": 59311303.0, + "step": 49340 + }, + { + "entropy": 1.871800681948662, + "epoch": 0.15298068212024715, + "grad_norm": 8.856807708740234, + "learning_rate": 6.468130643122074e-06, + "loss": 0.5604, + "mean_token_accuracy": 0.8348386570811271, + "num_tokens": 59322970.0, + "step": 49350 + }, + { + "entropy": 1.8413175642490387, + "epoch": 0.15301168124529685, + "grad_norm": 10.395611763000488, + "learning_rate": 6.4674753970390126e-06, + "loss": 0.5536, + "mean_token_accuracy": 0.825497391819954, + "num_tokens": 59335548.0, + "step": 49360 + }, + { + "entropy": 1.9431413426995277, + "epoch": 0.15304268037034655, + "grad_norm": 8.011473655700684, + "learning_rate": 6.46682035005233e-06, + "loss": 0.6076, + "mean_token_accuracy": 0.8263207510113716, + "num_tokens": 59347158.0, + "step": 49370 + }, + { + "entropy": 1.8601020842790603, + "epoch": 0.15307367949539624, + "grad_norm": 10.249993324279785, + "learning_rate": 6.466165502061217e-06, + "loss": 0.5366, + "mean_token_accuracy": 0.833003306388855, + "num_tokens": 59359361.0, + "step": 49380 + }, + { + "entropy": 1.8623602211475372, + "epoch": 0.15310467862044594, + "grad_norm": 9.118306159973145, + "learning_rate": 6.465510852964943e-06, + "loss": 0.5088, + "mean_token_accuracy": 0.8340568155050277, + "num_tokens": 59371827.0, + "step": 49390 + }, + { + "entropy": 1.9150095269083978, + "epoch": 0.15313567774549564, + "grad_norm": 8.470681190490723, + "learning_rate": 6.464856402662844e-06, + "loss": 0.5721, + "mean_token_accuracy": 0.8367661386728287, + "num_tokens": 59383425.0, + "step": 49400 + }, + { + "entropy": 1.9471410617232323, + "epoch": 0.15316667687054533, + "grad_norm": 3.6562201976776123, + "learning_rate": 6.4642021510543284e-06, + "loss": 0.5508, + "mean_token_accuracy": 0.8367534473538398, + "num_tokens": 59394778.0, + "step": 49410 + }, + { + "entropy": 1.831431895494461, + "epoch": 0.15319767599559503, + "grad_norm": 7.588027000427246, + "learning_rate": 6.463548098038879e-06, + "loss": 0.4789, + "mean_token_accuracy": 0.8452267602086068, + "num_tokens": 59407172.0, + "step": 49420 + }, + { + "entropy": 1.8925320595502853, + "epoch": 0.15322867512064473, + "grad_norm": 8.88277816772461, + "learning_rate": 6.462894243516044e-06, + "loss": 0.5003, + "mean_token_accuracy": 0.8345058739185334, + "num_tokens": 59418806.0, + "step": 49430 + }, + { + "entropy": 1.8566480353474617, + "epoch": 0.15325967424569442, + "grad_norm": 4.175265789031982, + "learning_rate": 6.462240587385448e-06, + "loss": 0.5153, + "mean_token_accuracy": 0.8374285340309143, + "num_tokens": 59430897.0, + "step": 49440 + }, + { + "entropy": 1.8792894035577774, + "epoch": 0.15329067337074412, + "grad_norm": 11.97266674041748, + "learning_rate": 6.461587129546784e-06, + "loss": 0.5335, + "mean_token_accuracy": 0.8309213414788246, + "num_tokens": 59442625.0, + "step": 49450 + }, + { + "entropy": 1.8707554250955583, + "epoch": 0.15332167249579381, + "grad_norm": 9.689730644226074, + "learning_rate": 6.460933869899815e-06, + "loss": 0.5504, + "mean_token_accuracy": 0.8266577154397965, + "num_tokens": 59454969.0, + "step": 49460 + }, + { + "entropy": 1.912343481183052, + "epoch": 0.1533526716208435, + "grad_norm": 9.543248176574707, + "learning_rate": 6.460280808344378e-06, + "loss": 0.5702, + "mean_token_accuracy": 0.8356236189603805, + "num_tokens": 59467285.0, + "step": 49470 + }, + { + "entropy": 1.8799829974770546, + "epoch": 0.1533836707458932, + "grad_norm": 7.834064960479736, + "learning_rate": 6.459627944780378e-06, + "loss": 0.5296, + "mean_token_accuracy": 0.8366628587245941, + "num_tokens": 59480058.0, + "step": 49480 + }, + { + "entropy": 1.8075303509831429, + "epoch": 0.1534146698709429, + "grad_norm": 3.9849984645843506, + "learning_rate": 6.458975279107794e-06, + "loss": 0.5013, + "mean_token_accuracy": 0.8453790143132209, + "num_tokens": 59493616.0, + "step": 49490 + }, + { + "entropy": 1.8907064393162727, + "epoch": 0.1534456689959926, + "grad_norm": 8.766815185546875, + "learning_rate": 6.458322811226673e-06, + "loss": 0.5133, + "mean_token_accuracy": 0.838210554420948, + "num_tokens": 59506402.0, + "step": 49500 + }, + { + "entropy": 1.8980057820677758, + "epoch": 0.15347666812104227, + "grad_norm": 9.522225379943848, + "learning_rate": 6.457670541037133e-06, + "loss": 0.522, + "mean_token_accuracy": 0.8347028121352196, + "num_tokens": 59518065.0, + "step": 49510 + }, + { + "entropy": 1.9182281613349914, + "epoch": 0.15350766724609197, + "grad_norm": 8.175332069396973, + "learning_rate": 6.457018468439363e-06, + "loss": 0.5776, + "mean_token_accuracy": 0.8285543143749237, + "num_tokens": 59528843.0, + "step": 49520 + }, + { + "entropy": 1.8754586443305015, + "epoch": 0.15353866637114166, + "grad_norm": 8.504158020019531, + "learning_rate": 6.456366593333622e-06, + "loss": 0.5776, + "mean_token_accuracy": 0.8261403411626815, + "num_tokens": 59541901.0, + "step": 49530 + }, + { + "entropy": 1.816955418884754, + "epoch": 0.15356966549619136, + "grad_norm": 8.501500129699707, + "learning_rate": 6.455714915620241e-06, + "loss": 0.4729, + "mean_token_accuracy": 0.840816356241703, + "num_tokens": 59555193.0, + "step": 49540 + }, + { + "entropy": 1.8059712588787078, + "epoch": 0.15360066462124106, + "grad_norm": 4.024796009063721, + "learning_rate": 6.45506343519962e-06, + "loss": 0.4305, + "mean_token_accuracy": 0.8564706295728683, + "num_tokens": 59568219.0, + "step": 49550 + }, + { + "entropy": 1.932602970302105, + "epoch": 0.15363166374629075, + "grad_norm": 8.853862762451172, + "learning_rate": 6.4544121519722305e-06, + "loss": 0.5538, + "mean_token_accuracy": 0.8353964149951935, + "num_tokens": 59579262.0, + "step": 49560 + }, + { + "entropy": 1.8641065925359726, + "epoch": 0.15366266287134045, + "grad_norm": 7.992645263671875, + "learning_rate": 6.453761065838612e-06, + "loss": 0.5253, + "mean_token_accuracy": 0.842236676812172, + "num_tokens": 59590979.0, + "step": 49570 + }, + { + "entropy": 1.8351068049669266, + "epoch": 0.15369366199639015, + "grad_norm": 11.022222518920898, + "learning_rate": 6.453110176699378e-06, + "loss": 0.5175, + "mean_token_accuracy": 0.836832246184349, + "num_tokens": 59603268.0, + "step": 49580 + }, + { + "entropy": 1.8691863656044005, + "epoch": 0.15372466112143984, + "grad_norm": 10.182769775390625, + "learning_rate": 6.452459484455208e-06, + "loss": 0.4967, + "mean_token_accuracy": 0.8315588355064392, + "num_tokens": 59615230.0, + "step": 49590 + }, + { + "entropy": 1.867138534784317, + "epoch": 0.15375566024648954, + "grad_norm": 8.996567726135254, + "learning_rate": 6.451808989006854e-06, + "loss": 0.5441, + "mean_token_accuracy": 0.8165970057249069, + "num_tokens": 59628275.0, + "step": 49600 + }, + { + "entropy": 1.8459146052598954, + "epoch": 0.15378665937153924, + "grad_norm": 4.567706108093262, + "learning_rate": 6.451158690255139e-06, + "loss": 0.4739, + "mean_token_accuracy": 0.8464686393737793, + "num_tokens": 59640506.0, + "step": 49610 + }, + { + "entropy": 1.9115084454417228, + "epoch": 0.15381765849658893, + "grad_norm": 8.155994415283203, + "learning_rate": 6.450508588100953e-06, + "loss": 0.5458, + "mean_token_accuracy": 0.8274505391716958, + "num_tokens": 59651786.0, + "step": 49620 + }, + { + "entropy": 1.8388926222920419, + "epoch": 0.15384865762163863, + "grad_norm": 3.681293487548828, + "learning_rate": 6.449858682445258e-06, + "loss": 0.4744, + "mean_token_accuracy": 0.842620424926281, + "num_tokens": 59664048.0, + "step": 49630 + }, + { + "entropy": 1.836945366859436, + "epoch": 0.15387965674668833, + "grad_norm": 9.795785903930664, + "learning_rate": 6.449208973189086e-06, + "loss": 0.4704, + "mean_token_accuracy": 0.8397533014416695, + "num_tokens": 59676318.0, + "step": 49640 + }, + { + "entropy": 1.9197256535291671, + "epoch": 0.15391065587173802, + "grad_norm": 9.240389823913574, + "learning_rate": 6.448559460233536e-06, + "loss": 0.5298, + "mean_token_accuracy": 0.8350172653794289, + "num_tokens": 59686954.0, + "step": 49650 + }, + { + "entropy": 1.9680290162563323, + "epoch": 0.15394165499678772, + "grad_norm": 8.425545692443848, + "learning_rate": 6.447910143479779e-06, + "loss": 0.568, + "mean_token_accuracy": 0.8262524694204331, + "num_tokens": 59697663.0, + "step": 49660 + }, + { + "entropy": 1.8470780551433563, + "epoch": 0.15397265412183742, + "grad_norm": 4.306032180786133, + "learning_rate": 6.447261022829057e-06, + "loss": 0.4315, + "mean_token_accuracy": 0.847578053176403, + "num_tokens": 59710534.0, + "step": 49670 + }, + { + "entropy": 1.9209625497460365, + "epoch": 0.1540036532468871, + "grad_norm": 8.741031646728516, + "learning_rate": 6.446612098182679e-06, + "loss": 0.5573, + "mean_token_accuracy": 0.8277770847082138, + "num_tokens": 59721380.0, + "step": 49680 + }, + { + "entropy": 1.7767624616622926, + "epoch": 0.1540346523719368, + "grad_norm": 9.159987449645996, + "learning_rate": 6.445963369442024e-06, + "loss": 0.4768, + "mean_token_accuracy": 0.8386806547641754, + "num_tokens": 59734270.0, + "step": 49690 + }, + { + "entropy": 1.9001009285449981, + "epoch": 0.1540656514969865, + "grad_norm": 13.77109432220459, + "learning_rate": 6.4453148365085425e-06, + "loss": 0.5171, + "mean_token_accuracy": 0.8450674623250961, + "num_tokens": 59746960.0, + "step": 49700 + }, + { + "entropy": 1.9309724509716033, + "epoch": 0.1540966506220362, + "grad_norm": 4.395373821258545, + "learning_rate": 6.444666499283752e-06, + "loss": 0.5595, + "mean_token_accuracy": 0.8193286895751953, + "num_tokens": 59758480.0, + "step": 49710 + }, + { + "entropy": 1.8494668424129486, + "epoch": 0.1541276497470859, + "grad_norm": 4.505528926849365, + "learning_rate": 6.444018357669239e-06, + "loss": 0.4911, + "mean_token_accuracy": 0.8275418624281883, + "num_tokens": 59772380.0, + "step": 49720 + }, + { + "entropy": 1.9033274337649346, + "epoch": 0.1541586488721356, + "grad_norm": 9.880914688110352, + "learning_rate": 6.443370411566663e-06, + "loss": 0.5613, + "mean_token_accuracy": 0.8315735951066017, + "num_tokens": 59784089.0, + "step": 49730 + }, + { + "entropy": 1.888562636077404, + "epoch": 0.1541896479971853, + "grad_norm": 8.08756160736084, + "learning_rate": 6.442722660877747e-06, + "loss": 0.5053, + "mean_token_accuracy": 0.8301805481314659, + "num_tokens": 59796171.0, + "step": 49740 + }, + { + "entropy": 1.90590338408947, + "epoch": 0.154220647122235, + "grad_norm": 9.111113548278809, + "learning_rate": 6.44207510550429e-06, + "loss": 0.5416, + "mean_token_accuracy": 0.8257271036505699, + "num_tokens": 59807917.0, + "step": 49750 + }, + { + "entropy": 1.9074049085378646, + "epoch": 0.15425164624728466, + "grad_norm": 9.030037879943848, + "learning_rate": 6.441427745348153e-06, + "loss": 0.5366, + "mean_token_accuracy": 0.8337802454829216, + "num_tokens": 59818916.0, + "step": 49760 + }, + { + "entropy": 1.8746358096599578, + "epoch": 0.15428264537233435, + "grad_norm": 7.909918308258057, + "learning_rate": 6.440780580311269e-06, + "loss": 0.5091, + "mean_token_accuracy": 0.8407538250088692, + "num_tokens": 59830862.0, + "step": 49770 + }, + { + "entropy": 1.875384160876274, + "epoch": 0.15431364449738405, + "grad_norm": 8.324300765991211, + "learning_rate": 6.4401336102956434e-06, + "loss": 0.5317, + "mean_token_accuracy": 0.8312161207199097, + "num_tokens": 59842486.0, + "step": 49780 + }, + { + "entropy": 1.9422027677297593, + "epoch": 0.15434464362243375, + "grad_norm": 9.501036643981934, + "learning_rate": 6.439486835203346e-06, + "loss": 0.5842, + "mean_token_accuracy": 0.8298379242420196, + "num_tokens": 59853228.0, + "step": 49790 + }, + { + "entropy": 1.854721449315548, + "epoch": 0.15437564274748344, + "grad_norm": 7.378803730010986, + "learning_rate": 6.438840254936516e-06, + "loss": 0.509, + "mean_token_accuracy": 0.8368782594799995, + "num_tokens": 59864876.0, + "step": 49800 + }, + { + "entropy": 1.8604190409183503, + "epoch": 0.15440664187253314, + "grad_norm": 11.659805297851562, + "learning_rate": 6.438193869397364e-06, + "loss": 0.5423, + "mean_token_accuracy": 0.8247652351856232, + "num_tokens": 59877865.0, + "step": 49810 + }, + { + "entropy": 1.8469142317771912, + "epoch": 0.15443764099758284, + "grad_norm": 4.957024097442627, + "learning_rate": 6.437547678488166e-06, + "loss": 0.5404, + "mean_token_accuracy": 0.8259366884827614, + "num_tokens": 59890976.0, + "step": 49820 + }, + { + "entropy": 1.8666848599910737, + "epoch": 0.15446864012263253, + "grad_norm": 9.470239639282227, + "learning_rate": 6.436901682111268e-06, + "loss": 0.4959, + "mean_token_accuracy": 0.8429308265447617, + "num_tokens": 59903295.0, + "step": 49830 + }, + { + "entropy": 1.9163474783301353, + "epoch": 0.15449963924768223, + "grad_norm": 9.075465202331543, + "learning_rate": 6.436255880169087e-06, + "loss": 0.5747, + "mean_token_accuracy": 0.8243316933512688, + "num_tokens": 59914765.0, + "step": 49840 + }, + { + "entropy": 1.8828782141208649, + "epoch": 0.15453063837273193, + "grad_norm": 9.262798309326172, + "learning_rate": 6.4356102725641035e-06, + "loss": 0.5203, + "mean_token_accuracy": 0.837623517215252, + "num_tokens": 59925835.0, + "step": 49850 + }, + { + "entropy": 1.8749229982495308, + "epoch": 0.15456163749778162, + "grad_norm": 8.3054780960083, + "learning_rate": 6.434964859198871e-06, + "loss": 0.4989, + "mean_token_accuracy": 0.8357605487108231, + "num_tokens": 59937496.0, + "step": 49860 + }, + { + "entropy": 1.9846446454524993, + "epoch": 0.15459263662283132, + "grad_norm": 7.847235679626465, + "learning_rate": 6.434319639976007e-06, + "loss": 0.5728, + "mean_token_accuracy": 0.8296759322285652, + "num_tokens": 59948007.0, + "step": 49870 + }, + { + "entropy": 1.8958945728838443, + "epoch": 0.15462363574788102, + "grad_norm": 8.949000358581543, + "learning_rate": 6.433674614798204e-06, + "loss": 0.5476, + "mean_token_accuracy": 0.8374200582504272, + "num_tokens": 59960116.0, + "step": 49880 + }, + { + "entropy": 1.8910530745983123, + "epoch": 0.1546546348729307, + "grad_norm": 11.521330833435059, + "learning_rate": 6.433029783568216e-06, + "loss": 0.5068, + "mean_token_accuracy": 0.8429587453603744, + "num_tokens": 59971792.0, + "step": 49890 + }, + { + "entropy": 1.9622877299785615, + "epoch": 0.1546856339979804, + "grad_norm": 8.818026542663574, + "learning_rate": 6.4323851461888694e-06, + "loss": 0.5706, + "mean_token_accuracy": 0.8334489464759827, + "num_tokens": 59982734.0, + "step": 49900 + }, + { + "entropy": 1.9317353338003158, + "epoch": 0.1547166331230301, + "grad_norm": 9.778739929199219, + "learning_rate": 6.431740702563056e-06, + "loss": 0.5676, + "mean_token_accuracy": 0.8346163481473923, + "num_tokens": 59993700.0, + "step": 49910 + }, + { + "entropy": 1.8633641496300697, + "epoch": 0.1547476322480798, + "grad_norm": 8.956582069396973, + "learning_rate": 6.431096452593738e-06, + "loss": 0.4891, + "mean_token_accuracy": 0.8405205994844437, + "num_tokens": 60005298.0, + "step": 49920 + }, + { + "entropy": 1.9589593350887298, + "epoch": 0.1547786313731295, + "grad_norm": 7.157927989959717, + "learning_rate": 6.4304523961839436e-06, + "loss": 0.5482, + "mean_token_accuracy": 0.8311053574085235, + "num_tokens": 60016287.0, + "step": 49930 + }, + { + "entropy": 1.9007370486855506, + "epoch": 0.1548096304981792, + "grad_norm": 11.051389694213867, + "learning_rate": 6.429808533236771e-06, + "loss": 0.536, + "mean_token_accuracy": 0.830623884499073, + "num_tokens": 60028218.0, + "step": 49940 + }, + { + "entropy": 1.8595467865467072, + "epoch": 0.1548406296232289, + "grad_norm": 10.759744644165039, + "learning_rate": 6.429164863655384e-06, + "loss": 0.5172, + "mean_token_accuracy": 0.8303674578666687, + "num_tokens": 60040116.0, + "step": 49950 + }, + { + "entropy": 1.8307231336832046, + "epoch": 0.1548716287482786, + "grad_norm": 9.683849334716797, + "learning_rate": 6.428521387343016e-06, + "loss": 0.4501, + "mean_token_accuracy": 0.847040268778801, + "num_tokens": 60053053.0, + "step": 49960 + }, + { + "entropy": 1.855046309530735, + "epoch": 0.15490262787332829, + "grad_norm": 7.891843318939209, + "learning_rate": 6.427878104202968e-06, + "loss": 0.5095, + "mean_token_accuracy": 0.8327124953269959, + "num_tokens": 60065345.0, + "step": 49970 + }, + { + "entropy": 1.9063360676169396, + "epoch": 0.15493362699837798, + "grad_norm": 10.191269874572754, + "learning_rate": 6.4272350141386095e-06, + "loss": 0.5739, + "mean_token_accuracy": 0.8289344042539597, + "num_tokens": 60077605.0, + "step": 49980 + }, + { + "entropy": 1.8241621538996697, + "epoch": 0.15496462612342768, + "grad_norm": 10.132854461669922, + "learning_rate": 6.4265921170533755e-06, + "loss": 0.4474, + "mean_token_accuracy": 0.8447253108024597, + "num_tokens": 60089882.0, + "step": 49990 + }, + { + "entropy": 1.9241633802652358, + "epoch": 0.15499562524847738, + "grad_norm": 8.673702239990234, + "learning_rate": 6.425949412850768e-06, + "loss": 0.5634, + "mean_token_accuracy": 0.8289055705070496, + "num_tokens": 60100890.0, + "step": 50000 + }, + { + "entropy": 1.9505061358213425, + "epoch": 0.15502662437352704, + "grad_norm": 8.550101280212402, + "learning_rate": 6.4253069014343615e-06, + "loss": 0.5529, + "mean_token_accuracy": 0.8236529782414437, + "num_tokens": 60112909.0, + "step": 50010 + }, + { + "entropy": 1.9289822548627853, + "epoch": 0.15505762349857674, + "grad_norm": 8.745688438415527, + "learning_rate": 6.424664582707793e-06, + "loss": 0.5462, + "mean_token_accuracy": 0.8206963390111923, + "num_tokens": 60124202.0, + "step": 50020 + }, + { + "entropy": 1.7902291625738145, + "epoch": 0.15508862262362644, + "grad_norm": 3.6163487434387207, + "learning_rate": 6.424022456574768e-06, + "loss": 0.4297, + "mean_token_accuracy": 0.8540971964597702, + "num_tokens": 60137010.0, + "step": 50030 + }, + { + "entropy": 1.907183986902237, + "epoch": 0.15511962174867613, + "grad_norm": 9.091652870178223, + "learning_rate": 6.42338052293906e-06, + "loss": 0.5648, + "mean_token_accuracy": 0.8403370261192322, + "num_tokens": 60148040.0, + "step": 50040 + }, + { + "entropy": 1.8011170402169228, + "epoch": 0.15515062087372583, + "grad_norm": 4.003586769104004, + "learning_rate": 6.4227387817045115e-06, + "loss": 0.4439, + "mean_token_accuracy": 0.8478186964988709, + "num_tokens": 60160541.0, + "step": 50050 + }, + { + "entropy": 1.8726102083921432, + "epoch": 0.15518161999877553, + "grad_norm": 9.244813919067383, + "learning_rate": 6.42209723277503e-06, + "loss": 0.5469, + "mean_token_accuracy": 0.8411934614181519, + "num_tokens": 60171909.0, + "step": 50060 + }, + { + "entropy": 1.8079619467258454, + "epoch": 0.15521261912382522, + "grad_norm": 8.226319313049316, + "learning_rate": 6.421455876054589e-06, + "loss": 0.4337, + "mean_token_accuracy": 0.8530865862965584, + "num_tokens": 60184414.0, + "step": 50070 + }, + { + "entropy": 1.9148479044437408, + "epoch": 0.15524361824887492, + "grad_norm": 9.0443696975708, + "learning_rate": 6.420814711447232e-06, + "loss": 0.549, + "mean_token_accuracy": 0.8360567212104797, + "num_tokens": 60196202.0, + "step": 50080 + }, + { + "entropy": 1.754614818096161, + "epoch": 0.15527461737392462, + "grad_norm": 9.89665412902832, + "learning_rate": 6.42017373885707e-06, + "loss": 0.4003, + "mean_token_accuracy": 0.849841496348381, + "num_tokens": 60209012.0, + "step": 50090 + }, + { + "entropy": 1.7963035687804223, + "epoch": 0.1553056164989743, + "grad_norm": 3.856391429901123, + "learning_rate": 6.419532958188275e-06, + "loss": 0.4344, + "mean_token_accuracy": 0.8433475911617279, + "num_tokens": 60221933.0, + "step": 50100 + }, + { + "entropy": 1.9359655752778053, + "epoch": 0.155336615624024, + "grad_norm": 8.836137771606445, + "learning_rate": 6.418892369345093e-06, + "loss": 0.5711, + "mean_token_accuracy": 0.823367503285408, + "num_tokens": 60232952.0, + "step": 50110 + }, + { + "entropy": 1.9027406126260757, + "epoch": 0.1553676147490737, + "grad_norm": 8.101412773132324, + "learning_rate": 6.418251972231836e-06, + "loss": 0.5538, + "mean_token_accuracy": 0.8346571624279022, + "num_tokens": 60244389.0, + "step": 50120 + }, + { + "entropy": 1.8543214410543443, + "epoch": 0.1553986138741234, + "grad_norm": 4.314391613006592, + "learning_rate": 6.417611766752878e-06, + "loss": 0.4862, + "mean_token_accuracy": 0.8420602694153786, + "num_tokens": 60256224.0, + "step": 50130 + }, + { + "entropy": 1.9107976794242858, + "epoch": 0.1554296129991731, + "grad_norm": 9.208541870117188, + "learning_rate": 6.416971752812663e-06, + "loss": 0.5544, + "mean_token_accuracy": 0.8198239028453826, + "num_tokens": 60267830.0, + "step": 50140 + }, + { + "entropy": 1.8322341233491897, + "epoch": 0.1554606121242228, + "grad_norm": 4.257328033447266, + "learning_rate": 6.416331930315704e-06, + "loss": 0.4799, + "mean_token_accuracy": 0.8367450326681137, + "num_tokens": 60281007.0, + "step": 50150 + }, + { + "entropy": 1.9688672095537185, + "epoch": 0.1554916112492725, + "grad_norm": 7.649997711181641, + "learning_rate": 6.415692299166574e-06, + "loss": 0.6013, + "mean_token_accuracy": 0.8196437805891037, + "num_tokens": 60291878.0, + "step": 50160 + }, + { + "entropy": 1.9667943209409713, + "epoch": 0.1555226103743222, + "grad_norm": 9.782792091369629, + "learning_rate": 6.41505285926992e-06, + "loss": 0.5785, + "mean_token_accuracy": 0.8302391171455383, + "num_tokens": 60302517.0, + "step": 50170 + }, + { + "entropy": 1.8588977128267288, + "epoch": 0.1555536094993719, + "grad_norm": 7.907834053039551, + "learning_rate": 6.41441361053045e-06, + "loss": 0.4772, + "mean_token_accuracy": 0.8453133404254913, + "num_tokens": 60314120.0, + "step": 50180 + }, + { + "entropy": 1.8701641455292701, + "epoch": 0.15558460862442158, + "grad_norm": 8.567540168762207, + "learning_rate": 6.413774552852943e-06, + "loss": 0.5398, + "mean_token_accuracy": 0.8220806911587715, + "num_tokens": 60326477.0, + "step": 50190 + }, + { + "entropy": 1.8901820540428163, + "epoch": 0.15561560774947128, + "grad_norm": 13.7083740234375, + "learning_rate": 6.41313568614224e-06, + "loss": 0.5659, + "mean_token_accuracy": 0.8234784409403801, + "num_tokens": 60338791.0, + "step": 50200 + }, + { + "entropy": 1.8873229175806046, + "epoch": 0.15564660687452098, + "grad_norm": 7.920546054840088, + "learning_rate": 6.4124970103032505e-06, + "loss": 0.5607, + "mean_token_accuracy": 0.8268739849328994, + "num_tokens": 60350606.0, + "step": 50210 + }, + { + "entropy": 1.8220305427908898, + "epoch": 0.15567760599957067, + "grad_norm": 9.152536392211914, + "learning_rate": 6.411858525240952e-06, + "loss": 0.4489, + "mean_token_accuracy": 0.8429689288139344, + "num_tokens": 60362867.0, + "step": 50220 + }, + { + "entropy": 1.9848755061626435, + "epoch": 0.15570860512462037, + "grad_norm": 8.138653755187988, + "learning_rate": 6.411220230860381e-06, + "loss": 0.5795, + "mean_token_accuracy": 0.8322038248181343, + "num_tokens": 60373606.0, + "step": 50230 + }, + { + "entropy": 1.9133560702204704, + "epoch": 0.15573960424967007, + "grad_norm": 8.208351135253906, + "learning_rate": 6.410582127066652e-06, + "loss": 0.5564, + "mean_token_accuracy": 0.8335413232445716, + "num_tokens": 60385315.0, + "step": 50240 + }, + { + "entropy": 1.8043673783540726, + "epoch": 0.15577060337471973, + "grad_norm": 8.482664108276367, + "learning_rate": 6.4099442137649356e-06, + "loss": 0.505, + "mean_token_accuracy": 0.8454105794429779, + "num_tokens": 60398022.0, + "step": 50250 + }, + { + "entropy": 1.9254692614078521, + "epoch": 0.15580160249976943, + "grad_norm": 10.395545959472656, + "learning_rate": 6.409306490860473e-06, + "loss": 0.5454, + "mean_token_accuracy": 0.8282811924815178, + "num_tokens": 60409280.0, + "step": 50260 + }, + { + "entropy": 1.8486803263425826, + "epoch": 0.15583260162481913, + "grad_norm": 10.327423095703125, + "learning_rate": 6.408668958258571e-06, + "loss": 0.5141, + "mean_token_accuracy": 0.8356290921568871, + "num_tokens": 60421673.0, + "step": 50270 + }, + { + "entropy": 1.8446889415383338, + "epoch": 0.15586360074986882, + "grad_norm": 10.985127449035645, + "learning_rate": 6.408031615864598e-06, + "loss": 0.5038, + "mean_token_accuracy": 0.8322935044765473, + "num_tokens": 60434775.0, + "step": 50280 + }, + { + "entropy": 1.943482118844986, + "epoch": 0.15589459987491852, + "grad_norm": 9.617402076721191, + "learning_rate": 6.407394463583996e-06, + "loss": 0.5226, + "mean_token_accuracy": 0.8407940044999123, + "num_tokens": 60445972.0, + "step": 50290 + }, + { + "entropy": 1.85861434340477, + "epoch": 0.15592559899996822, + "grad_norm": 9.958732604980469, + "learning_rate": 6.406757501322266e-06, + "loss": 0.5453, + "mean_token_accuracy": 0.8299056336283683, + "num_tokens": 60458006.0, + "step": 50300 + }, + { + "entropy": 1.887197096645832, + "epoch": 0.15595659812501791, + "grad_norm": 11.390666961669922, + "learning_rate": 6.406120728984979e-06, + "loss": 0.5547, + "mean_token_accuracy": 0.8242016166448594, + "num_tokens": 60470382.0, + "step": 50310 + }, + { + "entropy": 1.8762848794460296, + "epoch": 0.1559875972500676, + "grad_norm": 7.51845121383667, + "learning_rate": 6.4054841464777696e-06, + "loss": 0.5142, + "mean_token_accuracy": 0.8328775450587272, + "num_tokens": 60482717.0, + "step": 50320 + }, + { + "entropy": 1.9675488024950027, + "epoch": 0.1560185963751173, + "grad_norm": 9.021512985229492, + "learning_rate": 6.404847753706339e-06, + "loss": 0.5867, + "mean_token_accuracy": 0.8280714631080628, + "num_tokens": 60493388.0, + "step": 50330 + }, + { + "entropy": 1.9901121139526368, + "epoch": 0.156049595500167, + "grad_norm": 8.092247009277344, + "learning_rate": 6.404211550576453e-06, + "loss": 0.5651, + "mean_token_accuracy": 0.8340759441256523, + "num_tokens": 60504429.0, + "step": 50340 + }, + { + "entropy": 1.8783787608146667, + "epoch": 0.1560805946252167, + "grad_norm": 9.310537338256836, + "learning_rate": 6.4035755369939425e-06, + "loss": 0.5001, + "mean_token_accuracy": 0.8348104074597359, + "num_tokens": 60516387.0, + "step": 50350 + }, + { + "entropy": 1.8168802306056022, + "epoch": 0.1561115937502664, + "grad_norm": 3.784146785736084, + "learning_rate": 6.4029397128647065e-06, + "loss": 0.4781, + "mean_token_accuracy": 0.8457161352038384, + "num_tokens": 60529393.0, + "step": 50360 + }, + { + "entropy": 1.7647080093622207, + "epoch": 0.1561425928753161, + "grad_norm": 9.681499481201172, + "learning_rate": 6.402304078094705e-06, + "loss": 0.3944, + "mean_token_accuracy": 0.8579833477735519, + "num_tokens": 60543184.0, + "step": 50370 + }, + { + "entropy": 1.8571357518434524, + "epoch": 0.1561735920003658, + "grad_norm": 8.170286178588867, + "learning_rate": 6.40166863258997e-06, + "loss": 0.4846, + "mean_token_accuracy": 0.8396312475204468, + "num_tokens": 60555841.0, + "step": 50380 + }, + { + "entropy": 1.9185105443000794, + "epoch": 0.1562045911254155, + "grad_norm": 7.637692451477051, + "learning_rate": 6.401033376256593e-06, + "loss": 0.5627, + "mean_token_accuracy": 0.8341576635837555, + "num_tokens": 60567373.0, + "step": 50390 + }, + { + "entropy": 1.951904332637787, + "epoch": 0.15623559025046518, + "grad_norm": 7.780064105987549, + "learning_rate": 6.40039830900073e-06, + "loss": 0.5698, + "mean_token_accuracy": 0.8260944783687592, + "num_tokens": 60578359.0, + "step": 50400 + }, + { + "entropy": 1.937084037065506, + "epoch": 0.15626658937551488, + "grad_norm": 8.136171340942383, + "learning_rate": 6.399763430728608e-06, + "loss": 0.5558, + "mean_token_accuracy": 0.8284446790814399, + "num_tokens": 60590095.0, + "step": 50410 + }, + { + "entropy": 1.8329094797372818, + "epoch": 0.15629758850056458, + "grad_norm": 7.345213890075684, + "learning_rate": 6.399128741346514e-06, + "loss": 0.486, + "mean_token_accuracy": 0.8410397842526436, + "num_tokens": 60603043.0, + "step": 50420 + }, + { + "entropy": 1.8990901306271553, + "epoch": 0.15632858762561427, + "grad_norm": 8.570768356323242, + "learning_rate": 6.398494240760803e-06, + "loss": 0.5062, + "mean_token_accuracy": 0.8417127162218094, + "num_tokens": 60614516.0, + "step": 50430 + }, + { + "entropy": 1.8423827588558197, + "epoch": 0.15635958675066397, + "grad_norm": 4.247179985046387, + "learning_rate": 6.397859928877893e-06, + "loss": 0.4673, + "mean_token_accuracy": 0.8417179524898529, + "num_tokens": 60626871.0, + "step": 50440 + }, + { + "entropy": 2.0029925853013992, + "epoch": 0.15639058587571367, + "grad_norm": 8.819866180419922, + "learning_rate": 6.3972258056042655e-06, + "loss": 0.5682, + "mean_token_accuracy": 0.829233068227768, + "num_tokens": 60637756.0, + "step": 50450 + }, + { + "entropy": 1.9364984780550003, + "epoch": 0.15642158500076336, + "grad_norm": 9.162713050842285, + "learning_rate": 6.396591870846475e-06, + "loss": 0.5832, + "mean_token_accuracy": 0.8301840484142303, + "num_tokens": 60648877.0, + "step": 50460 + }, + { + "entropy": 1.9391376033425332, + "epoch": 0.15645258412581306, + "grad_norm": 8.27598762512207, + "learning_rate": 6.395958124511129e-06, + "loss": 0.5563, + "mean_token_accuracy": 0.8327331587672233, + "num_tokens": 60659919.0, + "step": 50470 + }, + { + "entropy": 1.9072021529078484, + "epoch": 0.15648358325086276, + "grad_norm": 9.113195419311523, + "learning_rate": 6.395324566504908e-06, + "loss": 0.5266, + "mean_token_accuracy": 0.831612104177475, + "num_tokens": 60671627.0, + "step": 50480 + }, + { + "entropy": 1.9049764469265937, + "epoch": 0.15651458237591245, + "grad_norm": 4.894548416137695, + "learning_rate": 6.394691196734555e-06, + "loss": 0.5166, + "mean_token_accuracy": 0.8367709815502167, + "num_tokens": 60683963.0, + "step": 50490 + }, + { + "entropy": 1.8629326492547988, + "epoch": 0.15654558150096212, + "grad_norm": 5.698431968688965, + "learning_rate": 6.394058015106876e-06, + "loss": 0.461, + "mean_token_accuracy": 0.8405030608177185, + "num_tokens": 60696888.0, + "step": 50500 + }, + { + "entropy": 1.9971718460321426, + "epoch": 0.15657658062601182, + "grad_norm": 7.774511337280273, + "learning_rate": 6.393425021528746e-06, + "loss": 0.5977, + "mean_token_accuracy": 0.8220291540026665, + "num_tokens": 60707257.0, + "step": 50510 + }, + { + "entropy": 1.9252537876367568, + "epoch": 0.15660757975106152, + "grad_norm": 4.02286434173584, + "learning_rate": 6.392792215907099e-06, + "loss": 0.51, + "mean_token_accuracy": 0.8373642593622208, + "num_tokens": 60719173.0, + "step": 50520 + }, + { + "entropy": 1.9036842539906502, + "epoch": 0.1566385788761112, + "grad_norm": 9.243278503417969, + "learning_rate": 6.392159598148937e-06, + "loss": 0.5501, + "mean_token_accuracy": 0.8288576260209084, + "num_tokens": 60730887.0, + "step": 50530 + }, + { + "entropy": 1.950339911878109, + "epoch": 0.1566695780011609, + "grad_norm": 9.252776145935059, + "learning_rate": 6.391527168161323e-06, + "loss": 0.6262, + "mean_token_accuracy": 0.8195764616131782, + "num_tokens": 60742662.0, + "step": 50540 + }, + { + "entropy": 1.9394678846001625, + "epoch": 0.1567005771262106, + "grad_norm": 9.990178108215332, + "learning_rate": 6.390894925851392e-06, + "loss": 0.5444, + "mean_token_accuracy": 0.8261475265026093, + "num_tokens": 60753958.0, + "step": 50550 + }, + { + "entropy": 1.8357860133051873, + "epoch": 0.1567315762512603, + "grad_norm": 6.805025100708008, + "learning_rate": 6.390262871126333e-06, + "loss": 0.4584, + "mean_token_accuracy": 0.8525337189435959, + "num_tokens": 60766091.0, + "step": 50560 + }, + { + "entropy": 1.8700495898723601, + "epoch": 0.15676257537631, + "grad_norm": 8.728248596191406, + "learning_rate": 6.3896310038934085e-06, + "loss": 0.5036, + "mean_token_accuracy": 0.8417010590434074, + "num_tokens": 60777610.0, + "step": 50570 + }, + { + "entropy": 1.8651820093393325, + "epoch": 0.1567935745013597, + "grad_norm": 9.503561019897461, + "learning_rate": 6.388999324059937e-06, + "loss": 0.4944, + "mean_token_accuracy": 0.8336687937378884, + "num_tokens": 60789545.0, + "step": 50580 + }, + { + "entropy": 1.8762501239776612, + "epoch": 0.1568245736264094, + "grad_norm": 10.497973442077637, + "learning_rate": 6.3883678315333085e-06, + "loss": 0.5375, + "mean_token_accuracy": 0.8256103843450546, + "num_tokens": 60801947.0, + "step": 50590 + }, + { + "entropy": 1.8612007051706314, + "epoch": 0.1568555727514591, + "grad_norm": 8.15131950378418, + "learning_rate": 6.387736526220971e-06, + "loss": 0.4962, + "mean_token_accuracy": 0.8382210031151771, + "num_tokens": 60814283.0, + "step": 50600 + }, + { + "entropy": 1.7936832830309868, + "epoch": 0.15688657187650878, + "grad_norm": 8.772533416748047, + "learning_rate": 6.387105408030442e-06, + "loss": 0.4245, + "mean_token_accuracy": 0.8425097793340683, + "num_tokens": 60827792.0, + "step": 50610 + }, + { + "entropy": 1.969840306043625, + "epoch": 0.15691757100155848, + "grad_norm": 8.165382385253906, + "learning_rate": 6.386474476869298e-06, + "loss": 0.5996, + "mean_token_accuracy": 0.8277344390749931, + "num_tokens": 60838984.0, + "step": 50620 + }, + { + "entropy": 1.8312163800001144, + "epoch": 0.15694857012660818, + "grad_norm": 3.5525267124176025, + "learning_rate": 6.3858437326451805e-06, + "loss": 0.5051, + "mean_token_accuracy": 0.8387185871601105, + "num_tokens": 60852094.0, + "step": 50630 + }, + { + "entropy": 1.8953583613038063, + "epoch": 0.15697956925165787, + "grad_norm": 9.432021141052246, + "learning_rate": 6.3852131752658e-06, + "loss": 0.4871, + "mean_token_accuracy": 0.8444772154092789, + "num_tokens": 60863876.0, + "step": 50640 + }, + { + "entropy": 1.8836359083652496, + "epoch": 0.15701056837670757, + "grad_norm": 9.322160720825195, + "learning_rate": 6.384582804638923e-06, + "loss": 0.5454, + "mean_token_accuracy": 0.8311738058924675, + "num_tokens": 60876801.0, + "step": 50650 + }, + { + "entropy": 1.92313295006752, + "epoch": 0.15704156750175727, + "grad_norm": 8.350409507751465, + "learning_rate": 6.383952620672385e-06, + "loss": 0.5855, + "mean_token_accuracy": 0.8254999339580535, + "num_tokens": 60887997.0, + "step": 50660 + }, + { + "entropy": 1.9507147312164306, + "epoch": 0.15707256662680696, + "grad_norm": 8.378862380981445, + "learning_rate": 6.383322623274081e-06, + "loss": 0.5762, + "mean_token_accuracy": 0.8160288318991661, + "num_tokens": 60899916.0, + "step": 50670 + }, + { + "entropy": 1.7835942827165128, + "epoch": 0.15710356575185666, + "grad_norm": 7.95631217956543, + "learning_rate": 6.382692812351976e-06, + "loss": 0.4651, + "mean_token_accuracy": 0.8510005518794059, + "num_tokens": 60914587.0, + "step": 50680 + }, + { + "entropy": 1.8697744831442833, + "epoch": 0.15713456487690636, + "grad_norm": 4.098609924316406, + "learning_rate": 6.382063187814093e-06, + "loss": 0.4901, + "mean_token_accuracy": 0.8406439542770385, + "num_tokens": 60926953.0, + "step": 50690 + }, + { + "entropy": 1.8595498703420161, + "epoch": 0.15716556400195605, + "grad_norm": 8.888344764709473, + "learning_rate": 6.381433749568522e-06, + "loss": 0.4505, + "mean_token_accuracy": 0.8456152617931366, + "num_tokens": 60939063.0, + "step": 50700 + }, + { + "entropy": 1.8515797272324561, + "epoch": 0.15719656312700575, + "grad_norm": 6.374711513519287, + "learning_rate": 6.380804497523409e-06, + "loss": 0.4767, + "mean_token_accuracy": 0.844392117857933, + "num_tokens": 60950485.0, + "step": 50710 + }, + { + "entropy": 1.8461545348167419, + "epoch": 0.15722756225205545, + "grad_norm": 9.635364532470703, + "learning_rate": 6.380175431586977e-06, + "loss": 0.4855, + "mean_token_accuracy": 0.8438180416822434, + "num_tokens": 60962994.0, + "step": 50720 + }, + { + "entropy": 1.9317549824714662, + "epoch": 0.15725856137710514, + "grad_norm": 9.940279960632324, + "learning_rate": 6.379546551667498e-06, + "loss": 0.5661, + "mean_token_accuracy": 0.8308566614985466, + "num_tokens": 60974445.0, + "step": 50730 + }, + { + "entropy": 1.919760164618492, + "epoch": 0.15728956050215484, + "grad_norm": 17.22686004638672, + "learning_rate": 6.3789178576733166e-06, + "loss": 0.5501, + "mean_token_accuracy": 0.8357348814606667, + "num_tokens": 60985676.0, + "step": 50740 + }, + { + "entropy": 1.893989272415638, + "epoch": 0.1573205596272045, + "grad_norm": 8.417506217956543, + "learning_rate": 6.378289349512838e-06, + "loss": 0.5289, + "mean_token_accuracy": 0.840349805355072, + "num_tokens": 60997353.0, + "step": 50750 + }, + { + "entropy": 1.9457985952496528, + "epoch": 0.1573515587522542, + "grad_norm": 9.5839204788208, + "learning_rate": 6.377661027094528e-06, + "loss": 0.5644, + "mean_token_accuracy": 0.8289625599980355, + "num_tokens": 61008540.0, + "step": 50760 + }, + { + "entropy": 1.8832007065415381, + "epoch": 0.1573825578773039, + "grad_norm": 8.148798942565918, + "learning_rate": 6.377032890326919e-06, + "loss": 0.4871, + "mean_token_accuracy": 0.8396278038620949, + "num_tokens": 61020653.0, + "step": 50770 + }, + { + "entropy": 1.8223789624869824, + "epoch": 0.1574135570023536, + "grad_norm": 8.812091827392578, + "learning_rate": 6.376404939118606e-06, + "loss": 0.4469, + "mean_token_accuracy": 0.8405194833874703, + "num_tokens": 61033783.0, + "step": 50780 + }, + { + "entropy": 1.908838202059269, + "epoch": 0.1574445561274033, + "grad_norm": 9.226709365844727, + "learning_rate": 6.3757771733782435e-06, + "loss": 0.5498, + "mean_token_accuracy": 0.8301179006695747, + "num_tokens": 61045896.0, + "step": 50790 + }, + { + "entropy": 1.8385404348373413, + "epoch": 0.157475555252453, + "grad_norm": 3.6262776851654053, + "learning_rate": 6.375149593014555e-06, + "loss": 0.4527, + "mean_token_accuracy": 0.844711446762085, + "num_tokens": 61059216.0, + "step": 50800 + }, + { + "entropy": 1.8200566530227662, + "epoch": 0.1575065543775027, + "grad_norm": 4.00075626373291, + "learning_rate": 6.3745221979363226e-06, + "loss": 0.4372, + "mean_token_accuracy": 0.8521258249878884, + "num_tokens": 61072093.0, + "step": 50810 + }, + { + "entropy": 1.9493874102830886, + "epoch": 0.15753755350255239, + "grad_norm": 7.502165794372559, + "learning_rate": 6.373894988052391e-06, + "loss": 0.5919, + "mean_token_accuracy": 0.8284584790468216, + "num_tokens": 61083605.0, + "step": 50820 + }, + { + "entropy": 1.8854053810238838, + "epoch": 0.15756855262760208, + "grad_norm": 8.985443115234375, + "learning_rate": 6.373267963271668e-06, + "loss": 0.4746, + "mean_token_accuracy": 0.8452239811420441, + "num_tokens": 61095760.0, + "step": 50830 + }, + { + "entropy": 1.9016931489109994, + "epoch": 0.15759955175265178, + "grad_norm": 8.221643447875977, + "learning_rate": 6.372641123503127e-06, + "loss": 0.4685, + "mean_token_accuracy": 0.8430635541677475, + "num_tokens": 61108080.0, + "step": 50840 + }, + { + "entropy": 1.9211482793092727, + "epoch": 0.15763055087770148, + "grad_norm": 10.981581687927246, + "learning_rate": 6.372014468655801e-06, + "loss": 0.5592, + "mean_token_accuracy": 0.825599467754364, + "num_tokens": 61120063.0, + "step": 50850 + }, + { + "entropy": 1.948448945581913, + "epoch": 0.15766155000275117, + "grad_norm": 8.988844871520996, + "learning_rate": 6.371387998638789e-06, + "loss": 0.58, + "mean_token_accuracy": 0.8233562901616096, + "num_tokens": 61131416.0, + "step": 50860 + }, + { + "entropy": 1.923220631480217, + "epoch": 0.15769254912780087, + "grad_norm": 8.43247127532959, + "learning_rate": 6.3707617133612456e-06, + "loss": 0.5363, + "mean_token_accuracy": 0.831841279566288, + "num_tokens": 61143372.0, + "step": 50870 + }, + { + "entropy": 1.890692350268364, + "epoch": 0.15772354825285057, + "grad_norm": 4.421103477478027, + "learning_rate": 6.370135612732394e-06, + "loss": 0.5208, + "mean_token_accuracy": 0.8370485186576844, + "num_tokens": 61155644.0, + "step": 50880 + }, + { + "entropy": 1.9700417637825012, + "epoch": 0.15775454737790026, + "grad_norm": 4.649320602416992, + "learning_rate": 6.36950969666152e-06, + "loss": 0.5609, + "mean_token_accuracy": 0.828958623111248, + "num_tokens": 61166750.0, + "step": 50890 + }, + { + "entropy": 1.8618547976017, + "epoch": 0.15778554650294996, + "grad_norm": 7.937415599822998, + "learning_rate": 6.368883965057968e-06, + "loss": 0.5103, + "mean_token_accuracy": 0.8421330273151397, + "num_tokens": 61179716.0, + "step": 50900 + }, + { + "entropy": 1.9229750469326974, + "epoch": 0.15781654562799965, + "grad_norm": 7.491119861602783, + "learning_rate": 6.368258417831149e-06, + "loss": 0.5214, + "mean_token_accuracy": 0.8261383548378944, + "num_tokens": 61191546.0, + "step": 50910 + }, + { + "entropy": 1.9325373709201812, + "epoch": 0.15784754475304935, + "grad_norm": 9.881427764892578, + "learning_rate": 6.367633054890532e-06, + "loss": 0.5417, + "mean_token_accuracy": 0.8323504284024239, + "num_tokens": 61203143.0, + "step": 50920 + }, + { + "entropy": 1.8541507571935654, + "epoch": 0.15787854387809905, + "grad_norm": 4.566098690032959, + "learning_rate": 6.367007876145651e-06, + "loss": 0.5094, + "mean_token_accuracy": 0.8295344039797783, + "num_tokens": 61216098.0, + "step": 50930 + }, + { + "entropy": 1.9541061252355576, + "epoch": 0.15790954300314874, + "grad_norm": 8.65689754486084, + "learning_rate": 6.3663828815061e-06, + "loss": 0.6162, + "mean_token_accuracy": 0.8173148021101951, + "num_tokens": 61227492.0, + "step": 50940 + }, + { + "entropy": 1.9283887058496476, + "epoch": 0.15794054212819844, + "grad_norm": 8.396080017089844, + "learning_rate": 6.36575807088154e-06, + "loss": 0.513, + "mean_token_accuracy": 0.8281655997037888, + "num_tokens": 61239330.0, + "step": 50950 + }, + { + "entropy": 1.8624481573700904, + "epoch": 0.15797154125324814, + "grad_norm": 9.15777587890625, + "learning_rate": 6.365133444181688e-06, + "loss": 0.4972, + "mean_token_accuracy": 0.8356254741549491, + "num_tokens": 61252054.0, + "step": 50960 + }, + { + "entropy": 1.9177945956587792, + "epoch": 0.15800254037829783, + "grad_norm": 9.632381439208984, + "learning_rate": 6.364509001316326e-06, + "loss": 0.5361, + "mean_token_accuracy": 0.8355124577879905, + "num_tokens": 61263663.0, + "step": 50970 + }, + { + "entropy": 1.8173129588365555, + "epoch": 0.15803353950334753, + "grad_norm": 9.271222114562988, + "learning_rate": 6.363884742195296e-06, + "loss": 0.4709, + "mean_token_accuracy": 0.8449875220656395, + "num_tokens": 61277606.0, + "step": 50980 + }, + { + "entropy": 1.97777401804924, + "epoch": 0.1580645386283972, + "grad_norm": 11.567782402038574, + "learning_rate": 6.363260666728507e-06, + "loss": 0.5419, + "mean_token_accuracy": 0.8399723425507546, + "num_tokens": 61287908.0, + "step": 50990 + }, + { + "entropy": 1.8491854876279832, + "epoch": 0.1580955377534469, + "grad_norm": 3.4037585258483887, + "learning_rate": 6.362636774825923e-06, + "loss": 0.5092, + "mean_token_accuracy": 0.8377619609236717, + "num_tokens": 61300804.0, + "step": 51000 + }, + { + "entropy": 1.9141796737909318, + "epoch": 0.1581265368784966, + "grad_norm": 4.402615547180176, + "learning_rate": 6.362013066397575e-06, + "loss": 0.5334, + "mean_token_accuracy": 0.8279877364635467, + "num_tokens": 61312498.0, + "step": 51010 + }, + { + "entropy": 1.9528733968734742, + "epoch": 0.1581575360035463, + "grad_norm": 8.701175689697266, + "learning_rate": 6.361389541353552e-06, + "loss": 0.5713, + "mean_token_accuracy": 0.836171668767929, + "num_tokens": 61323367.0, + "step": 51020 + }, + { + "entropy": 1.8788547486066818, + "epoch": 0.158188535128596, + "grad_norm": 9.006681442260742, + "learning_rate": 6.360766199604007e-06, + "loss": 0.549, + "mean_token_accuracy": 0.8283375024795532, + "num_tokens": 61336492.0, + "step": 51030 + }, + { + "entropy": 1.856010665744543, + "epoch": 0.15821953425364568, + "grad_norm": 4.132041931152344, + "learning_rate": 6.360143041059156e-06, + "loss": 0.4608, + "mean_token_accuracy": 0.8421657636761666, + "num_tokens": 61349803.0, + "step": 51040 + }, + { + "entropy": 1.8853138819336892, + "epoch": 0.15825053337869538, + "grad_norm": 4.42047643661499, + "learning_rate": 6.359520065629272e-06, + "loss": 0.5469, + "mean_token_accuracy": 0.8277313068509102, + "num_tokens": 61361430.0, + "step": 51050 + }, + { + "entropy": 1.8170242205262184, + "epoch": 0.15828153250374508, + "grad_norm": 3.389427423477173, + "learning_rate": 6.358897273224693e-06, + "loss": 0.4527, + "mean_token_accuracy": 0.8529497593641281, + "num_tokens": 61374400.0, + "step": 51060 + }, + { + "entropy": 1.8062128841876983, + "epoch": 0.15831253162879477, + "grad_norm": 3.821852207183838, + "learning_rate": 6.358274663755817e-06, + "loss": 0.4512, + "mean_token_accuracy": 0.8493776023387909, + "num_tokens": 61388410.0, + "step": 51070 + }, + { + "entropy": 1.904846543073654, + "epoch": 0.15834353075384447, + "grad_norm": 11.317872047424316, + "learning_rate": 6.357652237133105e-06, + "loss": 0.5412, + "mean_token_accuracy": 0.830742597579956, + "num_tokens": 61399765.0, + "step": 51080 + }, + { + "entropy": 1.9227543294429779, + "epoch": 0.15837452987889417, + "grad_norm": 9.054058074951172, + "learning_rate": 6.357029993267079e-06, + "loss": 0.594, + "mean_token_accuracy": 0.8182252869009972, + "num_tokens": 61411205.0, + "step": 51090 + }, + { + "entropy": 1.7773767858743668, + "epoch": 0.15840552900394386, + "grad_norm": 3.7279295921325684, + "learning_rate": 6.356407932068319e-06, + "loss": 0.4533, + "mean_token_accuracy": 0.8401523649692535, + "num_tokens": 61425116.0, + "step": 51100 + }, + { + "entropy": 1.932927194237709, + "epoch": 0.15843652812899356, + "grad_norm": 8.091573715209961, + "learning_rate": 6.35578605344747e-06, + "loss": 0.5335, + "mean_token_accuracy": 0.8386119574308395, + "num_tokens": 61436160.0, + "step": 51110 + }, + { + "entropy": 1.9222755640745164, + "epoch": 0.15846752725404326, + "grad_norm": 9.271939277648926, + "learning_rate": 6.355164357315238e-06, + "loss": 0.5488, + "mean_token_accuracy": 0.8323445156216621, + "num_tokens": 61447404.0, + "step": 51120 + }, + { + "entropy": 1.8717841893434524, + "epoch": 0.15849852637909295, + "grad_norm": 4.64661979675293, + "learning_rate": 6.354542843582387e-06, + "loss": 0.5269, + "mean_token_accuracy": 0.8282085090875626, + "num_tokens": 61459307.0, + "step": 51130 + }, + { + "entropy": 1.7520866304636002, + "epoch": 0.15852952550414265, + "grad_norm": 8.69471263885498, + "learning_rate": 6.353921512159747e-06, + "loss": 0.392, + "mean_token_accuracy": 0.8573095768690109, + "num_tokens": 61473555.0, + "step": 51140 + }, + { + "entropy": 1.933935022354126, + "epoch": 0.15856052462919235, + "grad_norm": 9.661917686462402, + "learning_rate": 6.353300362958204e-06, + "loss": 0.5802, + "mean_token_accuracy": 0.8254079014062882, + "num_tokens": 61485440.0, + "step": 51150 + }, + { + "entropy": 1.8875377878546715, + "epoch": 0.15859152375424204, + "grad_norm": 9.894033432006836, + "learning_rate": 6.352679395888709e-06, + "loss": 0.5426, + "mean_token_accuracy": 0.8381941422820092, + "num_tokens": 61497948.0, + "step": 51160 + }, + { + "entropy": 1.954133327305317, + "epoch": 0.15862252287929174, + "grad_norm": 13.155253410339355, + "learning_rate": 6.3520586108622695e-06, + "loss": 0.5527, + "mean_token_accuracy": 0.8287410020828248, + "num_tokens": 61509102.0, + "step": 51170 + }, + { + "entropy": 1.9011639848351478, + "epoch": 0.15865352200434144, + "grad_norm": 9.405858039855957, + "learning_rate": 6.351438007789959e-06, + "loss": 0.506, + "mean_token_accuracy": 0.8395056575536728, + "num_tokens": 61521308.0, + "step": 51180 + }, + { + "entropy": 1.9040748074650764, + "epoch": 0.15868452112939113, + "grad_norm": 8.241883277893066, + "learning_rate": 6.350817586582909e-06, + "loss": 0.5341, + "mean_token_accuracy": 0.8309203192591668, + "num_tokens": 61533828.0, + "step": 51190 + }, + { + "entropy": 1.9427318632602693, + "epoch": 0.15871552025444083, + "grad_norm": 8.195354461669922, + "learning_rate": 6.35019734715231e-06, + "loss": 0.5027, + "mean_token_accuracy": 0.8373890981078148, + "num_tokens": 61545278.0, + "step": 51200 + }, + { + "entropy": 1.904130506515503, + "epoch": 0.15874651937949052, + "grad_norm": 7.979343891143799, + "learning_rate": 6.349577289409418e-06, + "loss": 0.5099, + "mean_token_accuracy": 0.8309107288718224, + "num_tokens": 61557125.0, + "step": 51210 + }, + { + "entropy": 1.9387200504541398, + "epoch": 0.15877751850454022, + "grad_norm": 8.9723539352417, + "learning_rate": 6.348957413265544e-06, + "loss": 0.5938, + "mean_token_accuracy": 0.8150834292173386, + "num_tokens": 61569250.0, + "step": 51220 + }, + { + "entropy": 1.8315011352300643, + "epoch": 0.15880851762958992, + "grad_norm": 4.4107136726379395, + "learning_rate": 6.348337718632065e-06, + "loss": 0.4648, + "mean_token_accuracy": 0.8472426652908325, + "num_tokens": 61582476.0, + "step": 51230 + }, + { + "entropy": 1.8459465608000756, + "epoch": 0.1588395167546396, + "grad_norm": 8.931220054626465, + "learning_rate": 6.347718205420413e-06, + "loss": 0.4387, + "mean_token_accuracy": 0.8469175025820732, + "num_tokens": 61595011.0, + "step": 51240 + }, + { + "entropy": 1.9565140813589097, + "epoch": 0.15887051587968928, + "grad_norm": 8.753551483154297, + "learning_rate": 6.347098873542088e-06, + "loss": 0.5644, + "mean_token_accuracy": 0.8283399358391762, + "num_tokens": 61606389.0, + "step": 51250 + }, + { + "entropy": 1.9582069575786591, + "epoch": 0.15890151500473898, + "grad_norm": 7.78233003616333, + "learning_rate": 6.346479722908642e-06, + "loss": 0.5422, + "mean_token_accuracy": 0.8336837723851204, + "num_tokens": 61617999.0, + "step": 51260 + }, + { + "entropy": 1.9196703389286995, + "epoch": 0.15893251412978868, + "grad_norm": 8.762555122375488, + "learning_rate": 6.345860753431693e-06, + "loss": 0.5081, + "mean_token_accuracy": 0.8311418369412422, + "num_tokens": 61630045.0, + "step": 51270 + }, + { + "entropy": 1.9337300062179565, + "epoch": 0.15896351325483837, + "grad_norm": 9.353802680969238, + "learning_rate": 6.345241965022917e-06, + "loss": 0.5269, + "mean_token_accuracy": 0.8350248277187348, + "num_tokens": 61642241.0, + "step": 51280 + }, + { + "entropy": 1.903528119623661, + "epoch": 0.15899451237988807, + "grad_norm": 8.561066627502441, + "learning_rate": 6.344623357594051e-06, + "loss": 0.5307, + "mean_token_accuracy": 0.8348277062177658, + "num_tokens": 61654226.0, + "step": 51290 + }, + { + "entropy": 1.9056920766830445, + "epoch": 0.15902551150493777, + "grad_norm": 4.238293170928955, + "learning_rate": 6.344004931056894e-06, + "loss": 0.4882, + "mean_token_accuracy": 0.8365932151675224, + "num_tokens": 61666601.0, + "step": 51300 + }, + { + "entropy": 1.900420580804348, + "epoch": 0.15905651062998746, + "grad_norm": 8.474109649658203, + "learning_rate": 6.343386685323301e-06, + "loss": 0.5645, + "mean_token_accuracy": 0.8356752887368202, + "num_tokens": 61678955.0, + "step": 51310 + }, + { + "entropy": 1.7979067623615266, + "epoch": 0.15908750975503716, + "grad_norm": 7.289546012878418, + "learning_rate": 6.34276862030519e-06, + "loss": 0.4091, + "mean_token_accuracy": 0.8524860993027688, + "num_tokens": 61692933.0, + "step": 51320 + }, + { + "entropy": 1.9506612807512282, + "epoch": 0.15911850888008686, + "grad_norm": 7.817605018615723, + "learning_rate": 6.342150735914539e-06, + "loss": 0.5449, + "mean_token_accuracy": 0.8349381595849991, + "num_tokens": 61703549.0, + "step": 51330 + }, + { + "entropy": 1.9035085812211037, + "epoch": 0.15914950800513655, + "grad_norm": 8.978363037109375, + "learning_rate": 6.341533032063384e-06, + "loss": 0.5406, + "mean_token_accuracy": 0.8332462772727013, + "num_tokens": 61715518.0, + "step": 51340 + }, + { + "entropy": 1.9238849043846131, + "epoch": 0.15918050713018625, + "grad_norm": 10.23372745513916, + "learning_rate": 6.3409155086638244e-06, + "loss": 0.5827, + "mean_token_accuracy": 0.8245359167456627, + "num_tokens": 61726468.0, + "step": 51350 + }, + { + "entropy": 1.912158764898777, + "epoch": 0.15921150625523595, + "grad_norm": 8.236334800720215, + "learning_rate": 6.3402981656280174e-06, + "loss": 0.5213, + "mean_token_accuracy": 0.8354252070188523, + "num_tokens": 61738604.0, + "step": 51360 + }, + { + "entropy": 1.9760198339819908, + "epoch": 0.15924250538028564, + "grad_norm": 9.30893611907959, + "learning_rate": 6.339681002868179e-06, + "loss": 0.561, + "mean_token_accuracy": 0.8204899609088898, + "num_tokens": 61749706.0, + "step": 51370 + }, + { + "entropy": 1.9051903694868089, + "epoch": 0.15927350450533534, + "grad_norm": 9.52764892578125, + "learning_rate": 6.3390640202965856e-06, + "loss": 0.5108, + "mean_token_accuracy": 0.8380590423941612, + "num_tokens": 61761965.0, + "step": 51380 + }, + { + "entropy": 1.9080309465527534, + "epoch": 0.15930450363038504, + "grad_norm": 3.284393072128296, + "learning_rate": 6.338447217825577e-06, + "loss": 0.5122, + "mean_token_accuracy": 0.8422815844416618, + "num_tokens": 61773196.0, + "step": 51390 + }, + { + "entropy": 1.848653295636177, + "epoch": 0.15933550275543473, + "grad_norm": 9.541685104370117, + "learning_rate": 6.337830595367548e-06, + "loss": 0.52, + "mean_token_accuracy": 0.835855670273304, + "num_tokens": 61785726.0, + "step": 51400 + }, + { + "entropy": 1.8705054201185702, + "epoch": 0.15936650188048443, + "grad_norm": 8.726213455200195, + "learning_rate": 6.337214152834954e-06, + "loss": 0.5133, + "mean_token_accuracy": 0.838579186797142, + "num_tokens": 61798381.0, + "step": 51410 + }, + { + "entropy": 1.9128465965390204, + "epoch": 0.15939750100553413, + "grad_norm": 4.461379528045654, + "learning_rate": 6.336597890140311e-06, + "loss": 0.5283, + "mean_token_accuracy": 0.8308456495404244, + "num_tokens": 61809810.0, + "step": 51420 + }, + { + "entropy": 1.8830090552568435, + "epoch": 0.15942850013058382, + "grad_norm": 7.6069135665893555, + "learning_rate": 6.335981807196195e-06, + "loss": 0.4831, + "mean_token_accuracy": 0.8455763354897499, + "num_tokens": 61821564.0, + "step": 51430 + }, + { + "entropy": 1.879130232334137, + "epoch": 0.15945949925563352, + "grad_norm": 8.862757682800293, + "learning_rate": 6.335365903915241e-06, + "loss": 0.4903, + "mean_token_accuracy": 0.841680321097374, + "num_tokens": 61833721.0, + "step": 51440 + }, + { + "entropy": 1.8420701980590821, + "epoch": 0.15949049838068322, + "grad_norm": 9.544261932373047, + "learning_rate": 6.334750180210142e-06, + "loss": 0.4785, + "mean_token_accuracy": 0.8434771358966827, + "num_tokens": 61845911.0, + "step": 51450 + }, + { + "entropy": 1.8399369686841964, + "epoch": 0.1595214975057329, + "grad_norm": 8.646150588989258, + "learning_rate": 6.334134635993651e-06, + "loss": 0.4886, + "mean_token_accuracy": 0.8362537592649459, + "num_tokens": 61858452.0, + "step": 51460 + }, + { + "entropy": 1.8282588481903077, + "epoch": 0.1595524966307826, + "grad_norm": 8.844307899475098, + "learning_rate": 6.333519271178583e-06, + "loss": 0.4811, + "mean_token_accuracy": 0.8485338285565376, + "num_tokens": 61871207.0, + "step": 51470 + }, + { + "entropy": 1.9945002496242523, + "epoch": 0.1595834957558323, + "grad_norm": 9.600845336914062, + "learning_rate": 6.332904085677809e-06, + "loss": 0.6231, + "mean_token_accuracy": 0.8192642524838447, + "num_tokens": 61881849.0, + "step": 51480 + }, + { + "entropy": 1.866794577240944, + "epoch": 0.15961449488088197, + "grad_norm": 8.577037811279297, + "learning_rate": 6.33228907940426e-06, + "loss": 0.5177, + "mean_token_accuracy": 0.8294419586658478, + "num_tokens": 61894411.0, + "step": 51490 + }, + { + "entropy": 1.9821971401572227, + "epoch": 0.15964549400593167, + "grad_norm": 7.966568470001221, + "learning_rate": 6.3316742522709295e-06, + "loss": 0.5735, + "mean_token_accuracy": 0.8234156042337417, + "num_tokens": 61905974.0, + "step": 51500 + }, + { + "entropy": 1.928943158686161, + "epoch": 0.15967649313098137, + "grad_norm": 8.466790199279785, + "learning_rate": 6.331059604190863e-06, + "loss": 0.5206, + "mean_token_accuracy": 0.83587566614151, + "num_tokens": 61918230.0, + "step": 51510 + }, + { + "entropy": 1.8466455608606338, + "epoch": 0.15970749225603106, + "grad_norm": 8.579726219177246, + "learning_rate": 6.330445135077171e-06, + "loss": 0.4981, + "mean_token_accuracy": 0.8351440250873565, + "num_tokens": 61930533.0, + "step": 51520 + }, + { + "entropy": 1.9407551258802413, + "epoch": 0.15973849138108076, + "grad_norm": 9.855902671813965, + "learning_rate": 6.329830844843021e-06, + "loss": 0.5384, + "mean_token_accuracy": 0.8276364892721176, + "num_tokens": 61941838.0, + "step": 51530 + }, + { + "entropy": 1.8404507592320443, + "epoch": 0.15976949050613046, + "grad_norm": 7.8672261238098145, + "learning_rate": 6.329216733401641e-06, + "loss": 0.4824, + "mean_token_accuracy": 0.8361625537276268, + "num_tokens": 61954790.0, + "step": 51540 + }, + { + "entropy": 1.8961133405566215, + "epoch": 0.15980048963118015, + "grad_norm": 9.657519340515137, + "learning_rate": 6.328602800666316e-06, + "loss": 0.538, + "mean_token_accuracy": 0.8254346460103988, + "num_tokens": 61967113.0, + "step": 51550 + }, + { + "entropy": 1.887896779179573, + "epoch": 0.15983148875622985, + "grad_norm": 7.927542209625244, + "learning_rate": 6.32798904655039e-06, + "loss": 0.4913, + "mean_token_accuracy": 0.8375117152929306, + "num_tokens": 61979299.0, + "step": 51560 + }, + { + "entropy": 1.8948641210794448, + "epoch": 0.15986248788127955, + "grad_norm": 7.149417400360107, + "learning_rate": 6.327375470967267e-06, + "loss": 0.5088, + "mean_token_accuracy": 0.8403451159596443, + "num_tokens": 61990608.0, + "step": 51570 + }, + { + "entropy": 1.8972643151879311, + "epoch": 0.15989348700632924, + "grad_norm": 10.748580932617188, + "learning_rate": 6.326762073830408e-06, + "loss": 0.5441, + "mean_token_accuracy": 0.8346004873514176, + "num_tokens": 62002236.0, + "step": 51580 + }, + { + "entropy": 1.9104709938168525, + "epoch": 0.15992448613137894, + "grad_norm": 9.303431510925293, + "learning_rate": 6.326148855053335e-06, + "loss": 0.542, + "mean_token_accuracy": 0.8291504085063934, + "num_tokens": 62013316.0, + "step": 51590 + }, + { + "entropy": 1.9200339168310165, + "epoch": 0.15995548525642864, + "grad_norm": 8.650131225585938, + "learning_rate": 6.325535814549628e-06, + "loss": 0.5125, + "mean_token_accuracy": 0.8344766482710838, + "num_tokens": 62024979.0, + "step": 51600 + }, + { + "entropy": 1.9062774524092674, + "epoch": 0.15998648438147833, + "grad_norm": 9.20338249206543, + "learning_rate": 6.324922952232924e-06, + "loss": 0.5443, + "mean_token_accuracy": 0.8257553368806839, + "num_tokens": 62037183.0, + "step": 51610 + }, + { + "entropy": 1.8092211425304412, + "epoch": 0.16001748350652803, + "grad_norm": 8.952025413513184, + "learning_rate": 6.32431026801692e-06, + "loss": 0.4568, + "mean_token_accuracy": 0.8433082342147827, + "num_tokens": 62050479.0, + "step": 51620 + }, + { + "entropy": 1.9418270736932755, + "epoch": 0.16004848263157773, + "grad_norm": 9.070456504821777, + "learning_rate": 6.3236977618153725e-06, + "loss": 0.5269, + "mean_token_accuracy": 0.8335362508893013, + "num_tokens": 62062292.0, + "step": 51630 + }, + { + "entropy": 1.8717383340001106, + "epoch": 0.16007948175662742, + "grad_norm": 4.403903961181641, + "learning_rate": 6.323085433542092e-06, + "loss": 0.4676, + "mean_token_accuracy": 0.8389642179012299, + "num_tokens": 62074628.0, + "step": 51640 + }, + { + "entropy": 1.9106849119067193, + "epoch": 0.16011048088167712, + "grad_norm": 9.37865924835205, + "learning_rate": 6.3224732831109535e-06, + "loss": 0.5259, + "mean_token_accuracy": 0.8324133858084679, + "num_tokens": 62086818.0, + "step": 51650 + }, + { + "entropy": 1.885637989640236, + "epoch": 0.16014148000672682, + "grad_norm": 4.711241722106934, + "learning_rate": 6.321861310435887e-06, + "loss": 0.4828, + "mean_token_accuracy": 0.8371865153312683, + "num_tokens": 62098896.0, + "step": 51660 + }, + { + "entropy": 1.9405640929937362, + "epoch": 0.1601724791317765, + "grad_norm": 8.641385078430176, + "learning_rate": 6.3212495154308804e-06, + "loss": 0.5958, + "mean_token_accuracy": 0.8278973504900933, + "num_tokens": 62109979.0, + "step": 51670 + }, + { + "entropy": 1.934703852236271, + "epoch": 0.1602034782568262, + "grad_norm": 9.276317596435547, + "learning_rate": 6.32063789800998e-06, + "loss": 0.5185, + "mean_token_accuracy": 0.8384066224098206, + "num_tokens": 62121358.0, + "step": 51680 + }, + { + "entropy": 1.885528053343296, + "epoch": 0.1602344773818759, + "grad_norm": 8.702717781066895, + "learning_rate": 6.320026458087292e-06, + "loss": 0.5103, + "mean_token_accuracy": 0.8399693235754967, + "num_tokens": 62132820.0, + "step": 51690 + }, + { + "entropy": 1.955546210706234, + "epoch": 0.1602654765069256, + "grad_norm": 9.13160228729248, + "learning_rate": 6.319415195576981e-06, + "loss": 0.5602, + "mean_token_accuracy": 0.8254757478833199, + "num_tokens": 62143735.0, + "step": 51700 + }, + { + "entropy": 1.9146698236465454, + "epoch": 0.1602964756319753, + "grad_norm": 8.36129379272461, + "learning_rate": 6.318804110393267e-06, + "loss": 0.4777, + "mean_token_accuracy": 0.8527568742632866, + "num_tokens": 62155133.0, + "step": 51710 + }, + { + "entropy": 1.859630098938942, + "epoch": 0.160327474757025, + "grad_norm": 4.852166175842285, + "learning_rate": 6.318193202450428e-06, + "loss": 0.5001, + "mean_token_accuracy": 0.8412252008914948, + "num_tokens": 62167946.0, + "step": 51720 + }, + { + "entropy": 1.9424054473638535, + "epoch": 0.16035847388207466, + "grad_norm": 10.02141284942627, + "learning_rate": 6.317582471662803e-06, + "loss": 0.5314, + "mean_token_accuracy": 0.839588788151741, + "num_tokens": 62179109.0, + "step": 51730 + }, + { + "entropy": 1.9220136627554893, + "epoch": 0.16038947300712436, + "grad_norm": 12.059248924255371, + "learning_rate": 6.3169719179447885e-06, + "loss": 0.5252, + "mean_token_accuracy": 0.8367831781506538, + "num_tokens": 62190468.0, + "step": 51740 + }, + { + "entropy": 1.85854529440403, + "epoch": 0.16042047213217406, + "grad_norm": 9.116106986999512, + "learning_rate": 6.316361541210837e-06, + "loss": 0.4878, + "mean_token_accuracy": 0.8309743300080299, + "num_tokens": 62202630.0, + "step": 51750 + }, + { + "entropy": 1.9260679721832275, + "epoch": 0.16045147125722375, + "grad_norm": 9.218046188354492, + "learning_rate": 6.315751341375458e-06, + "loss": 0.5193, + "mean_token_accuracy": 0.8390769004821778, + "num_tokens": 62214408.0, + "step": 51760 + }, + { + "entropy": 1.9168349742889403, + "epoch": 0.16048247038227345, + "grad_norm": 8.362269401550293, + "learning_rate": 6.315141318353224e-06, + "loss": 0.5034, + "mean_token_accuracy": 0.8344539046287537, + "num_tokens": 62226824.0, + "step": 51770 + }, + { + "entropy": 1.847986987233162, + "epoch": 0.16051346950732315, + "grad_norm": 8.942157745361328, + "learning_rate": 6.314531472058758e-06, + "loss": 0.5114, + "mean_token_accuracy": 0.8420057892799377, + "num_tokens": 62239194.0, + "step": 51780 + }, + { + "entropy": 1.890682005882263, + "epoch": 0.16054446863237284, + "grad_norm": 8.703591346740723, + "learning_rate": 6.313921802406747e-06, + "loss": 0.5093, + "mean_token_accuracy": 0.8423530831933022, + "num_tokens": 62251150.0, + "step": 51790 + }, + { + "entropy": 1.8866778627038001, + "epoch": 0.16057546775742254, + "grad_norm": 9.444439888000488, + "learning_rate": 6.313312309311932e-06, + "loss": 0.5286, + "mean_token_accuracy": 0.8364355772733688, + "num_tokens": 62262711.0, + "step": 51800 + }, + { + "entropy": 1.8455047011375427, + "epoch": 0.16060646688247224, + "grad_norm": 8.43193244934082, + "learning_rate": 6.312702992689113e-06, + "loss": 0.4802, + "mean_token_accuracy": 0.8457055032253266, + "num_tokens": 62274824.0, + "step": 51810 + }, + { + "entropy": 1.850141017138958, + "epoch": 0.16063746600752193, + "grad_norm": 10.177520751953125, + "learning_rate": 6.312093852453148e-06, + "loss": 0.4953, + "mean_token_accuracy": 0.832466046512127, + "num_tokens": 62286953.0, + "step": 51820 + }, + { + "entropy": 1.8169025957584382, + "epoch": 0.16066846513257163, + "grad_norm": 2.502871036529541, + "learning_rate": 6.31148488851895e-06, + "loss": 0.4419, + "mean_token_accuracy": 0.8574206903576851, + "num_tokens": 62300292.0, + "step": 51830 + }, + { + "entropy": 1.8187232732772827, + "epoch": 0.16069946425762133, + "grad_norm": 7.720560550689697, + "learning_rate": 6.3108761008014915e-06, + "loss": 0.4664, + "mean_token_accuracy": 0.8429305925965309, + "num_tokens": 62312236.0, + "step": 51840 + }, + { + "entropy": 1.879136349260807, + "epoch": 0.16073046338267102, + "grad_norm": 7.850799083709717, + "learning_rate": 6.310267489215804e-06, + "loss": 0.4894, + "mean_token_accuracy": 0.8317036911845207, + "num_tokens": 62324178.0, + "step": 51850 + }, + { + "entropy": 1.7745792135596274, + "epoch": 0.16076146250772072, + "grad_norm": 4.761654853820801, + "learning_rate": 6.309659053676972e-06, + "loss": 0.4362, + "mean_token_accuracy": 0.8426973283290863, + "num_tokens": 62338322.0, + "step": 51860 + }, + { + "entropy": 1.8939239963889123, + "epoch": 0.16079246163277042, + "grad_norm": 3.76540470123291, + "learning_rate": 6.309050794100141e-06, + "loss": 0.5121, + "mean_token_accuracy": 0.8374624997377396, + "num_tokens": 62349756.0, + "step": 51870 + }, + { + "entropy": 1.9361850887537002, + "epoch": 0.1608234607578201, + "grad_norm": 7.414719581604004, + "learning_rate": 6.308442710400513e-06, + "loss": 0.5092, + "mean_token_accuracy": 0.8424745500087738, + "num_tokens": 62360166.0, + "step": 51880 + }, + { + "entropy": 1.816766545176506, + "epoch": 0.1608544598828698, + "grad_norm": 9.440898895263672, + "learning_rate": 6.3078348024933465e-06, + "loss": 0.4855, + "mean_token_accuracy": 0.8390632942318916, + "num_tokens": 62373447.0, + "step": 51890 + }, + { + "entropy": 1.9127420842647553, + "epoch": 0.1608854590079195, + "grad_norm": 8.341848373413086, + "learning_rate": 6.307227070293956e-06, + "loss": 0.5394, + "mean_token_accuracy": 0.8356220990419387, + "num_tokens": 62384460.0, + "step": 51900 + }, + { + "entropy": 1.8038682281970977, + "epoch": 0.1609164581329692, + "grad_norm": 3.9414637088775635, + "learning_rate": 6.3066195137177146e-06, + "loss": 0.4422, + "mean_token_accuracy": 0.841181357204914, + "num_tokens": 62397300.0, + "step": 51910 + }, + { + "entropy": 1.9235049843788148, + "epoch": 0.1609474572580189, + "grad_norm": 4.492739677429199, + "learning_rate": 6.306012132680054e-06, + "loss": 0.5551, + "mean_token_accuracy": 0.8201109856367111, + "num_tokens": 62409356.0, + "step": 51920 + }, + { + "entropy": 1.8290775090456008, + "epoch": 0.1609784563830686, + "grad_norm": 3.958709955215454, + "learning_rate": 6.3054049270964605e-06, + "loss": 0.4805, + "mean_token_accuracy": 0.83324736058712, + "num_tokens": 62423295.0, + "step": 51930 + }, + { + "entropy": 1.933160249888897, + "epoch": 0.1610094555081183, + "grad_norm": 7.670217037200928, + "learning_rate": 6.304797896882477e-06, + "loss": 0.5262, + "mean_token_accuracy": 0.8253769144415856, + "num_tokens": 62434328.0, + "step": 51940 + }, + { + "entropy": 1.8668658897280692, + "epoch": 0.161040454633168, + "grad_norm": 4.041083812713623, + "learning_rate": 6.3041910419537055e-06, + "loss": 0.4744, + "mean_token_accuracy": 0.8454795464873314, + "num_tokens": 62446093.0, + "step": 51950 + }, + { + "entropy": 1.9969589874148368, + "epoch": 0.1610714537582177, + "grad_norm": 10.19937801361084, + "learning_rate": 6.3035843622258045e-06, + "loss": 0.5644, + "mean_token_accuracy": 0.8266312971711158, + "num_tokens": 62457402.0, + "step": 51960 + }, + { + "entropy": 1.911833395063877, + "epoch": 0.16110245288326738, + "grad_norm": 9.627182006835938, + "learning_rate": 6.302977857614485e-06, + "loss": 0.5166, + "mean_token_accuracy": 0.8356106966733933, + "num_tokens": 62470041.0, + "step": 51970 + }, + { + "entropy": 1.9067854672670363, + "epoch": 0.16113345200831705, + "grad_norm": 3.0671586990356445, + "learning_rate": 6.302371528035522e-06, + "loss": 0.52, + "mean_token_accuracy": 0.8428362473845482, + "num_tokens": 62482431.0, + "step": 51980 + }, + { + "entropy": 1.9820441797375679, + "epoch": 0.16116445113336675, + "grad_norm": 11.670134544372559, + "learning_rate": 6.301765373404741e-06, + "loss": 0.5354, + "mean_token_accuracy": 0.8310918644070625, + "num_tokens": 62494113.0, + "step": 51990 + }, + { + "entropy": 1.9489408016204834, + "epoch": 0.16119545025841645, + "grad_norm": 9.345502853393555, + "learning_rate": 6.301159393638029e-06, + "loss": 0.5212, + "mean_token_accuracy": 0.8365255728363991, + "num_tokens": 62505895.0, + "step": 52000 + }, + { + "entropy": 1.9901140004396438, + "epoch": 0.16122644938346614, + "grad_norm": 8.158758163452148, + "learning_rate": 6.300553588651326e-06, + "loss": 0.5582, + "mean_token_accuracy": 0.8245610594749451, + "num_tokens": 62516556.0, + "step": 52010 + }, + { + "entropy": 1.8812553852796554, + "epoch": 0.16125744850851584, + "grad_norm": 4.510929584503174, + "learning_rate": 6.2999479583606295e-06, + "loss": 0.4991, + "mean_token_accuracy": 0.8425406232476235, + "num_tokens": 62529781.0, + "step": 52020 + }, + { + "entropy": 1.9245860621333122, + "epoch": 0.16128844763356553, + "grad_norm": 5.81223726272583, + "learning_rate": 6.299342502681993e-06, + "loss": 0.5076, + "mean_token_accuracy": 0.8385741889476777, + "num_tokens": 62541343.0, + "step": 52030 + }, + { + "entropy": 1.8486530616879464, + "epoch": 0.16131944675861523, + "grad_norm": 9.269059181213379, + "learning_rate": 6.298737221531529e-06, + "loss": 0.4251, + "mean_token_accuracy": 0.8453450709581375, + "num_tokens": 62554053.0, + "step": 52040 + }, + { + "entropy": 1.9573129430413245, + "epoch": 0.16135044588366493, + "grad_norm": 4.898494720458984, + "learning_rate": 6.298132114825405e-06, + "loss": 0.5759, + "mean_token_accuracy": 0.8290790885686874, + "num_tokens": 62565444.0, + "step": 52050 + }, + { + "entropy": 1.9822146385908126, + "epoch": 0.16138144500871462, + "grad_norm": 9.721142768859863, + "learning_rate": 6.2975271824798425e-06, + "loss": 0.5361, + "mean_token_accuracy": 0.8361834734678268, + "num_tokens": 62576680.0, + "step": 52060 + }, + { + "entropy": 1.9362813726067543, + "epoch": 0.16141244413376432, + "grad_norm": 9.152141571044922, + "learning_rate": 6.2969224244111225e-06, + "loss": 0.524, + "mean_token_accuracy": 0.8337451457977295, + "num_tokens": 62588501.0, + "step": 52070 + }, + { + "entropy": 1.9137926280498505, + "epoch": 0.16144344325881402, + "grad_norm": 7.9217047691345215, + "learning_rate": 6.296317840535582e-06, + "loss": 0.4653, + "mean_token_accuracy": 0.8420775726437568, + "num_tokens": 62601151.0, + "step": 52080 + }, + { + "entropy": 2.0264596685767176, + "epoch": 0.16147444238386371, + "grad_norm": 9.242731094360352, + "learning_rate": 6.295713430769611e-06, + "loss": 0.5702, + "mean_token_accuracy": 0.8302224919199943, + "num_tokens": 62612685.0, + "step": 52090 + }, + { + "entropy": 1.9726679027080536, + "epoch": 0.1615054415089134, + "grad_norm": 3.5792272090911865, + "learning_rate": 6.29510919502966e-06, + "loss": 0.5095, + "mean_token_accuracy": 0.8441024616360664, + "num_tokens": 62624794.0, + "step": 52100 + }, + { + "entropy": 2.029518485069275, + "epoch": 0.1615364406339631, + "grad_norm": 9.071422576904297, + "learning_rate": 6.294505133232234e-06, + "loss": 0.6116, + "mean_token_accuracy": 0.815070490539074, + "num_tokens": 62636376.0, + "step": 52110 + }, + { + "entropy": 1.9669608414173125, + "epoch": 0.1615674397590128, + "grad_norm": 8.340578079223633, + "learning_rate": 6.293901245293893e-06, + "loss": 0.5875, + "mean_token_accuracy": 0.827280244231224, + "num_tokens": 62648432.0, + "step": 52120 + }, + { + "entropy": 1.9860535800457, + "epoch": 0.1615984388840625, + "grad_norm": 8.199106216430664, + "learning_rate": 6.293297531131253e-06, + "loss": 0.5733, + "mean_token_accuracy": 0.83171975761652, + "num_tokens": 62660966.0, + "step": 52130 + }, + { + "entropy": 2.0739176899194716, + "epoch": 0.1616294380091122, + "grad_norm": 10.330364227294922, + "learning_rate": 6.292693990660986e-06, + "loss": 0.5954, + "mean_token_accuracy": 0.8177470460534095, + "num_tokens": 62671756.0, + "step": 52140 + }, + { + "entropy": 1.9170552849769593, + "epoch": 0.1616604371341619, + "grad_norm": 8.208378791809082, + "learning_rate": 6.292090623799823e-06, + "loss": 0.4852, + "mean_token_accuracy": 0.8416977405548096, + "num_tokens": 62683638.0, + "step": 52150 + }, + { + "entropy": 1.8894746892154217, + "epoch": 0.1616914362592116, + "grad_norm": 6.65895414352417, + "learning_rate": 6.291487430464548e-06, + "loss": 0.4755, + "mean_token_accuracy": 0.845618699491024, + "num_tokens": 62696586.0, + "step": 52160 + }, + { + "entropy": 1.8414909780025481, + "epoch": 0.1617224353842613, + "grad_norm": 9.130861282348633, + "learning_rate": 6.290884410572e-06, + "loss": 0.4207, + "mean_token_accuracy": 0.8525190591812134, + "num_tokens": 62709495.0, + "step": 52170 + }, + { + "entropy": 1.9621912389993668, + "epoch": 0.16175343450931098, + "grad_norm": 9.641398429870605, + "learning_rate": 6.290281564039078e-06, + "loss": 0.551, + "mean_token_accuracy": 0.8189822494983673, + "num_tokens": 62721137.0, + "step": 52180 + }, + { + "entropy": 1.869258552789688, + "epoch": 0.16178443363436068, + "grad_norm": 5.695123672485352, + "learning_rate": 6.28967889078273e-06, + "loss": 0.5037, + "mean_token_accuracy": 0.8320344671607017, + "num_tokens": 62734333.0, + "step": 52190 + }, + { + "entropy": 1.860531361401081, + "epoch": 0.16181543275941038, + "grad_norm": 7.216394424438477, + "learning_rate": 6.289076390719966e-06, + "loss": 0.4677, + "mean_token_accuracy": 0.8472234353423118, + "num_tokens": 62746055.0, + "step": 52200 + }, + { + "entropy": 1.9746491819620133, + "epoch": 0.16184643188446007, + "grad_norm": 9.281171798706055, + "learning_rate": 6.2884740637678486e-06, + "loss": 0.5562, + "mean_token_accuracy": 0.8372055932879447, + "num_tokens": 62757053.0, + "step": 52210 + }, + { + "entropy": 1.9339981719851493, + "epoch": 0.16187743100950977, + "grad_norm": 10.073769569396973, + "learning_rate": 6.2878719098434975e-06, + "loss": 0.577, + "mean_token_accuracy": 0.8267990037798881, + "num_tokens": 62768818.0, + "step": 52220 + }, + { + "entropy": 1.75873264670372, + "epoch": 0.16190843013455944, + "grad_norm": 3.918210744857788, + "learning_rate": 6.287269928864085e-06, + "loss": 0.4269, + "mean_token_accuracy": 0.8522544384002686, + "num_tokens": 62782705.0, + "step": 52230 + }, + { + "entropy": 1.9759118229150772, + "epoch": 0.16193942925960914, + "grad_norm": 8.20750904083252, + "learning_rate": 6.286668120746842e-06, + "loss": 0.5464, + "mean_token_accuracy": 0.8221010401844978, + "num_tokens": 62794524.0, + "step": 52240 + }, + { + "entropy": 1.8256248250603675, + "epoch": 0.16197042838465883, + "grad_norm": 3.0523853302001953, + "learning_rate": 6.286066485409056e-06, + "loss": 0.4388, + "mean_token_accuracy": 0.849830961227417, + "num_tokens": 62807370.0, + "step": 52250 + }, + { + "entropy": 1.864981435239315, + "epoch": 0.16200142750970853, + "grad_norm": 8.436219215393066, + "learning_rate": 6.285465022768064e-06, + "loss": 0.524, + "mean_token_accuracy": 0.8273715361952781, + "num_tokens": 62820130.0, + "step": 52260 + }, + { + "entropy": 1.9281801611185074, + "epoch": 0.16203242663475823, + "grad_norm": 8.644983291625977, + "learning_rate": 6.284863732741263e-06, + "loss": 0.4807, + "mean_token_accuracy": 0.8462273836135864, + "num_tokens": 62832096.0, + "step": 52270 + }, + { + "entropy": 1.9527894288301468, + "epoch": 0.16206342575980792, + "grad_norm": 9.219314575195312, + "learning_rate": 6.284262615246107e-06, + "loss": 0.5651, + "mean_token_accuracy": 0.8327635273337364, + "num_tokens": 62843243.0, + "step": 52280 + }, + { + "entropy": 1.823768149316311, + "epoch": 0.16209442488485762, + "grad_norm": 3.6631810665130615, + "learning_rate": 6.283661670200099e-06, + "loss": 0.4393, + "mean_token_accuracy": 0.8509679839015007, + "num_tokens": 62856797.0, + "step": 52290 + }, + { + "entropy": 1.9193030267953872, + "epoch": 0.16212542400990732, + "grad_norm": 19.060598373413086, + "learning_rate": 6.283060897520804e-06, + "loss": 0.5283, + "mean_token_accuracy": 0.8333917066454888, + "num_tokens": 62869635.0, + "step": 52300 + }, + { + "entropy": 1.9955947354435921, + "epoch": 0.162156423134957, + "grad_norm": 4.73981237411499, + "learning_rate": 6.282460297125835e-06, + "loss": 0.5073, + "mean_token_accuracy": 0.8443202391266823, + "num_tokens": 62881179.0, + "step": 52310 + }, + { + "entropy": 1.9485764712095262, + "epoch": 0.1621874222600067, + "grad_norm": 8.990504264831543, + "learning_rate": 6.281859868932869e-06, + "loss": 0.5494, + "mean_token_accuracy": 0.8265508517622948, + "num_tokens": 62893144.0, + "step": 52320 + }, + { + "entropy": 1.9622452184557915, + "epoch": 0.1622184213850564, + "grad_norm": 8.875653266906738, + "learning_rate": 6.281259612859629e-06, + "loss": 0.5053, + "mean_token_accuracy": 0.8386439487338067, + "num_tokens": 62904857.0, + "step": 52330 + }, + { + "entropy": 1.889234210550785, + "epoch": 0.1622494205101061, + "grad_norm": 10.868480682373047, + "learning_rate": 6.2806595288239e-06, + "loss": 0.4941, + "mean_token_accuracy": 0.8372628569602967, + "num_tokens": 62917145.0, + "step": 52340 + }, + { + "entropy": 1.8264701277017594, + "epoch": 0.1622804196351558, + "grad_norm": 9.613685607910156, + "learning_rate": 6.2800596167435165e-06, + "loss": 0.4617, + "mean_token_accuracy": 0.850235678255558, + "num_tokens": 62930488.0, + "step": 52350 + }, + { + "entropy": 1.854723860323429, + "epoch": 0.1623114187602055, + "grad_norm": 9.742049217224121, + "learning_rate": 6.279459876536374e-06, + "loss": 0.4825, + "mean_token_accuracy": 0.8460089027881622, + "num_tokens": 62942658.0, + "step": 52360 + }, + { + "entropy": 1.9286055207252502, + "epoch": 0.1623424178852552, + "grad_norm": 4.1667304039001465, + "learning_rate": 6.278860308120416e-06, + "loss": 0.5199, + "mean_token_accuracy": 0.8262563273310661, + "num_tokens": 62953922.0, + "step": 52370 + }, + { + "entropy": 1.7510716319084167, + "epoch": 0.1623734170103049, + "grad_norm": 8.469949722290039, + "learning_rate": 6.278260911413646e-06, + "loss": 0.415, + "mean_token_accuracy": 0.8467779159545898, + "num_tokens": 62967483.0, + "step": 52380 + }, + { + "entropy": 1.9073357090353966, + "epoch": 0.16240441613535458, + "grad_norm": 8.063495635986328, + "learning_rate": 6.27766168633412e-06, + "loss": 0.5298, + "mean_token_accuracy": 0.8321097016334533, + "num_tokens": 62979720.0, + "step": 52390 + }, + { + "entropy": 1.9408669352531434, + "epoch": 0.16243541526040428, + "grad_norm": 10.30274772644043, + "learning_rate": 6.277062632799949e-06, + "loss": 0.515, + "mean_token_accuracy": 0.8371255546808243, + "num_tokens": 62991264.0, + "step": 52400 + }, + { + "entropy": 1.9547599002718925, + "epoch": 0.16246641438545398, + "grad_norm": 8.85878849029541, + "learning_rate": 6.276463750729301e-06, + "loss": 0.5293, + "mean_token_accuracy": 0.8315024048089981, + "num_tokens": 63003018.0, + "step": 52410 + }, + { + "entropy": 1.9163083508610725, + "epoch": 0.16249741351050367, + "grad_norm": 9.163949966430664, + "learning_rate": 6.2758650400403964e-06, + "loss": 0.4808, + "mean_token_accuracy": 0.8458400592207909, + "num_tokens": 63014741.0, + "step": 52420 + }, + { + "entropy": 1.9846520096063613, + "epoch": 0.16252841263555337, + "grad_norm": 7.502941131591797, + "learning_rate": 6.275266500651508e-06, + "loss": 0.5835, + "mean_token_accuracy": 0.8262620836496353, + "num_tokens": 63025808.0, + "step": 52430 + }, + { + "entropy": 1.8606478095054626, + "epoch": 0.16255941176060307, + "grad_norm": 4.25178861618042, + "learning_rate": 6.274668132480967e-06, + "loss": 0.4987, + "mean_token_accuracy": 0.8410593420267105, + "num_tokens": 63038607.0, + "step": 52440 + }, + { + "entropy": 1.8719539806246757, + "epoch": 0.16259041088565276, + "grad_norm": 4.510512828826904, + "learning_rate": 6.274069935447157e-06, + "loss": 0.491, + "mean_token_accuracy": 0.8395673424005509, + "num_tokens": 63050950.0, + "step": 52450 + }, + { + "entropy": 1.8893551722168922, + "epoch": 0.16262141001070246, + "grad_norm": 3.8029367923736572, + "learning_rate": 6.273471909468518e-06, + "loss": 0.487, + "mean_token_accuracy": 0.8418749555945396, + "num_tokens": 63063475.0, + "step": 52460 + }, + { + "entropy": 1.8528440594673157, + "epoch": 0.16265240913575213, + "grad_norm": 8.126982688903809, + "learning_rate": 6.272874054463543e-06, + "loss": 0.458, + "mean_token_accuracy": 0.8420227542519569, + "num_tokens": 63076738.0, + "step": 52470 + }, + { + "entropy": 1.9618181601166724, + "epoch": 0.16268340826080183, + "grad_norm": 11.542217254638672, + "learning_rate": 6.272276370350776e-06, + "loss": 0.5376, + "mean_token_accuracy": 0.8369494050741195, + "num_tokens": 63087700.0, + "step": 52480 + }, + { + "entropy": 1.9238998390734197, + "epoch": 0.16271440738585152, + "grad_norm": 7.640765190124512, + "learning_rate": 6.271678857048824e-06, + "loss": 0.5036, + "mean_token_accuracy": 0.8411808148026466, + "num_tokens": 63099944.0, + "step": 52490 + }, + { + "entropy": 1.888495209813118, + "epoch": 0.16274540651090122, + "grad_norm": 7.534304141998291, + "learning_rate": 6.271081514476341e-06, + "loss": 0.4903, + "mean_token_accuracy": 0.8466553881764411, + "num_tokens": 63112697.0, + "step": 52500 + }, + { + "entropy": 1.9347805023193358, + "epoch": 0.16277640563595092, + "grad_norm": 7.860371112823486, + "learning_rate": 6.270484342552038e-06, + "loss": 0.535, + "mean_token_accuracy": 0.8298186644911766, + "num_tokens": 63124515.0, + "step": 52510 + }, + { + "entropy": 1.8718351736664771, + "epoch": 0.1628074047610006, + "grad_norm": 11.632791519165039, + "learning_rate": 6.269887341194678e-06, + "loss": 0.4855, + "mean_token_accuracy": 0.8416498139500618, + "num_tokens": 63137607.0, + "step": 52520 + }, + { + "entropy": 1.8613640531897544, + "epoch": 0.1628384038860503, + "grad_norm": 5.130488872528076, + "learning_rate": 6.269290510323079e-06, + "loss": 0.5124, + "mean_token_accuracy": 0.8313415363430977, + "num_tokens": 63150710.0, + "step": 52530 + }, + { + "entropy": 1.8332793086767196, + "epoch": 0.1628694030111, + "grad_norm": 8.796858787536621, + "learning_rate": 6.2686938498561155e-06, + "loss": 0.4807, + "mean_token_accuracy": 0.8430872440338135, + "num_tokens": 63163425.0, + "step": 52540 + }, + { + "entropy": 1.8761805295944214, + "epoch": 0.1629004021361497, + "grad_norm": 4.210749626159668, + "learning_rate": 6.268097359712715e-06, + "loss": 0.4861, + "mean_token_accuracy": 0.8424938842654228, + "num_tokens": 63175765.0, + "step": 52550 + }, + { + "entropy": 1.9241129815578462, + "epoch": 0.1629314012611994, + "grad_norm": 7.9560465812683105, + "learning_rate": 6.267501039811856e-06, + "loss": 0.5341, + "mean_token_accuracy": 0.836355759203434, + "num_tokens": 63187161.0, + "step": 52560 + }, + { + "entropy": 1.9873510360717774, + "epoch": 0.1629624003862491, + "grad_norm": 8.924347877502441, + "learning_rate": 6.2669048900725745e-06, + "loss": 0.6066, + "mean_token_accuracy": 0.8214996859431267, + "num_tokens": 63198468.0, + "step": 52570 + }, + { + "entropy": 1.9314937353134156, + "epoch": 0.1629933995112988, + "grad_norm": 5.659384727478027, + "learning_rate": 6.266308910413959e-06, + "loss": 0.5159, + "mean_token_accuracy": 0.8333692610263824, + "num_tokens": 63210134.0, + "step": 52580 + }, + { + "entropy": 2.028110182285309, + "epoch": 0.1630243986363485, + "grad_norm": 8.73976993560791, + "learning_rate": 6.2657131007551516e-06, + "loss": 0.6023, + "mean_token_accuracy": 0.8240557476878166, + "num_tokens": 63221318.0, + "step": 52590 + }, + { + "entropy": 1.8795430675148963, + "epoch": 0.16305539776139819, + "grad_norm": 7.525078296661377, + "learning_rate": 6.265117461015348e-06, + "loss": 0.4751, + "mean_token_accuracy": 0.8401163816452026, + "num_tokens": 63234281.0, + "step": 52600 + }, + { + "entropy": 1.9891944900155067, + "epoch": 0.16308639688644788, + "grad_norm": 8.702574729919434, + "learning_rate": 6.2645219911138e-06, + "loss": 0.5248, + "mean_token_accuracy": 0.8414581835269928, + "num_tokens": 63245528.0, + "step": 52610 + }, + { + "entropy": 1.9499772146344185, + "epoch": 0.16311739601149758, + "grad_norm": 8.397680282592773, + "learning_rate": 6.263926690969809e-06, + "loss": 0.5396, + "mean_token_accuracy": 0.8338237330317497, + "num_tokens": 63257825.0, + "step": 52620 + }, + { + "entropy": 1.9680059745907783, + "epoch": 0.16314839513654728, + "grad_norm": 8.269797325134277, + "learning_rate": 6.263331560502734e-06, + "loss": 0.5671, + "mean_token_accuracy": 0.8288321003317833, + "num_tokens": 63269246.0, + "step": 52630 + }, + { + "entropy": 1.9180641785264014, + "epoch": 0.16317939426159697, + "grad_norm": 7.361063003540039, + "learning_rate": 6.262736599631985e-06, + "loss": 0.4958, + "mean_token_accuracy": 0.8423007354140282, + "num_tokens": 63281078.0, + "step": 52640 + }, + { + "entropy": 1.9701405078172685, + "epoch": 0.16321039338664667, + "grad_norm": 9.259462356567383, + "learning_rate": 6.262141808277028e-06, + "loss": 0.5578, + "mean_token_accuracy": 0.8284604266285897, + "num_tokens": 63292492.0, + "step": 52650 + }, + { + "entropy": 1.8732146829366685, + "epoch": 0.16324139251169636, + "grad_norm": 4.93553352355957, + "learning_rate": 6.261547186357378e-06, + "loss": 0.5442, + "mean_token_accuracy": 0.8330485403537751, + "num_tokens": 63305365.0, + "step": 52660 + }, + { + "entropy": 1.8874418213963509, + "epoch": 0.16327239163674606, + "grad_norm": 12.51389217376709, + "learning_rate": 6.260952733792611e-06, + "loss": 0.4924, + "mean_token_accuracy": 0.8417252108454705, + "num_tokens": 63317381.0, + "step": 52670 + }, + { + "entropy": 1.9065624982118607, + "epoch": 0.16330339076179576, + "grad_norm": 4.349983215332031, + "learning_rate": 6.26035845050235e-06, + "loss": 0.4946, + "mean_token_accuracy": 0.83489740639925, + "num_tokens": 63329554.0, + "step": 52680 + }, + { + "entropy": 1.9560609757900238, + "epoch": 0.16333438988684545, + "grad_norm": 8.065526008605957, + "learning_rate": 6.259764336406272e-06, + "loss": 0.5565, + "mean_token_accuracy": 0.8307097449898719, + "num_tokens": 63341290.0, + "step": 52690 + }, + { + "entropy": 1.9741875648498535, + "epoch": 0.16336538901189515, + "grad_norm": 4.957176685333252, + "learning_rate": 6.259170391424109e-06, + "loss": 0.5308, + "mean_token_accuracy": 0.8235329478979111, + "num_tokens": 63352756.0, + "step": 52700 + }, + { + "entropy": 1.897114197909832, + "epoch": 0.16339638813694485, + "grad_norm": 7.078670024871826, + "learning_rate": 6.25857661547565e-06, + "loss": 0.4845, + "mean_token_accuracy": 0.8442688778042793, + "num_tokens": 63364598.0, + "step": 52710 + }, + { + "entropy": 1.9124154165387153, + "epoch": 0.16342738726199452, + "grad_norm": 8.284811973571777, + "learning_rate": 6.257983008480728e-06, + "loss": 0.5272, + "mean_token_accuracy": 0.8364136472344399, + "num_tokens": 63376703.0, + "step": 52720 + }, + { + "entropy": 1.9177636936306954, + "epoch": 0.1634583863870442, + "grad_norm": 6.819045066833496, + "learning_rate": 6.257389570359238e-06, + "loss": 0.4727, + "mean_token_accuracy": 0.846918910741806, + "num_tokens": 63389432.0, + "step": 52730 + }, + { + "entropy": 1.9418374314904212, + "epoch": 0.1634893855120939, + "grad_norm": 9.605525970458984, + "learning_rate": 6.256796301031124e-06, + "loss": 0.5572, + "mean_token_accuracy": 0.8311137303709983, + "num_tokens": 63401131.0, + "step": 52740 + }, + { + "entropy": 1.8737993687391281, + "epoch": 0.1635203846371436, + "grad_norm": 8.54281234741211, + "learning_rate": 6.256203200416383e-06, + "loss": 0.5133, + "mean_token_accuracy": 0.8313992112874985, + "num_tokens": 63413232.0, + "step": 52750 + }, + { + "entropy": 1.8898098900914193, + "epoch": 0.1635513837621933, + "grad_norm": 8.377137184143066, + "learning_rate": 6.255610268435066e-06, + "loss": 0.5053, + "mean_token_accuracy": 0.8360078081488609, + "num_tokens": 63425498.0, + "step": 52760 + }, + { + "entropy": 1.9383550241589547, + "epoch": 0.163582382887243, + "grad_norm": 8.232552528381348, + "learning_rate": 6.255017505007278e-06, + "loss": 0.5117, + "mean_token_accuracy": 0.8423770934343338, + "num_tokens": 63436970.0, + "step": 52770 + }, + { + "entropy": 1.9232452899217605, + "epoch": 0.1636133820122927, + "grad_norm": 7.204452991485596, + "learning_rate": 6.254424910053175e-06, + "loss": 0.5308, + "mean_token_accuracy": 0.8377453580498695, + "num_tokens": 63448237.0, + "step": 52780 + }, + { + "entropy": 1.9503621995449065, + "epoch": 0.1636443811373424, + "grad_norm": 7.743916034698486, + "learning_rate": 6.253832483492968e-06, + "loss": 0.604, + "mean_token_accuracy": 0.818188302218914, + "num_tokens": 63459542.0, + "step": 52790 + }, + { + "entropy": 1.9227148860692977, + "epoch": 0.1636753802623921, + "grad_norm": 8.97805404663086, + "learning_rate": 6.253240225246917e-06, + "loss": 0.5535, + "mean_token_accuracy": 0.835084454715252, + "num_tokens": 63470819.0, + "step": 52800 + }, + { + "entropy": 1.8931695908308028, + "epoch": 0.16370637938744179, + "grad_norm": 8.5215482711792, + "learning_rate": 6.25264813523534e-06, + "loss": 0.4692, + "mean_token_accuracy": 0.841637410223484, + "num_tokens": 63483021.0, + "step": 52810 + }, + { + "entropy": 1.9346808150410653, + "epoch": 0.16373737851249148, + "grad_norm": 3.57350754737854, + "learning_rate": 6.252056213378607e-06, + "loss": 0.5261, + "mean_token_accuracy": 0.8383998513221741, + "num_tokens": 63495186.0, + "step": 52820 + }, + { + "entropy": 1.8804344907402992, + "epoch": 0.16376837763754118, + "grad_norm": 8.589835166931152, + "learning_rate": 6.251464459597134e-06, + "loss": 0.4566, + "mean_token_accuracy": 0.8427335307002067, + "num_tokens": 63507869.0, + "step": 52830 + }, + { + "entropy": 1.8359813764691353, + "epoch": 0.16379937676259088, + "grad_norm": 9.559521675109863, + "learning_rate": 6.2508728738114e-06, + "loss": 0.4589, + "mean_token_accuracy": 0.8516914382576942, + "num_tokens": 63519894.0, + "step": 52840 + }, + { + "entropy": 1.9041066259145736, + "epoch": 0.16383037588764057, + "grad_norm": 4.24696683883667, + "learning_rate": 6.250281455941929e-06, + "loss": 0.5371, + "mean_token_accuracy": 0.8393527343869209, + "num_tokens": 63532566.0, + "step": 52850 + }, + { + "entropy": 1.857696245610714, + "epoch": 0.16386137501269027, + "grad_norm": 13.18340015411377, + "learning_rate": 6.249690205909301e-06, + "loss": 0.4958, + "mean_token_accuracy": 0.8314845860004425, + "num_tokens": 63545311.0, + "step": 52860 + }, + { + "entropy": 1.9533265694975852, + "epoch": 0.16389237413773997, + "grad_norm": 8.314949035644531, + "learning_rate": 6.249099123634147e-06, + "loss": 0.5793, + "mean_token_accuracy": 0.8232409819960594, + "num_tokens": 63556591.0, + "step": 52870 + }, + { + "entropy": 1.9042376592755317, + "epoch": 0.16392337326278966, + "grad_norm": 7.390749454498291, + "learning_rate": 6.248508209037151e-06, + "loss": 0.5547, + "mean_token_accuracy": 0.827096089720726, + "num_tokens": 63568839.0, + "step": 52880 + }, + { + "entropy": 1.8934383913874626, + "epoch": 0.16395437238783936, + "grad_norm": 8.171467781066895, + "learning_rate": 6.2479174620390516e-06, + "loss": 0.5367, + "mean_token_accuracy": 0.8317389547824859, + "num_tokens": 63580636.0, + "step": 52890 + }, + { + "entropy": 1.8028965070843697, + "epoch": 0.16398537151288906, + "grad_norm": 9.49335765838623, + "learning_rate": 6.247326882560637e-06, + "loss": 0.5213, + "mean_token_accuracy": 0.8371429294347763, + "num_tokens": 63593629.0, + "step": 52900 + }, + { + "entropy": 1.848221817612648, + "epoch": 0.16401637063793875, + "grad_norm": 8.622106552124023, + "learning_rate": 6.246736470522748e-06, + "loss": 0.4548, + "mean_token_accuracy": 0.8468390449881553, + "num_tokens": 63606139.0, + "step": 52910 + }, + { + "entropy": 1.8176067188382148, + "epoch": 0.16404736976298845, + "grad_norm": 9.547903060913086, + "learning_rate": 6.2461462258462804e-06, + "loss": 0.4316, + "mean_token_accuracy": 0.854963929951191, + "num_tokens": 63619469.0, + "step": 52920 + }, + { + "entropy": 1.9598242774605752, + "epoch": 0.16407836888803815, + "grad_norm": 8.696121215820312, + "learning_rate": 6.245556148452177e-06, + "loss": 0.5457, + "mean_token_accuracy": 0.8299388602375984, + "num_tokens": 63630770.0, + "step": 52930 + }, + { + "entropy": 1.9861217468976975, + "epoch": 0.16410936801308784, + "grad_norm": 7.65268611907959, + "learning_rate": 6.244966238261442e-06, + "loss": 0.5992, + "mean_token_accuracy": 0.8203268766403198, + "num_tokens": 63641633.0, + "step": 52940 + }, + { + "entropy": 1.933871328830719, + "epoch": 0.16414036713813754, + "grad_norm": 4.0689263343811035, + "learning_rate": 6.2443764951951215e-06, + "loss": 0.5137, + "mean_token_accuracy": 0.8379737511277199, + "num_tokens": 63653263.0, + "step": 52950 + }, + { + "entropy": 1.924587444961071, + "epoch": 0.16417136626318724, + "grad_norm": 9.592199325561523, + "learning_rate": 6.24378691917432e-06, + "loss": 0.5423, + "mean_token_accuracy": 0.8331802487373352, + "num_tokens": 63664170.0, + "step": 52960 + }, + { + "entropy": 1.922967004776001, + "epoch": 0.1642023653882369, + "grad_norm": 3.9042298793792725, + "learning_rate": 6.2431975101201926e-06, + "loss": 0.534, + "mean_token_accuracy": 0.8360741794109344, + "num_tokens": 63675465.0, + "step": 52970 + }, + { + "entropy": 1.8445443019270897, + "epoch": 0.1642333645132866, + "grad_norm": 7.786768913269043, + "learning_rate": 6.242608267953947e-06, + "loss": 0.4816, + "mean_token_accuracy": 0.8440708309412003, + "num_tokens": 63688438.0, + "step": 52980 + }, + { + "entropy": 1.9318128436803819, + "epoch": 0.1642643636383363, + "grad_norm": 7.483189582824707, + "learning_rate": 6.242019192596842e-06, + "loss": 0.5058, + "mean_token_accuracy": 0.8481845185160637, + "num_tokens": 63700364.0, + "step": 52990 + }, + { + "entropy": 1.9713671594858169, + "epoch": 0.164295362763386, + "grad_norm": 7.716215133666992, + "learning_rate": 6.241430283970189e-06, + "loss": 0.5784, + "mean_token_accuracy": 0.8340044632554054, + "num_tokens": 63711655.0, + "step": 53000 + }, + { + "entropy": 1.916495531797409, + "epoch": 0.1643263618884357, + "grad_norm": 7.8675971031188965, + "learning_rate": 6.24084154199535e-06, + "loss": 0.5383, + "mean_token_accuracy": 0.8242413744330406, + "num_tokens": 63723415.0, + "step": 53010 + }, + { + "entropy": 1.8746812611818313, + "epoch": 0.1643573610134854, + "grad_norm": 4.13045072555542, + "learning_rate": 6.240252966593741e-06, + "loss": 0.4903, + "mean_token_accuracy": 0.8390706807374955, + "num_tokens": 63735381.0, + "step": 53020 + }, + { + "entropy": 1.9430684581398965, + "epoch": 0.16438836013853508, + "grad_norm": 8.874407768249512, + "learning_rate": 6.23966455768683e-06, + "loss": 0.5869, + "mean_token_accuracy": 0.8245584949851036, + "num_tokens": 63746701.0, + "step": 53030 + }, + { + "entropy": 1.9074441373348237, + "epoch": 0.16441935926358478, + "grad_norm": 3.734499454498291, + "learning_rate": 6.239076315196135e-06, + "loss": 0.5123, + "mean_token_accuracy": 0.8352471068501472, + "num_tokens": 63758681.0, + "step": 53040 + }, + { + "entropy": 1.9198872432112695, + "epoch": 0.16445035838863448, + "grad_norm": 7.779430866241455, + "learning_rate": 6.2384882390432265e-06, + "loss": 0.5311, + "mean_token_accuracy": 0.8286612689495086, + "num_tokens": 63770142.0, + "step": 53050 + }, + { + "entropy": 1.9856867283582686, + "epoch": 0.16448135751368417, + "grad_norm": 8.502123832702637, + "learning_rate": 6.2379003291497265e-06, + "loss": 0.5602, + "mean_token_accuracy": 0.8293860018253326, + "num_tokens": 63781495.0, + "step": 53060 + }, + { + "entropy": 1.8787162870168685, + "epoch": 0.16451235663873387, + "grad_norm": 3.1227447986602783, + "learning_rate": 6.237312585437309e-06, + "loss": 0.5165, + "mean_token_accuracy": 0.8432197883725167, + "num_tokens": 63793956.0, + "step": 53070 + }, + { + "entropy": 1.8861326739192008, + "epoch": 0.16454335576378357, + "grad_norm": 9.353623390197754, + "learning_rate": 6.236725007827702e-06, + "loss": 0.4855, + "mean_token_accuracy": 0.8440233051776886, + "num_tokens": 63806818.0, + "step": 53080 + }, + { + "entropy": 1.7985064759850502, + "epoch": 0.16457435488883326, + "grad_norm": 10.218011856079102, + "learning_rate": 6.23613759624268e-06, + "loss": 0.4521, + "mean_token_accuracy": 0.8478423848748207, + "num_tokens": 63820679.0, + "step": 53090 + }, + { + "entropy": 1.8236029118299484, + "epoch": 0.16460535401388296, + "grad_norm": 4.523934841156006, + "learning_rate": 6.235550350604071e-06, + "loss": 0.4179, + "mean_token_accuracy": 0.8463995724916458, + "num_tokens": 63833871.0, + "step": 53100 + }, + { + "entropy": 1.8776141807436943, + "epoch": 0.16463635313893266, + "grad_norm": 4.4371161460876465, + "learning_rate": 6.234963270833758e-06, + "loss": 0.474, + "mean_token_accuracy": 0.8449958503246308, + "num_tokens": 63845568.0, + "step": 53110 + }, + { + "entropy": 1.8257340744137764, + "epoch": 0.16466735226398235, + "grad_norm": 9.007696151733398, + "learning_rate": 6.234376356853673e-06, + "loss": 0.431, + "mean_token_accuracy": 0.8533379226922989, + "num_tokens": 63858077.0, + "step": 53120 + }, + { + "entropy": 1.8883861318230628, + "epoch": 0.16469835138903205, + "grad_norm": 8.40960693359375, + "learning_rate": 6.233789608585796e-06, + "loss": 0.5674, + "mean_token_accuracy": 0.8298670023679733, + "num_tokens": 63870245.0, + "step": 53130 + }, + { + "entropy": 1.993936476111412, + "epoch": 0.16472935051408175, + "grad_norm": 9.627812385559082, + "learning_rate": 6.233203025952166e-06, + "loss": 0.6206, + "mean_token_accuracy": 0.8264122769236565, + "num_tokens": 63881175.0, + "step": 53140 + }, + { + "entropy": 1.8338624104857444, + "epoch": 0.16476034963913144, + "grad_norm": 10.100017547607422, + "learning_rate": 6.232616608874865e-06, + "loss": 0.4989, + "mean_token_accuracy": 0.8356154605746269, + "num_tokens": 63894063.0, + "step": 53150 + }, + { + "entropy": 1.8793131604790687, + "epoch": 0.16479134876418114, + "grad_norm": 8.098730087280273, + "learning_rate": 6.232030357276034e-06, + "loss": 0.4788, + "mean_token_accuracy": 0.8461975052952766, + "num_tokens": 63905919.0, + "step": 53160 + }, + { + "entropy": 1.9642018005251884, + "epoch": 0.16482234788923084, + "grad_norm": 7.914994716644287, + "learning_rate": 6.231444271077859e-06, + "loss": 0.555, + "mean_token_accuracy": 0.829823549091816, + "num_tokens": 63916952.0, + "step": 53170 + }, + { + "entropy": 1.8831199273467063, + "epoch": 0.16485334701428053, + "grad_norm": 10.928045272827148, + "learning_rate": 6.23085835020258e-06, + "loss": 0.5219, + "mean_token_accuracy": 0.8363705515861511, + "num_tokens": 63928969.0, + "step": 53180 + }, + { + "entropy": 1.9851090759038925, + "epoch": 0.16488434613933023, + "grad_norm": 9.08737564086914, + "learning_rate": 6.230272594572488e-06, + "loss": 0.5901, + "mean_token_accuracy": 0.8261380925774574, + "num_tokens": 63939337.0, + "step": 53190 + }, + { + "entropy": 1.908937330543995, + "epoch": 0.16491534526437993, + "grad_norm": 11.058307647705078, + "learning_rate": 6.229687004109927e-06, + "loss": 0.5673, + "mean_token_accuracy": 0.8325806826353073, + "num_tokens": 63951300.0, + "step": 53200 + }, + { + "entropy": 1.9652216613292695, + "epoch": 0.16494634438942962, + "grad_norm": 9.384000778198242, + "learning_rate": 6.229101578737288e-06, + "loss": 0.5616, + "mean_token_accuracy": 0.8316969543695449, + "num_tokens": 63961965.0, + "step": 53210 + }, + { + "entropy": 1.9054602891206742, + "epoch": 0.1649773435144793, + "grad_norm": 4.693671226501465, + "learning_rate": 6.228516318377016e-06, + "loss": 0.5676, + "mean_token_accuracy": 0.8300312593579292, + "num_tokens": 63974012.0, + "step": 53220 + }, + { + "entropy": 1.87213394343853, + "epoch": 0.165008342639529, + "grad_norm": 8.082072257995605, + "learning_rate": 6.227931222951605e-06, + "loss": 0.522, + "mean_token_accuracy": 0.836493344604969, + "num_tokens": 63985663.0, + "step": 53230 + }, + { + "entropy": 1.9317936196923255, + "epoch": 0.16503934176457868, + "grad_norm": 8.324505805969238, + "learning_rate": 6.227346292383604e-06, + "loss": 0.5724, + "mean_token_accuracy": 0.8266930311918259, + "num_tokens": 63996929.0, + "step": 53240 + }, + { + "entropy": 1.8709998071193694, + "epoch": 0.16507034088962838, + "grad_norm": 8.387805938720703, + "learning_rate": 6.226761526595607e-06, + "loss": 0.557, + "mean_token_accuracy": 0.8292562425136566, + "num_tokens": 64008847.0, + "step": 53250 + }, + { + "entropy": 1.8826777443289757, + "epoch": 0.16510134001467808, + "grad_norm": 7.2748541831970215, + "learning_rate": 6.2261769255102635e-06, + "loss": 0.513, + "mean_token_accuracy": 0.8443334132432938, + "num_tokens": 64020608.0, + "step": 53260 + }, + { + "entropy": 1.8901048377156258, + "epoch": 0.16513233913972777, + "grad_norm": 4.1918625831604, + "learning_rate": 6.225592489050273e-06, + "loss": 0.5043, + "mean_token_accuracy": 0.8356076464056968, + "num_tokens": 64032582.0, + "step": 53270 + }, + { + "entropy": 1.8856966137886046, + "epoch": 0.16516333826477747, + "grad_norm": 9.814857482910156, + "learning_rate": 6.225008217138383e-06, + "loss": 0.5243, + "mean_token_accuracy": 0.8268331870436668, + "num_tokens": 64044568.0, + "step": 53280 + }, + { + "entropy": 1.9235829666256905, + "epoch": 0.16519433738982717, + "grad_norm": 8.297536849975586, + "learning_rate": 6.224424109697395e-06, + "loss": 0.509, + "mean_token_accuracy": 0.8338031157851219, + "num_tokens": 64055682.0, + "step": 53290 + }, + { + "entropy": 1.8362517848610878, + "epoch": 0.16522533651487686, + "grad_norm": 4.987396717071533, + "learning_rate": 6.2238401666501594e-06, + "loss": 0.4394, + "mean_token_accuracy": 0.842101874947548, + "num_tokens": 64068519.0, + "step": 53300 + }, + { + "entropy": 1.8466239094734191, + "epoch": 0.16525633563992656, + "grad_norm": 7.831912517547607, + "learning_rate": 6.2232563879195784e-06, + "loss": 0.4627, + "mean_token_accuracy": 0.8441867157816887, + "num_tokens": 64081113.0, + "step": 53310 + }, + { + "entropy": 1.901265199482441, + "epoch": 0.16528733476497626, + "grad_norm": 9.943422317504883, + "learning_rate": 6.222672773428604e-06, + "loss": 0.5043, + "mean_token_accuracy": 0.8303415447473526, + "num_tokens": 64093471.0, + "step": 53320 + }, + { + "entropy": 1.8734540060162543, + "epoch": 0.16531833389002595, + "grad_norm": 9.80716609954834, + "learning_rate": 6.222089323100241e-06, + "loss": 0.5508, + "mean_token_accuracy": 0.8340218141674995, + "num_tokens": 64106551.0, + "step": 53330 + }, + { + "entropy": 1.8805134430527688, + "epoch": 0.16534933301507565, + "grad_norm": 8.605008125305176, + "learning_rate": 6.221506036857539e-06, + "loss": 0.4991, + "mean_token_accuracy": 0.8431431293487549, + "num_tokens": 64119113.0, + "step": 53340 + }, + { + "entropy": 1.8234007403254509, + "epoch": 0.16538033214012535, + "grad_norm": 4.009090423583984, + "learning_rate": 6.220922914623604e-06, + "loss": 0.4587, + "mean_token_accuracy": 0.8466459035873413, + "num_tokens": 64132616.0, + "step": 53350 + }, + { + "entropy": 1.8845936864614488, + "epoch": 0.16541133126517504, + "grad_norm": 10.148909568786621, + "learning_rate": 6.2203399563215905e-06, + "loss": 0.5034, + "mean_token_accuracy": 0.8377220943570137, + "num_tokens": 64145357.0, + "step": 53360 + }, + { + "entropy": 1.959778368473053, + "epoch": 0.16544233039022474, + "grad_norm": 8.213615417480469, + "learning_rate": 6.219757161874702e-06, + "loss": 0.5736, + "mean_token_accuracy": 0.8356760248541832, + "num_tokens": 64156979.0, + "step": 53370 + }, + { + "entropy": 1.9542056560516357, + "epoch": 0.16547332951527444, + "grad_norm": 7.512001991271973, + "learning_rate": 6.219174531206195e-06, + "loss": 0.5212, + "mean_token_accuracy": 0.8426282212138176, + "num_tokens": 64167936.0, + "step": 53380 + }, + { + "entropy": 1.8867290809750557, + "epoch": 0.16550432864032413, + "grad_norm": 11.55746078491211, + "learning_rate": 6.2185920642393724e-06, + "loss": 0.538, + "mean_token_accuracy": 0.8335390016436577, + "num_tokens": 64178732.0, + "step": 53390 + }, + { + "entropy": 1.9342323437333107, + "epoch": 0.16553532776537383, + "grad_norm": 8.260815620422363, + "learning_rate": 6.218009760897592e-06, + "loss": 0.5032, + "mean_token_accuracy": 0.8423149228096009, + "num_tokens": 64189523.0, + "step": 53400 + }, + { + "entropy": 1.8842595234513282, + "epoch": 0.16556632689042353, + "grad_norm": 8.171215057373047, + "learning_rate": 6.21742762110426e-06, + "loss": 0.4739, + "mean_token_accuracy": 0.852245531976223, + "num_tokens": 64200851.0, + "step": 53410 + }, + { + "entropy": 1.9514251738786696, + "epoch": 0.16559732601547322, + "grad_norm": 11.585762977600098, + "learning_rate": 6.216845644782831e-06, + "loss": 0.6304, + "mean_token_accuracy": 0.8217004343867302, + "num_tokens": 64212043.0, + "step": 53420 + }, + { + "entropy": 1.9424443751573564, + "epoch": 0.16562832514052292, + "grad_norm": 7.723189353942871, + "learning_rate": 6.216263831856811e-06, + "loss": 0.5982, + "mean_token_accuracy": 0.830877748131752, + "num_tokens": 64223504.0, + "step": 53430 + }, + { + "entropy": 1.9373986154794693, + "epoch": 0.16565932426557262, + "grad_norm": 8.541019439697266, + "learning_rate": 6.215682182249758e-06, + "loss": 0.5521, + "mean_token_accuracy": 0.8272872015833854, + "num_tokens": 64235303.0, + "step": 53440 + }, + { + "entropy": 1.926292023062706, + "epoch": 0.1656903233906223, + "grad_norm": 9.712891578674316, + "learning_rate": 6.215100695885277e-06, + "loss": 0.5625, + "mean_token_accuracy": 0.8317703247070313, + "num_tokens": 64246713.0, + "step": 53450 + }, + { + "entropy": 1.8299464777112007, + "epoch": 0.16572132251567198, + "grad_norm": 8.690601348876953, + "learning_rate": 6.214519372687023e-06, + "loss": 0.4614, + "mean_token_accuracy": 0.8461026042699814, + "num_tokens": 64259907.0, + "step": 53460 + }, + { + "entropy": 1.9531344637274741, + "epoch": 0.16575232164072168, + "grad_norm": 8.977042198181152, + "learning_rate": 6.2139382125787065e-06, + "loss": 0.5531, + "mean_token_accuracy": 0.8346742272377015, + "num_tokens": 64271538.0, + "step": 53470 + }, + { + "entropy": 1.870873971283436, + "epoch": 0.16578332076577137, + "grad_norm": 10.686627388000488, + "learning_rate": 6.213357215484079e-06, + "loss": 0.5115, + "mean_token_accuracy": 0.8391006410121917, + "num_tokens": 64283477.0, + "step": 53480 + }, + { + "entropy": 1.884416152536869, + "epoch": 0.16581431989082107, + "grad_norm": 9.072136878967285, + "learning_rate": 6.2127763813269494e-06, + "loss": 0.5146, + "mean_token_accuracy": 0.8330440178513527, + "num_tokens": 64295950.0, + "step": 53490 + }, + { + "entropy": 1.8264600470662118, + "epoch": 0.16584531901587077, + "grad_norm": 3.7949202060699463, + "learning_rate": 6.212195710031174e-06, + "loss": 0.4127, + "mean_token_accuracy": 0.8554823577404023, + "num_tokens": 64308519.0, + "step": 53500 + }, + { + "entropy": 1.8256630197167396, + "epoch": 0.16587631814092046, + "grad_norm": 8.502163887023926, + "learning_rate": 6.211615201520656e-06, + "loss": 0.4813, + "mean_token_accuracy": 0.8437755182385445, + "num_tokens": 64321187.0, + "step": 53510 + }, + { + "entropy": 1.7614564910531043, + "epoch": 0.16590731726597016, + "grad_norm": 9.997268676757812, + "learning_rate": 6.211034855719356e-06, + "loss": 0.466, + "mean_token_accuracy": 0.8389209792017936, + "num_tokens": 64334994.0, + "step": 53520 + }, + { + "entropy": 1.934888207912445, + "epoch": 0.16593831639101986, + "grad_norm": 7.5674519538879395, + "learning_rate": 6.210454672551274e-06, + "loss": 0.5638, + "mean_token_accuracy": 0.8264615863561631, + "num_tokens": 64345597.0, + "step": 53530 + }, + { + "entropy": 1.8772856667637825, + "epoch": 0.16596931551606955, + "grad_norm": 10.031050682067871, + "learning_rate": 6.209874651940466e-06, + "loss": 0.4958, + "mean_token_accuracy": 0.8403290301561356, + "num_tokens": 64357564.0, + "step": 53540 + }, + { + "entropy": 1.8533400312066077, + "epoch": 0.16600031464111925, + "grad_norm": 10.334144592285156, + "learning_rate": 6.209294793811038e-06, + "loss": 0.4953, + "mean_token_accuracy": 0.8353090777993202, + "num_tokens": 64369611.0, + "step": 53550 + }, + { + "entropy": 1.940661406517029, + "epoch": 0.16603131376616895, + "grad_norm": 11.576028823852539, + "learning_rate": 6.208715098087144e-06, + "loss": 0.5575, + "mean_token_accuracy": 0.8241597592830658, + "num_tokens": 64380934.0, + "step": 53560 + }, + { + "entropy": 1.9238759770989418, + "epoch": 0.16606231289121864, + "grad_norm": 6.117959022521973, + "learning_rate": 6.208135564692989e-06, + "loss": 0.581, + "mean_token_accuracy": 0.824966461956501, + "num_tokens": 64392121.0, + "step": 53570 + }, + { + "entropy": 1.9680738180875779, + "epoch": 0.16609331201626834, + "grad_norm": 6.631012916564941, + "learning_rate": 6.207556193552824e-06, + "loss": 0.5566, + "mean_token_accuracy": 0.8322376728057861, + "num_tokens": 64403048.0, + "step": 53580 + }, + { + "entropy": 1.895513379573822, + "epoch": 0.16612431114131804, + "grad_norm": 9.87430191040039, + "learning_rate": 6.206976984590952e-06, + "loss": 0.6129, + "mean_token_accuracy": 0.8248256593942642, + "num_tokens": 64414999.0, + "step": 53590 + }, + { + "entropy": 1.8739793948829173, + "epoch": 0.16615531026636773, + "grad_norm": 10.234819412231445, + "learning_rate": 6.206397937731726e-06, + "loss": 0.4756, + "mean_token_accuracy": 0.8436161160469056, + "num_tokens": 64427895.0, + "step": 53600 + }, + { + "entropy": 1.885368101298809, + "epoch": 0.16618630939141743, + "grad_norm": 7.840350151062012, + "learning_rate": 6.205819052899549e-06, + "loss": 0.5118, + "mean_token_accuracy": 0.8369289055466652, + "num_tokens": 64440003.0, + "step": 53610 + }, + { + "entropy": 1.9216457679867744, + "epoch": 0.16621730851646713, + "grad_norm": 9.213172912597656, + "learning_rate": 6.205240330018869e-06, + "loss": 0.5225, + "mean_token_accuracy": 0.840261846780777, + "num_tokens": 64451450.0, + "step": 53620 + }, + { + "entropy": 1.9085844457149506, + "epoch": 0.16624830764151682, + "grad_norm": 8.908021926879883, + "learning_rate": 6.2046617690141876e-06, + "loss": 0.5254, + "mean_token_accuracy": 0.8355207294225693, + "num_tokens": 64463262.0, + "step": 53630 + }, + { + "entropy": 1.932268613576889, + "epoch": 0.16627930676656652, + "grad_norm": 8.519464492797852, + "learning_rate": 6.204083369810055e-06, + "loss": 0.5445, + "mean_token_accuracy": 0.8326381966471672, + "num_tokens": 64474641.0, + "step": 53640 + }, + { + "entropy": 1.8569614686071874, + "epoch": 0.16631030589161622, + "grad_norm": 4.804983139038086, + "learning_rate": 6.203505132331069e-06, + "loss": 0.4986, + "mean_token_accuracy": 0.8352816164493561, + "num_tokens": 64487602.0, + "step": 53650 + }, + { + "entropy": 1.858240360021591, + "epoch": 0.1663413050166659, + "grad_norm": 10.345724105834961, + "learning_rate": 6.202927056501878e-06, + "loss": 0.5177, + "mean_token_accuracy": 0.8289370179176331, + "num_tokens": 64500395.0, + "step": 53660 + }, + { + "entropy": 1.807939064502716, + "epoch": 0.1663723041417156, + "grad_norm": 7.798347473144531, + "learning_rate": 6.202349142247179e-06, + "loss": 0.4629, + "mean_token_accuracy": 0.8403982803225517, + "num_tokens": 64513854.0, + "step": 53670 + }, + { + "entropy": 1.972070437669754, + "epoch": 0.1664033032667653, + "grad_norm": 7.832572937011719, + "learning_rate": 6.201771389491718e-06, + "loss": 0.5794, + "mean_token_accuracy": 0.8308521166443825, + "num_tokens": 64524527.0, + "step": 53680 + }, + { + "entropy": 1.8211177855730056, + "epoch": 0.166434302391815, + "grad_norm": 7.727509498596191, + "learning_rate": 6.20119379816029e-06, + "loss": 0.4504, + "mean_token_accuracy": 0.8501279547810554, + "num_tokens": 64536806.0, + "step": 53690 + }, + { + "entropy": 1.9093163907527924, + "epoch": 0.1664653015168647, + "grad_norm": 8.725934982299805, + "learning_rate": 6.20061636817774e-06, + "loss": 0.5555, + "mean_token_accuracy": 0.8283161029219628, + "num_tokens": 64547580.0, + "step": 53700 + }, + { + "entropy": 1.8534967467188834, + "epoch": 0.16649630064191437, + "grad_norm": 9.739397048950195, + "learning_rate": 6.200039099468959e-06, + "loss": 0.4736, + "mean_token_accuracy": 0.8432888746261596, + "num_tokens": 64559322.0, + "step": 53710 + }, + { + "entropy": 1.8829628586769105, + "epoch": 0.16652729976696407, + "grad_norm": 9.243392944335938, + "learning_rate": 6.1994619919588925e-06, + "loss": 0.5197, + "mean_token_accuracy": 0.8410766318440437, + "num_tokens": 64571411.0, + "step": 53720 + }, + { + "entropy": 1.850360631942749, + "epoch": 0.16655829889201376, + "grad_norm": 8.87844467163086, + "learning_rate": 6.19888504557253e-06, + "loss": 0.4892, + "mean_token_accuracy": 0.8418926939368248, + "num_tokens": 64583534.0, + "step": 53730 + }, + { + "entropy": 1.7772665143013, + "epoch": 0.16658929801706346, + "grad_norm": 10.492433547973633, + "learning_rate": 6.198308260234912e-06, + "loss": 0.4102, + "mean_token_accuracy": 0.8469656750559806, + "num_tokens": 64596994.0, + "step": 53740 + }, + { + "entropy": 1.8638063728809358, + "epoch": 0.16662029714211316, + "grad_norm": 4.035283088684082, + "learning_rate": 6.197731635871126e-06, + "loss": 0.5047, + "mean_token_accuracy": 0.8372615680098534, + "num_tokens": 64608787.0, + "step": 53750 + }, + { + "entropy": 1.8116811349987985, + "epoch": 0.16665129626716285, + "grad_norm": 3.7760744094848633, + "learning_rate": 6.197155172406311e-06, + "loss": 0.483, + "mean_token_accuracy": 0.8445352002978325, + "num_tokens": 64620766.0, + "step": 53760 + }, + { + "entropy": 1.764721181988716, + "epoch": 0.16668229539221255, + "grad_norm": 9.610373497009277, + "learning_rate": 6.19657886976565e-06, + "loss": 0.4509, + "mean_token_accuracy": 0.8372749388217926, + "num_tokens": 64634866.0, + "step": 53770 + }, + { + "entropy": 1.9135527536273003, + "epoch": 0.16671329451726224, + "grad_norm": 7.860201358795166, + "learning_rate": 6.196002727874382e-06, + "loss": 0.528, + "mean_token_accuracy": 0.8299287378787994, + "num_tokens": 64646883.0, + "step": 53780 + }, + { + "entropy": 1.9007146388292313, + "epoch": 0.16674429364231194, + "grad_norm": 3.8875370025634766, + "learning_rate": 6.195426746657789e-06, + "loss": 0.5254, + "mean_token_accuracy": 0.8316178724169732, + "num_tokens": 64658973.0, + "step": 53790 + }, + { + "entropy": 1.826033054292202, + "epoch": 0.16677529276736164, + "grad_norm": 8.549474716186523, + "learning_rate": 6.194850926041201e-06, + "loss": 0.4855, + "mean_token_accuracy": 0.848523524403572, + "num_tokens": 64671112.0, + "step": 53800 + }, + { + "entropy": 1.9412501826882362, + "epoch": 0.16680629189241133, + "grad_norm": 8.226346015930176, + "learning_rate": 6.194275265950003e-06, + "loss": 0.5854, + "mean_token_accuracy": 0.817940565943718, + "num_tokens": 64682585.0, + "step": 53810 + }, + { + "entropy": 1.9826242715120315, + "epoch": 0.16683729101746103, + "grad_norm": 7.8780622482299805, + "learning_rate": 6.193699766309622e-06, + "loss": 0.5808, + "mean_token_accuracy": 0.8211043074727058, + "num_tokens": 64693614.0, + "step": 53820 + }, + { + "entropy": 1.9336616814136505, + "epoch": 0.16686829014251073, + "grad_norm": 10.512943267822266, + "learning_rate": 6.193124427045535e-06, + "loss": 0.576, + "mean_token_accuracy": 0.8250611051917076, + "num_tokens": 64705081.0, + "step": 53830 + }, + { + "entropy": 1.8315666154026986, + "epoch": 0.16689928926756042, + "grad_norm": 9.55489730834961, + "learning_rate": 6.1925492480832705e-06, + "loss": 0.5247, + "mean_token_accuracy": 0.8315412878990174, + "num_tokens": 64717880.0, + "step": 53840 + }, + { + "entropy": 1.878394363820553, + "epoch": 0.16693028839261012, + "grad_norm": 7.533677101135254, + "learning_rate": 6.1919742293484e-06, + "loss": 0.5741, + "mean_token_accuracy": 0.8287183776497841, + "num_tokens": 64728991.0, + "step": 53850 + }, + { + "entropy": 1.9339822947978973, + "epoch": 0.16696128751765982, + "grad_norm": 4.745457172393799, + "learning_rate": 6.1913993707665485e-06, + "loss": 0.5554, + "mean_token_accuracy": 0.8289982140064239, + "num_tokens": 64740494.0, + "step": 53860 + }, + { + "entropy": 1.8400120854377746, + "epoch": 0.16699228664270951, + "grad_norm": 8.681490898132324, + "learning_rate": 6.190824672263388e-06, + "loss": 0.4626, + "mean_token_accuracy": 0.8391780480742455, + "num_tokens": 64753203.0, + "step": 53870 + }, + { + "entropy": 1.9312397927045821, + "epoch": 0.1670232857677592, + "grad_norm": 9.676896095275879, + "learning_rate": 6.190250133764637e-06, + "loss": 0.5628, + "mean_token_accuracy": 0.832855261862278, + "num_tokens": 64764277.0, + "step": 53880 + }, + { + "entropy": 1.8353455513715744, + "epoch": 0.1670542848928089, + "grad_norm": 7.824866771697998, + "learning_rate": 6.189675755196064e-06, + "loss": 0.4281, + "mean_token_accuracy": 0.854548205435276, + "num_tokens": 64776606.0, + "step": 53890 + }, + { + "entropy": 1.9409961223602294, + "epoch": 0.1670852840178586, + "grad_norm": 9.087993621826172, + "learning_rate": 6.189101536483484e-06, + "loss": 0.5796, + "mean_token_accuracy": 0.8235799849033356, + "num_tokens": 64788237.0, + "step": 53900 + }, + { + "entropy": 1.8919391304254531, + "epoch": 0.1671162831429083, + "grad_norm": 7.690661430358887, + "learning_rate": 6.188527477552761e-06, + "loss": 0.564, + "mean_token_accuracy": 0.832922050356865, + "num_tokens": 64800456.0, + "step": 53910 + }, + { + "entropy": 1.8101703599095345, + "epoch": 0.167147282267958, + "grad_norm": 8.9353609085083, + "learning_rate": 6.18795357832981e-06, + "loss": 0.4504, + "mean_token_accuracy": 0.8428716316819191, + "num_tokens": 64813777.0, + "step": 53920 + }, + { + "entropy": 1.8997833237051964, + "epoch": 0.1671782813930077, + "grad_norm": 5.414516448974609, + "learning_rate": 6.187379838740587e-06, + "loss": 0.5056, + "mean_token_accuracy": 0.8327810063958168, + "num_tokens": 64826025.0, + "step": 53930 + }, + { + "entropy": 1.9362305417656898, + "epoch": 0.1672092805180574, + "grad_norm": 7.484461784362793, + "learning_rate": 6.186806258711105e-06, + "loss": 0.5841, + "mean_token_accuracy": 0.8236838817596436, + "num_tokens": 64837897.0, + "step": 53940 + }, + { + "entropy": 1.7711832091212272, + "epoch": 0.1672402796431071, + "grad_norm": 3.344655990600586, + "learning_rate": 6.186232838167419e-06, + "loss": 0.3899, + "mean_token_accuracy": 0.847083292901516, + "num_tokens": 64852441.0, + "step": 53950 + }, + { + "entropy": 1.8284263715147973, + "epoch": 0.16727127876815676, + "grad_norm": 7.023416042327881, + "learning_rate": 6.185659577035632e-06, + "loss": 0.4613, + "mean_token_accuracy": 0.8410525426268578, + "num_tokens": 64865349.0, + "step": 53960 + }, + { + "entropy": 1.9532892093062402, + "epoch": 0.16730227789320645, + "grad_norm": 7.449609756469727, + "learning_rate": 6.185086475241898e-06, + "loss": 0.5544, + "mean_token_accuracy": 0.8271322563290596, + "num_tokens": 64876306.0, + "step": 53970 + }, + { + "entropy": 1.943205328285694, + "epoch": 0.16733327701825615, + "grad_norm": 9.61139965057373, + "learning_rate": 6.184513532712416e-06, + "loss": 0.5391, + "mean_token_accuracy": 0.8366443976759911, + "num_tokens": 64888292.0, + "step": 53980 + }, + { + "entropy": 1.9256486117839813, + "epoch": 0.16736427614330585, + "grad_norm": 5.424153804779053, + "learning_rate": 6.183940749373436e-06, + "loss": 0.5322, + "mean_token_accuracy": 0.836666439473629, + "num_tokens": 64899609.0, + "step": 53990 + }, + { + "entropy": 1.9205065920948983, + "epoch": 0.16739527526835554, + "grad_norm": 8.4097900390625, + "learning_rate": 6.1833681251512516e-06, + "loss": 0.4996, + "mean_token_accuracy": 0.8369838654994964, + "num_tokens": 64911133.0, + "step": 54000 + }, + { + "entropy": 1.837955741584301, + "epoch": 0.16742627439340524, + "grad_norm": 4.225302696228027, + "learning_rate": 6.182795659972208e-06, + "loss": 0.4938, + "mean_token_accuracy": 0.8357640966773033, + "num_tokens": 64923539.0, + "step": 54010 + }, + { + "entropy": 1.87107597514987, + "epoch": 0.16745727351845494, + "grad_norm": 9.220149993896484, + "learning_rate": 6.182223353762697e-06, + "loss": 0.4919, + "mean_token_accuracy": 0.8416076585650444, + "num_tokens": 64935263.0, + "step": 54020 + }, + { + "entropy": 1.8615114361047744, + "epoch": 0.16748827264350463, + "grad_norm": 8.91309928894043, + "learning_rate": 6.181651206449155e-06, + "loss": 0.4344, + "mean_token_accuracy": 0.8517223030328751, + "num_tokens": 64947376.0, + "step": 54030 + }, + { + "entropy": 1.8877154156565665, + "epoch": 0.16751927176855433, + "grad_norm": 4.553346633911133, + "learning_rate": 6.181079217958073e-06, + "loss": 0.4656, + "mean_token_accuracy": 0.8457445085048676, + "num_tokens": 64959541.0, + "step": 54040 + }, + { + "entropy": 1.8997020781040193, + "epoch": 0.16755027089360403, + "grad_norm": 9.368854522705078, + "learning_rate": 6.180507388215983e-06, + "loss": 0.5, + "mean_token_accuracy": 0.8364048600196838, + "num_tokens": 64972259.0, + "step": 54050 + }, + { + "entropy": 1.8942409604787827, + "epoch": 0.16758127001865372, + "grad_norm": 8.198491096496582, + "learning_rate": 6.1799357171494655e-06, + "loss": 0.5034, + "mean_token_accuracy": 0.8378923922777176, + "num_tokens": 64983781.0, + "step": 54060 + }, + { + "entropy": 1.8540250420570374, + "epoch": 0.16761226914370342, + "grad_norm": 10.47086238861084, + "learning_rate": 6.179364204685151e-06, + "loss": 0.5252, + "mean_token_accuracy": 0.8316616043448448, + "num_tokens": 64996187.0, + "step": 54070 + }, + { + "entropy": 1.9425106346607208, + "epoch": 0.16764326826875312, + "grad_norm": 9.815423965454102, + "learning_rate": 6.1787928507497175e-06, + "loss": 0.5643, + "mean_token_accuracy": 0.8328223183751107, + "num_tokens": 65007642.0, + "step": 54080 + }, + { + "entropy": 1.7625815600156785, + "epoch": 0.1676742673938028, + "grad_norm": 4.722161769866943, + "learning_rate": 6.178221655269889e-06, + "loss": 0.4629, + "mean_token_accuracy": 0.851328319311142, + "num_tokens": 65021921.0, + "step": 54090 + }, + { + "entropy": 1.877830518782139, + "epoch": 0.1677052665188525, + "grad_norm": 9.134954452514648, + "learning_rate": 6.177650618172437e-06, + "loss": 0.4936, + "mean_token_accuracy": 0.8438004121184349, + "num_tokens": 65033368.0, + "step": 54100 + }, + { + "entropy": 1.8599343091249465, + "epoch": 0.1677362656439022, + "grad_norm": 10.030871391296387, + "learning_rate": 6.177079739384181e-06, + "loss": 0.5236, + "mean_token_accuracy": 0.8409288361668587, + "num_tokens": 65045878.0, + "step": 54110 + }, + { + "entropy": 1.7290209725499153, + "epoch": 0.1677672647689519, + "grad_norm": 4.038937568664551, + "learning_rate": 6.176509018831986e-06, + "loss": 0.4231, + "mean_token_accuracy": 0.8499686598777771, + "num_tokens": 65060347.0, + "step": 54120 + }, + { + "entropy": 1.8978177532553673, + "epoch": 0.1677982638940016, + "grad_norm": 8.696316719055176, + "learning_rate": 6.175938456442767e-06, + "loss": 0.5338, + "mean_token_accuracy": 0.8306406125426292, + "num_tokens": 65071304.0, + "step": 54130 + }, + { + "entropy": 1.8237817287445068, + "epoch": 0.1678292630190513, + "grad_norm": 5.508317947387695, + "learning_rate": 6.175368052143486e-06, + "loss": 0.4792, + "mean_token_accuracy": 0.84134771078825, + "num_tokens": 65083895.0, + "step": 54140 + }, + { + "entropy": 1.8195566333830357, + "epoch": 0.167860262144101, + "grad_norm": 3.8358309268951416, + "learning_rate": 6.174797805861148e-06, + "loss": 0.464, + "mean_token_accuracy": 0.8476643204689026, + "num_tokens": 65096233.0, + "step": 54150 + }, + { + "entropy": 1.8712603464722632, + "epoch": 0.1678912612691507, + "grad_norm": 8.841750144958496, + "learning_rate": 6.174227717522813e-06, + "loss": 0.5239, + "mean_token_accuracy": 0.834831403195858, + "num_tokens": 65108051.0, + "step": 54160 + }, + { + "entropy": 1.8733776569366456, + "epoch": 0.16792226039420038, + "grad_norm": 7.8258280754089355, + "learning_rate": 6.173657787055579e-06, + "loss": 0.5251, + "mean_token_accuracy": 0.8338091254234314, + "num_tokens": 65120148.0, + "step": 54170 + }, + { + "entropy": 1.7442950963974, + "epoch": 0.16795325951925008, + "grad_norm": 9.016060829162598, + "learning_rate": 6.173088014386599e-06, + "loss": 0.3985, + "mean_token_accuracy": 0.85818080753088, + "num_tokens": 65133990.0, + "step": 54180 + }, + { + "entropy": 1.8510777726769447, + "epoch": 0.16798425864429978, + "grad_norm": 4.038900852203369, + "learning_rate": 6.1725183994430695e-06, + "loss": 0.5051, + "mean_token_accuracy": 0.8405930832028389, + "num_tokens": 65145866.0, + "step": 54190 + }, + { + "entropy": 1.874160850048065, + "epoch": 0.16801525776934945, + "grad_norm": 9.50784969329834, + "learning_rate": 6.1719489421522305e-06, + "loss": 0.5411, + "mean_token_accuracy": 0.8350860238075256, + "num_tokens": 65157550.0, + "step": 54200 + }, + { + "entropy": 1.8412716314196587, + "epoch": 0.16804625689439914, + "grad_norm": 4.003512859344482, + "learning_rate": 6.1713796424413765e-06, + "loss": 0.4365, + "mean_token_accuracy": 0.8513872250914574, + "num_tokens": 65169597.0, + "step": 54210 + }, + { + "entropy": 1.9005790546536445, + "epoch": 0.16807725601944884, + "grad_norm": 9.995362281799316, + "learning_rate": 6.170810500237844e-06, + "loss": 0.5528, + "mean_token_accuracy": 0.8309820532798767, + "num_tokens": 65180854.0, + "step": 54220 + }, + { + "entropy": 1.8263572067022324, + "epoch": 0.16810825514449854, + "grad_norm": 7.676513195037842, + "learning_rate": 6.170241515469018e-06, + "loss": 0.4498, + "mean_token_accuracy": 0.8502917841076851, + "num_tokens": 65192720.0, + "step": 54230 + }, + { + "entropy": 1.832256343960762, + "epoch": 0.16813925426954823, + "grad_norm": 4.75320291519165, + "learning_rate": 6.1696726880623285e-06, + "loss": 0.4836, + "mean_token_accuracy": 0.8385294854640961, + "num_tokens": 65204655.0, + "step": 54240 + }, + { + "entropy": 1.8810152530670166, + "epoch": 0.16817025339459793, + "grad_norm": 9.239448547363281, + "learning_rate": 6.1691040179452545e-06, + "loss": 0.5374, + "mean_token_accuracy": 0.832660236954689, + "num_tokens": 65216115.0, + "step": 54250 + }, + { + "entropy": 1.7937642320990563, + "epoch": 0.16820125251964763, + "grad_norm": 8.329822540283203, + "learning_rate": 6.168535505045321e-06, + "loss": 0.4775, + "mean_token_accuracy": 0.8271018177270889, + "num_tokens": 65229542.0, + "step": 54260 + }, + { + "entropy": 1.8577524468302726, + "epoch": 0.16823225164469732, + "grad_norm": 3.8464255332946777, + "learning_rate": 6.1679671492901005e-06, + "loss": 0.5141, + "mean_token_accuracy": 0.8379319593310356, + "num_tokens": 65241000.0, + "step": 54270 + }, + { + "entropy": 1.8444037914276123, + "epoch": 0.16826325076974702, + "grad_norm": 9.02126407623291, + "learning_rate": 6.167398950607211e-06, + "loss": 0.4539, + "mean_token_accuracy": 0.8464455410838128, + "num_tokens": 65253095.0, + "step": 54280 + }, + { + "entropy": 1.8382619485259055, + "epoch": 0.16829424989479672, + "grad_norm": 8.693649291992188, + "learning_rate": 6.166830908924317e-06, + "loss": 0.5167, + "mean_token_accuracy": 0.8406226679682731, + "num_tokens": 65265949.0, + "step": 54290 + }, + { + "entropy": 1.9115242898464202, + "epoch": 0.1683252490198464, + "grad_norm": 3.702934741973877, + "learning_rate": 6.16626302416913e-06, + "loss": 0.5333, + "mean_token_accuracy": 0.833556392788887, + "num_tokens": 65277257.0, + "step": 54300 + }, + { + "entropy": 1.8722372770309448, + "epoch": 0.1683562481448961, + "grad_norm": 9.893916130065918, + "learning_rate": 6.16569529626941e-06, + "loss": 0.495, + "mean_token_accuracy": 0.8460499599575997, + "num_tokens": 65288847.0, + "step": 54310 + }, + { + "entropy": 1.8180507212877273, + "epoch": 0.1683872472699458, + "grad_norm": 9.089763641357422, + "learning_rate": 6.165127725152958e-06, + "loss": 0.5071, + "mean_token_accuracy": 0.8375856161117554, + "num_tokens": 65301059.0, + "step": 54320 + }, + { + "entropy": 1.8814466312527656, + "epoch": 0.1684182463949955, + "grad_norm": 8.537693977355957, + "learning_rate": 6.1645603107476316e-06, + "loss": 0.4984, + "mean_token_accuracy": 0.8440789937973022, + "num_tokens": 65313029.0, + "step": 54330 + }, + { + "entropy": 1.8159365728497505, + "epoch": 0.1684492455200452, + "grad_norm": 3.978240489959717, + "learning_rate": 6.163993052981323e-06, + "loss": 0.483, + "mean_token_accuracy": 0.8402403250336647, + "num_tokens": 65325995.0, + "step": 54340 + }, + { + "entropy": 1.8512892082333565, + "epoch": 0.1684802446450949, + "grad_norm": 3.7935879230499268, + "learning_rate": 6.163425951781979e-06, + "loss": 0.5167, + "mean_token_accuracy": 0.8360736206173897, + "num_tokens": 65338118.0, + "step": 54350 + }, + { + "entropy": 1.802795946598053, + "epoch": 0.1685112437701446, + "grad_norm": 8.774079322814941, + "learning_rate": 6.162859007077591e-06, + "loss": 0.5186, + "mean_token_accuracy": 0.8323176607489586, + "num_tokens": 65350124.0, + "step": 54360 + }, + { + "entropy": 1.9113412827253342, + "epoch": 0.1685422428951943, + "grad_norm": 9.643198013305664, + "learning_rate": 6.162292218796194e-06, + "loss": 0.6128, + "mean_token_accuracy": 0.821797750890255, + "num_tokens": 65361216.0, + "step": 54370 + }, + { + "entropy": 1.880385261774063, + "epoch": 0.16857324202024399, + "grad_norm": 4.427012920379639, + "learning_rate": 6.161725586865874e-06, + "loss": 0.514, + "mean_token_accuracy": 0.8347649067640305, + "num_tokens": 65372951.0, + "step": 54380 + }, + { + "entropy": 1.8761203557252883, + "epoch": 0.16860424114529368, + "grad_norm": 7.965682506561279, + "learning_rate": 6.1611591112147576e-06, + "loss": 0.5154, + "mean_token_accuracy": 0.8341512382030487, + "num_tokens": 65384596.0, + "step": 54390 + }, + { + "entropy": 1.9145608723163605, + "epoch": 0.16863524027034338, + "grad_norm": 9.253949165344238, + "learning_rate": 6.160592791771022e-06, + "loss": 0.5824, + "mean_token_accuracy": 0.8162755355238914, + "num_tokens": 65396034.0, + "step": 54400 + }, + { + "entropy": 1.831618282198906, + "epoch": 0.16866623939539307, + "grad_norm": 8.794281005859375, + "learning_rate": 6.16002662846289e-06, + "loss": 0.4911, + "mean_token_accuracy": 0.8415708675980568, + "num_tokens": 65408333.0, + "step": 54410 + }, + { + "entropy": 1.8698890820145606, + "epoch": 0.16869723852044277, + "grad_norm": 9.203094482421875, + "learning_rate": 6.159460621218628e-06, + "loss": 0.5114, + "mean_token_accuracy": 0.8259828209877014, + "num_tokens": 65420329.0, + "step": 54420 + }, + { + "entropy": 1.8370443254709243, + "epoch": 0.16872823764549247, + "grad_norm": 4.42335319519043, + "learning_rate": 6.158894769966554e-06, + "loss": 0.51, + "mean_token_accuracy": 0.8294874534010888, + "num_tokens": 65432554.0, + "step": 54430 + }, + { + "entropy": 1.929424050450325, + "epoch": 0.16875923677054216, + "grad_norm": 8.103418350219727, + "learning_rate": 6.158329074635024e-06, + "loss": 0.6025, + "mean_token_accuracy": 0.8246657729148865, + "num_tokens": 65443420.0, + "step": 54440 + }, + { + "entropy": 1.9033832669258117, + "epoch": 0.16879023589559183, + "grad_norm": 11.029186248779297, + "learning_rate": 6.157763535152448e-06, + "loss": 0.5328, + "mean_token_accuracy": 0.8389760866761208, + "num_tokens": 65455561.0, + "step": 54450 + }, + { + "entropy": 1.8955723971128464, + "epoch": 0.16882123502064153, + "grad_norm": 10.687283515930176, + "learning_rate": 6.1571981514472765e-06, + "loss": 0.5405, + "mean_token_accuracy": 0.8430651158094407, + "num_tokens": 65467045.0, + "step": 54460 + }, + { + "entropy": 1.9213042184710503, + "epoch": 0.16885223414569123, + "grad_norm": 8.316679000854492, + "learning_rate": 6.156632923448008e-06, + "loss": 0.5787, + "mean_token_accuracy": 0.8268252417445183, + "num_tokens": 65479119.0, + "step": 54470 + }, + { + "entropy": 1.8560591742396355, + "epoch": 0.16888323327074092, + "grad_norm": 4.520045757293701, + "learning_rate": 6.156067851083189e-06, + "loss": 0.5089, + "mean_token_accuracy": 0.8373791992664337, + "num_tokens": 65491055.0, + "step": 54480 + }, + { + "entropy": 1.8120755463838578, + "epoch": 0.16891423239579062, + "grad_norm": 9.259931564331055, + "learning_rate": 6.1555029342814085e-06, + "loss": 0.4521, + "mean_token_accuracy": 0.8388746708631516, + "num_tokens": 65503547.0, + "step": 54490 + }, + { + "entropy": 1.8446748018264771, + "epoch": 0.16894523152084032, + "grad_norm": 5.609341144561768, + "learning_rate": 6.154938172971303e-06, + "loss": 0.517, + "mean_token_accuracy": 0.8386319100856781, + "num_tokens": 65516410.0, + "step": 54500 + }, + { + "entropy": 1.7979810684919357, + "epoch": 0.16897623064589, + "grad_norm": 8.738170623779297, + "learning_rate": 6.154373567081555e-06, + "loss": 0.4665, + "mean_token_accuracy": 0.8370955526828766, + "num_tokens": 65529389.0, + "step": 54510 + }, + { + "entropy": 1.9188962578773499, + "epoch": 0.1690072297709397, + "grad_norm": 8.479401588439941, + "learning_rate": 6.15380911654089e-06, + "loss": 0.536, + "mean_token_accuracy": 0.8272537529468537, + "num_tokens": 65541159.0, + "step": 54520 + }, + { + "entropy": 1.8414244174957275, + "epoch": 0.1690382288959894, + "grad_norm": 9.82351016998291, + "learning_rate": 6.153244821278084e-06, + "loss": 0.4987, + "mean_token_accuracy": 0.8392342895269393, + "num_tokens": 65553128.0, + "step": 54530 + }, + { + "entropy": 1.9202020570635796, + "epoch": 0.1690692280210391, + "grad_norm": 7.951665878295898, + "learning_rate": 6.152680681221957e-06, + "loss": 0.5564, + "mean_token_accuracy": 0.8311819225549698, + "num_tokens": 65564486.0, + "step": 54540 + }, + { + "entropy": 1.7439219117164613, + "epoch": 0.1691002271460888, + "grad_norm": 7.696526527404785, + "learning_rate": 6.1521166963013704e-06, + "loss": 0.4214, + "mean_token_accuracy": 0.8509344890713691, + "num_tokens": 65578178.0, + "step": 54550 + }, + { + "entropy": 1.8539911389350892, + "epoch": 0.1691312262711385, + "grad_norm": 8.005914688110352, + "learning_rate": 6.151552866445237e-06, + "loss": 0.487, + "mean_token_accuracy": 0.8328571185469628, + "num_tokens": 65590436.0, + "step": 54560 + }, + { + "entropy": 1.7372994154691697, + "epoch": 0.1691622253961882, + "grad_norm": 8.57450008392334, + "learning_rate": 6.150989191582515e-06, + "loss": 0.4778, + "mean_token_accuracy": 0.8435987919569016, + "num_tokens": 65604247.0, + "step": 54570 + }, + { + "entropy": 1.8149765402078628, + "epoch": 0.1691932245212379, + "grad_norm": 9.524360656738281, + "learning_rate": 6.150425671642202e-06, + "loss": 0.4857, + "mean_token_accuracy": 0.8407787919044495, + "num_tokens": 65616338.0, + "step": 54580 + }, + { + "entropy": 1.7159005388617516, + "epoch": 0.16922422364628759, + "grad_norm": 8.246034622192383, + "learning_rate": 6.1498623065533485e-06, + "loss": 0.4175, + "mean_token_accuracy": 0.8674893081188202, + "num_tokens": 65629499.0, + "step": 54590 + }, + { + "entropy": 1.7766979977488517, + "epoch": 0.16925522277133728, + "grad_norm": 3.937098741531372, + "learning_rate": 6.1492990962450465e-06, + "loss": 0.4736, + "mean_token_accuracy": 0.8527488321065902, + "num_tokens": 65641287.0, + "step": 54600 + }, + { + "entropy": 1.8135869204998016, + "epoch": 0.16928622189638698, + "grad_norm": 9.094282150268555, + "learning_rate": 6.148736040646432e-06, + "loss": 0.5036, + "mean_token_accuracy": 0.8407742142677307, + "num_tokens": 65653614.0, + "step": 54610 + }, + { + "entropy": 1.7666838884353637, + "epoch": 0.16931722102143668, + "grad_norm": 8.157979011535645, + "learning_rate": 6.148173139686692e-06, + "loss": 0.4819, + "mean_token_accuracy": 0.845091213285923, + "num_tokens": 65666332.0, + "step": 54620 + }, + { + "entropy": 1.7945738911628724, + "epoch": 0.16934822014648637, + "grad_norm": 4.526710510253906, + "learning_rate": 6.147610393295055e-06, + "loss": 0.5045, + "mean_token_accuracy": 0.8315363183617592, + "num_tokens": 65679667.0, + "step": 54630 + }, + { + "entropy": 1.8847228914499283, + "epoch": 0.16937921927153607, + "grad_norm": 8.596298217773438, + "learning_rate": 6.147047801400793e-06, + "loss": 0.569, + "mean_token_accuracy": 0.8253517493605613, + "num_tokens": 65690829.0, + "step": 54640 + }, + { + "entropy": 1.9206166476011277, + "epoch": 0.16941021839658577, + "grad_norm": 7.514388561248779, + "learning_rate": 6.146485363933227e-06, + "loss": 0.5221, + "mean_token_accuracy": 0.8408697739243507, + "num_tokens": 65702308.0, + "step": 54650 + }, + { + "entropy": 1.8065496653318405, + "epoch": 0.16944121752163546, + "grad_norm": 8.206125259399414, + "learning_rate": 6.145923080821722e-06, + "loss": 0.5337, + "mean_token_accuracy": 0.8288014903664589, + "num_tokens": 65715353.0, + "step": 54660 + }, + { + "entropy": 1.8215434283018113, + "epoch": 0.16947221664668516, + "grad_norm": 10.296819686889648, + "learning_rate": 6.145360951995688e-06, + "loss": 0.5272, + "mean_token_accuracy": 0.8371994882822037, + "num_tokens": 65727578.0, + "step": 54670 + }, + { + "entropy": 1.8798654437065125, + "epoch": 0.16950321577173486, + "grad_norm": 8.519831657409668, + "learning_rate": 6.144798977384581e-06, + "loss": 0.4857, + "mean_token_accuracy": 0.8545126497745514, + "num_tokens": 65739521.0, + "step": 54680 + }, + { + "entropy": 1.9068218991160393, + "epoch": 0.16953421489678455, + "grad_norm": 7.611881256103516, + "learning_rate": 6.144237156917899e-06, + "loss": 0.5547, + "mean_token_accuracy": 0.8298189043998718, + "num_tokens": 65751188.0, + "step": 54690 + }, + { + "entropy": 1.8502942398190498, + "epoch": 0.16956521402183422, + "grad_norm": 8.908893585205078, + "learning_rate": 6.143675490525191e-06, + "loss": 0.4892, + "mean_token_accuracy": 0.8421571254730225, + "num_tokens": 65763519.0, + "step": 54700 + }, + { + "entropy": 1.926678617298603, + "epoch": 0.16959621314688392, + "grad_norm": 8.78310489654541, + "learning_rate": 6.143113978136046e-06, + "loss": 0.5346, + "mean_token_accuracy": 0.8322173491120338, + "num_tokens": 65775010.0, + "step": 54710 + }, + { + "entropy": 1.9700055122375488, + "epoch": 0.16962721227193361, + "grad_norm": 9.067830085754395, + "learning_rate": 6.1425526196801e-06, + "loss": 0.5819, + "mean_token_accuracy": 0.824844454228878, + "num_tokens": 65785503.0, + "step": 54720 + }, + { + "entropy": 1.8520893216133119, + "epoch": 0.1696582113969833, + "grad_norm": 4.218345642089844, + "learning_rate": 6.1419914150870315e-06, + "loss": 0.5206, + "mean_token_accuracy": 0.8238402858376503, + "num_tokens": 65799211.0, + "step": 54730 + }, + { + "entropy": 1.9024053156375884, + "epoch": 0.169689210522033, + "grad_norm": 5.589046001434326, + "learning_rate": 6.141430364286569e-06, + "loss": 0.5102, + "mean_token_accuracy": 0.8389298126101494, + "num_tokens": 65811181.0, + "step": 54740 + }, + { + "entropy": 1.8264511436223985, + "epoch": 0.1697202096470827, + "grad_norm": 9.439501762390137, + "learning_rate": 6.140869467208483e-06, + "loss": 0.5153, + "mean_token_accuracy": 0.839516569674015, + "num_tokens": 65824036.0, + "step": 54750 + }, + { + "entropy": 1.9636918157339096, + "epoch": 0.1697512087721324, + "grad_norm": 8.578853607177734, + "learning_rate": 6.140308723782587e-06, + "loss": 0.5827, + "mean_token_accuracy": 0.8268523201346397, + "num_tokens": 65834646.0, + "step": 54760 + }, + { + "entropy": 1.8637161239981652, + "epoch": 0.1697822078971821, + "grad_norm": 8.782843589782715, + "learning_rate": 6.1397481339387444e-06, + "loss": 0.5552, + "mean_token_accuracy": 0.8350477933883667, + "num_tokens": 65846256.0, + "step": 54770 + }, + { + "entropy": 1.8629076793789863, + "epoch": 0.1698132070222318, + "grad_norm": 8.186110496520996, + "learning_rate": 6.139187697606855e-06, + "loss": 0.4779, + "mean_token_accuracy": 0.84441829174757, + "num_tokens": 65858192.0, + "step": 54780 + }, + { + "entropy": 1.8901616916060449, + "epoch": 0.1698442061472815, + "grad_norm": 9.074200630187988, + "learning_rate": 6.138627414716874e-06, + "loss": 0.5114, + "mean_token_accuracy": 0.8320909798145294, + "num_tokens": 65869497.0, + "step": 54790 + }, + { + "entropy": 1.8830202847719193, + "epoch": 0.1698752052723312, + "grad_norm": 8.882548332214355, + "learning_rate": 6.138067285198796e-06, + "loss": 0.5194, + "mean_token_accuracy": 0.8349261358380318, + "num_tokens": 65881627.0, + "step": 54800 + }, + { + "entropy": 1.8568723633885384, + "epoch": 0.16990620439738088, + "grad_norm": 8.651144981384277, + "learning_rate": 6.1375073089826556e-06, + "loss": 0.5063, + "mean_token_accuracy": 0.8417845577001571, + "num_tokens": 65894013.0, + "step": 54810 + }, + { + "entropy": 1.6487878575921058, + "epoch": 0.16993720352243058, + "grad_norm": 2.7366909980773926, + "learning_rate": 6.13694748599854e-06, + "loss": 0.3365, + "mean_token_accuracy": 0.862517063319683, + "num_tokens": 65909433.0, + "step": 54820 + }, + { + "entropy": 1.9101332992315292, + "epoch": 0.16996820264748028, + "grad_norm": 9.56188678741455, + "learning_rate": 6.136387816176578e-06, + "loss": 0.596, + "mean_token_accuracy": 0.8215406507253646, + "num_tokens": 65921625.0, + "step": 54830 + }, + { + "entropy": 1.7827743321657181, + "epoch": 0.16999920177252997, + "grad_norm": 8.454669952392578, + "learning_rate": 6.135828299446942e-06, + "loss": 0.439, + "mean_token_accuracy": 0.8514051124453544, + "num_tokens": 65934681.0, + "step": 54840 + }, + { + "entropy": 1.8705067232251167, + "epoch": 0.17003020089757967, + "grad_norm": 8.229545593261719, + "learning_rate": 6.135268935739851e-06, + "loss": 0.4988, + "mean_token_accuracy": 0.847279503941536, + "num_tokens": 65946087.0, + "step": 54850 + }, + { + "entropy": 1.7427136436104775, + "epoch": 0.17006120002262937, + "grad_norm": 10.23863697052002, + "learning_rate": 6.134709724985567e-06, + "loss": 0.4771, + "mean_token_accuracy": 0.8420625299215316, + "num_tokens": 65959753.0, + "step": 54860 + }, + { + "entropy": 1.8604324340820313, + "epoch": 0.17009219914767906, + "grad_norm": 9.759727478027344, + "learning_rate": 6.134150667114395e-06, + "loss": 0.5228, + "mean_token_accuracy": 0.8294590935111046, + "num_tokens": 65971696.0, + "step": 54870 + }, + { + "entropy": 1.8827021196484566, + "epoch": 0.17012319827272876, + "grad_norm": 8.212512969970703, + "learning_rate": 6.133591762056689e-06, + "loss": 0.4834, + "mean_token_accuracy": 0.8433489888906479, + "num_tokens": 65983076.0, + "step": 54880 + }, + { + "entropy": 1.8750552967190743, + "epoch": 0.17015419739777846, + "grad_norm": 9.474791526794434, + "learning_rate": 6.133033009742842e-06, + "loss": 0.4965, + "mean_token_accuracy": 0.8393171012401581, + "num_tokens": 65994772.0, + "step": 54890 + }, + { + "entropy": 1.7940944850444793, + "epoch": 0.17018519652282815, + "grad_norm": 4.888198375701904, + "learning_rate": 6.132474410103298e-06, + "loss": 0.4454, + "mean_token_accuracy": 0.8466850072145462, + "num_tokens": 66007645.0, + "step": 54900 + }, + { + "entropy": 1.8010751977562904, + "epoch": 0.17021619564787785, + "grad_norm": 8.661337852478027, + "learning_rate": 6.131915963068537e-06, + "loss": 0.4722, + "mean_token_accuracy": 0.8513104304671287, + "num_tokens": 66020326.0, + "step": 54910 + }, + { + "entropy": 1.7909870207309724, + "epoch": 0.17024719477292755, + "grad_norm": 7.634841442108154, + "learning_rate": 6.13135766856909e-06, + "loss": 0.4178, + "mean_token_accuracy": 0.8531160518527031, + "num_tokens": 66033075.0, + "step": 54920 + }, + { + "entropy": 1.8581923857331275, + "epoch": 0.17027819389797724, + "grad_norm": 8.779280662536621, + "learning_rate": 6.130799526535529e-06, + "loss": 0.5948, + "mean_token_accuracy": 0.8206439360976219, + "num_tokens": 66046620.0, + "step": 54930 + }, + { + "entropy": 1.8611151441931724, + "epoch": 0.1703091930230269, + "grad_norm": 4.603653430938721, + "learning_rate": 6.1302415368984725e-06, + "loss": 0.5261, + "mean_token_accuracy": 0.8282294601202012, + "num_tokens": 66059390.0, + "step": 54940 + }, + { + "entropy": 1.9167177721858024, + "epoch": 0.1703401921480766, + "grad_norm": 3.782064199447632, + "learning_rate": 6.129683699588581e-06, + "loss": 0.5186, + "mean_token_accuracy": 0.8340412959456444, + "num_tokens": 66071324.0, + "step": 54950 + }, + { + "entropy": 1.7558236002922059, + "epoch": 0.1703711912731263, + "grad_norm": 3.8991594314575195, + "learning_rate": 6.129126014536561e-06, + "loss": 0.403, + "mean_token_accuracy": 0.8545200228691101, + "num_tokens": 66084375.0, + "step": 54960 + }, + { + "entropy": 1.9062550529837607, + "epoch": 0.170402190398176, + "grad_norm": 3.855738639831543, + "learning_rate": 6.12856848167316e-06, + "loss": 0.5194, + "mean_token_accuracy": 0.8293624669313431, + "num_tokens": 66096293.0, + "step": 54970 + }, + { + "entropy": 1.9391276210546493, + "epoch": 0.1704331895232257, + "grad_norm": 8.4966402053833, + "learning_rate": 6.1280111009291744e-06, + "loss": 0.5505, + "mean_token_accuracy": 0.8367387875914574, + "num_tokens": 66106920.0, + "step": 54980 + }, + { + "entropy": 1.8991558268666267, + "epoch": 0.1704641886482754, + "grad_norm": 5.249369144439697, + "learning_rate": 6.127453872235442e-06, + "loss": 0.5751, + "mean_token_accuracy": 0.8212674275040627, + "num_tokens": 66118337.0, + "step": 54990 + }, + { + "entropy": 1.8485107704997064, + "epoch": 0.1704951877733251, + "grad_norm": 9.159661293029785, + "learning_rate": 6.1268967955228405e-06, + "loss": 0.5451, + "mean_token_accuracy": 0.8363189935684204, + "num_tokens": 66130596.0, + "step": 55000 + }, + { + "entropy": 1.851409375667572, + "epoch": 0.1705261868983748, + "grad_norm": 8.441533088684082, + "learning_rate": 6.126339870722301e-06, + "loss": 0.4712, + "mean_token_accuracy": 0.8494085937738418, + "num_tokens": 66142188.0, + "step": 55010 + }, + { + "entropy": 1.9319586962461472, + "epoch": 0.17055718602342448, + "grad_norm": 9.650931358337402, + "learning_rate": 6.12578309776479e-06, + "loss": 0.5737, + "mean_token_accuracy": 0.8261142611503601, + "num_tokens": 66153013.0, + "step": 55020 + }, + { + "entropy": 1.8354326650500297, + "epoch": 0.17058818514847418, + "grad_norm": 3.9305970668792725, + "learning_rate": 6.125226476581324e-06, + "loss": 0.4783, + "mean_token_accuracy": 0.8330473214387893, + "num_tokens": 66165874.0, + "step": 55030 + }, + { + "entropy": 1.9324169605970383, + "epoch": 0.17061918427352388, + "grad_norm": 4.871352672576904, + "learning_rate": 6.124670007102958e-06, + "loss": 0.5163, + "mean_token_accuracy": 0.8405327409505844, + "num_tokens": 66176898.0, + "step": 55040 + }, + { + "entropy": 1.8468404799699782, + "epoch": 0.17065018339857357, + "grad_norm": 9.20655632019043, + "learning_rate": 6.124113689260793e-06, + "loss": 0.5028, + "mean_token_accuracy": 0.8357664123177528, + "num_tokens": 66188431.0, + "step": 55050 + }, + { + "entropy": 1.7879943639039992, + "epoch": 0.17068118252362327, + "grad_norm": 3.959418296813965, + "learning_rate": 6.123557522985977e-06, + "loss": 0.4314, + "mean_token_accuracy": 0.8417293280363083, + "num_tokens": 66201593.0, + "step": 55060 + }, + { + "entropy": 1.924604320526123, + "epoch": 0.17071218164867297, + "grad_norm": 8.743783950805664, + "learning_rate": 6.123001508209696e-06, + "loss": 0.6023, + "mean_token_accuracy": 0.8270179316401481, + "num_tokens": 66212878.0, + "step": 55070 + }, + { + "entropy": 1.9165123641490935, + "epoch": 0.17074318077372266, + "grad_norm": 7.5204949378967285, + "learning_rate": 6.122445644863187e-06, + "loss": 0.5624, + "mean_token_accuracy": 0.8256003141403199, + "num_tokens": 66224720.0, + "step": 55080 + }, + { + "entropy": 1.889313419163227, + "epoch": 0.17077417989877236, + "grad_norm": 8.590926170349121, + "learning_rate": 6.121889932877719e-06, + "loss": 0.5542, + "mean_token_accuracy": 0.8257635533809662, + "num_tokens": 66236384.0, + "step": 55090 + }, + { + "entropy": 1.870141127705574, + "epoch": 0.17080517902382206, + "grad_norm": 3.6843600273132324, + "learning_rate": 6.121334372184618e-06, + "loss": 0.5093, + "mean_token_accuracy": 0.8345419481396675, + "num_tokens": 66248946.0, + "step": 55100 + }, + { + "entropy": 1.8741691306233406, + "epoch": 0.17083617814887175, + "grad_norm": 9.643596649169922, + "learning_rate": 6.120778962715248e-06, + "loss": 0.4838, + "mean_token_accuracy": 0.8437133207917213, + "num_tokens": 66260837.0, + "step": 55110 + }, + { + "entropy": 1.8679534777998925, + "epoch": 0.17086717727392145, + "grad_norm": 4.9696149826049805, + "learning_rate": 6.120223704401012e-06, + "loss": 0.4552, + "mean_token_accuracy": 0.8462657079100608, + "num_tokens": 66272827.0, + "step": 55120 + }, + { + "entropy": 1.8148210868239403, + "epoch": 0.17089817639897115, + "grad_norm": 8.890388488769531, + "learning_rate": 6.119668597173365e-06, + "loss": 0.4812, + "mean_token_accuracy": 0.8454957276582717, + "num_tokens": 66285297.0, + "step": 55130 + }, + { + "entropy": 1.8707555949687957, + "epoch": 0.17092917552402084, + "grad_norm": 8.708013534545898, + "learning_rate": 6.119113640963797e-06, + "loss": 0.5601, + "mean_token_accuracy": 0.8221472725272179, + "num_tokens": 66296618.0, + "step": 55140 + }, + { + "entropy": 1.8355353966355323, + "epoch": 0.17096017464907054, + "grad_norm": 9.65738296508789, + "learning_rate": 6.11855883570385e-06, + "loss": 0.5414, + "mean_token_accuracy": 0.8310455739498138, + "num_tokens": 66309122.0, + "step": 55150 + }, + { + "entropy": 1.8585526466369628, + "epoch": 0.17099117377412024, + "grad_norm": 9.257162094116211, + "learning_rate": 6.118004181325103e-06, + "loss": 0.5515, + "mean_token_accuracy": 0.8375745385885238, + "num_tokens": 66321433.0, + "step": 55160 + }, + { + "entropy": 1.8798829302191735, + "epoch": 0.17102217289916993, + "grad_norm": 8.34717845916748, + "learning_rate": 6.117449677759181e-06, + "loss": 0.5329, + "mean_token_accuracy": 0.8309325784444809, + "num_tokens": 66333576.0, + "step": 55170 + }, + { + "entropy": 1.9052462548017501, + "epoch": 0.17105317202421963, + "grad_norm": 9.19088363647461, + "learning_rate": 6.11689532493775e-06, + "loss": 0.5508, + "mean_token_accuracy": 0.8245049402117729, + "num_tokens": 66345424.0, + "step": 55180 + }, + { + "entropy": 1.8513251379132272, + "epoch": 0.1710841711492693, + "grad_norm": 4.772027969360352, + "learning_rate": 6.1163411227925265e-06, + "loss": 0.5349, + "mean_token_accuracy": 0.8387829244136811, + "num_tokens": 66357556.0, + "step": 55190 + }, + { + "entropy": 1.8194545328617096, + "epoch": 0.171115170274319, + "grad_norm": 8.34814453125, + "learning_rate": 6.11578707125526e-06, + "loss": 0.5304, + "mean_token_accuracy": 0.8422711014747619, + "num_tokens": 66370273.0, + "step": 55200 + }, + { + "entropy": 1.8819702237844467, + "epoch": 0.1711461693993687, + "grad_norm": 8.056477546691895, + "learning_rate": 6.1152331702577514e-06, + "loss": 0.5244, + "mean_token_accuracy": 0.8342429891228675, + "num_tokens": 66382365.0, + "step": 55210 + }, + { + "entropy": 1.898891557753086, + "epoch": 0.1711771685244184, + "grad_norm": 4.703845977783203, + "learning_rate": 6.114679419731841e-06, + "loss": 0.5002, + "mean_token_accuracy": 0.8460740655660629, + "num_tokens": 66393344.0, + "step": 55220 + }, + { + "entropy": 1.957570144534111, + "epoch": 0.17120816764946808, + "grad_norm": 7.900186061859131, + "learning_rate": 6.114125819609411e-06, + "loss": 0.5556, + "mean_token_accuracy": 0.8310376718640328, + "num_tokens": 66404439.0, + "step": 55230 + }, + { + "entropy": 1.8322395712137223, + "epoch": 0.17123916677451778, + "grad_norm": 3.6799228191375732, + "learning_rate": 6.113572369822391e-06, + "loss": 0.5098, + "mean_token_accuracy": 0.8300682485103608, + "num_tokens": 66417136.0, + "step": 55240 + }, + { + "entropy": 1.8394572094082833, + "epoch": 0.17127016589956748, + "grad_norm": 4.595379829406738, + "learning_rate": 6.113019070302754e-06, + "loss": 0.4841, + "mean_token_accuracy": 0.8420197859406471, + "num_tokens": 66429039.0, + "step": 55250 + }, + { + "entropy": 1.8247755616903305, + "epoch": 0.17130116502461717, + "grad_norm": 7.622194290161133, + "learning_rate": 6.11246592098251e-06, + "loss": 0.4805, + "mean_token_accuracy": 0.8401867300271988, + "num_tokens": 66441004.0, + "step": 55260 + }, + { + "entropy": 1.858515314757824, + "epoch": 0.17133216414966687, + "grad_norm": 9.903068542480469, + "learning_rate": 6.111912921793715e-06, + "loss": 0.4979, + "mean_token_accuracy": 0.8486079826951027, + "num_tokens": 66452827.0, + "step": 55270 + }, + { + "entropy": 1.8595426589250565, + "epoch": 0.17136316327471657, + "grad_norm": 6.444674968719482, + "learning_rate": 6.111360072668473e-06, + "loss": 0.5119, + "mean_token_accuracy": 0.8341106563806534, + "num_tokens": 66464688.0, + "step": 55280 + }, + { + "entropy": 1.7985023483633995, + "epoch": 0.17139416239976626, + "grad_norm": 9.139451026916504, + "learning_rate": 6.110807373538924e-06, + "loss": 0.4902, + "mean_token_accuracy": 0.8438055410981178, + "num_tokens": 66478770.0, + "step": 55290 + }, + { + "entropy": 1.8379919603466988, + "epoch": 0.17142516152481596, + "grad_norm": 2.0652432441711426, + "learning_rate": 6.110254824337254e-06, + "loss": 0.5378, + "mean_token_accuracy": 0.8349084377288818, + "num_tokens": 66491604.0, + "step": 55300 + }, + { + "entropy": 1.8246984764933587, + "epoch": 0.17145616064986566, + "grad_norm": 4.141010284423828, + "learning_rate": 6.109702424995692e-06, + "loss": 0.461, + "mean_token_accuracy": 0.8502539679408073, + "num_tokens": 66503887.0, + "step": 55310 + }, + { + "entropy": 1.7455498218536376, + "epoch": 0.17148715977491535, + "grad_norm": 7.454645156860352, + "learning_rate": 6.1091501754465084e-06, + "loss": 0.4684, + "mean_token_accuracy": 0.8398867219686508, + "num_tokens": 66516981.0, + "step": 55320 + }, + { + "entropy": 1.8381555840373038, + "epoch": 0.17151815889996505, + "grad_norm": 8.063959121704102, + "learning_rate": 6.10859807562202e-06, + "loss": 0.5018, + "mean_token_accuracy": 0.8499990120530129, + "num_tokens": 66528982.0, + "step": 55330 + }, + { + "entropy": 1.9049826204776763, + "epoch": 0.17154915802501475, + "grad_norm": 8.288491249084473, + "learning_rate": 6.108046125454582e-06, + "loss": 0.5242, + "mean_token_accuracy": 0.8419345527887344, + "num_tokens": 66540768.0, + "step": 55340 + }, + { + "entropy": 1.8578782469034194, + "epoch": 0.17158015715006444, + "grad_norm": 7.979318618774414, + "learning_rate": 6.107494324876594e-06, + "loss": 0.5466, + "mean_token_accuracy": 0.8317851856350899, + "num_tokens": 66552308.0, + "step": 55350 + }, + { + "entropy": 1.8647135615348815, + "epoch": 0.17161115627511414, + "grad_norm": 4.911669731140137, + "learning_rate": 6.1069426738205e-06, + "loss": 0.5339, + "mean_token_accuracy": 0.8424048766493797, + "num_tokens": 66564209.0, + "step": 55360 + }, + { + "entropy": 1.9092617228627204, + "epoch": 0.17164215540016384, + "grad_norm": 9.604966163635254, + "learning_rate": 6.106391172218784e-06, + "loss": 0.5279, + "mean_token_accuracy": 0.8417677640914917, + "num_tokens": 66575424.0, + "step": 55370 + }, + { + "entropy": 1.8798686414957047, + "epoch": 0.17167315452521353, + "grad_norm": 9.60474681854248, + "learning_rate": 6.105839820003976e-06, + "loss": 0.4949, + "mean_token_accuracy": 0.8326731339097023, + "num_tokens": 66588461.0, + "step": 55380 + }, + { + "entropy": 1.787381762266159, + "epoch": 0.17170415365026323, + "grad_norm": 8.320876121520996, + "learning_rate": 6.105288617108646e-06, + "loss": 0.4899, + "mean_token_accuracy": 0.8443598464131356, + "num_tokens": 66601689.0, + "step": 55390 + }, + { + "entropy": 1.8882825881242753, + "epoch": 0.17173515277531293, + "grad_norm": 8.86194896697998, + "learning_rate": 6.104737563465406e-06, + "loss": 0.5774, + "mean_token_accuracy": 0.8265114605426789, + "num_tokens": 66613156.0, + "step": 55400 + }, + { + "entropy": 1.9447656571865082, + "epoch": 0.17176615190036262, + "grad_norm": 10.87563419342041, + "learning_rate": 6.104186659006913e-06, + "loss": 0.5485, + "mean_token_accuracy": 0.8329192861914635, + "num_tokens": 66623750.0, + "step": 55410 + }, + { + "entropy": 1.862581080198288, + "epoch": 0.17179715102541232, + "grad_norm": 9.020846366882324, + "learning_rate": 6.103635903665865e-06, + "loss": 0.5222, + "mean_token_accuracy": 0.8274956732988358, + "num_tokens": 66636578.0, + "step": 55420 + }, + { + "entropy": 1.8903913646936417, + "epoch": 0.17182815015046202, + "grad_norm": 8.09145736694336, + "learning_rate": 6.103085297375004e-06, + "loss": 0.5167, + "mean_token_accuracy": 0.8357395201921463, + "num_tokens": 66648330.0, + "step": 55430 + }, + { + "entropy": 1.9172933578491211, + "epoch": 0.17185914927551169, + "grad_norm": 4.03477144241333, + "learning_rate": 6.102534840067114e-06, + "loss": 0.5521, + "mean_token_accuracy": 0.8339840278029442, + "num_tokens": 66660056.0, + "step": 55440 + }, + { + "entropy": 1.9062219202518462, + "epoch": 0.17189014840056138, + "grad_norm": 9.107460021972656, + "learning_rate": 6.101984531675016e-06, + "loss": 0.565, + "mean_token_accuracy": 0.833404429256916, + "num_tokens": 66670678.0, + "step": 55450 + }, + { + "entropy": 1.9296542406082153, + "epoch": 0.17192114752561108, + "grad_norm": 8.541874885559082, + "learning_rate": 6.1014343721315835e-06, + "loss": 0.517, + "mean_token_accuracy": 0.8412581771612168, + "num_tokens": 66680863.0, + "step": 55460 + }, + { + "entropy": 1.9104670375585555, + "epoch": 0.17195214665066078, + "grad_norm": 7.40362548828125, + "learning_rate": 6.1008843613697255e-06, + "loss": 0.5266, + "mean_token_accuracy": 0.8393293395638466, + "num_tokens": 66692889.0, + "step": 55470 + }, + { + "entropy": 1.8890238150954246, + "epoch": 0.17198314577571047, + "grad_norm": 9.017800331115723, + "learning_rate": 6.100334499322393e-06, + "loss": 0.5257, + "mean_token_accuracy": 0.834419809281826, + "num_tokens": 66705358.0, + "step": 55480 + }, + { + "entropy": 1.8500746071338654, + "epoch": 0.17201414490076017, + "grad_norm": 8.356595993041992, + "learning_rate": 6.099784785922585e-06, + "loss": 0.4723, + "mean_token_accuracy": 0.8483300238847733, + "num_tokens": 66717381.0, + "step": 55490 + }, + { + "entropy": 1.9085285305976867, + "epoch": 0.17204514402580987, + "grad_norm": 9.017643928527832, + "learning_rate": 6.0992352211033335e-06, + "loss": 0.5308, + "mean_token_accuracy": 0.8379713967442513, + "num_tokens": 66729410.0, + "step": 55500 + }, + { + "entropy": 1.914148934185505, + "epoch": 0.17207614315085956, + "grad_norm": 8.142106056213379, + "learning_rate": 6.098685804797724e-06, + "loss": 0.5254, + "mean_token_accuracy": 0.8404997661709785, + "num_tokens": 66740551.0, + "step": 55510 + }, + { + "entropy": 1.9194943860173226, + "epoch": 0.17210714227590926, + "grad_norm": 8.375265121459961, + "learning_rate": 6.098136536938873e-06, + "loss": 0.5537, + "mean_token_accuracy": 0.8302345380187035, + "num_tokens": 66751861.0, + "step": 55520 + }, + { + "entropy": 1.8494600921869278, + "epoch": 0.17213814140095895, + "grad_norm": 7.672314643859863, + "learning_rate": 6.097587417459949e-06, + "loss": 0.4787, + "mean_token_accuracy": 0.8483859553933144, + "num_tokens": 66764180.0, + "step": 55530 + }, + { + "entropy": 1.876507543027401, + "epoch": 0.17216914052600865, + "grad_norm": 8.333982467651367, + "learning_rate": 6.097038446294156e-06, + "loss": 0.4906, + "mean_token_accuracy": 0.8330820754170418, + "num_tokens": 66776554.0, + "step": 55540 + }, + { + "entropy": 1.8608839854598045, + "epoch": 0.17220013965105835, + "grad_norm": 8.950138092041016, + "learning_rate": 6.096489623374742e-06, + "loss": 0.5448, + "mean_token_accuracy": 0.8301695078611374, + "num_tokens": 66789000.0, + "step": 55550 + }, + { + "entropy": 1.871629747748375, + "epoch": 0.17223113877610804, + "grad_norm": 4.220654487609863, + "learning_rate": 6.095940948634997e-06, + "loss": 0.578, + "mean_token_accuracy": 0.8214164420962333, + "num_tokens": 66802167.0, + "step": 55560 + }, + { + "entropy": 1.8488945305347442, + "epoch": 0.17226213790115774, + "grad_norm": 4.729830265045166, + "learning_rate": 6.095392422008255e-06, + "loss": 0.4952, + "mean_token_accuracy": 0.8359545990824699, + "num_tokens": 66813727.0, + "step": 55570 + }, + { + "entropy": 1.8354024216532707, + "epoch": 0.17229313702620744, + "grad_norm": 8.174845695495605, + "learning_rate": 6.094844043427889e-06, + "loss": 0.4816, + "mean_token_accuracy": 0.8510124906897545, + "num_tokens": 66825754.0, + "step": 55580 + }, + { + "entropy": 1.8678600177168847, + "epoch": 0.17232413615125713, + "grad_norm": 11.34805679321289, + "learning_rate": 6.094295812827316e-06, + "loss": 0.4958, + "mean_token_accuracy": 0.8442059218883514, + "num_tokens": 66838613.0, + "step": 55590 + }, + { + "entropy": 1.8801333606243134, + "epoch": 0.17235513527630683, + "grad_norm": 8.798630714416504, + "learning_rate": 6.0937477301399924e-06, + "loss": 0.5115, + "mean_token_accuracy": 0.8383255407214165, + "num_tokens": 66850523.0, + "step": 55600 + }, + { + "entropy": 1.863646823167801, + "epoch": 0.17238613440135653, + "grad_norm": 8.633447647094727, + "learning_rate": 6.093199795299421e-06, + "loss": 0.5343, + "mean_token_accuracy": 0.8378429308533668, + "num_tokens": 66863104.0, + "step": 55610 + }, + { + "entropy": 1.9260516792535782, + "epoch": 0.17241713352640622, + "grad_norm": 6.970504283905029, + "learning_rate": 6.092652008239141e-06, + "loss": 0.5621, + "mean_token_accuracy": 0.8314559891819954, + "num_tokens": 66874671.0, + "step": 55620 + }, + { + "entropy": 1.9149534597992897, + "epoch": 0.17244813265145592, + "grad_norm": 5.021518230438232, + "learning_rate": 6.0921043688927366e-06, + "loss": 0.53, + "mean_token_accuracy": 0.8325030341744423, + "num_tokens": 66886294.0, + "step": 55630 + }, + { + "entropy": 1.9480685591697693, + "epoch": 0.17247913177650562, + "grad_norm": 8.716386795043945, + "learning_rate": 6.091556877193834e-06, + "loss": 0.5884, + "mean_token_accuracy": 0.8272656366229058, + "num_tokens": 66897341.0, + "step": 55640 + }, + { + "entropy": 1.8431043431162835, + "epoch": 0.17251013090155531, + "grad_norm": 8.646214485168457, + "learning_rate": 6.091009533076101e-06, + "loss": 0.4873, + "mean_token_accuracy": 0.8376592069864273, + "num_tokens": 66909553.0, + "step": 55650 + }, + { + "entropy": 1.9310444086790084, + "epoch": 0.172541130026605, + "grad_norm": 7.217070579528809, + "learning_rate": 6.090462336473245e-06, + "loss": 0.6076, + "mean_token_accuracy": 0.8188234835863113, + "num_tokens": 66920698.0, + "step": 55660 + }, + { + "entropy": 1.7853051990270614, + "epoch": 0.1725721291516547, + "grad_norm": 11.338160514831543, + "learning_rate": 6.089915287319018e-06, + "loss": 0.5264, + "mean_token_accuracy": 0.8323128372430801, + "num_tokens": 66933403.0, + "step": 55670 + }, + { + "entropy": 1.8359568476676942, + "epoch": 0.17260312827670438, + "grad_norm": 4.960665702819824, + "learning_rate": 6.089368385547212e-06, + "loss": 0.4729, + "mean_token_accuracy": 0.837063156068325, + "num_tokens": 66945928.0, + "step": 55680 + }, + { + "entropy": 1.7752636238932609, + "epoch": 0.17263412740175407, + "grad_norm": 9.67094612121582, + "learning_rate": 6.088821631091659e-06, + "loss": 0.4704, + "mean_token_accuracy": 0.8366348952054977, + "num_tokens": 66959462.0, + "step": 55690 + }, + { + "entropy": 1.7888485848903657, + "epoch": 0.17266512652680377, + "grad_norm": 9.076850891113281, + "learning_rate": 6.088275023886237e-06, + "loss": 0.4558, + "mean_token_accuracy": 0.8395483061671257, + "num_tokens": 66972656.0, + "step": 55700 + }, + { + "entropy": 1.8987372756004333, + "epoch": 0.17269612565185347, + "grad_norm": 7.72656774520874, + "learning_rate": 6.087728563864862e-06, + "loss": 0.5102, + "mean_token_accuracy": 0.8445681780576706, + "num_tokens": 66983938.0, + "step": 55710 + }, + { + "entropy": 1.894448073208332, + "epoch": 0.17272712477690316, + "grad_norm": 8.844822883605957, + "learning_rate": 6.087182250961492e-06, + "loss": 0.5572, + "mean_token_accuracy": 0.8229103952646255, + "num_tokens": 66996286.0, + "step": 55720 + }, + { + "entropy": 1.7768873780965806, + "epoch": 0.17275812390195286, + "grad_norm": 4.438620567321777, + "learning_rate": 6.086636085110128e-06, + "loss": 0.394, + "mean_token_accuracy": 0.8516925528645516, + "num_tokens": 67010213.0, + "step": 55730 + }, + { + "entropy": 1.8323894530534743, + "epoch": 0.17278912302700256, + "grad_norm": 9.754355430603027, + "learning_rate": 6.08609006624481e-06, + "loss": 0.4419, + "mean_token_accuracy": 0.8407779783010483, + "num_tokens": 67022907.0, + "step": 55740 + }, + { + "entropy": 1.8603756889700889, + "epoch": 0.17282012215205225, + "grad_norm": 10.162839889526367, + "learning_rate": 6.085544194299622e-06, + "loss": 0.5008, + "mean_token_accuracy": 0.8378032267093658, + "num_tokens": 67035045.0, + "step": 55750 + }, + { + "entropy": 1.7585888996720314, + "epoch": 0.17285112127710195, + "grad_norm": 9.135080337524414, + "learning_rate": 6.084998469208687e-06, + "loss": 0.4527, + "mean_token_accuracy": 0.8512668639421463, + "num_tokens": 67048645.0, + "step": 55760 + }, + { + "entropy": 1.8613043814897536, + "epoch": 0.17288212040215165, + "grad_norm": 9.348379135131836, + "learning_rate": 6.084452890906173e-06, + "loss": 0.4957, + "mean_token_accuracy": 0.8385450929403305, + "num_tokens": 67060973.0, + "step": 55770 + }, + { + "entropy": 1.9134966135025024, + "epoch": 0.17291311952720134, + "grad_norm": 9.092744827270508, + "learning_rate": 6.083907459326285e-06, + "loss": 0.5414, + "mean_token_accuracy": 0.838169914484024, + "num_tokens": 67072327.0, + "step": 55780 + }, + { + "entropy": 1.8521252527832985, + "epoch": 0.17294411865225104, + "grad_norm": 8.718194007873535, + "learning_rate": 6.08336217440327e-06, + "loss": 0.5484, + "mean_token_accuracy": 0.8312934651970864, + "num_tokens": 67084319.0, + "step": 55790 + }, + { + "entropy": 1.8963706970214844, + "epoch": 0.17297511777730074, + "grad_norm": 8.831055641174316, + "learning_rate": 6.08281703607142e-06, + "loss": 0.5294, + "mean_token_accuracy": 0.8281921342015266, + "num_tokens": 67096177.0, + "step": 55800 + }, + { + "entropy": 1.8676214978098868, + "epoch": 0.17300611690235043, + "grad_norm": 9.61502742767334, + "learning_rate": 6.082272044265064e-06, + "loss": 0.4739, + "mean_token_accuracy": 0.8491611614823341, + "num_tokens": 67108106.0, + "step": 55810 + }, + { + "entropy": 1.8385474801063537, + "epoch": 0.17303711602740013, + "grad_norm": 3.662489414215088, + "learning_rate": 6.0817271989185745e-06, + "loss": 0.4754, + "mean_token_accuracy": 0.8533884555101394, + "num_tokens": 67120164.0, + "step": 55820 + }, + { + "entropy": 1.7729324698448181, + "epoch": 0.17306811515244983, + "grad_norm": 4.637348175048828, + "learning_rate": 6.081182499966365e-06, + "loss": 0.4626, + "mean_token_accuracy": 0.8501248374581337, + "num_tokens": 67134497.0, + "step": 55830 + }, + { + "entropy": 1.9585619807243346, + "epoch": 0.17309911427749952, + "grad_norm": 7.857696056365967, + "learning_rate": 6.080637947342887e-06, + "loss": 0.5646, + "mean_token_accuracy": 0.8382192403078079, + "num_tokens": 67145397.0, + "step": 55840 + }, + { + "entropy": 1.809555734694004, + "epoch": 0.17313011340254922, + "grad_norm": 5.004009246826172, + "learning_rate": 6.080093540982638e-06, + "loss": 0.4573, + "mean_token_accuracy": 0.8505311787128449, + "num_tokens": 67157963.0, + "step": 55850 + }, + { + "entropy": 1.9263376638293266, + "epoch": 0.17316111252759891, + "grad_norm": 7.850517272949219, + "learning_rate": 6.079549280820153e-06, + "loss": 0.5709, + "mean_token_accuracy": 0.8317690268158913, + "num_tokens": 67169537.0, + "step": 55860 + }, + { + "entropy": 1.9509922355413436, + "epoch": 0.1731921116526486, + "grad_norm": 9.430130958557129, + "learning_rate": 6.079005166790011e-06, + "loss": 0.56, + "mean_token_accuracy": 0.8258754447102546, + "num_tokens": 67180681.0, + "step": 55870 + }, + { + "entropy": 1.9066618397831916, + "epoch": 0.1732231107776983, + "grad_norm": 8.74619197845459, + "learning_rate": 6.078461198826828e-06, + "loss": 0.5422, + "mean_token_accuracy": 0.8300882831215859, + "num_tokens": 67191836.0, + "step": 55880 + }, + { + "entropy": 1.7900028765201568, + "epoch": 0.173254109902748, + "grad_norm": 6.856855869293213, + "learning_rate": 6.077917376865262e-06, + "loss": 0.4932, + "mean_token_accuracy": 0.8394763827323913, + "num_tokens": 67206034.0, + "step": 55890 + }, + { + "entropy": 1.9227029278874397, + "epoch": 0.1732851090277977, + "grad_norm": 8.958833694458008, + "learning_rate": 6.077373700840018e-06, + "loss": 0.4931, + "mean_token_accuracy": 0.8387436643242836, + "num_tokens": 67217234.0, + "step": 55900 + }, + { + "entropy": 1.9202365294098853, + "epoch": 0.1733161081528474, + "grad_norm": 4.27255392074585, + "learning_rate": 6.076830170685832e-06, + "loss": 0.5532, + "mean_token_accuracy": 0.827874468266964, + "num_tokens": 67228530.0, + "step": 55910 + }, + { + "entropy": 1.908749097585678, + "epoch": 0.1733471072778971, + "grad_norm": 8.770615577697754, + "learning_rate": 6.07628678633749e-06, + "loss": 0.5257, + "mean_token_accuracy": 0.8317655339837075, + "num_tokens": 67240085.0, + "step": 55920 + }, + { + "entropy": 1.8192409992218017, + "epoch": 0.17337810640294676, + "grad_norm": 8.269194602966309, + "learning_rate": 6.0757435477298085e-06, + "loss": 0.501, + "mean_token_accuracy": 0.8376537606120109, + "num_tokens": 67252721.0, + "step": 55930 + }, + { + "entropy": 1.835218572616577, + "epoch": 0.17340910552799646, + "grad_norm": 8.810011863708496, + "learning_rate": 6.075200454797657e-06, + "loss": 0.4887, + "mean_token_accuracy": 0.8403037443757058, + "num_tokens": 67265339.0, + "step": 55940 + }, + { + "entropy": 1.9089889451861382, + "epoch": 0.17344010465304616, + "grad_norm": 5.602406978607178, + "learning_rate": 6.0746575074759365e-06, + "loss": 0.5341, + "mean_token_accuracy": 0.8314460068941116, + "num_tokens": 67278228.0, + "step": 55950 + }, + { + "entropy": 1.8503844618797303, + "epoch": 0.17347110377809585, + "grad_norm": 4.604806423187256, + "learning_rate": 6.074114705699592e-06, + "loss": 0.4924, + "mean_token_accuracy": 0.8360475450754166, + "num_tokens": 67290208.0, + "step": 55960 + }, + { + "entropy": 1.8351913020014763, + "epoch": 0.17350210290314555, + "grad_norm": 10.080077171325684, + "learning_rate": 6.073572049403609e-06, + "loss": 0.5014, + "mean_token_accuracy": 0.8317245200276375, + "num_tokens": 67303011.0, + "step": 55970 + }, + { + "entropy": 1.7987764164805413, + "epoch": 0.17353310202819525, + "grad_norm": 8.698798179626465, + "learning_rate": 6.073029538523015e-06, + "loss": 0.4725, + "mean_token_accuracy": 0.8366831094026566, + "num_tokens": 67316802.0, + "step": 55980 + }, + { + "entropy": 1.8339210391044616, + "epoch": 0.17356410115324494, + "grad_norm": 3.881253957748413, + "learning_rate": 6.072487172992875e-06, + "loss": 0.4636, + "mean_token_accuracy": 0.8414662271738053, + "num_tokens": 67329858.0, + "step": 55990 + }, + { + "entropy": 1.8059268981218337, + "epoch": 0.17359510027829464, + "grad_norm": 4.120025157928467, + "learning_rate": 6.0719449527482976e-06, + "loss": 0.4618, + "mean_token_accuracy": 0.8513453081250191, + "num_tokens": 67342567.0, + "step": 56000 + }, + { + "entropy": 1.9046938508749007, + "epoch": 0.17362609940334434, + "grad_norm": 4.1900787353515625, + "learning_rate": 6.07140287772443e-06, + "loss": 0.5465, + "mean_token_accuracy": 0.8171088561415673, + "num_tokens": 67354240.0, + "step": 56010 + }, + { + "entropy": 1.8154946908354759, + "epoch": 0.17365709852839403, + "grad_norm": 7.720127582550049, + "learning_rate": 6.070860947856461e-06, + "loss": 0.5129, + "mean_token_accuracy": 0.841141340136528, + "num_tokens": 67366512.0, + "step": 56020 + }, + { + "entropy": 1.7826613813638688, + "epoch": 0.17368809765344373, + "grad_norm": 8.429262161254883, + "learning_rate": 6.07031916307962e-06, + "loss": 0.4866, + "mean_token_accuracy": 0.8355521574616432, + "num_tokens": 67379705.0, + "step": 56030 + }, + { + "entropy": 1.8282306790351868, + "epoch": 0.17371909677849343, + "grad_norm": 8.058755874633789, + "learning_rate": 6.0697775233291746e-06, + "loss": 0.5237, + "mean_token_accuracy": 0.8397795766592026, + "num_tokens": 67392350.0, + "step": 56040 + }, + { + "entropy": 1.7306653708219528, + "epoch": 0.17375009590354312, + "grad_norm": 8.275370597839355, + "learning_rate": 6.069236028540436e-06, + "loss": 0.4619, + "mean_token_accuracy": 0.8475747913122177, + "num_tokens": 67406980.0, + "step": 56050 + }, + { + "entropy": 1.8713230773806573, + "epoch": 0.17378109502859282, + "grad_norm": 8.6502103805542, + "learning_rate": 6.068694678648755e-06, + "loss": 0.5318, + "mean_token_accuracy": 0.8327876642346382, + "num_tokens": 67419032.0, + "step": 56060 + }, + { + "entropy": 1.9657189399003983, + "epoch": 0.17381209415364252, + "grad_norm": 8.886541366577148, + "learning_rate": 6.068153473589519e-06, + "loss": 0.5763, + "mean_token_accuracy": 0.8254697665572166, + "num_tokens": 67429412.0, + "step": 56070 + }, + { + "entropy": 1.9012456104159354, + "epoch": 0.1738430932786922, + "grad_norm": 7.720726013183594, + "learning_rate": 6.0676124132981626e-06, + "loss": 0.5181, + "mean_token_accuracy": 0.8330578565597534, + "num_tokens": 67441415.0, + "step": 56080 + }, + { + "entropy": 1.8742031693458556, + "epoch": 0.1738740924037419, + "grad_norm": 3.867934465408325, + "learning_rate": 6.067071497710155e-06, + "loss": 0.5121, + "mean_token_accuracy": 0.8421854302287102, + "num_tokens": 67453389.0, + "step": 56090 + }, + { + "entropy": 1.9605601608753205, + "epoch": 0.1739050915287916, + "grad_norm": 8.339366912841797, + "learning_rate": 6.066530726761009e-06, + "loss": 0.5707, + "mean_token_accuracy": 0.8267245456576348, + "num_tokens": 67464579.0, + "step": 56100 + }, + { + "entropy": 1.9092912346124649, + "epoch": 0.1739360906538413, + "grad_norm": 8.375153541564941, + "learning_rate": 6.065990100386274e-06, + "loss": 0.5162, + "mean_token_accuracy": 0.8393413156270981, + "num_tokens": 67476222.0, + "step": 56110 + }, + { + "entropy": 1.805423805117607, + "epoch": 0.173967089778891, + "grad_norm": 7.936673641204834, + "learning_rate": 6.065449618521544e-06, + "loss": 0.4564, + "mean_token_accuracy": 0.8524986699223518, + "num_tokens": 67488739.0, + "step": 56120 + }, + { + "entropy": 1.8608886793255806, + "epoch": 0.1739980889039407, + "grad_norm": 9.917237281799316, + "learning_rate": 6.0649092811024514e-06, + "loss": 0.51, + "mean_token_accuracy": 0.8281596288084984, + "num_tokens": 67500883.0, + "step": 56130 + }, + { + "entropy": 1.9484786361455917, + "epoch": 0.1740290880289904, + "grad_norm": 11.2413969039917, + "learning_rate": 6.064369088064665e-06, + "loss": 0.5384, + "mean_token_accuracy": 0.8397915035486221, + "num_tokens": 67512036.0, + "step": 56140 + }, + { + "entropy": 1.896202652156353, + "epoch": 0.1740600871540401, + "grad_norm": 10.009648323059082, + "learning_rate": 6.063829039343899e-06, + "loss": 0.4998, + "mean_token_accuracy": 0.8400693356990814, + "num_tokens": 67523562.0, + "step": 56150 + }, + { + "entropy": 1.8820205710828304, + "epoch": 0.17409108627908979, + "grad_norm": 4.450737953186035, + "learning_rate": 6.063289134875907e-06, + "loss": 0.4927, + "mean_token_accuracy": 0.8278883457183838, + "num_tokens": 67536330.0, + "step": 56160 + }, + { + "entropy": 1.8365131139755249, + "epoch": 0.17412208540413948, + "grad_norm": 4.729415416717529, + "learning_rate": 6.062749374596479e-06, + "loss": 0.4538, + "mean_token_accuracy": 0.8442412883043289, + "num_tokens": 67549775.0, + "step": 56170 + }, + { + "entropy": 1.9157144859433175, + "epoch": 0.17415308452918915, + "grad_norm": 8.306681632995605, + "learning_rate": 6.062209758441451e-06, + "loss": 0.5175, + "mean_token_accuracy": 0.8310726657509804, + "num_tokens": 67561670.0, + "step": 56180 + }, + { + "entropy": 1.8954879730939864, + "epoch": 0.17418408365423885, + "grad_norm": 9.215730667114258, + "learning_rate": 6.0616702863466905e-06, + "loss": 0.5044, + "mean_token_accuracy": 0.8425735384225845, + "num_tokens": 67573051.0, + "step": 56190 + }, + { + "entropy": 1.9187992975115775, + "epoch": 0.17421508277928854, + "grad_norm": 7.861081600189209, + "learning_rate": 6.061130958248112e-06, + "loss": 0.601, + "mean_token_accuracy": 0.8312497869133949, + "num_tokens": 67585205.0, + "step": 56200 + }, + { + "entropy": 1.954244077205658, + "epoch": 0.17424608190433824, + "grad_norm": 9.027844429016113, + "learning_rate": 6.060591774081669e-06, + "loss": 0.5498, + "mean_token_accuracy": 0.8323284685611725, + "num_tokens": 67596130.0, + "step": 56210 + }, + { + "entropy": 1.917762076854706, + "epoch": 0.17427708102938794, + "grad_norm": 8.658714294433594, + "learning_rate": 6.060052733783352e-06, + "loss": 0.5318, + "mean_token_accuracy": 0.8378539264202118, + "num_tokens": 67607764.0, + "step": 56220 + }, + { + "entropy": 1.9130154103040695, + "epoch": 0.17430808015443763, + "grad_norm": 10.506196022033691, + "learning_rate": 6.0595138372891934e-06, + "loss": 0.5378, + "mean_token_accuracy": 0.8319619536399842, + "num_tokens": 67619152.0, + "step": 56230 + }, + { + "entropy": 1.8991611540317535, + "epoch": 0.17433907927948733, + "grad_norm": 7.824307918548584, + "learning_rate": 6.0589750845352644e-06, + "loss": 0.5565, + "mean_token_accuracy": 0.8250974491238594, + "num_tokens": 67630997.0, + "step": 56240 + }, + { + "entropy": 1.8883090645074845, + "epoch": 0.17437007840453703, + "grad_norm": 8.599298477172852, + "learning_rate": 6.058436475457677e-06, + "loss": 0.5425, + "mean_token_accuracy": 0.831093080341816, + "num_tokens": 67642786.0, + "step": 56250 + }, + { + "entropy": 1.8896732196211814, + "epoch": 0.17440107752958672, + "grad_norm": 4.400026798248291, + "learning_rate": 6.057898009992582e-06, + "loss": 0.53, + "mean_token_accuracy": 0.8287911862134933, + "num_tokens": 67654432.0, + "step": 56260 + }, + { + "entropy": 1.941761639714241, + "epoch": 0.17443207665463642, + "grad_norm": 7.403299808502197, + "learning_rate": 6.057359688076171e-06, + "loss": 0.5551, + "mean_token_accuracy": 0.8296359524130821, + "num_tokens": 67665128.0, + "step": 56270 + }, + { + "entropy": 1.8876193895936013, + "epoch": 0.17446307577968612, + "grad_norm": 8.507538795471191, + "learning_rate": 6.0568215096446736e-06, + "loss": 0.5421, + "mean_token_accuracy": 0.8257350817322731, + "num_tokens": 67676670.0, + "step": 56280 + }, + { + "entropy": 1.9427810519933701, + "epoch": 0.1744940749047358, + "grad_norm": 8.400871276855469, + "learning_rate": 6.0562834746343615e-06, + "loss": 0.5415, + "mean_token_accuracy": 0.8306556642055511, + "num_tokens": 67688070.0, + "step": 56290 + }, + { + "entropy": 1.8497335493564606, + "epoch": 0.1745250740297855, + "grad_norm": 8.613706588745117, + "learning_rate": 6.0557455829815425e-06, + "loss": 0.4978, + "mean_token_accuracy": 0.8401633113622665, + "num_tokens": 67699880.0, + "step": 56300 + }, + { + "entropy": 1.8545069240033627, + "epoch": 0.1745560731548352, + "grad_norm": 8.968703269958496, + "learning_rate": 6.055207834622569e-06, + "loss": 0.4838, + "mean_token_accuracy": 0.83926263153553, + "num_tokens": 67712700.0, + "step": 56310 + }, + { + "entropy": 1.878834429383278, + "epoch": 0.1745870722798849, + "grad_norm": 3.72434663772583, + "learning_rate": 6.054670229493826e-06, + "loss": 0.5328, + "mean_token_accuracy": 0.837174066901207, + "num_tokens": 67724809.0, + "step": 56320 + }, + { + "entropy": 1.891117848455906, + "epoch": 0.1746180714049346, + "grad_norm": 7.3367180824279785, + "learning_rate": 6.054132767531746e-06, + "loss": 0.5233, + "mean_token_accuracy": 0.8528507500886917, + "num_tokens": 67735990.0, + "step": 56330 + }, + { + "entropy": 1.818806654214859, + "epoch": 0.1746490705299843, + "grad_norm": 4.021301746368408, + "learning_rate": 6.053595448672795e-06, + "loss": 0.4479, + "mean_token_accuracy": 0.848919802904129, + "num_tokens": 67748381.0, + "step": 56340 + }, + { + "entropy": 1.9311149969697, + "epoch": 0.174680069655034, + "grad_norm": 9.249166488647461, + "learning_rate": 6.053058272853482e-06, + "loss": 0.5285, + "mean_token_accuracy": 0.8428058341145516, + "num_tokens": 67759883.0, + "step": 56350 + }, + { + "entropy": 1.8628065332770347, + "epoch": 0.1747110687800837, + "grad_norm": 5.258411884307861, + "learning_rate": 6.0525212400103525e-06, + "loss": 0.6164, + "mean_token_accuracy": 0.8357118725776672, + "num_tokens": 67772424.0, + "step": 56360 + }, + { + "entropy": 1.892784410715103, + "epoch": 0.17474206790513339, + "grad_norm": 3.5523910522460938, + "learning_rate": 6.051984350079994e-06, + "loss": 0.5414, + "mean_token_accuracy": 0.83593248128891, + "num_tokens": 67783804.0, + "step": 56370 + }, + { + "entropy": 1.8566182047128676, + "epoch": 0.17477306703018308, + "grad_norm": 10.872590065002441, + "learning_rate": 6.051447602999031e-06, + "loss": 0.5343, + "mean_token_accuracy": 0.8340441033244133, + "num_tokens": 67795593.0, + "step": 56380 + }, + { + "entropy": 1.8216275811195373, + "epoch": 0.17480406615523278, + "grad_norm": 8.305917739868164, + "learning_rate": 6.050910998704129e-06, + "loss": 0.4729, + "mean_token_accuracy": 0.8441584646701813, + "num_tokens": 67807848.0, + "step": 56390 + }, + { + "entropy": 1.8245621785521506, + "epoch": 0.17483506528028248, + "grad_norm": 4.366250514984131, + "learning_rate": 6.050374537131993e-06, + "loss": 0.4799, + "mean_token_accuracy": 0.8390408426523208, + "num_tokens": 67820873.0, + "step": 56400 + }, + { + "entropy": 1.8166996166110039, + "epoch": 0.17486606440533217, + "grad_norm": 8.542943954467773, + "learning_rate": 6.049838218219366e-06, + "loss": 0.4522, + "mean_token_accuracy": 0.8468366071581841, + "num_tokens": 67833571.0, + "step": 56410 + }, + { + "entropy": 1.858149343729019, + "epoch": 0.17489706353038187, + "grad_norm": 8.90046501159668, + "learning_rate": 6.049302041903031e-06, + "loss": 0.4834, + "mean_token_accuracy": 0.8406156003475189, + "num_tokens": 67845941.0, + "step": 56420 + }, + { + "entropy": 1.965251961350441, + "epoch": 0.17492806265543154, + "grad_norm": 10.929553985595703, + "learning_rate": 6.048766008119811e-06, + "loss": 0.5884, + "mean_token_accuracy": 0.8183304697275162, + "num_tokens": 67856898.0, + "step": 56430 + }, + { + "entropy": 1.9665222853422164, + "epoch": 0.17495906178048123, + "grad_norm": 9.303450584411621, + "learning_rate": 6.048230116806566e-06, + "loss": 0.5542, + "mean_token_accuracy": 0.8342076927423477, + "num_tokens": 67868059.0, + "step": 56440 + }, + { + "entropy": 1.916182427108288, + "epoch": 0.17499006090553093, + "grad_norm": 7.816822052001953, + "learning_rate": 6.047694367900196e-06, + "loss": 0.5245, + "mean_token_accuracy": 0.8353784546256066, + "num_tokens": 67880386.0, + "step": 56450 + }, + { + "entropy": 1.9688440665602684, + "epoch": 0.17502106003058063, + "grad_norm": 8.755724906921387, + "learning_rate": 6.047158761337643e-06, + "loss": 0.5154, + "mean_token_accuracy": 0.8388349682092666, + "num_tokens": 67892083.0, + "step": 56460 + }, + { + "entropy": 1.8669242531061172, + "epoch": 0.17505205915563032, + "grad_norm": 7.993595600128174, + "learning_rate": 6.046623297055885e-06, + "loss": 0.5223, + "mean_token_accuracy": 0.8375104799866676, + "num_tokens": 67904144.0, + "step": 56470 + }, + { + "entropy": 1.816992525756359, + "epoch": 0.17508305828068002, + "grad_norm": 4.964598655700684, + "learning_rate": 6.046087974991937e-06, + "loss": 0.4835, + "mean_token_accuracy": 0.8406194254755974, + "num_tokens": 67917537.0, + "step": 56480 + }, + { + "entropy": 1.8135570034384727, + "epoch": 0.17511405740572972, + "grad_norm": 8.609065055847168, + "learning_rate": 6.045552795082859e-06, + "loss": 0.4481, + "mean_token_accuracy": 0.8388777539134026, + "num_tokens": 67931322.0, + "step": 56490 + }, + { + "entropy": 1.9724404200911523, + "epoch": 0.1751450565307794, + "grad_norm": 9.910813331604004, + "learning_rate": 6.0450177572657435e-06, + "loss": 0.5724, + "mean_token_accuracy": 0.829445195198059, + "num_tokens": 67942971.0, + "step": 56500 + }, + { + "entropy": 1.8769863620400429, + "epoch": 0.1751760556558291, + "grad_norm": 10.879551887512207, + "learning_rate": 6.044482861477728e-06, + "loss": 0.4882, + "mean_token_accuracy": 0.84668777436018, + "num_tokens": 67954835.0, + "step": 56510 + }, + { + "entropy": 1.895396353304386, + "epoch": 0.1752070547808788, + "grad_norm": 7.5075178146362305, + "learning_rate": 6.043948107655985e-06, + "loss": 0.4617, + "mean_token_accuracy": 0.859335508942604, + "num_tokens": 67966585.0, + "step": 56520 + }, + { + "entropy": 1.845558349788189, + "epoch": 0.1752380539059285, + "grad_norm": 5.098723411560059, + "learning_rate": 6.0434134957377275e-06, + "loss": 0.4722, + "mean_token_accuracy": 0.8512143403291702, + "num_tokens": 67978885.0, + "step": 56530 + }, + { + "entropy": 1.8857697710394858, + "epoch": 0.1752690530309782, + "grad_norm": 9.507451057434082, + "learning_rate": 6.042879025660207e-06, + "loss": 0.4985, + "mean_token_accuracy": 0.8429853811860084, + "num_tokens": 67990999.0, + "step": 56540 + }, + { + "entropy": 1.8623712986707688, + "epoch": 0.1753000521560279, + "grad_norm": 9.963186264038086, + "learning_rate": 6.042344697360713e-06, + "loss": 0.4936, + "mean_token_accuracy": 0.8374731317162514, + "num_tokens": 68003103.0, + "step": 56550 + }, + { + "entropy": 1.8676358297467233, + "epoch": 0.1753310512810776, + "grad_norm": 7.353335857391357, + "learning_rate": 6.041810510776573e-06, + "loss": 0.4875, + "mean_token_accuracy": 0.8497109815478325, + "num_tokens": 68015029.0, + "step": 56560 + }, + { + "entropy": 1.9030646055936813, + "epoch": 0.1753620504061273, + "grad_norm": 4.22896671295166, + "learning_rate": 6.041276465845158e-06, + "loss": 0.5131, + "mean_token_accuracy": 0.8298840820789337, + "num_tokens": 68026783.0, + "step": 56570 + }, + { + "entropy": 1.9653840601444243, + "epoch": 0.175393049531177, + "grad_norm": 7.451119899749756, + "learning_rate": 6.040742562503874e-06, + "loss": 0.6031, + "mean_token_accuracy": 0.8305002480745316, + "num_tokens": 68037670.0, + "step": 56580 + }, + { + "entropy": 1.9089856892824173, + "epoch": 0.17542404865622668, + "grad_norm": 9.136287689208984, + "learning_rate": 6.040208800690164e-06, + "loss": 0.4826, + "mean_token_accuracy": 0.8452465951442718, + "num_tokens": 68049712.0, + "step": 56590 + }, + { + "entropy": 1.9428860023617744, + "epoch": 0.17545504778127638, + "grad_norm": 4.60407829284668, + "learning_rate": 6.039675180341514e-06, + "loss": 0.5676, + "mean_token_accuracy": 0.8250961035490036, + "num_tokens": 68061772.0, + "step": 56600 + }, + { + "entropy": 1.8673185929656029, + "epoch": 0.17548604690632608, + "grad_norm": 8.91592788696289, + "learning_rate": 6.039141701395445e-06, + "loss": 0.5159, + "mean_token_accuracy": 0.831261222064495, + "num_tokens": 68074176.0, + "step": 56610 + }, + { + "entropy": 1.8924739733338356, + "epoch": 0.17551704603137577, + "grad_norm": 4.301705360412598, + "learning_rate": 6.0386083637895194e-06, + "loss": 0.589, + "mean_token_accuracy": 0.8193011194467544, + "num_tokens": 68087007.0, + "step": 56620 + }, + { + "entropy": 1.9513219073414803, + "epoch": 0.17554804515642547, + "grad_norm": 8.441484451293945, + "learning_rate": 6.038075167461339e-06, + "loss": 0.5498, + "mean_token_accuracy": 0.8327566117048264, + "num_tokens": 68098862.0, + "step": 56630 + }, + { + "entropy": 1.927374078333378, + "epoch": 0.17557904428147517, + "grad_norm": 9.029617309570312, + "learning_rate": 6.037542112348537e-06, + "loss": 0.5306, + "mean_token_accuracy": 0.831793662905693, + "num_tokens": 68111512.0, + "step": 56640 + }, + { + "entropy": 1.9223608300089836, + "epoch": 0.17561004340652486, + "grad_norm": 4.942859172821045, + "learning_rate": 6.0370091983887946e-06, + "loss": 0.4767, + "mean_token_accuracy": 0.8408562824130058, + "num_tokens": 68123365.0, + "step": 56650 + }, + { + "entropy": 1.9207605987787246, + "epoch": 0.17564104253157456, + "grad_norm": 8.518644332885742, + "learning_rate": 6.036476425519826e-06, + "loss": 0.5439, + "mean_token_accuracy": 0.8278803005814552, + "num_tokens": 68135831.0, + "step": 56660 + }, + { + "entropy": 1.8496787279844285, + "epoch": 0.17567204165662423, + "grad_norm": 8.98222541809082, + "learning_rate": 6.0359437936793865e-06, + "loss": 0.4803, + "mean_token_accuracy": 0.8472736284136773, + "num_tokens": 68147503.0, + "step": 56670 + }, + { + "entropy": 1.9197917222976684, + "epoch": 0.17570304078167392, + "grad_norm": 8.030874252319336, + "learning_rate": 6.0354113028052645e-06, + "loss": 0.5854, + "mean_token_accuracy": 0.8222957968711853, + "num_tokens": 68159296.0, + "step": 56680 + }, + { + "entropy": 1.7842533797025681, + "epoch": 0.17573403990672362, + "grad_norm": 4.326035976409912, + "learning_rate": 6.034878952835293e-06, + "loss": 0.452, + "mean_token_accuracy": 0.8420939773321152, + "num_tokens": 68173065.0, + "step": 56690 + }, + { + "entropy": 1.832868304848671, + "epoch": 0.17576503903177332, + "grad_norm": 7.654922962188721, + "learning_rate": 6.0343467437073435e-06, + "loss": 0.4637, + "mean_token_accuracy": 0.8458761259913444, + "num_tokens": 68185736.0, + "step": 56700 + }, + { + "entropy": 1.8796532317996024, + "epoch": 0.17579603815682301, + "grad_norm": 8.131157875061035, + "learning_rate": 6.03381467535932e-06, + "loss": 0.5128, + "mean_token_accuracy": 0.8416437700390815, + "num_tokens": 68197051.0, + "step": 56710 + }, + { + "entropy": 1.8534429788589477, + "epoch": 0.1758270372818727, + "grad_norm": 10.127676963806152, + "learning_rate": 6.03328274772917e-06, + "loss": 0.4784, + "mean_token_accuracy": 0.8437806442379951, + "num_tokens": 68209732.0, + "step": 56720 + }, + { + "entropy": 1.9118575572967529, + "epoch": 0.1758580364069224, + "grad_norm": 8.467833518981934, + "learning_rate": 6.0327509607548775e-06, + "loss": 0.5385, + "mean_token_accuracy": 0.8397658735513687, + "num_tokens": 68220339.0, + "step": 56730 + }, + { + "entropy": 1.844566436111927, + "epoch": 0.1758890355319721, + "grad_norm": 9.010001182556152, + "learning_rate": 6.032219314374463e-06, + "loss": 0.4874, + "mean_token_accuracy": 0.8425126999616623, + "num_tokens": 68232942.0, + "step": 56740 + }, + { + "entropy": 1.874419206380844, + "epoch": 0.1759200346570218, + "grad_norm": 3.708150625228882, + "learning_rate": 6.03168780852599e-06, + "loss": 0.5122, + "mean_token_accuracy": 0.8311620220541954, + "num_tokens": 68245087.0, + "step": 56750 + }, + { + "entropy": 1.936698080599308, + "epoch": 0.1759510337820715, + "grad_norm": 8.663212776184082, + "learning_rate": 6.0311564431475544e-06, + "loss": 0.5413, + "mean_token_accuracy": 0.8401820287108421, + "num_tokens": 68256104.0, + "step": 56760 + }, + { + "entropy": 1.7919904850423336, + "epoch": 0.1759820329071212, + "grad_norm": 8.531180381774902, + "learning_rate": 6.030625218177295e-06, + "loss": 0.4302, + "mean_token_accuracy": 0.8482810765504837, + "num_tokens": 68269355.0, + "step": 56770 + }, + { + "entropy": 1.8165376424789428, + "epoch": 0.1760130320321709, + "grad_norm": 8.21648120880127, + "learning_rate": 6.030094133553386e-06, + "loss": 0.3991, + "mean_token_accuracy": 0.8565915554761887, + "num_tokens": 68282068.0, + "step": 56780 + }, + { + "entropy": 1.9260249942541123, + "epoch": 0.1760440311572206, + "grad_norm": 10.875053405761719, + "learning_rate": 6.029563189214042e-06, + "loss": 0.5705, + "mean_token_accuracy": 0.8371611222624779, + "num_tokens": 68293043.0, + "step": 56790 + }, + { + "entropy": 1.894984395802021, + "epoch": 0.17607503028227028, + "grad_norm": 4.284290790557861, + "learning_rate": 6.0290323850975115e-06, + "loss": 0.5323, + "mean_token_accuracy": 0.8329278856515885, + "num_tokens": 68304689.0, + "step": 56800 + }, + { + "entropy": 1.826059702038765, + "epoch": 0.17610602940731998, + "grad_norm": 10.755820274353027, + "learning_rate": 6.028501721142086e-06, + "loss": 0.4972, + "mean_token_accuracy": 0.8308580055832863, + "num_tokens": 68317294.0, + "step": 56810 + }, + { + "entropy": 1.8452559620141984, + "epoch": 0.17613702853236968, + "grad_norm": 7.7797040939331055, + "learning_rate": 6.027971197286092e-06, + "loss": 0.5015, + "mean_token_accuracy": 0.8381189867854119, + "num_tokens": 68329628.0, + "step": 56820 + }, + { + "entropy": 1.96290685236454, + "epoch": 0.17616802765741937, + "grad_norm": 9.668147087097168, + "learning_rate": 6.027440813467895e-06, + "loss": 0.5797, + "mean_token_accuracy": 0.822065281867981, + "num_tokens": 68341030.0, + "step": 56830 + }, + { + "entropy": 1.869129341840744, + "epoch": 0.17619902678246907, + "grad_norm": 8.411051750183105, + "learning_rate": 6.026910569625899e-06, + "loss": 0.4821, + "mean_token_accuracy": 0.8431349366903305, + "num_tokens": 68352755.0, + "step": 56840 + }, + { + "entropy": 1.8781275868415832, + "epoch": 0.17623002590751877, + "grad_norm": 8.966902732849121, + "learning_rate": 6.026380465698544e-06, + "loss": 0.4906, + "mean_token_accuracy": 0.8468410953879356, + "num_tokens": 68364276.0, + "step": 56850 + }, + { + "entropy": 1.7951830908656121, + "epoch": 0.17626102503256846, + "grad_norm": 7.949598789215088, + "learning_rate": 6.025850501624308e-06, + "loss": 0.4386, + "mean_token_accuracy": 0.8530111536383629, + "num_tokens": 68377483.0, + "step": 56860 + }, + { + "entropy": 1.8381464675068855, + "epoch": 0.17629202415761816, + "grad_norm": 8.990347862243652, + "learning_rate": 6.025320677341711e-06, + "loss": 0.477, + "mean_token_accuracy": 0.8483751729130745, + "num_tokens": 68390308.0, + "step": 56870 + }, + { + "entropy": 1.8240345925092698, + "epoch": 0.17632302328266786, + "grad_norm": 7.0853071212768555, + "learning_rate": 6.024790992789304e-06, + "loss": 0.4229, + "mean_token_accuracy": 0.8524978414177895, + "num_tokens": 68402956.0, + "step": 56880 + }, + { + "entropy": 1.8852536737918855, + "epoch": 0.17635402240771755, + "grad_norm": 7.855372905731201, + "learning_rate": 6.024261447905683e-06, + "loss": 0.529, + "mean_token_accuracy": 0.830386458337307, + "num_tokens": 68415648.0, + "step": 56890 + }, + { + "entropy": 1.903229682147503, + "epoch": 0.17638502153276725, + "grad_norm": 9.828873634338379, + "learning_rate": 6.0237320426294755e-06, + "loss": 0.5071, + "mean_token_accuracy": 0.8335711434483528, + "num_tokens": 68427530.0, + "step": 56900 + }, + { + "entropy": 1.9013172283768653, + "epoch": 0.17641602065781695, + "grad_norm": 8.85859203338623, + "learning_rate": 6.023202776899353e-06, + "loss": 0.5314, + "mean_token_accuracy": 0.8388579234480857, + "num_tokens": 68439199.0, + "step": 56910 + }, + { + "entropy": 1.7970546633005142, + "epoch": 0.17644701978286662, + "grad_norm": 7.830367088317871, + "learning_rate": 6.0226736506540186e-06, + "loss": 0.4507, + "mean_token_accuracy": 0.8480477169156074, + "num_tokens": 68452889.0, + "step": 56920 + }, + { + "entropy": 1.9641900137066841, + "epoch": 0.1764780189079163, + "grad_norm": 9.607945442199707, + "learning_rate": 6.022144663832216e-06, + "loss": 0.5687, + "mean_token_accuracy": 0.8301128730177879, + "num_tokens": 68464241.0, + "step": 56930 + }, + { + "entropy": 1.8105278745293618, + "epoch": 0.176509018032966, + "grad_norm": 8.978128433227539, + "learning_rate": 6.0216158163727265e-06, + "loss": 0.4757, + "mean_token_accuracy": 0.8478241801261902, + "num_tokens": 68477760.0, + "step": 56940 + }, + { + "entropy": 1.8664100661873817, + "epoch": 0.1765400171580157, + "grad_norm": 9.166288375854492, + "learning_rate": 6.021087108214369e-06, + "loss": 0.5079, + "mean_token_accuracy": 0.8363597363233566, + "num_tokens": 68489199.0, + "step": 56950 + }, + { + "entropy": 1.86256393045187, + "epoch": 0.1765710162830654, + "grad_norm": 11.094559669494629, + "learning_rate": 6.020558539296e-06, + "loss": 0.4993, + "mean_token_accuracy": 0.8493609294295311, + "num_tokens": 68500857.0, + "step": 56960 + }, + { + "entropy": 1.9616552650928498, + "epoch": 0.1766020154081151, + "grad_norm": 9.776663780212402, + "learning_rate": 6.020030109556513e-06, + "loss": 0.5729, + "mean_token_accuracy": 0.8265084490180016, + "num_tokens": 68512216.0, + "step": 56970 + }, + { + "entropy": 1.9011280700564384, + "epoch": 0.1766330145331648, + "grad_norm": 10.027840614318848, + "learning_rate": 6.019501818934841e-06, + "loss": 0.5192, + "mean_token_accuracy": 0.83545922935009, + "num_tokens": 68524348.0, + "step": 56980 + }, + { + "entropy": 1.898293286561966, + "epoch": 0.1766640136582145, + "grad_norm": 8.679490089416504, + "learning_rate": 6.018973667369951e-06, + "loss": 0.4967, + "mean_token_accuracy": 0.837830375134945, + "num_tokens": 68536357.0, + "step": 56990 + }, + { + "entropy": 1.8468020305037498, + "epoch": 0.1766950127832642, + "grad_norm": 10.170060157775879, + "learning_rate": 6.01844565480085e-06, + "loss": 0.4664, + "mean_token_accuracy": 0.8466181620955467, + "num_tokens": 68548639.0, + "step": 57000 + }, + { + "entropy": 1.8502362087368964, + "epoch": 0.17672601190831388, + "grad_norm": 9.918789863586426, + "learning_rate": 6.017917781166582e-06, + "loss": 0.4941, + "mean_token_accuracy": 0.8434646666049957, + "num_tokens": 68560221.0, + "step": 57010 + }, + { + "entropy": 1.847883252799511, + "epoch": 0.17675701103336358, + "grad_norm": 3.5449788570404053, + "learning_rate": 6.017390046406228e-06, + "loss": 0.5113, + "mean_token_accuracy": 0.8291626140475273, + "num_tokens": 68573365.0, + "step": 57020 + }, + { + "entropy": 1.8949782311916352, + "epoch": 0.17678801015841328, + "grad_norm": 8.271769523620605, + "learning_rate": 6.016862450458908e-06, + "loss": 0.5132, + "mean_token_accuracy": 0.8308848381042481, + "num_tokens": 68585450.0, + "step": 57030 + }, + { + "entropy": 1.8946194440126418, + "epoch": 0.17681900928346297, + "grad_norm": 4.019629955291748, + "learning_rate": 6.016334993263777e-06, + "loss": 0.5717, + "mean_token_accuracy": 0.832112868130207, + "num_tokens": 68597707.0, + "step": 57040 + }, + { + "entropy": 1.8897631257772445, + "epoch": 0.17685000840851267, + "grad_norm": 7.272465705871582, + "learning_rate": 6.015807674760029e-06, + "loss": 0.5713, + "mean_token_accuracy": 0.8315581545233727, + "num_tokens": 68610457.0, + "step": 57050 + }, + { + "entropy": 1.925081330537796, + "epoch": 0.17688100753356237, + "grad_norm": 3.892751932144165, + "learning_rate": 6.015280494886894e-06, + "loss": 0.5467, + "mean_token_accuracy": 0.8374591827392578, + "num_tokens": 68621838.0, + "step": 57060 + }, + { + "entropy": 1.9003553301095963, + "epoch": 0.17691200665861206, + "grad_norm": 8.307111740112305, + "learning_rate": 6.01475345358364e-06, + "loss": 0.549, + "mean_token_accuracy": 0.8233644008636475, + "num_tokens": 68633799.0, + "step": 57070 + }, + { + "entropy": 1.8528232917189598, + "epoch": 0.17694300578366176, + "grad_norm": 5.3729071617126465, + "learning_rate": 6.014226550789571e-06, + "loss": 0.4674, + "mean_token_accuracy": 0.8408889621496201, + "num_tokens": 68645858.0, + "step": 57080 + }, + { + "entropy": 1.8784978061914444, + "epoch": 0.17697400490871146, + "grad_norm": 9.919316291809082, + "learning_rate": 6.013699786444032e-06, + "loss": 0.5215, + "mean_token_accuracy": 0.8381554082036018, + "num_tokens": 68657900.0, + "step": 57090 + }, + { + "entropy": 1.870173905789852, + "epoch": 0.17700500403376115, + "grad_norm": 8.955236434936523, + "learning_rate": 6.013173160486402e-06, + "loss": 0.4849, + "mean_token_accuracy": 0.8371668472886086, + "num_tokens": 68670085.0, + "step": 57100 + }, + { + "entropy": 1.836390271782875, + "epoch": 0.17703600315881085, + "grad_norm": 8.182779312133789, + "learning_rate": 6.012646672856096e-06, + "loss": 0.4137, + "mean_token_accuracy": 0.8486859187483787, + "num_tokens": 68682961.0, + "step": 57110 + }, + { + "entropy": 1.8538921266794204, + "epoch": 0.17706700228386055, + "grad_norm": 9.032723426818848, + "learning_rate": 6.012120323492569e-06, + "loss": 0.4751, + "mean_token_accuracy": 0.8346615865826607, + "num_tokens": 68695560.0, + "step": 57120 + }, + { + "entropy": 1.7396719381213188, + "epoch": 0.17709800140891024, + "grad_norm": 9.137755393981934, + "learning_rate": 6.0115941123353115e-06, + "loss": 0.3782, + "mean_token_accuracy": 0.8600768327713013, + "num_tokens": 68709891.0, + "step": 57130 + }, + { + "entropy": 1.9093570321798325, + "epoch": 0.17712900053395994, + "grad_norm": 8.14976978302002, + "learning_rate": 6.011068039323853e-06, + "loss": 0.5074, + "mean_token_accuracy": 0.845938429236412, + "num_tokens": 68721365.0, + "step": 57140 + }, + { + "entropy": 1.9436588928103447, + "epoch": 0.17715999965900964, + "grad_norm": 7.569816589355469, + "learning_rate": 6.010542104397757e-06, + "loss": 0.5499, + "mean_token_accuracy": 0.8280878692865372, + "num_tokens": 68733349.0, + "step": 57150 + }, + { + "entropy": 1.8442727386951447, + "epoch": 0.17719099878405933, + "grad_norm": 8.786954879760742, + "learning_rate": 6.0100163074966265e-06, + "loss": 0.4243, + "mean_token_accuracy": 0.8567957922816276, + "num_tokens": 68746143.0, + "step": 57160 + }, + { + "entropy": 1.9321425527334213, + "epoch": 0.177221997909109, + "grad_norm": 5.256120681762695, + "learning_rate": 6.009490648560099e-06, + "loss": 0.5664, + "mean_token_accuracy": 0.8285882025957108, + "num_tokens": 68757817.0, + "step": 57170 + }, + { + "entropy": 1.8465531781315803, + "epoch": 0.1772529970341587, + "grad_norm": 9.743329048156738, + "learning_rate": 6.008965127527853e-06, + "loss": 0.4624, + "mean_token_accuracy": 0.842889928817749, + "num_tokens": 68771328.0, + "step": 57180 + }, + { + "entropy": 1.9369317084550857, + "epoch": 0.1772839961592084, + "grad_norm": 3.970804452896118, + "learning_rate": 6.008439744339599e-06, + "loss": 0.5458, + "mean_token_accuracy": 0.8342833817005157, + "num_tokens": 68782338.0, + "step": 57190 + }, + { + "entropy": 1.8963096350431443, + "epoch": 0.1773149952842581, + "grad_norm": 10.260050773620605, + "learning_rate": 6.007914498935089e-06, + "loss": 0.5204, + "mean_token_accuracy": 0.8387321591377258, + "num_tokens": 68793787.0, + "step": 57200 + }, + { + "entropy": 1.9125867441296578, + "epoch": 0.1773459944093078, + "grad_norm": 8.963105201721191, + "learning_rate": 6.007389391254107e-06, + "loss": 0.5465, + "mean_token_accuracy": 0.8316601321101189, + "num_tokens": 68805415.0, + "step": 57210 + }, + { + "entropy": 1.9211559012532233, + "epoch": 0.17737699353435749, + "grad_norm": 8.377967834472656, + "learning_rate": 6.006864421236479e-06, + "loss": 0.5096, + "mean_token_accuracy": 0.84478460252285, + "num_tokens": 68816320.0, + "step": 57220 + }, + { + "entropy": 1.9259533405303955, + "epoch": 0.17740799265940718, + "grad_norm": 8.574922561645508, + "learning_rate": 6.0063395888220646e-06, + "loss": 0.6228, + "mean_token_accuracy": 0.8221300706267357, + "num_tokens": 68828212.0, + "step": 57230 + }, + { + "entropy": 1.94825499355793, + "epoch": 0.17743899178445688, + "grad_norm": 7.872236251831055, + "learning_rate": 6.00581489395076e-06, + "loss": 0.5457, + "mean_token_accuracy": 0.8307785525918007, + "num_tokens": 68839479.0, + "step": 57240 + }, + { + "entropy": 1.8726135820150376, + "epoch": 0.17746999090950658, + "grad_norm": 9.208822250366211, + "learning_rate": 6.005290336562501e-06, + "loss": 0.4791, + "mean_token_accuracy": 0.8378603532910347, + "num_tokens": 68851565.0, + "step": 57250 + }, + { + "entropy": 1.899706956744194, + "epoch": 0.17750099003455627, + "grad_norm": 8.506078720092773, + "learning_rate": 6.004765916597255e-06, + "loss": 0.4913, + "mean_token_accuracy": 0.842448279261589, + "num_tokens": 68863462.0, + "step": 57260 + }, + { + "entropy": 1.8596750691533088, + "epoch": 0.17753198915960597, + "grad_norm": 9.010077476501465, + "learning_rate": 6.004241633995031e-06, + "loss": 0.4798, + "mean_token_accuracy": 0.8384414002299309, + "num_tokens": 68875441.0, + "step": 57270 + }, + { + "entropy": 1.9451793283224106, + "epoch": 0.17756298828465567, + "grad_norm": 14.787067413330078, + "learning_rate": 6.0037174886958745e-06, + "loss": 0.5643, + "mean_token_accuracy": 0.8292500406503678, + "num_tokens": 68886706.0, + "step": 57280 + }, + { + "entropy": 1.9556769192218781, + "epoch": 0.17759398740970536, + "grad_norm": 9.24488353729248, + "learning_rate": 6.003193480639865e-06, + "loss": 0.5579, + "mean_token_accuracy": 0.8255641996860504, + "num_tokens": 68899129.0, + "step": 57290 + }, + { + "entropy": 1.9216732487082482, + "epoch": 0.17762498653475506, + "grad_norm": 9.25411605834961, + "learning_rate": 6.0026696097671166e-06, + "loss": 0.5175, + "mean_token_accuracy": 0.8260634735226631, + "num_tokens": 68911411.0, + "step": 57300 + }, + { + "entropy": 1.8534406289458274, + "epoch": 0.17765598565980475, + "grad_norm": 9.491771697998047, + "learning_rate": 6.002145876017787e-06, + "loss": 0.539, + "mean_token_accuracy": 0.8245239853858948, + "num_tokens": 68923378.0, + "step": 57310 + }, + { + "entropy": 1.8846858441829681, + "epoch": 0.17768698478485445, + "grad_norm": 8.265665054321289, + "learning_rate": 6.001622279332065e-06, + "loss": 0.4853, + "mean_token_accuracy": 0.8445610210299492, + "num_tokens": 68934573.0, + "step": 57320 + }, + { + "entropy": 1.88294820189476, + "epoch": 0.17771798390990415, + "grad_norm": 6.753918170928955, + "learning_rate": 6.001098819650178e-06, + "loss": 0.4659, + "mean_token_accuracy": 0.846784770488739, + "num_tokens": 68946288.0, + "step": 57330 + }, + { + "entropy": 1.8599652536213398, + "epoch": 0.17774898303495384, + "grad_norm": 3.2606992721557617, + "learning_rate": 6.000575496912389e-06, + "loss": 0.5001, + "mean_token_accuracy": 0.8316014409065247, + "num_tokens": 68958131.0, + "step": 57340 + }, + { + "entropy": 1.8438660085201264, + "epoch": 0.17777998216000354, + "grad_norm": 4.111084461212158, + "learning_rate": 6.000052311058995e-06, + "loss": 0.5221, + "mean_token_accuracy": 0.8313129052519799, + "num_tokens": 68970796.0, + "step": 57350 + }, + { + "entropy": 1.95847550034523, + "epoch": 0.17781098128505324, + "grad_norm": 8.4618501663208, + "learning_rate": 5.999529262030336e-06, + "loss": 0.5624, + "mean_token_accuracy": 0.8282546654343605, + "num_tokens": 68982324.0, + "step": 57360 + }, + { + "entropy": 1.8982812404632567, + "epoch": 0.17784198041010293, + "grad_norm": 9.305404663085938, + "learning_rate": 5.999006349766783e-06, + "loss": 0.4753, + "mean_token_accuracy": 0.8398912832140922, + "num_tokens": 68993770.0, + "step": 57370 + }, + { + "entropy": 1.9295596539974214, + "epoch": 0.17787297953515263, + "grad_norm": 8.27092456817627, + "learning_rate": 5.998483574208745e-06, + "loss": 0.521, + "mean_token_accuracy": 0.8346618101000786, + "num_tokens": 69005186.0, + "step": 57380 + }, + { + "entropy": 1.8877547219395638, + "epoch": 0.17790397866020233, + "grad_norm": 9.524073600769043, + "learning_rate": 5.997960935296666e-06, + "loss": 0.5023, + "mean_token_accuracy": 0.8391301646828652, + "num_tokens": 69016767.0, + "step": 57390 + }, + { + "entropy": 1.8745584458112716, + "epoch": 0.17793497778525202, + "grad_norm": 9.270411491394043, + "learning_rate": 5.99743843297103e-06, + "loss": 0.4864, + "mean_token_accuracy": 0.8369218707084656, + "num_tokens": 69028704.0, + "step": 57400 + }, + { + "entropy": 1.910376462340355, + "epoch": 0.1779659769103017, + "grad_norm": 9.455318450927734, + "learning_rate": 5.9969160671723535e-06, + "loss": 0.5464, + "mean_token_accuracy": 0.8310431107878685, + "num_tokens": 69039809.0, + "step": 57410 + }, + { + "entropy": 1.8097993165254593, + "epoch": 0.1779969760353514, + "grad_norm": 6.114592552185059, + "learning_rate": 5.996393837841191e-06, + "loss": 0.4764, + "mean_token_accuracy": 0.8315378293395043, + "num_tokens": 69053275.0, + "step": 57420 + }, + { + "entropy": 1.8377384558320045, + "epoch": 0.1780279751604011, + "grad_norm": 10.482206344604492, + "learning_rate": 5.995871744918132e-06, + "loss": 0.5011, + "mean_token_accuracy": 0.8455873817205429, + "num_tokens": 69065797.0, + "step": 57430 + }, + { + "entropy": 1.8282571971416473, + "epoch": 0.17805897428545078, + "grad_norm": 8.590948104858398, + "learning_rate": 5.995349788343804e-06, + "loss": 0.4795, + "mean_token_accuracy": 0.8498968616127968, + "num_tokens": 69078035.0, + "step": 57440 + }, + { + "entropy": 1.8669711872935295, + "epoch": 0.17808997341050048, + "grad_norm": 8.990492820739746, + "learning_rate": 5.994827968058869e-06, + "loss": 0.532, + "mean_token_accuracy": 0.8290546640753746, + "num_tokens": 69090734.0, + "step": 57450 + }, + { + "entropy": 1.8619839981198312, + "epoch": 0.17812097253555018, + "grad_norm": 3.9954440593719482, + "learning_rate": 5.9943062840040275e-06, + "loss": 0.4637, + "mean_token_accuracy": 0.8474828630685807, + "num_tokens": 69103255.0, + "step": 57460 + }, + { + "entropy": 1.7928624168038367, + "epoch": 0.17815197166059987, + "grad_norm": 9.835958480834961, + "learning_rate": 5.993784736120013e-06, + "loss": 0.4709, + "mean_token_accuracy": 0.8395032018423081, + "num_tokens": 69115875.0, + "step": 57470 + }, + { + "entropy": 1.8707625821232796, + "epoch": 0.17818297078564957, + "grad_norm": 8.842391014099121, + "learning_rate": 5.9932633243475954e-06, + "loss": 0.4943, + "mean_token_accuracy": 0.842163647711277, + "num_tokens": 69127843.0, + "step": 57480 + }, + { + "entropy": 1.7798241943120956, + "epoch": 0.17821396991069927, + "grad_norm": 9.086782455444336, + "learning_rate": 5.992742048627585e-06, + "loss": 0.4477, + "mean_token_accuracy": 0.8449828371405601, + "num_tokens": 69140178.0, + "step": 57490 + }, + { + "entropy": 1.885751624405384, + "epoch": 0.17824496903574896, + "grad_norm": 7.488559722900391, + "learning_rate": 5.992220908900822e-06, + "loss": 0.5086, + "mean_token_accuracy": 0.8432123690843583, + "num_tokens": 69151411.0, + "step": 57500 + }, + { + "entropy": 1.9364232003688813, + "epoch": 0.17827596816079866, + "grad_norm": 7.98928165435791, + "learning_rate": 5.991699905108188e-06, + "loss": 0.5405, + "mean_token_accuracy": 0.8352708846330643, + "num_tokens": 69162326.0, + "step": 57510 + }, + { + "entropy": 1.7974600687623024, + "epoch": 0.17830696728584836, + "grad_norm": 8.123475074768066, + "learning_rate": 5.991179037190596e-06, + "loss": 0.4522, + "mean_token_accuracy": 0.8448081642389298, + "num_tokens": 69174698.0, + "step": 57520 + }, + { + "entropy": 1.8795274257659913, + "epoch": 0.17833796641089805, + "grad_norm": 5.942895412445068, + "learning_rate": 5.9906583050889985e-06, + "loss": 0.5132, + "mean_token_accuracy": 0.847043776512146, + "num_tokens": 69186725.0, + "step": 57530 + }, + { + "entropy": 1.895848709344864, + "epoch": 0.17836896553594775, + "grad_norm": 10.697137832641602, + "learning_rate": 5.990137708744383e-06, + "loss": 0.5632, + "mean_token_accuracy": 0.8229943498969078, + "num_tokens": 69198477.0, + "step": 57540 + }, + { + "entropy": 1.8663012593984605, + "epoch": 0.17839996466099745, + "grad_norm": 10.071105003356934, + "learning_rate": 5.989617248097771e-06, + "loss": 0.4907, + "mean_token_accuracy": 0.8398059472441674, + "num_tokens": 69210496.0, + "step": 57550 + }, + { + "entropy": 1.8629123076796532, + "epoch": 0.17843096378604714, + "grad_norm": 8.073697090148926, + "learning_rate": 5.989096923090223e-06, + "loss": 0.5158, + "mean_token_accuracy": 0.8346897527575493, + "num_tokens": 69222754.0, + "step": 57560 + }, + { + "entropy": 1.947044813632965, + "epoch": 0.17846196291109684, + "grad_norm": 10.283122062683105, + "learning_rate": 5.988576733662831e-06, + "loss": 0.5859, + "mean_token_accuracy": 0.8340685039758682, + "num_tokens": 69233747.0, + "step": 57570 + }, + { + "entropy": 1.8908654794096946, + "epoch": 0.17849296203614654, + "grad_norm": 8.264286041259766, + "learning_rate": 5.988056679756728e-06, + "loss": 0.573, + "mean_token_accuracy": 0.8300464496016502, + "num_tokens": 69244865.0, + "step": 57580 + }, + { + "entropy": 1.9280391097068788, + "epoch": 0.17852396116119623, + "grad_norm": 8.630804061889648, + "learning_rate": 5.9875367613130775e-06, + "loss": 0.5689, + "mean_token_accuracy": 0.8240269988775253, + "num_tokens": 69255702.0, + "step": 57590 + }, + { + "entropy": 1.935716523230076, + "epoch": 0.17855496028624593, + "grad_norm": 11.321623802185059, + "learning_rate": 5.987016978273085e-06, + "loss": 0.5363, + "mean_token_accuracy": 0.8363067075610161, + "num_tokens": 69266825.0, + "step": 57600 + }, + { + "entropy": 1.9014330476522445, + "epoch": 0.17858595941129563, + "grad_norm": 4.646116733551025, + "learning_rate": 5.986497330577986e-06, + "loss": 0.522, + "mean_token_accuracy": 0.8390189468860626, + "num_tokens": 69279081.0, + "step": 57610 + }, + { + "entropy": 1.9175360828638077, + "epoch": 0.17861695853634532, + "grad_norm": 9.152368545532227, + "learning_rate": 5.985977818169053e-06, + "loss": 0.5684, + "mean_token_accuracy": 0.8228761807084084, + "num_tokens": 69290443.0, + "step": 57620 + }, + { + "entropy": 1.8552556172013284, + "epoch": 0.17864795766139502, + "grad_norm": 7.568161487579346, + "learning_rate": 5.985458440987597e-06, + "loss": 0.4541, + "mean_token_accuracy": 0.8391215577721596, + "num_tokens": 69303861.0, + "step": 57630 + }, + { + "entropy": 1.7614154547452927, + "epoch": 0.17867895678644471, + "grad_norm": 3.9971959590911865, + "learning_rate": 5.984939198974961e-06, + "loss": 0.4513, + "mean_token_accuracy": 0.8515268057584763, + "num_tokens": 69317877.0, + "step": 57640 + }, + { + "entropy": 1.9185373350977897, + "epoch": 0.1787099559114944, + "grad_norm": 11.149886131286621, + "learning_rate": 5.984420092072528e-06, + "loss": 0.5383, + "mean_token_accuracy": 0.8318190723657608, + "num_tokens": 69329893.0, + "step": 57650 + }, + { + "entropy": 1.9363491863012314, + "epoch": 0.17874095503654408, + "grad_norm": 9.132575035095215, + "learning_rate": 5.983901120221711e-06, + "loss": 0.5636, + "mean_token_accuracy": 0.8298118308186531, + "num_tokens": 69341278.0, + "step": 57660 + }, + { + "entropy": 1.833200243115425, + "epoch": 0.17877195416159378, + "grad_norm": 4.589186668395996, + "learning_rate": 5.983382283363963e-06, + "loss": 0.4279, + "mean_token_accuracy": 0.8535523906350135, + "num_tokens": 69354018.0, + "step": 57670 + }, + { + "entropy": 1.952676109969616, + "epoch": 0.17880295328664347, + "grad_norm": 8.8577241897583, + "learning_rate": 5.9828635814407695e-06, + "loss": 0.5592, + "mean_token_accuracy": 0.8286620289087295, + "num_tokens": 69365463.0, + "step": 57680 + }, + { + "entropy": 1.901480646431446, + "epoch": 0.17883395241169317, + "grad_norm": 8.063117027282715, + "learning_rate": 5.9823450143936555e-06, + "loss": 0.4938, + "mean_token_accuracy": 0.8349184259772301, + "num_tokens": 69377683.0, + "step": 57690 + }, + { + "entropy": 1.8743933662772179, + "epoch": 0.17886495153674287, + "grad_norm": 8.365898132324219, + "learning_rate": 5.981826582164176e-06, + "loss": 0.5108, + "mean_token_accuracy": 0.8368162304162979, + "num_tokens": 69390015.0, + "step": 57700 + }, + { + "entropy": 1.776149820536375, + "epoch": 0.17889595066179256, + "grad_norm": 9.224735260009766, + "learning_rate": 5.9813082846939264e-06, + "loss": 0.4258, + "mean_token_accuracy": 0.8448210716247558, + "num_tokens": 69402894.0, + "step": 57710 + }, + { + "entropy": 1.7384593583643437, + "epoch": 0.17892694978684226, + "grad_norm": 8.464068412780762, + "learning_rate": 5.980790121924534e-06, + "loss": 0.3974, + "mean_token_accuracy": 0.8530717685818672, + "num_tokens": 69416819.0, + "step": 57720 + }, + { + "entropy": 1.8666021645069122, + "epoch": 0.17895794891189196, + "grad_norm": 10.071202278137207, + "learning_rate": 5.980272093797667e-06, + "loss": 0.5063, + "mean_token_accuracy": 0.8388993337750434, + "num_tokens": 69428622.0, + "step": 57730 + }, + { + "entropy": 1.9105965510010718, + "epoch": 0.17898894803694165, + "grad_norm": 7.725157737731934, + "learning_rate": 5.979754200255019e-06, + "loss": 0.5431, + "mean_token_accuracy": 0.8272575065493584, + "num_tokens": 69439573.0, + "step": 57740 + }, + { + "entropy": 1.880422416329384, + "epoch": 0.17901994716199135, + "grad_norm": 4.6876630783081055, + "learning_rate": 5.979236441238329e-06, + "loss": 0.5393, + "mean_token_accuracy": 0.8352604731917381, + "num_tokens": 69451040.0, + "step": 57750 + }, + { + "entropy": 1.877747993171215, + "epoch": 0.17905094628704105, + "grad_norm": 3.8717169761657715, + "learning_rate": 5.978718816689365e-06, + "loss": 0.5186, + "mean_token_accuracy": 0.8352397963404655, + "num_tokens": 69462601.0, + "step": 57760 + }, + { + "entropy": 1.9101334929466247, + "epoch": 0.17908194541209074, + "grad_norm": 9.124340057373047, + "learning_rate": 5.978201326549935e-06, + "loss": 0.5731, + "mean_token_accuracy": 0.8258951917290688, + "num_tokens": 69474464.0, + "step": 57770 + }, + { + "entropy": 1.7846893429756165, + "epoch": 0.17911294453714044, + "grad_norm": 3.8451902866363525, + "learning_rate": 5.977683970761876e-06, + "loss": 0.465, + "mean_token_accuracy": 0.8453226387500763, + "num_tokens": 69487808.0, + "step": 57780 + }, + { + "entropy": 1.8912085384130477, + "epoch": 0.17914394366219014, + "grad_norm": 9.206360816955566, + "learning_rate": 5.9771667492670675e-06, + "loss": 0.5444, + "mean_token_accuracy": 0.8258654251694679, + "num_tokens": 69499784.0, + "step": 57790 + }, + { + "entropy": 1.8862205877900124, + "epoch": 0.17917494278723983, + "grad_norm": 4.152833461761475, + "learning_rate": 5.97664966200742e-06, + "loss": 0.5273, + "mean_token_accuracy": 0.8353832781314849, + "num_tokens": 69511090.0, + "step": 57800 + }, + { + "entropy": 1.8243310183286667, + "epoch": 0.17920594191228953, + "grad_norm": 11.02452278137207, + "learning_rate": 5.9761327089248786e-06, + "loss": 0.4937, + "mean_token_accuracy": 0.840731629729271, + "num_tokens": 69523788.0, + "step": 57810 + }, + { + "entropy": 1.9363866940140724, + "epoch": 0.17923694103733923, + "grad_norm": 4.2471923828125, + "learning_rate": 5.975615889961425e-06, + "loss": 0.5913, + "mean_token_accuracy": 0.8229147881269455, + "num_tokens": 69534659.0, + "step": 57820 + }, + { + "entropy": 1.8675603151321412, + "epoch": 0.17926794016238892, + "grad_norm": 8.263809204101562, + "learning_rate": 5.9750992050590765e-06, + "loss": 0.5291, + "mean_token_accuracy": 0.8418093547224998, + "num_tokens": 69546361.0, + "step": 57830 + }, + { + "entropy": 1.8654843851923943, + "epoch": 0.17929893928743862, + "grad_norm": 9.333740234375, + "learning_rate": 5.974582654159884e-06, + "loss": 0.4803, + "mean_token_accuracy": 0.8490580499172211, + "num_tokens": 69558369.0, + "step": 57840 + }, + { + "entropy": 1.7780871927738189, + "epoch": 0.17932993841248832, + "grad_norm": 3.7377822399139404, + "learning_rate": 5.974066237205935e-06, + "loss": 0.4321, + "mean_token_accuracy": 0.8493701010942459, + "num_tokens": 69571586.0, + "step": 57850 + }, + { + "entropy": 1.8709719255566597, + "epoch": 0.179360937537538, + "grad_norm": 9.209651947021484, + "learning_rate": 5.9735499541393515e-06, + "loss": 0.5304, + "mean_token_accuracy": 0.8328555643558502, + "num_tokens": 69584134.0, + "step": 57860 + }, + { + "entropy": 1.9305126339197158, + "epoch": 0.1793919366625877, + "grad_norm": 8.458016395568848, + "learning_rate": 5.9730338049022905e-06, + "loss": 0.5505, + "mean_token_accuracy": 0.8329010114073754, + "num_tokens": 69595889.0, + "step": 57870 + }, + { + "entropy": 1.9079819098114967, + "epoch": 0.1794229357876374, + "grad_norm": 8.438733100891113, + "learning_rate": 5.972517789436941e-06, + "loss": 0.5549, + "mean_token_accuracy": 0.8301453992724419, + "num_tokens": 69607872.0, + "step": 57880 + }, + { + "entropy": 1.8880163133144379, + "epoch": 0.1794539349126871, + "grad_norm": 7.341186046600342, + "learning_rate": 5.972001907685534e-06, + "loss": 0.4612, + "mean_token_accuracy": 0.8447798684239387, + "num_tokens": 69619608.0, + "step": 57890 + }, + { + "entropy": 1.9878419309854507, + "epoch": 0.1794849340377368, + "grad_norm": 8.431941032409668, + "learning_rate": 5.9714861595903275e-06, + "loss": 0.5709, + "mean_token_accuracy": 0.8319448336958886, + "num_tokens": 69630342.0, + "step": 57900 + }, + { + "entropy": 1.9403752774000167, + "epoch": 0.17951593316278647, + "grad_norm": 8.231103897094727, + "learning_rate": 5.9709705450936195e-06, + "loss": 0.5535, + "mean_token_accuracy": 0.8287773564457893, + "num_tokens": 69641589.0, + "step": 57910 + }, + { + "entropy": 1.8349755868315696, + "epoch": 0.17954693228783616, + "grad_norm": 8.210265159606934, + "learning_rate": 5.9704550641377414e-06, + "loss": 0.5199, + "mean_token_accuracy": 0.8326278194785118, + "num_tokens": 69654695.0, + "step": 57920 + }, + { + "entropy": 1.8748487085103989, + "epoch": 0.17957793141288586, + "grad_norm": 4.568990230560303, + "learning_rate": 5.96993971666506e-06, + "loss": 0.516, + "mean_token_accuracy": 0.8239820554852486, + "num_tokens": 69667546.0, + "step": 57930 + }, + { + "entropy": 1.957274827361107, + "epoch": 0.17960893053793556, + "grad_norm": 8.795912742614746, + "learning_rate": 5.969424502617975e-06, + "loss": 0.5353, + "mean_token_accuracy": 0.8339650616049766, + "num_tokens": 69678423.0, + "step": 57940 + }, + { + "entropy": 1.9020697817206382, + "epoch": 0.17963992966298525, + "grad_norm": 8.028707504272461, + "learning_rate": 5.968909421938924e-06, + "loss": 0.499, + "mean_token_accuracy": 0.83273094445467, + "num_tokens": 69690504.0, + "step": 57950 + }, + { + "entropy": 1.9501001119613648, + "epoch": 0.17967092878803495, + "grad_norm": 8.39437198638916, + "learning_rate": 5.968394474570377e-06, + "loss": 0.5423, + "mean_token_accuracy": 0.8293194517493248, + "num_tokens": 69701909.0, + "step": 57960 + }, + { + "entropy": 1.8961822897195817, + "epoch": 0.17970192791308465, + "grad_norm": 3.989229202270508, + "learning_rate": 5.9678796604548385e-06, + "loss": 0.5007, + "mean_token_accuracy": 0.8313602998852729, + "num_tokens": 69713928.0, + "step": 57970 + }, + { + "entropy": 1.9584363132715226, + "epoch": 0.17973292703813434, + "grad_norm": 10.755518913269043, + "learning_rate": 5.967364979534849e-06, + "loss": 0.5435, + "mean_token_accuracy": 0.8270615398883819, + "num_tokens": 69725093.0, + "step": 57980 + }, + { + "entropy": 1.883443070948124, + "epoch": 0.17976392616318404, + "grad_norm": 8.810373306274414, + "learning_rate": 5.966850431752984e-06, + "loss": 0.5011, + "mean_token_accuracy": 0.8435442551970482, + "num_tokens": 69736710.0, + "step": 57990 + }, + { + "entropy": 1.838029107451439, + "epoch": 0.17979492528823374, + "grad_norm": 7.354464530944824, + "learning_rate": 5.9663360170518524e-06, + "loss": 0.4682, + "mean_token_accuracy": 0.8321570068597793, + "num_tokens": 69749525.0, + "step": 58000 + }, + { + "entropy": 1.860961727797985, + "epoch": 0.17982592441328343, + "grad_norm": 8.58539867401123, + "learning_rate": 5.965821735374097e-06, + "loss": 0.5197, + "mean_token_accuracy": 0.842749148607254, + "num_tokens": 69761090.0, + "step": 58010 + }, + { + "entropy": 1.888116455078125, + "epoch": 0.17985692353833313, + "grad_norm": 8.116665840148926, + "learning_rate": 5.965307586662398e-06, + "loss": 0.4385, + "mean_token_accuracy": 0.8538775756955147, + "num_tokens": 69773218.0, + "step": 58020 + }, + { + "entropy": 1.88920701444149, + "epoch": 0.17988792266338283, + "grad_norm": 8.906111717224121, + "learning_rate": 5.964793570859469e-06, + "loss": 0.5258, + "mean_token_accuracy": 0.8328505247831345, + "num_tokens": 69785392.0, + "step": 58030 + }, + { + "entropy": 1.9361517250537872, + "epoch": 0.17991892178843252, + "grad_norm": 10.055381774902344, + "learning_rate": 5.964279687908057e-06, + "loss": 0.5385, + "mean_token_accuracy": 0.8345837727189064, + "num_tokens": 69796770.0, + "step": 58040 + }, + { + "entropy": 1.856403675675392, + "epoch": 0.17994992091348222, + "grad_norm": 2.6345648765563965, + "learning_rate": 5.963765937750943e-06, + "loss": 0.4652, + "mean_token_accuracy": 0.8511771202087403, + "num_tokens": 69808863.0, + "step": 58050 + }, + { + "entropy": 1.8975456178188324, + "epoch": 0.17998092003853192, + "grad_norm": 9.3054780960083, + "learning_rate": 5.963252320330947e-06, + "loss": 0.5478, + "mean_token_accuracy": 0.827577319741249, + "num_tokens": 69820718.0, + "step": 58060 + }, + { + "entropy": 1.8229490399360657, + "epoch": 0.1800119191635816, + "grad_norm": 8.946834564208984, + "learning_rate": 5.962738835590917e-06, + "loss": 0.4613, + "mean_token_accuracy": 0.8495317012071609, + "num_tokens": 69834215.0, + "step": 58070 + }, + { + "entropy": 1.9095330134034156, + "epoch": 0.1800429182886313, + "grad_norm": 8.070923805236816, + "learning_rate": 5.962225483473742e-06, + "loss": 0.5089, + "mean_token_accuracy": 0.8391377106308937, + "num_tokens": 69846084.0, + "step": 58080 + }, + { + "entropy": 1.7783651992678642, + "epoch": 0.180073917413681, + "grad_norm": 7.508241176605225, + "learning_rate": 5.961712263922337e-06, + "loss": 0.4244, + "mean_token_accuracy": 0.8533773466944694, + "num_tokens": 69859333.0, + "step": 58090 + }, + { + "entropy": 1.926021693646908, + "epoch": 0.1801049165387307, + "grad_norm": 9.783551216125488, + "learning_rate": 5.961199176879661e-06, + "loss": 0.5203, + "mean_token_accuracy": 0.8343341812491417, + "num_tokens": 69870605.0, + "step": 58100 + }, + { + "entropy": 1.9040902271866798, + "epoch": 0.1801359156637804, + "grad_norm": 8.443004608154297, + "learning_rate": 5.960686222288703e-06, + "loss": 0.5384, + "mean_token_accuracy": 0.8259869500994682, + "num_tokens": 69883254.0, + "step": 58110 + }, + { + "entropy": 1.887841096520424, + "epoch": 0.1801669147888301, + "grad_norm": 9.13963794708252, + "learning_rate": 5.960173400092483e-06, + "loss": 0.4781, + "mean_token_accuracy": 0.8409109726548195, + "num_tokens": 69895422.0, + "step": 58120 + }, + { + "entropy": 1.8949230402708053, + "epoch": 0.1801979139138798, + "grad_norm": 8.608329772949219, + "learning_rate": 5.9596607102340605e-06, + "loss": 0.5482, + "mean_token_accuracy": 0.8356387764215469, + "num_tokens": 69907555.0, + "step": 58130 + }, + { + "entropy": 1.9181236669421196, + "epoch": 0.1802289130389295, + "grad_norm": 7.1196513175964355, + "learning_rate": 5.959148152656526e-06, + "loss": 0.4865, + "mean_token_accuracy": 0.8519378170371056, + "num_tokens": 69919079.0, + "step": 58140 + }, + { + "entropy": 1.9497553408145905, + "epoch": 0.18025991216397916, + "grad_norm": 9.516104698181152, + "learning_rate": 5.958635727303008e-06, + "loss": 0.5585, + "mean_token_accuracy": 0.8375312581658363, + "num_tokens": 69929946.0, + "step": 58150 + }, + { + "entropy": 1.8249136477708816, + "epoch": 0.18029091128902885, + "grad_norm": 4.427674770355225, + "learning_rate": 5.958123434116665e-06, + "loss": 0.4367, + "mean_token_accuracy": 0.8447049587965012, + "num_tokens": 69942319.0, + "step": 58160 + }, + { + "entropy": 1.8562700033187867, + "epoch": 0.18032191041407855, + "grad_norm": 9.44267749786377, + "learning_rate": 5.957611273040691e-06, + "loss": 0.5141, + "mean_token_accuracy": 0.8396990388631821, + "num_tokens": 69954897.0, + "step": 58170 + }, + { + "entropy": 1.9479858607053757, + "epoch": 0.18035290953912825, + "grad_norm": 4.776163101196289, + "learning_rate": 5.9570992440183166e-06, + "loss": 0.5244, + "mean_token_accuracy": 0.8349523320794106, + "num_tokens": 69966494.0, + "step": 58180 + }, + { + "entropy": 1.9115686953067779, + "epoch": 0.18038390866417794, + "grad_norm": 8.399941444396973, + "learning_rate": 5.956587346992802e-06, + "loss": 0.4839, + "mean_token_accuracy": 0.8438914701342582, + "num_tokens": 69978598.0, + "step": 58190 + }, + { + "entropy": 1.8765361204743385, + "epoch": 0.18041490778922764, + "grad_norm": 9.405889511108398, + "learning_rate": 5.956075581907446e-06, + "loss": 0.5624, + "mean_token_accuracy": 0.8236325919628144, + "num_tokens": 69990680.0, + "step": 58200 + }, + { + "entropy": 1.9489685088396071, + "epoch": 0.18044590691427734, + "grad_norm": 8.940106391906738, + "learning_rate": 5.955563948705578e-06, + "loss": 0.5632, + "mean_token_accuracy": 0.8357775121927261, + "num_tokens": 70001208.0, + "step": 58210 + }, + { + "entropy": 1.8073627695441246, + "epoch": 0.18047690603932703, + "grad_norm": 7.959947109222412, + "learning_rate": 5.955052447330566e-06, + "loss": 0.4221, + "mean_token_accuracy": 0.856120876967907, + "num_tokens": 70015168.0, + "step": 58220 + }, + { + "entropy": 1.912624678015709, + "epoch": 0.18050790516437673, + "grad_norm": 10.096214294433594, + "learning_rate": 5.954541077725806e-06, + "loss": 0.5486, + "mean_token_accuracy": 0.821705561876297, + "num_tokens": 70027352.0, + "step": 58230 + }, + { + "entropy": 1.7947795525193215, + "epoch": 0.18053890428942643, + "grad_norm": 3.570014715194702, + "learning_rate": 5.954029839834733e-06, + "loss": 0.395, + "mean_token_accuracy": 0.8589225232601165, + "num_tokens": 70040539.0, + "step": 58240 + }, + { + "entropy": 1.9037404909729958, + "epoch": 0.18056990341447612, + "grad_norm": 7.7414655685424805, + "learning_rate": 5.953518733600813e-06, + "loss": 0.5514, + "mean_token_accuracy": 0.828503304719925, + "num_tokens": 70052279.0, + "step": 58250 + }, + { + "entropy": 1.898376226425171, + "epoch": 0.18060090253952582, + "grad_norm": 4.189719200134277, + "learning_rate": 5.953007758967547e-06, + "loss": 0.4731, + "mean_token_accuracy": 0.8411821514368057, + "num_tokens": 70064479.0, + "step": 58260 + }, + { + "entropy": 1.8012950256466866, + "epoch": 0.18063190166457552, + "grad_norm": 2.9318971633911133, + "learning_rate": 5.952496915878471e-06, + "loss": 0.4324, + "mean_token_accuracy": 0.8473293244838714, + "num_tokens": 70077477.0, + "step": 58270 + }, + { + "entropy": 1.883352592587471, + "epoch": 0.1806629007896252, + "grad_norm": 8.893994331359863, + "learning_rate": 5.951986204277154e-06, + "loss": 0.4891, + "mean_token_accuracy": 0.8392975389957428, + "num_tokens": 70089437.0, + "step": 58280 + }, + { + "entropy": 1.9155563935637474, + "epoch": 0.1806938999146749, + "grad_norm": 9.788602828979492, + "learning_rate": 5.951475624107198e-06, + "loss": 0.527, + "mean_token_accuracy": 0.8240778625011445, + "num_tokens": 70101237.0, + "step": 58290 + }, + { + "entropy": 1.8512123331427575, + "epoch": 0.1807248990397246, + "grad_norm": 8.938436508178711, + "learning_rate": 5.950965175312241e-06, + "loss": 0.4717, + "mean_token_accuracy": 0.836862875521183, + "num_tokens": 70114119.0, + "step": 58300 + }, + { + "entropy": 1.8680644989013673, + "epoch": 0.1807558981647743, + "grad_norm": 11.42545223236084, + "learning_rate": 5.950454857835951e-06, + "loss": 0.4764, + "mean_token_accuracy": 0.8473235473036766, + "num_tokens": 70125999.0, + "step": 58310 + }, + { + "entropy": 1.8300606310367584, + "epoch": 0.180786897289824, + "grad_norm": 6.41502046585083, + "learning_rate": 5.949944671622034e-06, + "loss": 0.5353, + "mean_token_accuracy": 0.8300027817487716, + "num_tokens": 70139069.0, + "step": 58320 + }, + { + "entropy": 1.9075913026928901, + "epoch": 0.1808178964148737, + "grad_norm": 3.9195168018341064, + "learning_rate": 5.949434616614229e-06, + "loss": 0.5197, + "mean_token_accuracy": 0.8323316410183906, + "num_tokens": 70151975.0, + "step": 58330 + }, + { + "entropy": 1.979878196120262, + "epoch": 0.1808488955399234, + "grad_norm": 8.486638069152832, + "learning_rate": 5.948924692756306e-06, + "loss": 0.5894, + "mean_token_accuracy": 0.8257222607731819, + "num_tokens": 70162362.0, + "step": 58340 + }, + { + "entropy": 1.891125389933586, + "epoch": 0.1808798946649731, + "grad_norm": 4.342275142669678, + "learning_rate": 5.948414899992072e-06, + "loss": 0.542, + "mean_token_accuracy": 0.8310712307691575, + "num_tokens": 70174516.0, + "step": 58350 + }, + { + "entropy": 1.8640663996338844, + "epoch": 0.1809108937900228, + "grad_norm": 2.7677135467529297, + "learning_rate": 5.947905238265366e-06, + "loss": 0.4979, + "mean_token_accuracy": 0.8343622460961342, + "num_tokens": 70186361.0, + "step": 58360 + }, + { + "entropy": 1.863309583067894, + "epoch": 0.18094189291507248, + "grad_norm": 9.252532958984375, + "learning_rate": 5.947395707520059e-06, + "loss": 0.5103, + "mean_token_accuracy": 0.8355363115668297, + "num_tokens": 70198676.0, + "step": 58370 + }, + { + "entropy": 1.9271214351058006, + "epoch": 0.18097289204012218, + "grad_norm": 8.357312202453613, + "learning_rate": 5.946886307700062e-06, + "loss": 0.535, + "mean_token_accuracy": 0.8216869860887528, + "num_tokens": 70210561.0, + "step": 58380 + }, + { + "entropy": 1.8037690997123719, + "epoch": 0.18100389116517188, + "grad_norm": 8.436007499694824, + "learning_rate": 5.94637703874931e-06, + "loss": 0.4989, + "mean_token_accuracy": 0.8438222482800484, + "num_tokens": 70224444.0, + "step": 58390 + }, + { + "entropy": 1.9045485824346542, + "epoch": 0.18103489029022155, + "grad_norm": 10.353751182556152, + "learning_rate": 5.9458679006117815e-06, + "loss": 0.5498, + "mean_token_accuracy": 0.8378313854336739, + "num_tokens": 70236240.0, + "step": 58400 + }, + { + "entropy": 1.9282925948500633, + "epoch": 0.18106588941527124, + "grad_norm": 9.726922035217285, + "learning_rate": 5.94535889323148e-06, + "loss": 0.4797, + "mean_token_accuracy": 0.8426464855670929, + "num_tokens": 70247776.0, + "step": 58410 + }, + { + "entropy": 1.9179017692804337, + "epoch": 0.18109688854032094, + "grad_norm": 8.368406295776367, + "learning_rate": 5.944850016552449e-06, + "loss": 0.5916, + "mean_token_accuracy": 0.820852018892765, + "num_tokens": 70259710.0, + "step": 58420 + }, + { + "entropy": 1.9104688361287117, + "epoch": 0.18112788766537063, + "grad_norm": 9.314393997192383, + "learning_rate": 5.944341270518763e-06, + "loss": 0.5022, + "mean_token_accuracy": 0.8437484547495842, + "num_tokens": 70271807.0, + "step": 58430 + }, + { + "entropy": 1.9487433806061745, + "epoch": 0.18115888679042033, + "grad_norm": 9.065957069396973, + "learning_rate": 5.943832655074528e-06, + "loss": 0.5531, + "mean_token_accuracy": 0.822340051829815, + "num_tokens": 70283681.0, + "step": 58440 + }, + { + "entropy": 1.8573688492178917, + "epoch": 0.18118988591547003, + "grad_norm": 8.39069652557373, + "learning_rate": 5.943324170163888e-06, + "loss": 0.5027, + "mean_token_accuracy": 0.8431822940707207, + "num_tokens": 70296587.0, + "step": 58450 + }, + { + "entropy": 1.8324483245611192, + "epoch": 0.18122088504051972, + "grad_norm": 4.025021076202393, + "learning_rate": 5.942815815731015e-06, + "loss": 0.4318, + "mean_token_accuracy": 0.8505256190896034, + "num_tokens": 70308977.0, + "step": 58460 + }, + { + "entropy": 1.9166489720344544, + "epoch": 0.18125188416556942, + "grad_norm": 7.720203876495361, + "learning_rate": 5.942307591720121e-06, + "loss": 0.4921, + "mean_token_accuracy": 0.8491469100117683, + "num_tokens": 70320331.0, + "step": 58470 + }, + { + "entropy": 1.8525561004877091, + "epoch": 0.18128288329061912, + "grad_norm": 8.706543922424316, + "learning_rate": 5.941799498075445e-06, + "loss": 0.4696, + "mean_token_accuracy": 0.8432158127427101, + "num_tokens": 70331972.0, + "step": 58480 + }, + { + "entropy": 1.734443087875843, + "epoch": 0.18131388241566881, + "grad_norm": 4.216269493103027, + "learning_rate": 5.9412915347412624e-06, + "loss": 0.4215, + "mean_token_accuracy": 0.8574797987937928, + "num_tokens": 70345958.0, + "step": 58490 + }, + { + "entropy": 1.885965469479561, + "epoch": 0.1813448815407185, + "grad_norm": 8.010455131530762, + "learning_rate": 5.940783701661882e-06, + "loss": 0.4823, + "mean_token_accuracy": 0.8380753323435783, + "num_tokens": 70357917.0, + "step": 58500 + }, + { + "entropy": 1.9255839720368386, + "epoch": 0.1813758806657682, + "grad_norm": 8.077774047851562, + "learning_rate": 5.940275998781646e-06, + "loss": 0.5291, + "mean_token_accuracy": 0.8398918956518173, + "num_tokens": 70369190.0, + "step": 58510 + }, + { + "entropy": 1.8902644366025925, + "epoch": 0.1814068797908179, + "grad_norm": 9.07833480834961, + "learning_rate": 5.939768426044928e-06, + "loss": 0.5209, + "mean_token_accuracy": 0.8410121381282807, + "num_tokens": 70381045.0, + "step": 58520 + }, + { + "entropy": 1.8733600035309792, + "epoch": 0.1814378789158676, + "grad_norm": 7.863108158111572, + "learning_rate": 5.939260983396139e-06, + "loss": 0.5245, + "mean_token_accuracy": 0.8383708387613297, + "num_tokens": 70394169.0, + "step": 58530 + }, + { + "entropy": 1.9640895150601865, + "epoch": 0.1814688780409173, + "grad_norm": 9.28256607055664, + "learning_rate": 5.938753670779716e-06, + "loss": 0.5632, + "mean_token_accuracy": 0.8161120370030404, + "num_tokens": 70405772.0, + "step": 58540 + }, + { + "entropy": 1.8138402692973614, + "epoch": 0.181499877165967, + "grad_norm": 9.445598602294922, + "learning_rate": 5.938246488140139e-06, + "loss": 0.4378, + "mean_token_accuracy": 0.8572772964835167, + "num_tokens": 70418813.0, + "step": 58550 + }, + { + "entropy": 1.9521236777305604, + "epoch": 0.1815308762910167, + "grad_norm": 6.890706539154053, + "learning_rate": 5.937739435421913e-06, + "loss": 0.5465, + "mean_token_accuracy": 0.8317166283726692, + "num_tokens": 70430085.0, + "step": 58560 + }, + { + "entropy": 1.8753592044115066, + "epoch": 0.1815618754160664, + "grad_norm": Infinity, + "learning_rate": 5.9372325125695796e-06, + "loss": 0.513, + "mean_token_accuracy": 0.8267385110259056, + "num_tokens": 70442432.0, + "step": 58570 + }, + { + "entropy": 1.951469287276268, + "epoch": 0.18159287454111608, + "grad_norm": 10.176626205444336, + "learning_rate": 5.936725719527712e-06, + "loss": 0.5294, + "mean_token_accuracy": 0.8351786851882934, + "num_tokens": 70453518.0, + "step": 58580 + }, + { + "entropy": 2.006668972969055, + "epoch": 0.18162387366616578, + "grad_norm": 10.691234588623047, + "learning_rate": 5.9362190562409196e-06, + "loss": 0.6187, + "mean_token_accuracy": 0.8179316386580467, + "num_tokens": 70464169.0, + "step": 58590 + }, + { + "entropy": 1.9340445652604104, + "epoch": 0.18165487279121548, + "grad_norm": 8.558653831481934, + "learning_rate": 5.93571252265384e-06, + "loss": 0.5382, + "mean_token_accuracy": 0.8301935106515884, + "num_tokens": 70476295.0, + "step": 58600 + }, + { + "entropy": 2.03122977912426, + "epoch": 0.18168587191626517, + "grad_norm": 8.362186431884766, + "learning_rate": 5.935206118711151e-06, + "loss": 0.6264, + "mean_token_accuracy": 0.8174452885985375, + "num_tokens": 70487260.0, + "step": 58610 + }, + { + "entropy": 1.9458183541893959, + "epoch": 0.18171687104131487, + "grad_norm": 4.3328633308410645, + "learning_rate": 5.934699844357555e-06, + "loss": 0.5, + "mean_token_accuracy": 0.834861546754837, + "num_tokens": 70498573.0, + "step": 58620 + }, + { + "entropy": 1.9818433836102485, + "epoch": 0.18174787016636457, + "grad_norm": 9.87380313873291, + "learning_rate": 5.934193699537794e-06, + "loss": 0.5678, + "mean_token_accuracy": 0.8219478592276573, + "num_tokens": 70510126.0, + "step": 58630 + }, + { + "entropy": 1.9035624399781228, + "epoch": 0.18177886929141426, + "grad_norm": 7.710408687591553, + "learning_rate": 5.933687684196638e-06, + "loss": 0.5346, + "mean_token_accuracy": 0.8381109595298767, + "num_tokens": 70522421.0, + "step": 58640 + }, + { + "entropy": 1.9757555976510048, + "epoch": 0.18180986841646393, + "grad_norm": 8.179011344909668, + "learning_rate": 5.933181798278895e-06, + "loss": 0.5466, + "mean_token_accuracy": 0.8351363927125931, + "num_tokens": 70534084.0, + "step": 58650 + }, + { + "entropy": 1.9270098477602005, + "epoch": 0.18184086754151363, + "grad_norm": 4.406381130218506, + "learning_rate": 5.9326760417294036e-06, + "loss": 0.5338, + "mean_token_accuracy": 0.8375926703214646, + "num_tokens": 70545476.0, + "step": 58660 + }, + { + "entropy": 1.9433436125516892, + "epoch": 0.18187186666656333, + "grad_norm": 8.257979393005371, + "learning_rate": 5.9321704144930335e-06, + "loss": 0.6162, + "mean_token_accuracy": 0.8256099238991738, + "num_tokens": 70557302.0, + "step": 58670 + }, + { + "entropy": 1.8725889652967453, + "epoch": 0.18190286579161302, + "grad_norm": 8.043081283569336, + "learning_rate": 5.931664916514689e-06, + "loss": 0.476, + "mean_token_accuracy": 0.8415656134486198, + "num_tokens": 70569142.0, + "step": 58680 + }, + { + "entropy": 1.922675184905529, + "epoch": 0.18193386491666272, + "grad_norm": 9.857270240783691, + "learning_rate": 5.931159547739309e-06, + "loss": 0.5054, + "mean_token_accuracy": 0.8421023860573769, + "num_tokens": 70580427.0, + "step": 58690 + }, + { + "entropy": 1.8865813314914703, + "epoch": 0.18196486404171242, + "grad_norm": 8.099855422973633, + "learning_rate": 5.9306543081118605e-06, + "loss": 0.5244, + "mean_token_accuracy": 0.8428427502512932, + "num_tokens": 70591829.0, + "step": 58700 + }, + { + "entropy": 1.7871639668941497, + "epoch": 0.1819958631667621, + "grad_norm": 8.50505256652832, + "learning_rate": 5.9301491975773485e-06, + "loss": 0.4363, + "mean_token_accuracy": 0.8578494325280189, + "num_tokens": 70605530.0, + "step": 58710 + }, + { + "entropy": 1.9226329445838928, + "epoch": 0.1820268622918118, + "grad_norm": 9.982844352722168, + "learning_rate": 5.929644216080808e-06, + "loss": 0.5019, + "mean_token_accuracy": 0.8366420567035675, + "num_tokens": 70616806.0, + "step": 58720 + }, + { + "entropy": 1.8149694345891476, + "epoch": 0.1820578614168615, + "grad_norm": 8.859916687011719, + "learning_rate": 5.9291393635673065e-06, + "loss": 0.4231, + "mean_token_accuracy": 0.8594744309782982, + "num_tokens": 70629601.0, + "step": 58730 + }, + { + "entropy": 1.8165232956409454, + "epoch": 0.1820888605419112, + "grad_norm": 8.397076606750488, + "learning_rate": 5.928634639981946e-06, + "loss": 0.5598, + "mean_token_accuracy": 0.8303966209292412, + "num_tokens": 70642458.0, + "step": 58740 + }, + { + "entropy": 1.8342901065945625, + "epoch": 0.1821198596669609, + "grad_norm": 4.7208123207092285, + "learning_rate": 5.9281300452698584e-06, + "loss": 0.4633, + "mean_token_accuracy": 0.850199481844902, + "num_tokens": 70655561.0, + "step": 58750 + }, + { + "entropy": 1.959836632013321, + "epoch": 0.1821508587920106, + "grad_norm": 7.517179489135742, + "learning_rate": 5.927625579376213e-06, + "loss": 0.5588, + "mean_token_accuracy": 0.8339650839567184, + "num_tokens": 70666015.0, + "step": 58760 + }, + { + "entropy": 1.8640755087137222, + "epoch": 0.1821818579170603, + "grad_norm": 8.22607135772705, + "learning_rate": 5.927121242246206e-06, + "loss": 0.4876, + "mean_token_accuracy": 0.8419701069593429, + "num_tokens": 70678336.0, + "step": 58770 + }, + { + "entropy": 1.8295077085494995, + "epoch": 0.18221285704211, + "grad_norm": 7.90572452545166, + "learning_rate": 5.9266170338250715e-06, + "loss": 0.4507, + "mean_token_accuracy": 0.8469627618789672, + "num_tokens": 70691062.0, + "step": 58780 + }, + { + "entropy": 1.8726282447576523, + "epoch": 0.18224385616715968, + "grad_norm": 9.512833595275879, + "learning_rate": 5.926112954058072e-06, + "loss": 0.4848, + "mean_token_accuracy": 0.840190976858139, + "num_tokens": 70702498.0, + "step": 58790 + }, + { + "entropy": 1.8806493178009986, + "epoch": 0.18227485529220938, + "grad_norm": 9.236995697021484, + "learning_rate": 5.925609002890504e-06, + "loss": 0.5467, + "mean_token_accuracy": 0.8324322685599327, + "num_tokens": 70713997.0, + "step": 58800 + }, + { + "entropy": 1.9413601398468017, + "epoch": 0.18230585441725908, + "grad_norm": 8.431058883666992, + "learning_rate": 5.9251051802677e-06, + "loss": 0.5951, + "mean_token_accuracy": 0.8179917827248573, + "num_tokens": 70725865.0, + "step": 58810 + }, + { + "entropy": 1.8975177273154258, + "epoch": 0.18233685354230877, + "grad_norm": 9.826079368591309, + "learning_rate": 5.9246014861350176e-06, + "loss": 0.5408, + "mean_token_accuracy": 0.8294125840067863, + "num_tokens": 70737735.0, + "step": 58820 + }, + { + "entropy": 1.8288067810237407, + "epoch": 0.18236785266735847, + "grad_norm": 8.987369537353516, + "learning_rate": 5.924097920437855e-06, + "loss": 0.5116, + "mean_token_accuracy": 0.8347997352480888, + "num_tokens": 70750964.0, + "step": 58830 + }, + { + "entropy": 1.9133558467030525, + "epoch": 0.18239885179240817, + "grad_norm": 8.020203590393066, + "learning_rate": 5.923594483121636e-06, + "loss": 0.553, + "mean_token_accuracy": 0.8251760572195053, + "num_tokens": 70762325.0, + "step": 58840 + }, + { + "entropy": 1.9130362540483474, + "epoch": 0.18242985091745786, + "grad_norm": 4.7962446212768555, + "learning_rate": 5.923091174131822e-06, + "loss": 0.5566, + "mean_token_accuracy": 0.8287200018763542, + "num_tokens": 70773880.0, + "step": 58850 + }, + { + "entropy": 1.8333944439888001, + "epoch": 0.18246085004250756, + "grad_norm": 8.215493202209473, + "learning_rate": 5.922587993413905e-06, + "loss": 0.4459, + "mean_token_accuracy": 0.8558073043823242, + "num_tokens": 70786236.0, + "step": 58860 + }, + { + "entropy": 1.8580266639590264, + "epoch": 0.18249184916755726, + "grad_norm": 8.1721830368042, + "learning_rate": 5.922084940913409e-06, + "loss": 0.4505, + "mean_token_accuracy": 0.8527326479554176, + "num_tokens": 70798107.0, + "step": 58870 + }, + { + "entropy": 1.8850371599197389, + "epoch": 0.18252284829260695, + "grad_norm": 9.118043899536133, + "learning_rate": 5.921582016575889e-06, + "loss": 0.5387, + "mean_token_accuracy": 0.8236063331365585, + "num_tokens": 70809614.0, + "step": 58880 + }, + { + "entropy": 1.819055077433586, + "epoch": 0.18255384741765662, + "grad_norm": 6.715834617614746, + "learning_rate": 5.921079220346936e-06, + "loss": 0.4664, + "mean_token_accuracy": 0.8425466433167458, + "num_tokens": 70821709.0, + "step": 58890 + }, + { + "entropy": 1.9392906963825225, + "epoch": 0.18258484654270632, + "grad_norm": 9.14821720123291, + "learning_rate": 5.920576552172171e-06, + "loss": 0.5465, + "mean_token_accuracy": 0.8290189146995545, + "num_tokens": 70833465.0, + "step": 58900 + }, + { + "entropy": 1.8775691345334053, + "epoch": 0.18261584566775602, + "grad_norm": 9.709592819213867, + "learning_rate": 5.920074011997246e-06, + "loss": 0.5464, + "mean_token_accuracy": 0.8243531733751297, + "num_tokens": 70845723.0, + "step": 58910 + }, + { + "entropy": 1.8897256642580031, + "epoch": 0.1826468447928057, + "grad_norm": 8.386919975280762, + "learning_rate": 5.919571599767849e-06, + "loss": 0.5157, + "mean_token_accuracy": 0.8361333101987839, + "num_tokens": 70858171.0, + "step": 58920 + }, + { + "entropy": 1.810855358839035, + "epoch": 0.1826778439178554, + "grad_norm": 7.87431001663208, + "learning_rate": 5.919069315429698e-06, + "loss": 0.4808, + "mean_token_accuracy": 0.8387002602219582, + "num_tokens": 70870960.0, + "step": 58930 + }, + { + "entropy": 1.812534037232399, + "epoch": 0.1827088430429051, + "grad_norm": 9.197452545166016, + "learning_rate": 5.91856715892854e-06, + "loss": 0.411, + "mean_token_accuracy": 0.8409740015864372, + "num_tokens": 70884301.0, + "step": 58940 + }, + { + "entropy": 1.8245099663734436, + "epoch": 0.1827398421679548, + "grad_norm": 3.566840410232544, + "learning_rate": 5.918065130210162e-06, + "loss": 0.435, + "mean_token_accuracy": 0.8494891449809074, + "num_tokens": 70897472.0, + "step": 58950 + }, + { + "entropy": 1.8804219841957093, + "epoch": 0.1827708412930045, + "grad_norm": 9.882930755615234, + "learning_rate": 5.917563229220378e-06, + "loss": 0.4904, + "mean_token_accuracy": 0.8421027392148972, + "num_tokens": 70909937.0, + "step": 58960 + }, + { + "entropy": 1.8350219413638116, + "epoch": 0.1828018404180542, + "grad_norm": 5.378752708435059, + "learning_rate": 5.917061455905032e-06, + "loss": 0.5055, + "mean_token_accuracy": 0.8302841424942017, + "num_tokens": 70922611.0, + "step": 58970 + }, + { + "entropy": 1.9114596903324128, + "epoch": 0.1828328395431039, + "grad_norm": 9.584024429321289, + "learning_rate": 5.9165598102100065e-06, + "loss": 0.5156, + "mean_token_accuracy": 0.8317157745361328, + "num_tokens": 70934342.0, + "step": 58980 + }, + { + "entropy": 1.9586676806211472, + "epoch": 0.1828638386681536, + "grad_norm": 9.2459135055542, + "learning_rate": 5.916058292081212e-06, + "loss": 0.5584, + "mean_token_accuracy": 0.8311464220285416, + "num_tokens": 70945014.0, + "step": 58990 + }, + { + "entropy": 1.8728288680315017, + "epoch": 0.18289483779320329, + "grad_norm": 3.3124911785125732, + "learning_rate": 5.91555690146459e-06, + "loss": 0.5009, + "mean_token_accuracy": 0.8396729901432991, + "num_tokens": 70957244.0, + "step": 59000 + }, + { + "entropy": 1.8486259281635284, + "epoch": 0.18292583691825298, + "grad_norm": 7.671779155731201, + "learning_rate": 5.9150556383061166e-06, + "loss": 0.5108, + "mean_token_accuracy": 0.8414855524897575, + "num_tokens": 70969181.0, + "step": 59010 + }, + { + "entropy": 1.8501243054866792, + "epoch": 0.18295683604330268, + "grad_norm": 7.360293865203857, + "learning_rate": 5.914554502551802e-06, + "loss": 0.4727, + "mean_token_accuracy": 0.8381767392158508, + "num_tokens": 70981480.0, + "step": 59020 + }, + { + "entropy": 1.9539805203676224, + "epoch": 0.18298783516835238, + "grad_norm": 8.94968032836914, + "learning_rate": 5.91405349414768e-06, + "loss": 0.6436, + "mean_token_accuracy": 0.8138018161058426, + "num_tokens": 70992634.0, + "step": 59030 + }, + { + "entropy": 1.7947911590337753, + "epoch": 0.18301883429340207, + "grad_norm": 8.198612213134766, + "learning_rate": 5.913552613039827e-06, + "loss": 0.4641, + "mean_token_accuracy": 0.8452375203371048, + "num_tokens": 71006127.0, + "step": 59040 + }, + { + "entropy": 1.837024737894535, + "epoch": 0.18304983341845177, + "grad_norm": 3.7017059326171875, + "learning_rate": 5.913051859174345e-06, + "loss": 0.5035, + "mean_token_accuracy": 0.8444261506199837, + "num_tokens": 71018041.0, + "step": 59050 + }, + { + "entropy": 1.7747617959976196, + "epoch": 0.18308083254350146, + "grad_norm": 7.044408321380615, + "learning_rate": 5.9125512324973685e-06, + "loss": 0.4534, + "mean_token_accuracy": 0.8478129550814628, + "num_tokens": 71031917.0, + "step": 59060 + }, + { + "entropy": 1.8644176304340363, + "epoch": 0.18311183166855116, + "grad_norm": 9.228703498840332, + "learning_rate": 5.9120507329550645e-06, + "loss": 0.4961, + "mean_token_accuracy": 0.8401506051421166, + "num_tokens": 71044161.0, + "step": 59070 + }, + { + "entropy": 1.8438007518649102, + "epoch": 0.18314283079360086, + "grad_norm": 4.598083019256592, + "learning_rate": 5.9115503604936345e-06, + "loss": 0.4624, + "mean_token_accuracy": 0.8480850771069527, + "num_tokens": 71057367.0, + "step": 59080 + }, + { + "entropy": 1.9214376494288445, + "epoch": 0.18317382991865055, + "grad_norm": 11.023812294006348, + "learning_rate": 5.911050115059307e-06, + "loss": 0.5445, + "mean_token_accuracy": 0.8322188884019852, + "num_tokens": 71068645.0, + "step": 59090 + }, + { + "entropy": 1.9257762372493743, + "epoch": 0.18320482904370025, + "grad_norm": 7.3912129402160645, + "learning_rate": 5.910549996598346e-06, + "loss": 0.5122, + "mean_token_accuracy": 0.8420733541250229, + "num_tokens": 71079883.0, + "step": 59100 + }, + { + "entropy": 1.9468917578458786, + "epoch": 0.18323582816874995, + "grad_norm": 8.324677467346191, + "learning_rate": 5.910050005057045e-06, + "loss": 0.5494, + "mean_token_accuracy": 0.8290826484560967, + "num_tokens": 71090907.0, + "step": 59110 + }, + { + "entropy": 1.930706176161766, + "epoch": 0.18326682729379964, + "grad_norm": 9.13598918914795, + "learning_rate": 5.909550140381733e-06, + "loss": 0.5705, + "mean_token_accuracy": 0.8350569799542427, + "num_tokens": 71101884.0, + "step": 59120 + }, + { + "entropy": 1.8789242595434188, + "epoch": 0.18329782641884934, + "grad_norm": 10.5730619430542, + "learning_rate": 5.9090504025187655e-06, + "loss": 0.4738, + "mean_token_accuracy": 0.8466743901371956, + "num_tokens": 71113590.0, + "step": 59130 + }, + { + "entropy": 1.901440866291523, + "epoch": 0.183328825543899, + "grad_norm": 12.086167335510254, + "learning_rate": 5.908550791414533e-06, + "loss": 0.5615, + "mean_token_accuracy": 0.8337482139468193, + "num_tokens": 71125988.0, + "step": 59140 + }, + { + "entropy": 1.8326066508889198, + "epoch": 0.1833598246689487, + "grad_norm": 2.8919966220855713, + "learning_rate": 5.908051307015459e-06, + "loss": 0.4791, + "mean_token_accuracy": 0.844254644215107, + "num_tokens": 71139896.0, + "step": 59150 + }, + { + "entropy": 1.9314706429839135, + "epoch": 0.1833908237939984, + "grad_norm": 8.493422508239746, + "learning_rate": 5.907551949267995e-06, + "loss": 0.5188, + "mean_token_accuracy": 0.8385980620980262, + "num_tokens": 71151553.0, + "step": 59160 + }, + { + "entropy": 1.8806381478905678, + "epoch": 0.1834218229190481, + "grad_norm": 8.074383735656738, + "learning_rate": 5.907052718118627e-06, + "loss": 0.4823, + "mean_token_accuracy": 0.8440838649868965, + "num_tokens": 71163731.0, + "step": 59170 + }, + { + "entropy": 1.845897839963436, + "epoch": 0.1834528220440978, + "grad_norm": 10.374253273010254, + "learning_rate": 5.9065536135138725e-06, + "loss": 0.4877, + "mean_token_accuracy": 0.8441713094711304, + "num_tokens": 71175735.0, + "step": 59180 + }, + { + "entropy": 1.9331976994872093, + "epoch": 0.1834838211691475, + "grad_norm": 8.653754234313965, + "learning_rate": 5.906054635400278e-06, + "loss": 0.5448, + "mean_token_accuracy": 0.834554848074913, + "num_tokens": 71187616.0, + "step": 59190 + }, + { + "entropy": 1.8752469643950462, + "epoch": 0.1835148202941972, + "grad_norm": 9.133798599243164, + "learning_rate": 5.905555783724424e-06, + "loss": 0.4555, + "mean_token_accuracy": 0.8436867401003838, + "num_tokens": 71199959.0, + "step": 59200 + }, + { + "entropy": 1.8521633878350259, + "epoch": 0.1835458194192469, + "grad_norm": 8.649680137634277, + "learning_rate": 5.905057058432922e-06, + "loss": 0.4741, + "mean_token_accuracy": 0.842129784822464, + "num_tokens": 71212099.0, + "step": 59210 + }, + { + "entropy": 1.8513922914862633, + "epoch": 0.18357681854429658, + "grad_norm": 9.206934928894043, + "learning_rate": 5.904558459472417e-06, + "loss": 0.5184, + "mean_token_accuracy": 0.8332608088850975, + "num_tokens": 71224564.0, + "step": 59220 + }, + { + "entropy": 1.8147864386439323, + "epoch": 0.18360781766934628, + "grad_norm": 7.711428165435791, + "learning_rate": 5.904059986789582e-06, + "loss": 0.5106, + "mean_token_accuracy": 0.8427559122443199, + "num_tokens": 71237203.0, + "step": 59230 + }, + { + "entropy": 1.8857770830392837, + "epoch": 0.18363881679439598, + "grad_norm": 8.450910568237305, + "learning_rate": 5.903561640331122e-06, + "loss": 0.5048, + "mean_token_accuracy": 0.8380788192152977, + "num_tokens": 71249147.0, + "step": 59240 + }, + { + "entropy": 1.8396921649575233, + "epoch": 0.18366981591944567, + "grad_norm": 2.8776183128356934, + "learning_rate": 5.9030634200437765e-06, + "loss": 0.4636, + "mean_token_accuracy": 0.8463966697454453, + "num_tokens": 71260897.0, + "step": 59250 + }, + { + "entropy": 1.8520937889814377, + "epoch": 0.18370081504449537, + "grad_norm": 6.755105018615723, + "learning_rate": 5.902565325874313e-06, + "loss": 0.5058, + "mean_token_accuracy": 0.8342436105012894, + "num_tokens": 71273021.0, + "step": 59260 + }, + { + "entropy": 1.8717355713248254, + "epoch": 0.18373181416954507, + "grad_norm": 8.250826835632324, + "learning_rate": 5.902067357769535e-06, + "loss": 0.5023, + "mean_token_accuracy": 0.8344789668917656, + "num_tokens": 71285282.0, + "step": 59270 + }, + { + "entropy": 1.911249254643917, + "epoch": 0.18376281329459476, + "grad_norm": 11.614958763122559, + "learning_rate": 5.901569515676272e-06, + "loss": 0.5307, + "mean_token_accuracy": 0.839310847222805, + "num_tokens": 71297503.0, + "step": 59280 + }, + { + "entropy": 1.9195268914103507, + "epoch": 0.18379381241964446, + "grad_norm": 8.41645336151123, + "learning_rate": 5.901071799541385e-06, + "loss": 0.5092, + "mean_token_accuracy": 0.8358474299311638, + "num_tokens": 71309385.0, + "step": 59290 + }, + { + "entropy": 1.90712169110775, + "epoch": 0.18382481154469416, + "grad_norm": 8.294679641723633, + "learning_rate": 5.900574209311775e-06, + "loss": 0.5158, + "mean_token_accuracy": 0.8497764050960541, + "num_tokens": 71320129.0, + "step": 59300 + }, + { + "entropy": 1.935383716225624, + "epoch": 0.18385581066974385, + "grad_norm": 9.08409309387207, + "learning_rate": 5.900076744934362e-06, + "loss": 0.5022, + "mean_token_accuracy": 0.8406318157911301, + "num_tokens": 71331528.0, + "step": 59310 + }, + { + "entropy": 1.9089769005775452, + "epoch": 0.18388680979479355, + "grad_norm": 12.19558048248291, + "learning_rate": 5.899579406356107e-06, + "loss": 0.5055, + "mean_token_accuracy": 0.8409374266862869, + "num_tokens": 71342190.0, + "step": 59320 + }, + { + "entropy": 1.9273348927497864, + "epoch": 0.18391780891984325, + "grad_norm": 10.214935302734375, + "learning_rate": 5.8990821935239975e-06, + "loss": 0.5118, + "mean_token_accuracy": 0.8436912566423416, + "num_tokens": 71353668.0, + "step": 59330 + }, + { + "entropy": 1.9410814076662064, + "epoch": 0.18394880804489294, + "grad_norm": 9.362590789794922, + "learning_rate": 5.898585106385053e-06, + "loss": 0.5704, + "mean_token_accuracy": 0.8309288114309311, + "num_tokens": 71364044.0, + "step": 59340 + }, + { + "entropy": 1.9330678835511208, + "epoch": 0.18397980716994264, + "grad_norm": 7.387380599975586, + "learning_rate": 5.898088144886326e-06, + "loss": 0.5147, + "mean_token_accuracy": 0.8400380194187165, + "num_tokens": 71375184.0, + "step": 59350 + }, + { + "entropy": 1.9176151275634765, + "epoch": 0.18401080629499234, + "grad_norm": 8.243695259094238, + "learning_rate": 5.897591308974896e-06, + "loss": 0.5476, + "mean_token_accuracy": 0.8329030573368073, + "num_tokens": 71386183.0, + "step": 59360 + }, + { + "entropy": 1.7748358353972435, + "epoch": 0.18404180542004203, + "grad_norm": 4.477342128753662, + "learning_rate": 5.897094598597879e-06, + "loss": 0.4067, + "mean_token_accuracy": 0.8530562177300454, + "num_tokens": 71399368.0, + "step": 59370 + }, + { + "entropy": 1.8144743472337723, + "epoch": 0.18407280454509173, + "grad_norm": 9.520360946655273, + "learning_rate": 5.896598013702419e-06, + "loss": 0.4925, + "mean_token_accuracy": 0.8422923222184181, + "num_tokens": 71412727.0, + "step": 59380 + }, + { + "entropy": 1.8014607191085816, + "epoch": 0.1841038036701414, + "grad_norm": 6.801212787628174, + "learning_rate": 5.8961015542356925e-06, + "loss": 0.3999, + "mean_token_accuracy": 0.8570068180561066, + "num_tokens": 71425857.0, + "step": 59390 + }, + { + "entropy": 1.927448120713234, + "epoch": 0.1841348027951911, + "grad_norm": 8.77550220489502, + "learning_rate": 5.895605220144907e-06, + "loss": 0.5676, + "mean_token_accuracy": 0.8222044169902801, + "num_tokens": 71436727.0, + "step": 59400 + }, + { + "entropy": 1.9893068850040436, + "epoch": 0.1841658019202408, + "grad_norm": 9.02739143371582, + "learning_rate": 5.8951090113772976e-06, + "loss": 0.5979, + "mean_token_accuracy": 0.8225637242197991, + "num_tokens": 71447373.0, + "step": 59410 + }, + { + "entropy": 1.8202601961791516, + "epoch": 0.1841968010452905, + "grad_norm": 8.421029090881348, + "learning_rate": 5.894612927880137e-06, + "loss": 0.4609, + "mean_token_accuracy": 0.8546611994504929, + "num_tokens": 71460105.0, + "step": 59420 + }, + { + "entropy": 1.8888828486204148, + "epoch": 0.18422780017034018, + "grad_norm": 9.308785438537598, + "learning_rate": 5.8941169696007235e-06, + "loss": 0.53, + "mean_token_accuracy": 0.8378401637077332, + "num_tokens": 71472520.0, + "step": 59430 + }, + { + "entropy": 1.8498116791248322, + "epoch": 0.18425879929538988, + "grad_norm": 8.532123565673828, + "learning_rate": 5.893621136486389e-06, + "loss": 0.4873, + "mean_token_accuracy": 0.844611656665802, + "num_tokens": 71484511.0, + "step": 59440 + }, + { + "entropy": 1.8172200858592986, + "epoch": 0.18428979842043958, + "grad_norm": 4.122251033782959, + "learning_rate": 5.893125428484495e-06, + "loss": 0.4447, + "mean_token_accuracy": 0.8459887281060219, + "num_tokens": 71496637.0, + "step": 59450 + }, + { + "entropy": 1.8752017796039582, + "epoch": 0.18432079754548927, + "grad_norm": 9.73798942565918, + "learning_rate": 5.892629845542437e-06, + "loss": 0.4696, + "mean_token_accuracy": 0.8463060513138772, + "num_tokens": 71508739.0, + "step": 59460 + }, + { + "entropy": 1.8498885072767735, + "epoch": 0.18435179667053897, + "grad_norm": 8.824564933776855, + "learning_rate": 5.892134387607636e-06, + "loss": 0.4777, + "mean_token_accuracy": 0.835616897046566, + "num_tokens": 71521174.0, + "step": 59470 + }, + { + "entropy": 1.863853606581688, + "epoch": 0.18438279579558867, + "grad_norm": 9.248029708862305, + "learning_rate": 5.891639054627551e-06, + "loss": 0.5117, + "mean_token_accuracy": 0.8358003750443459, + "num_tokens": 71533111.0, + "step": 59480 + }, + { + "entropy": 1.9517934620380402, + "epoch": 0.18441379492063836, + "grad_norm": 9.871654510498047, + "learning_rate": 5.891143846549664e-06, + "loss": 0.5908, + "mean_token_accuracy": 0.8294711738824845, + "num_tokens": 71544244.0, + "step": 59490 + }, + { + "entropy": 1.8825423762202262, + "epoch": 0.18444479404568806, + "grad_norm": 7.173405170440674, + "learning_rate": 5.890648763321495e-06, + "loss": 0.486, + "mean_token_accuracy": 0.8478507623076439, + "num_tokens": 71555732.0, + "step": 59500 + }, + { + "entropy": 1.9470236003398895, + "epoch": 0.18447579317073776, + "grad_norm": 10.64505672454834, + "learning_rate": 5.8901538048905904e-06, + "loss": 0.5637, + "mean_token_accuracy": 0.8279279932379723, + "num_tokens": 71566717.0, + "step": 59510 + }, + { + "entropy": 1.8414243504405021, + "epoch": 0.18450679229578745, + "grad_norm": 8.195703506469727, + "learning_rate": 5.8896589712045306e-06, + "loss": 0.4448, + "mean_token_accuracy": 0.8501287072896957, + "num_tokens": 71578437.0, + "step": 59520 + }, + { + "entropy": 1.8852709889411927, + "epoch": 0.18453779142083715, + "grad_norm": 10.737709045410156, + "learning_rate": 5.8891642622109235e-06, + "loss": 0.4645, + "mean_token_accuracy": 0.831559669971466, + "num_tokens": 71590564.0, + "step": 59530 + }, + { + "entropy": 1.8008535578846931, + "epoch": 0.18456879054588685, + "grad_norm": 12.071378707885742, + "learning_rate": 5.888669677857409e-06, + "loss": 0.4656, + "mean_token_accuracy": 0.8527644023299217, + "num_tokens": 71604258.0, + "step": 59540 + }, + { + "entropy": 1.8511207103729248, + "epoch": 0.18459978967093654, + "grad_norm": 4.7530012130737305, + "learning_rate": 5.88817521809166e-06, + "loss": 0.4668, + "mean_token_accuracy": 0.8385688096284867, + "num_tokens": 71616859.0, + "step": 59550 + }, + { + "entropy": 1.9494419798254967, + "epoch": 0.18463078879598624, + "grad_norm": 10.362981796264648, + "learning_rate": 5.887680882861378e-06, + "loss": 0.537, + "mean_token_accuracy": 0.833903931081295, + "num_tokens": 71628020.0, + "step": 59560 + }, + { + "entropy": 1.9751142218708992, + "epoch": 0.18466178792103594, + "grad_norm": 10.168465614318848, + "learning_rate": 5.887186672114294e-06, + "loss": 0.5409, + "mean_token_accuracy": 0.8342203125357628, + "num_tokens": 71639405.0, + "step": 59570 + }, + { + "entropy": 1.9470524042844772, + "epoch": 0.18469278704608563, + "grad_norm": 7.566704273223877, + "learning_rate": 5.886692585798173e-06, + "loss": 0.5758, + "mean_token_accuracy": 0.8385176613926888, + "num_tokens": 71650529.0, + "step": 59580 + }, + { + "entropy": 1.8561659947037696, + "epoch": 0.18472378617113533, + "grad_norm": 8.916204452514648, + "learning_rate": 5.886198623860807e-06, + "loss": 0.4514, + "mean_token_accuracy": 0.8419315591454506, + "num_tokens": 71663274.0, + "step": 59590 + }, + { + "entropy": 1.9323371350765228, + "epoch": 0.18475478529618503, + "grad_norm": 10.571834564208984, + "learning_rate": 5.8857047862500226e-06, + "loss": 0.5731, + "mean_token_accuracy": 0.8282590746879578, + "num_tokens": 71674739.0, + "step": 59600 + }, + { + "entropy": 1.9620111271739007, + "epoch": 0.18478578442123472, + "grad_norm": 8.114259719848633, + "learning_rate": 5.885211072913676e-06, + "loss": 0.5633, + "mean_token_accuracy": 0.8262233078479767, + "num_tokens": 71685925.0, + "step": 59610 + }, + { + "entropy": 1.8970046997070313, + "epoch": 0.18481678354628442, + "grad_norm": 3.8900442123413086, + "learning_rate": 5.884717483799649e-06, + "loss": 0.4988, + "mean_token_accuracy": 0.8467764779925346, + "num_tokens": 71697604.0, + "step": 59620 + }, + { + "entropy": 1.8912265598773956, + "epoch": 0.18484778267133412, + "grad_norm": 9.921353340148926, + "learning_rate": 5.884224018855862e-06, + "loss": 0.5736, + "mean_token_accuracy": 0.8240482151508332, + "num_tokens": 71708748.0, + "step": 59630 + }, + { + "entropy": 1.873260524868965, + "epoch": 0.18487878179638378, + "grad_norm": 7.441627025604248, + "learning_rate": 5.883730678030261e-06, + "loss": 0.5304, + "mean_token_accuracy": 0.831610731780529, + "num_tokens": 71720631.0, + "step": 59640 + }, + { + "entropy": 1.8897306516766548, + "epoch": 0.18490978092143348, + "grad_norm": 8.014190673828125, + "learning_rate": 5.883237461270822e-06, + "loss": 0.5162, + "mean_token_accuracy": 0.8387016162276268, + "num_tokens": 71732006.0, + "step": 59650 + }, + { + "entropy": 1.8416223376989365, + "epoch": 0.18494078004648318, + "grad_norm": 6.837216854095459, + "learning_rate": 5.882744368525556e-06, + "loss": 0.5324, + "mean_token_accuracy": 0.832354225218296, + "num_tokens": 71744288.0, + "step": 59660 + }, + { + "entropy": 1.7982579827308656, + "epoch": 0.18497177917153287, + "grad_norm": 9.943193435668945, + "learning_rate": 5.882251399742499e-06, + "loss": 0.4099, + "mean_token_accuracy": 0.8525391161441803, + "num_tokens": 71757879.0, + "step": 59670 + }, + { + "entropy": 1.7894408702850342, + "epoch": 0.18500277829658257, + "grad_norm": 8.48753833770752, + "learning_rate": 5.881758554869721e-06, + "loss": 0.4682, + "mean_token_accuracy": 0.8433696299791336, + "num_tokens": 71772009.0, + "step": 59680 + }, + { + "entropy": 1.8790398433804512, + "epoch": 0.18503377742163227, + "grad_norm": 8.779352188110352, + "learning_rate": 5.881265833855321e-06, + "loss": 0.5417, + "mean_token_accuracy": 0.8341058731079102, + "num_tokens": 71784020.0, + "step": 59690 + }, + { + "entropy": 1.8383540600538253, + "epoch": 0.18506477654668196, + "grad_norm": 4.482400894165039, + "learning_rate": 5.880773236647431e-06, + "loss": 0.4772, + "mean_token_accuracy": 0.8363133147358894, + "num_tokens": 71795570.0, + "step": 59700 + }, + { + "entropy": 1.807381245493889, + "epoch": 0.18509577567173166, + "grad_norm": 7.861814498901367, + "learning_rate": 5.8802807631942095e-06, + "loss": 0.4422, + "mean_token_accuracy": 0.8491622850298881, + "num_tokens": 71808715.0, + "step": 59710 + }, + { + "entropy": 1.9171263113617898, + "epoch": 0.18512677479678136, + "grad_norm": 9.038065910339355, + "learning_rate": 5.879788413443846e-06, + "loss": 0.5153, + "mean_token_accuracy": 0.8278371721506119, + "num_tokens": 71820652.0, + "step": 59720 + }, + { + "entropy": 1.9229667693376542, + "epoch": 0.18515777392183105, + "grad_norm": 7.926364898681641, + "learning_rate": 5.879296187344564e-06, + "loss": 0.5683, + "mean_token_accuracy": 0.8299353420734406, + "num_tokens": 71832006.0, + "step": 59730 + }, + { + "entropy": 1.8270016878843307, + "epoch": 0.18518877304688075, + "grad_norm": 4.967191219329834, + "learning_rate": 5.878804084844616e-06, + "loss": 0.4648, + "mean_token_accuracy": 0.8319668889045715, + "num_tokens": 71845433.0, + "step": 59740 + }, + { + "entropy": 1.9323490008711814, + "epoch": 0.18521977217193045, + "grad_norm": 7.835849285125732, + "learning_rate": 5.878312105892281e-06, + "loss": 0.5384, + "mean_token_accuracy": 0.8331502959132194, + "num_tokens": 71856737.0, + "step": 59750 + }, + { + "entropy": 1.7635822594165802, + "epoch": 0.18525077129698014, + "grad_norm": 10.747203826904297, + "learning_rate": 5.8778202504358716e-06, + "loss": 0.4358, + "mean_token_accuracy": 0.8548522099852562, + "num_tokens": 71870471.0, + "step": 59760 + }, + { + "entropy": 1.7823803335428239, + "epoch": 0.18528177042202984, + "grad_norm": 3.434664249420166, + "learning_rate": 5.8773285184237316e-06, + "loss": 0.426, + "mean_token_accuracy": 0.846599979698658, + "num_tokens": 71884046.0, + "step": 59770 + }, + { + "entropy": 1.8083126276731492, + "epoch": 0.18531276954707954, + "grad_norm": 4.187022686004639, + "learning_rate": 5.876836909804231e-06, + "loss": 0.4878, + "mean_token_accuracy": 0.8312985822558403, + "num_tokens": 71897333.0, + "step": 59780 + }, + { + "entropy": 1.9598889589309691, + "epoch": 0.18534376867212923, + "grad_norm": 8.239096641540527, + "learning_rate": 5.876345424525776e-06, + "loss": 0.6611, + "mean_token_accuracy": 0.8133495360612869, + "num_tokens": 71908395.0, + "step": 59790 + }, + { + "entropy": 1.9233538269996644, + "epoch": 0.18537476779717893, + "grad_norm": 3.7987241744995117, + "learning_rate": 5.8758540625367965e-06, + "loss": 0.5428, + "mean_token_accuracy": 0.8402821734547615, + "num_tokens": 71920345.0, + "step": 59800 + }, + { + "entropy": 1.9380876675248147, + "epoch": 0.18540576692222863, + "grad_norm": 7.9834208488464355, + "learning_rate": 5.875362823785758e-06, + "loss": 0.5111, + "mean_token_accuracy": 0.8361145481467247, + "num_tokens": 71932372.0, + "step": 59810 + }, + { + "entropy": 1.9172768041491508, + "epoch": 0.18543676604727832, + "grad_norm": 10.48983097076416, + "learning_rate": 5.8748717082211516e-06, + "loss": 0.5123, + "mean_token_accuracy": 0.8384231805801392, + "num_tokens": 71944733.0, + "step": 59820 + }, + { + "entropy": 1.8952128738164902, + "epoch": 0.18546776517232802, + "grad_norm": 6.812406539916992, + "learning_rate": 5.874380715791502e-06, + "loss": 0.4774, + "mean_token_accuracy": 0.846713088452816, + "num_tokens": 71956856.0, + "step": 59830 + }, + { + "entropy": 1.780382940173149, + "epoch": 0.18549876429737772, + "grad_norm": 4.57772159576416, + "learning_rate": 5.8738898464453644e-06, + "loss": 0.4001, + "mean_token_accuracy": 0.8531007096171379, + "num_tokens": 71970503.0, + "step": 59840 + }, + { + "entropy": 1.910321943461895, + "epoch": 0.1855297634224274, + "grad_norm": 10.066534042358398, + "learning_rate": 5.87339910013132e-06, + "loss": 0.5479, + "mean_token_accuracy": 0.8341235533356667, + "num_tokens": 71982378.0, + "step": 59850 + }, + { + "entropy": 1.927718922495842, + "epoch": 0.1855607625474771, + "grad_norm": 9.32041072845459, + "learning_rate": 5.872908476797983e-06, + "loss": 0.5619, + "mean_token_accuracy": 0.8251764044165611, + "num_tokens": 71993478.0, + "step": 59860 + }, + { + "entropy": 1.9356975913047791, + "epoch": 0.1855917616725268, + "grad_norm": 4.789025783538818, + "learning_rate": 5.872417976393997e-06, + "loss": 0.5355, + "mean_token_accuracy": 0.8277438759803772, + "num_tokens": 72005113.0, + "step": 59870 + }, + { + "entropy": 1.8369565188884736, + "epoch": 0.18562276079757647, + "grad_norm": 7.20404577255249, + "learning_rate": 5.871927598868036e-06, + "loss": 0.5349, + "mean_token_accuracy": 0.8378855675458908, + "num_tokens": 72017827.0, + "step": 59880 + }, + { + "entropy": 1.8494772240519524, + "epoch": 0.18565375992262617, + "grad_norm": 8.194475173950195, + "learning_rate": 5.871437344168805e-06, + "loss": 0.5451, + "mean_token_accuracy": 0.8221006825566292, + "num_tokens": 72030285.0, + "step": 59890 + }, + { + "entropy": 1.9028642885386944, + "epoch": 0.18568475904767587, + "grad_norm": 8.854440689086914, + "learning_rate": 5.870947212245036e-06, + "loss": 0.5293, + "mean_token_accuracy": 0.8278263434767723, + "num_tokens": 72043015.0, + "step": 59900 + }, + { + "entropy": 1.8917630180716514, + "epoch": 0.18571575817272556, + "grad_norm": 8.142960548400879, + "learning_rate": 5.870457203045496e-06, + "loss": 0.5053, + "mean_token_accuracy": 0.8398408219218254, + "num_tokens": 72054537.0, + "step": 59910 + }, + { + "entropy": 1.8536349281668663, + "epoch": 0.18574675729777526, + "grad_norm": 8.590570449829102, + "learning_rate": 5.869967316518973e-06, + "loss": 0.4352, + "mean_token_accuracy": 0.8522145226597786, + "num_tokens": 72067016.0, + "step": 59920 + }, + { + "entropy": 1.9314019471406936, + "epoch": 0.18577775642282496, + "grad_norm": 4.44788932800293, + "learning_rate": 5.869477552614296e-06, + "loss": 0.5156, + "mean_token_accuracy": 0.8373445898294449, + "num_tokens": 72078617.0, + "step": 59930 + }, + { + "entropy": 1.8379703059792518, + "epoch": 0.18580875554787465, + "grad_norm": 8.737577438354492, + "learning_rate": 5.868987911280315e-06, + "loss": 0.4859, + "mean_token_accuracy": 0.8353394463658332, + "num_tokens": 72090576.0, + "step": 59940 + }, + { + "entropy": 1.9440933406352996, + "epoch": 0.18583975467292435, + "grad_norm": 8.472302436828613, + "learning_rate": 5.868498392465915e-06, + "loss": 0.5734, + "mean_token_accuracy": 0.8333403542637825, + "num_tokens": 72101344.0, + "step": 59950 + }, + { + "entropy": 1.8688756361603738, + "epoch": 0.18587075379797405, + "grad_norm": 9.316754341125488, + "learning_rate": 5.868008996120008e-06, + "loss": 0.5235, + "mean_token_accuracy": 0.8337069198489189, + "num_tokens": 72113227.0, + "step": 59960 + }, + { + "entropy": 1.834533803164959, + "epoch": 0.18590175292302374, + "grad_norm": 8.468538284301758, + "learning_rate": 5.867519722191538e-06, + "loss": 0.4466, + "mean_token_accuracy": 0.8497491136193276, + "num_tokens": 72125271.0, + "step": 59970 + }, + { + "entropy": 1.7592488691210746, + "epoch": 0.18593275204807344, + "grad_norm": 8.312585830688477, + "learning_rate": 5.867030570629477e-06, + "loss": 0.396, + "mean_token_accuracy": 0.8604243487119675, + "num_tokens": 72139001.0, + "step": 59980 + }, + { + "entropy": 1.9161117985844611, + "epoch": 0.18596375117312314, + "grad_norm": 4.209480285644531, + "learning_rate": 5.866541541382829e-06, + "loss": 0.5155, + "mean_token_accuracy": 0.8296124190092087, + "num_tokens": 72151289.0, + "step": 59990 + }, + { + "entropy": 1.9577097177505494, + "epoch": 0.18599475029817283, + "grad_norm": 9.150534629821777, + "learning_rate": 5.866052634400624e-06, + "loss": 0.58, + "mean_token_accuracy": 0.8270249828696251, + "num_tokens": 72161958.0, + "step": 60000 + }, + { + "entropy": 1.896253764629364, + "epoch": 0.18602574942322253, + "grad_norm": 5.07069206237793, + "learning_rate": 5.865563849631925e-06, + "loss": 0.5408, + "mean_token_accuracy": 0.8247905030846596, + "num_tokens": 72175097.0, + "step": 60010 + }, + { + "entropy": 1.7840982712805271, + "epoch": 0.18605674854827223, + "grad_norm": 8.719794273376465, + "learning_rate": 5.865075187025824e-06, + "loss": 0.4362, + "mean_token_accuracy": 0.8515463337302208, + "num_tokens": 72188721.0, + "step": 60020 + }, + { + "entropy": 1.994665226340294, + "epoch": 0.18608774767332192, + "grad_norm": 8.010587692260742, + "learning_rate": 5.864586646531443e-06, + "loss": 0.5875, + "mean_token_accuracy": 0.8243452772498131, + "num_tokens": 72199455.0, + "step": 60030 + }, + { + "entropy": 1.869076819717884, + "epoch": 0.18611874679837162, + "grad_norm": 8.577561378479004, + "learning_rate": 5.864098228097931e-06, + "loss": 0.4793, + "mean_token_accuracy": 0.843721067905426, + "num_tokens": 72211035.0, + "step": 60040 + }, + { + "entropy": 1.9081630438566208, + "epoch": 0.18614974592342132, + "grad_norm": 8.139861106872559, + "learning_rate": 5.863609931674471e-06, + "loss": 0.4883, + "mean_token_accuracy": 0.847606098651886, + "num_tokens": 72222707.0, + "step": 60050 + }, + { + "entropy": 1.9403681397438048, + "epoch": 0.186180745048471, + "grad_norm": 9.84714412689209, + "learning_rate": 5.8631217572102716e-06, + "loss": 0.5301, + "mean_token_accuracy": 0.8385643243789673, + "num_tokens": 72232917.0, + "step": 60060 + }, + { + "entropy": 1.9238607689738274, + "epoch": 0.1862117441735207, + "grad_norm": 8.912215232849121, + "learning_rate": 5.862633704654573e-06, + "loss": 0.5572, + "mean_token_accuracy": 0.8306085541844368, + "num_tokens": 72244516.0, + "step": 60070 + }, + { + "entropy": 1.9544622465968131, + "epoch": 0.1862427432985704, + "grad_norm": 8.447044372558594, + "learning_rate": 5.862145773956647e-06, + "loss": 0.5516, + "mean_token_accuracy": 0.8342535004019738, + "num_tokens": 72255606.0, + "step": 60080 + }, + { + "entropy": 1.8915927335619926, + "epoch": 0.1862737424236201, + "grad_norm": 9.219244003295898, + "learning_rate": 5.861657965065788e-06, + "loss": 0.5361, + "mean_token_accuracy": 0.8286745131015778, + "num_tokens": 72267179.0, + "step": 60090 + }, + { + "entropy": 1.9081023514270783, + "epoch": 0.1863047415486698, + "grad_norm": 3.8178694248199463, + "learning_rate": 5.861170277931328e-06, + "loss": 0.5429, + "mean_token_accuracy": 0.8304624617099762, + "num_tokens": 72279385.0, + "step": 60100 + }, + { + "entropy": 1.8515421822667122, + "epoch": 0.1863357406737195, + "grad_norm": 6.892448902130127, + "learning_rate": 5.8606827125026256e-06, + "loss": 0.4714, + "mean_token_accuracy": 0.8440054371953011, + "num_tokens": 72292367.0, + "step": 60110 + }, + { + "entropy": 1.941430465877056, + "epoch": 0.1863667397987692, + "grad_norm": 9.434157371520996, + "learning_rate": 5.860195268729066e-06, + "loss": 0.5499, + "mean_token_accuracy": 0.8339191779494286, + "num_tokens": 72303613.0, + "step": 60120 + }, + { + "entropy": 1.9135318726301194, + "epoch": 0.18639773892381886, + "grad_norm": 11.788899421691895, + "learning_rate": 5.859707946560067e-06, + "loss": 0.5565, + "mean_token_accuracy": 0.8330492198467254, + "num_tokens": 72314613.0, + "step": 60130 + }, + { + "entropy": 1.8537031307816505, + "epoch": 0.18642873804886856, + "grad_norm": 9.07436466217041, + "learning_rate": 5.859220745945075e-06, + "loss": 0.4944, + "mean_token_accuracy": 0.8319351330399514, + "num_tokens": 72326917.0, + "step": 60140 + }, + { + "entropy": 2.000509098172188, + "epoch": 0.18645973717391826, + "grad_norm": 11.218595504760742, + "learning_rate": 5.858733666833567e-06, + "loss": 0.5732, + "mean_token_accuracy": 0.8274472177028656, + "num_tokens": 72338272.0, + "step": 60150 + }, + { + "entropy": 1.9531633704900742, + "epoch": 0.18649073629896795, + "grad_norm": 8.319474220275879, + "learning_rate": 5.858246709175047e-06, + "loss": 0.5806, + "mean_token_accuracy": 0.8167789533734322, + "num_tokens": 72349717.0, + "step": 60160 + }, + { + "entropy": 1.8039245098829269, + "epoch": 0.18652173542401765, + "grad_norm": 5.849046230316162, + "learning_rate": 5.8577598729190496e-06, + "loss": 0.4335, + "mean_token_accuracy": 0.8554789200425148, + "num_tokens": 72362915.0, + "step": 60170 + }, + { + "entropy": 1.87906823605299, + "epoch": 0.18655273454906734, + "grad_norm": 8.501765251159668, + "learning_rate": 5.85727315801514e-06, + "loss": 0.4956, + "mean_token_accuracy": 0.8379226326942444, + "num_tokens": 72374922.0, + "step": 60180 + }, + { + "entropy": 1.8856094121932983, + "epoch": 0.18658373367411704, + "grad_norm": 8.660506248474121, + "learning_rate": 5.8567865644129095e-06, + "loss": 0.5027, + "mean_token_accuracy": 0.8400798097252846, + "num_tokens": 72387230.0, + "step": 60190 + }, + { + "entropy": 1.843792949616909, + "epoch": 0.18661473279916674, + "grad_norm": 7.968148231506348, + "learning_rate": 5.856300092061984e-06, + "loss": 0.5075, + "mean_token_accuracy": 0.8412106573581696, + "num_tokens": 72399496.0, + "step": 60200 + }, + { + "entropy": 1.863570548593998, + "epoch": 0.18664573192421643, + "grad_norm": 8.22105598449707, + "learning_rate": 5.855813740912011e-06, + "loss": 0.4864, + "mean_token_accuracy": 0.8438234731554985, + "num_tokens": 72412130.0, + "step": 60210 + }, + { + "entropy": 1.903498760610819, + "epoch": 0.18667673104926613, + "grad_norm": 8.098082542419434, + "learning_rate": 5.855327510912675e-06, + "loss": 0.487, + "mean_token_accuracy": 0.8345488205552101, + "num_tokens": 72425418.0, + "step": 60220 + }, + { + "entropy": 1.8605897799134254, + "epoch": 0.18670773017431583, + "grad_norm": 8.592905044555664, + "learning_rate": 5.854841402013685e-06, + "loss": 0.5436, + "mean_token_accuracy": 0.8296092748641968, + "num_tokens": 72437861.0, + "step": 60230 + }, + { + "entropy": 1.8651499822735786, + "epoch": 0.18673872929936552, + "grad_norm": 3.829653739929199, + "learning_rate": 5.8543554141647814e-06, + "loss": 0.4924, + "mean_token_accuracy": 0.837422613799572, + "num_tokens": 72450429.0, + "step": 60240 + }, + { + "entropy": 1.8985691770911217, + "epoch": 0.18676972842441522, + "grad_norm": 8.132059097290039, + "learning_rate": 5.853869547315731e-06, + "loss": 0.4996, + "mean_token_accuracy": 0.8453919142484665, + "num_tokens": 72462168.0, + "step": 60250 + }, + { + "entropy": 1.8822806507349015, + "epoch": 0.18680072754946492, + "grad_norm": 8.20763111114502, + "learning_rate": 5.853383801416336e-06, + "loss": 0.5387, + "mean_token_accuracy": 0.8344938531517982, + "num_tokens": 72473992.0, + "step": 60260 + }, + { + "entropy": 1.9301311939954757, + "epoch": 0.18683172667451461, + "grad_norm": 8.083572387695312, + "learning_rate": 5.8528981764164205e-06, + "loss": 0.5355, + "mean_token_accuracy": 0.841622045636177, + "num_tokens": 72484567.0, + "step": 60270 + }, + { + "entropy": 1.8997415751218796, + "epoch": 0.1868627257995643, + "grad_norm": 7.791327953338623, + "learning_rate": 5.85241267226584e-06, + "loss": 0.5359, + "mean_token_accuracy": 0.8344151824712753, + "num_tokens": 72495039.0, + "step": 60280 + }, + { + "entropy": 1.8864543735980988, + "epoch": 0.186893724924614, + "grad_norm": 7.940762042999268, + "learning_rate": 5.851927288914482e-06, + "loss": 0.5043, + "mean_token_accuracy": 0.8393915817141533, + "num_tokens": 72507012.0, + "step": 60290 + }, + { + "entropy": 1.8426005780696868, + "epoch": 0.1869247240496637, + "grad_norm": 8.484749794006348, + "learning_rate": 5.85144202631226e-06, + "loss": 0.4907, + "mean_token_accuracy": 0.8417939618229866, + "num_tokens": 72519213.0, + "step": 60300 + }, + { + "entropy": 1.860968105494976, + "epoch": 0.1869557231747134, + "grad_norm": 8.830284118652344, + "learning_rate": 5.850956884409118e-06, + "loss": 0.5251, + "mean_token_accuracy": 0.848306556046009, + "num_tokens": 72531214.0, + "step": 60310 + }, + { + "entropy": 1.921893371641636, + "epoch": 0.1869867222997631, + "grad_norm": 4.533517837524414, + "learning_rate": 5.8504718631550285e-06, + "loss": 0.4962, + "mean_token_accuracy": 0.8360114693641663, + "num_tokens": 72543240.0, + "step": 60320 + }, + { + "entropy": 1.93612492531538, + "epoch": 0.1870177214248128, + "grad_norm": 8.422826766967773, + "learning_rate": 5.849986962499992e-06, + "loss": 0.5496, + "mean_token_accuracy": 0.8314146876335144, + "num_tokens": 72554502.0, + "step": 60330 + }, + { + "entropy": 1.887687975168228, + "epoch": 0.1870487205498625, + "grad_norm": 8.538383483886719, + "learning_rate": 5.8495021823940416e-06, + "loss": 0.5353, + "mean_token_accuracy": 0.8397644698619843, + "num_tokens": 72565656.0, + "step": 60340 + }, + { + "entropy": 1.744446012377739, + "epoch": 0.1870797196749122, + "grad_norm": 8.830626487731934, + "learning_rate": 5.849017522787233e-06, + "loss": 0.512, + "mean_token_accuracy": 0.8492047414183617, + "num_tokens": 72579158.0, + "step": 60350 + }, + { + "entropy": 1.8573518849909305, + "epoch": 0.18711071879996188, + "grad_norm": 8.745538711547852, + "learning_rate": 5.848532983629661e-06, + "loss": 0.5339, + "mean_token_accuracy": 0.8321371108293534, + "num_tokens": 72591425.0, + "step": 60360 + }, + { + "entropy": 1.7826851338148118, + "epoch": 0.18714171792501158, + "grad_norm": 8.527620315551758, + "learning_rate": 5.848048564871437e-06, + "loss": 0.4543, + "mean_token_accuracy": 0.8485151454806328, + "num_tokens": 72604518.0, + "step": 60370 + }, + { + "entropy": 1.8439786598086356, + "epoch": 0.18717271705006125, + "grad_norm": 8.76579475402832, + "learning_rate": 5.84756426646271e-06, + "loss": 0.4614, + "mean_token_accuracy": 0.8471570834517479, + "num_tokens": 72616133.0, + "step": 60380 + }, + { + "entropy": 1.9413674965500831, + "epoch": 0.18720371617511095, + "grad_norm": 9.924832344055176, + "learning_rate": 5.847080088353656e-06, + "loss": 0.5343, + "mean_token_accuracy": 0.8391248330473899, + "num_tokens": 72627275.0, + "step": 60390 + }, + { + "entropy": 1.803322871029377, + "epoch": 0.18723471530016064, + "grad_norm": 7.1156816482543945, + "learning_rate": 5.846596030494477e-06, + "loss": 0.5098, + "mean_token_accuracy": 0.831206327676773, + "num_tokens": 72640255.0, + "step": 60400 + }, + { + "entropy": 1.9137111112475396, + "epoch": 0.18726571442521034, + "grad_norm": 9.406431198120117, + "learning_rate": 5.8461120928354075e-06, + "loss": 0.5411, + "mean_token_accuracy": 0.8288335934281349, + "num_tokens": 72651695.0, + "step": 60410 + }, + { + "entropy": 1.988877174258232, + "epoch": 0.18729671355026004, + "grad_norm": 8.406907081604004, + "learning_rate": 5.845628275326711e-06, + "loss": 0.5578, + "mean_token_accuracy": 0.8310790181159973, + "num_tokens": 72662701.0, + "step": 60420 + }, + { + "entropy": 1.844719012081623, + "epoch": 0.18732771267530973, + "grad_norm": 4.9071431159973145, + "learning_rate": 5.845144577918675e-06, + "loss": 0.4843, + "mean_token_accuracy": 0.8463888078927994, + "num_tokens": 72675062.0, + "step": 60430 + }, + { + "entropy": 1.9066783770918847, + "epoch": 0.18735871180035943, + "grad_norm": 9.20982551574707, + "learning_rate": 5.844661000561621e-06, + "loss": 0.4689, + "mean_token_accuracy": 0.8450051352381707, + "num_tokens": 72687250.0, + "step": 60440 + }, + { + "entropy": 1.8505268752574922, + "epoch": 0.18738971092540913, + "grad_norm": 12.914068222045898, + "learning_rate": 5.844177543205897e-06, + "loss": 0.4712, + "mean_token_accuracy": 0.8434465497732162, + "num_tokens": 72699919.0, + "step": 60450 + }, + { + "entropy": 1.7514374867081641, + "epoch": 0.18742071005045882, + "grad_norm": 8.30848503112793, + "learning_rate": 5.843694205801879e-06, + "loss": 0.3801, + "mean_token_accuracy": 0.8658733040094375, + "num_tokens": 72713733.0, + "step": 60460 + }, + { + "entropy": 1.89697934538126, + "epoch": 0.18745170917550852, + "grad_norm": 9.627775192260742, + "learning_rate": 5.843210988299973e-06, + "loss": 0.5452, + "mean_token_accuracy": 0.8306502267718315, + "num_tokens": 72726134.0, + "step": 60470 + }, + { + "entropy": 1.9438308894634246, + "epoch": 0.18748270830055822, + "grad_norm": 10.055462837219238, + "learning_rate": 5.842727890650615e-06, + "loss": 0.5728, + "mean_token_accuracy": 0.8255834862589836, + "num_tokens": 72736983.0, + "step": 60480 + }, + { + "entropy": 1.8450178682804108, + "epoch": 0.1875137074256079, + "grad_norm": 9.322096824645996, + "learning_rate": 5.8422449128042654e-06, + "loss": 0.4454, + "mean_token_accuracy": 0.845012141764164, + "num_tokens": 72749747.0, + "step": 60490 + }, + { + "entropy": 1.8929164052009582, + "epoch": 0.1875447065506576, + "grad_norm": 3.979761838912964, + "learning_rate": 5.84176205471142e-06, + "loss": 0.5459, + "mean_token_accuracy": 0.8384419724345207, + "num_tokens": 72761568.0, + "step": 60500 + }, + { + "entropy": 1.8335536420345306, + "epoch": 0.1875757056757073, + "grad_norm": 7.59586238861084, + "learning_rate": 5.841279316322594e-06, + "loss": 0.4567, + "mean_token_accuracy": 0.8467094838619232, + "num_tokens": 72774261.0, + "step": 60510 + }, + { + "entropy": 1.882728861272335, + "epoch": 0.187606704800757, + "grad_norm": 8.7779541015625, + "learning_rate": 5.840796697588341e-06, + "loss": 0.4918, + "mean_token_accuracy": 0.8436086297035217, + "num_tokens": 72786714.0, + "step": 60520 + }, + { + "entropy": 1.8310616135597229, + "epoch": 0.1876377039258067, + "grad_norm": 5.086270809173584, + "learning_rate": 5.840314198459236e-06, + "loss": 0.4538, + "mean_token_accuracy": 0.8549797222018242, + "num_tokens": 72799986.0, + "step": 60530 + }, + { + "entropy": 1.9134382233023643, + "epoch": 0.1876687030508564, + "grad_norm": 8.873359680175781, + "learning_rate": 5.839831818885886e-06, + "loss": 0.5306, + "mean_token_accuracy": 0.8400856927037239, + "num_tokens": 72811382.0, + "step": 60540 + }, + { + "entropy": 1.809613211452961, + "epoch": 0.1876997021759061, + "grad_norm": 4.331747055053711, + "learning_rate": 5.839349558818926e-06, + "loss": 0.4504, + "mean_token_accuracy": 0.8486540347337723, + "num_tokens": 72824270.0, + "step": 60550 + }, + { + "entropy": 1.908320277929306, + "epoch": 0.1877307013009558, + "grad_norm": 8.491447448730469, + "learning_rate": 5.838867418209019e-06, + "loss": 0.5636, + "mean_token_accuracy": 0.8320151507854462, + "num_tokens": 72836395.0, + "step": 60560 + }, + { + "entropy": 1.8660949409008025, + "epoch": 0.18776170042600548, + "grad_norm": 4.656119346618652, + "learning_rate": 5.838385397006855e-06, + "loss": 0.4516, + "mean_token_accuracy": 0.8481364041566849, + "num_tokens": 72849113.0, + "step": 60570 + }, + { + "entropy": 1.8910485938191415, + "epoch": 0.18779269955105518, + "grad_norm": 6.919294834136963, + "learning_rate": 5.837903495163157e-06, + "loss": 0.489, + "mean_token_accuracy": 0.8406860768795014, + "num_tokens": 72860770.0, + "step": 60580 + }, + { + "entropy": 1.9625378847122192, + "epoch": 0.18782369867610488, + "grad_norm": 10.646431922912598, + "learning_rate": 5.837421712628675e-06, + "loss": 0.5639, + "mean_token_accuracy": 0.8330793261528016, + "num_tokens": 72871585.0, + "step": 60590 + }, + { + "entropy": 1.8914886653423308, + "epoch": 0.18785469780115457, + "grad_norm": 7.903103828430176, + "learning_rate": 5.836940049354182e-06, + "loss": 0.5078, + "mean_token_accuracy": 0.8413411438465118, + "num_tokens": 72883535.0, + "step": 60600 + }, + { + "entropy": 1.8991345196962357, + "epoch": 0.18788569692620427, + "grad_norm": 4.898101329803467, + "learning_rate": 5.8364585052904845e-06, + "loss": 0.5244, + "mean_token_accuracy": 0.8374688118696213, + "num_tokens": 72895721.0, + "step": 60610 + }, + { + "entropy": 1.8644976392388344, + "epoch": 0.18791669605125394, + "grad_norm": 4.19233512878418, + "learning_rate": 5.8359770803884175e-06, + "loss": 0.505, + "mean_token_accuracy": 0.8408412501215935, + "num_tokens": 72907421.0, + "step": 60620 + }, + { + "entropy": 1.891925212740898, + "epoch": 0.18794769517630364, + "grad_norm": 7.999198913574219, + "learning_rate": 5.835495774598844e-06, + "loss": 0.4944, + "mean_token_accuracy": 0.8336058259010315, + "num_tokens": 72919575.0, + "step": 60630 + }, + { + "entropy": 1.8591307654976845, + "epoch": 0.18797869430135333, + "grad_norm": 8.912029266357422, + "learning_rate": 5.8350145878726545e-06, + "loss": 0.5389, + "mean_token_accuracy": 0.8295843943953514, + "num_tokens": 72932425.0, + "step": 60640 + }, + { + "entropy": 1.957919180393219, + "epoch": 0.18800969342640303, + "grad_norm": 8.599974632263184, + "learning_rate": 5.834533520160769e-06, + "loss": 0.5634, + "mean_token_accuracy": 0.8301822498440743, + "num_tokens": 72943322.0, + "step": 60650 + }, + { + "entropy": 1.9159081429243088, + "epoch": 0.18804069255145273, + "grad_norm": 7.521796226501465, + "learning_rate": 5.834052571414132e-06, + "loss": 0.5475, + "mean_token_accuracy": 0.8309228405356407, + "num_tokens": 72954379.0, + "step": 60660 + }, + { + "entropy": 1.8881125912070273, + "epoch": 0.18807169167650242, + "grad_norm": 8.066596984863281, + "learning_rate": 5.833571741583721e-06, + "loss": 0.4948, + "mean_token_accuracy": 0.8445821896195411, + "num_tokens": 72965758.0, + "step": 60670 + }, + { + "entropy": 1.8443811774253844, + "epoch": 0.18810269080155212, + "grad_norm": 9.628620147705078, + "learning_rate": 5.8330910306205405e-06, + "loss": 0.4416, + "mean_token_accuracy": 0.8472337678074837, + "num_tokens": 72977927.0, + "step": 60680 + }, + { + "entropy": 1.869968481361866, + "epoch": 0.18813368992660182, + "grad_norm": 8.748429298400879, + "learning_rate": 5.832610438475622e-06, + "loss": 0.4627, + "mean_token_accuracy": 0.8413064509630204, + "num_tokens": 72989566.0, + "step": 60690 + }, + { + "entropy": 1.881579375267029, + "epoch": 0.1881646890516515, + "grad_norm": 8.547125816345215, + "learning_rate": 5.832129965100026e-06, + "loss": 0.5417, + "mean_token_accuracy": 0.8311194002628326, + "num_tokens": 73000552.0, + "step": 60700 + }, + { + "entropy": 1.9194360464811324, + "epoch": 0.1881956881767012, + "grad_norm": 10.575980186462402, + "learning_rate": 5.831649610444842e-06, + "loss": 0.5357, + "mean_token_accuracy": 0.8376916989684104, + "num_tokens": 73011919.0, + "step": 60710 + }, + { + "entropy": 1.936641664803028, + "epoch": 0.1882266873017509, + "grad_norm": 7.608135223388672, + "learning_rate": 5.831169374461185e-06, + "loss": 0.5898, + "mean_token_accuracy": 0.821084663271904, + "num_tokens": 73023762.0, + "step": 60720 + }, + { + "entropy": 1.9660951375961304, + "epoch": 0.1882576864268006, + "grad_norm": 8.96996784210205, + "learning_rate": 5.8306892571002025e-06, + "loss": 0.5683, + "mean_token_accuracy": 0.835235033929348, + "num_tokens": 73034510.0, + "step": 60730 + }, + { + "entropy": 1.8755255818367005, + "epoch": 0.1882886855518503, + "grad_norm": 8.350106239318848, + "learning_rate": 5.830209258313067e-06, + "loss": 0.4726, + "mean_token_accuracy": 0.8453781142830848, + "num_tokens": 73046102.0, + "step": 60740 + }, + { + "entropy": 1.901422207057476, + "epoch": 0.1883196846769, + "grad_norm": 8.605900764465332, + "learning_rate": 5.829729378050978e-06, + "loss": 0.5604, + "mean_token_accuracy": 0.8291002050042152, + "num_tokens": 73057508.0, + "step": 60750 + }, + { + "entropy": 1.9229997634887694, + "epoch": 0.1883506838019497, + "grad_norm": 11.104288101196289, + "learning_rate": 5.829249616265167e-06, + "loss": 0.5378, + "mean_token_accuracy": 0.8307598143815994, + "num_tokens": 73068665.0, + "step": 60760 + }, + { + "entropy": 1.8038654774427414, + "epoch": 0.1883816829269994, + "grad_norm": 9.37712287902832, + "learning_rate": 5.828769972906891e-06, + "loss": 0.4427, + "mean_token_accuracy": 0.8444581896066665, + "num_tokens": 73081347.0, + "step": 60770 + }, + { + "entropy": 1.8795530632138253, + "epoch": 0.18841268205204909, + "grad_norm": 4.818871021270752, + "learning_rate": 5.828290447927437e-06, + "loss": 0.5366, + "mean_token_accuracy": 0.8303244650363922, + "num_tokens": 73093545.0, + "step": 60780 + }, + { + "entropy": 1.8115891009569167, + "epoch": 0.18844368117709878, + "grad_norm": 8.035147666931152, + "learning_rate": 5.827811041278115e-06, + "loss": 0.426, + "mean_token_accuracy": 0.8465625256299972, + "num_tokens": 73106680.0, + "step": 60790 + }, + { + "entropy": 1.9304178357124329, + "epoch": 0.18847468030214848, + "grad_norm": 9.54056167602539, + "learning_rate": 5.827331752910272e-06, + "loss": 0.5425, + "mean_token_accuracy": 0.839380769431591, + "num_tokens": 73117528.0, + "step": 60800 + }, + { + "entropy": 1.9495259881019593, + "epoch": 0.18850567942719818, + "grad_norm": 9.372381210327148, + "learning_rate": 5.826852582775273e-06, + "loss": 0.6048, + "mean_token_accuracy": 0.8166373401880265, + "num_tokens": 73128356.0, + "step": 60810 + }, + { + "entropy": 1.8412368908524512, + "epoch": 0.18853667855224787, + "grad_norm": 7.743039608001709, + "learning_rate": 5.826373530824517e-06, + "loss": 0.4486, + "mean_token_accuracy": 0.8510013222694397, + "num_tokens": 73140590.0, + "step": 60820 + }, + { + "entropy": 1.7964115411043167, + "epoch": 0.18856767767729757, + "grad_norm": 8.170472145080566, + "learning_rate": 5.825894597009432e-06, + "loss": 0.468, + "mean_token_accuracy": 0.8488139227032662, + "num_tokens": 73153546.0, + "step": 60830 + }, + { + "entropy": 1.755212776362896, + "epoch": 0.18859867680234726, + "grad_norm": 7.8742451667785645, + "learning_rate": 5.825415781281467e-06, + "loss": 0.4312, + "mean_token_accuracy": 0.8522952437400818, + "num_tokens": 73166595.0, + "step": 60840 + }, + { + "entropy": 1.8175699785351753, + "epoch": 0.18862967592739696, + "grad_norm": 8.804338455200195, + "learning_rate": 5.824937083592109e-06, + "loss": 0.499, + "mean_token_accuracy": 0.8330580577254295, + "num_tokens": 73178927.0, + "step": 60850 + }, + { + "entropy": 1.8207954451441766, + "epoch": 0.18866067505244666, + "grad_norm": 8.167973518371582, + "learning_rate": 5.824458503892864e-06, + "loss": 0.4969, + "mean_token_accuracy": 0.8418833702802658, + "num_tokens": 73190925.0, + "step": 60860 + }, + { + "entropy": 1.8893669456243516, + "epoch": 0.18869167417749633, + "grad_norm": 8.711833953857422, + "learning_rate": 5.82398004213527e-06, + "loss": 0.5558, + "mean_token_accuracy": 0.8303925588726997, + "num_tokens": 73202125.0, + "step": 60870 + }, + { + "entropy": 1.8512268111109733, + "epoch": 0.18872267330254602, + "grad_norm": 7.86924409866333, + "learning_rate": 5.823501698270892e-06, + "loss": 0.5392, + "mean_token_accuracy": 0.8379854589700699, + "num_tokens": 73214492.0, + "step": 60880 + }, + { + "entropy": 1.8381731614470482, + "epoch": 0.18875367242759572, + "grad_norm": 6.331354141235352, + "learning_rate": 5.8230234722513236e-06, + "loss": 0.518, + "mean_token_accuracy": 0.8383203312754631, + "num_tokens": 73226919.0, + "step": 60890 + }, + { + "entropy": 1.8610471919178964, + "epoch": 0.18878467155264542, + "grad_norm": 8.457162857055664, + "learning_rate": 5.822545364028186e-06, + "loss": 0.4755, + "mean_token_accuracy": 0.8471423760056496, + "num_tokens": 73238963.0, + "step": 60900 + }, + { + "entropy": 1.8293138653039933, + "epoch": 0.1888156706776951, + "grad_norm": 4.284148216247559, + "learning_rate": 5.822067373553127e-06, + "loss": 0.4498, + "mean_token_accuracy": 0.8517989009618759, + "num_tokens": 73251832.0, + "step": 60910 + }, + { + "entropy": 1.856334713101387, + "epoch": 0.1888466698027448, + "grad_norm": 8.815077781677246, + "learning_rate": 5.8215895007778235e-06, + "loss": 0.5533, + "mean_token_accuracy": 0.8325720369815827, + "num_tokens": 73264796.0, + "step": 60920 + }, + { + "entropy": 1.6921759322285652, + "epoch": 0.1888776689277945, + "grad_norm": 7.586294174194336, + "learning_rate": 5.821111745653979e-06, + "loss": 0.4335, + "mean_token_accuracy": 0.8488885432481765, + "num_tokens": 73279601.0, + "step": 60930 + }, + { + "entropy": 1.9192057803273201, + "epoch": 0.1889086680528442, + "grad_norm": 8.917458534240723, + "learning_rate": 5.820634108133325e-06, + "loss": 0.5522, + "mean_token_accuracy": 0.825607393682003, + "num_tokens": 73291372.0, + "step": 60940 + }, + { + "entropy": 1.8995835900306701, + "epoch": 0.1889396671778939, + "grad_norm": 9.669869422912598, + "learning_rate": 5.820156588167624e-06, + "loss": 0.5177, + "mean_token_accuracy": 0.8387888163328171, + "num_tokens": 73302760.0, + "step": 60950 + }, + { + "entropy": 1.9761246114969253, + "epoch": 0.1889706663029436, + "grad_norm": 9.00790786743164, + "learning_rate": 5.819679185708661e-06, + "loss": 0.5676, + "mean_token_accuracy": 0.8249676078557968, + "num_tokens": 73313701.0, + "step": 60960 + }, + { + "entropy": 1.9618436962366104, + "epoch": 0.1890016654279933, + "grad_norm": 9.204923629760742, + "learning_rate": 5.8192019007082515e-06, + "loss": 0.5978, + "mean_token_accuracy": 0.8198722168803215, + "num_tokens": 73324576.0, + "step": 60970 + }, + { + "entropy": 1.8996351152658462, + "epoch": 0.189032664553043, + "grad_norm": 8.046465873718262, + "learning_rate": 5.818724733118237e-06, + "loss": 0.5235, + "mean_token_accuracy": 0.8394160747528077, + "num_tokens": 73335871.0, + "step": 60980 + }, + { + "entropy": 1.8664459705352783, + "epoch": 0.18906366367809269, + "grad_norm": 9.74695873260498, + "learning_rate": 5.8182476828904896e-06, + "loss": 0.5478, + "mean_token_accuracy": 0.8346919432282448, + "num_tokens": 73347256.0, + "step": 60990 + }, + { + "entropy": 1.9113014698028565, + "epoch": 0.18909466280314238, + "grad_norm": 9.092202186584473, + "learning_rate": 5.817770749976909e-06, + "loss": 0.5356, + "mean_token_accuracy": 0.8376618474721909, + "num_tokens": 73358623.0, + "step": 61000 + }, + { + "entropy": 1.8848463252186776, + "epoch": 0.18912566192819208, + "grad_norm": 4.147286891937256, + "learning_rate": 5.817293934329417e-06, + "loss": 0.5323, + "mean_token_accuracy": 0.8313426792621612, + "num_tokens": 73370417.0, + "step": 61010 + }, + { + "entropy": 1.9214195176959037, + "epoch": 0.18915666105324178, + "grad_norm": 7.7611403465271, + "learning_rate": 5.816817235899967e-06, + "loss": 0.5371, + "mean_token_accuracy": 0.83161641061306, + "num_tokens": 73382133.0, + "step": 61020 + }, + { + "entropy": 1.9057343572378158, + "epoch": 0.18918766017829147, + "grad_norm": 9.498258590698242, + "learning_rate": 5.816340654640542e-06, + "loss": 0.5504, + "mean_token_accuracy": 0.8384771049022675, + "num_tokens": 73393440.0, + "step": 61030 + }, + { + "entropy": 1.834838719666004, + "epoch": 0.18921865930334117, + "grad_norm": 4.797052383422852, + "learning_rate": 5.815864190503149e-06, + "loss": 0.4313, + "mean_token_accuracy": 0.8543123677372932, + "num_tokens": 73406284.0, + "step": 61040 + }, + { + "entropy": 1.8589958116412162, + "epoch": 0.18924965842839087, + "grad_norm": 8.2996826171875, + "learning_rate": 5.815387843439824e-06, + "loss": 0.5076, + "mean_token_accuracy": 0.8410072878003121, + "num_tokens": 73419184.0, + "step": 61050 + }, + { + "entropy": 1.9609655499458314, + "epoch": 0.18928065755344056, + "grad_norm": 8.885133743286133, + "learning_rate": 5.814911613402629e-06, + "loss": 0.5731, + "mean_token_accuracy": 0.8247525811195373, + "num_tokens": 73430755.0, + "step": 61060 + }, + { + "entropy": 1.8497852653265, + "epoch": 0.18931165667849026, + "grad_norm": 8.99361801147461, + "learning_rate": 5.814435500343657e-06, + "loss": 0.467, + "mean_token_accuracy": 0.8407298862934113, + "num_tokens": 73443198.0, + "step": 61070 + }, + { + "entropy": 1.920815998315811, + "epoch": 0.18934265580353996, + "grad_norm": 8.852625846862793, + "learning_rate": 5.813959504215025e-06, + "loss": 0.5761, + "mean_token_accuracy": 0.8310668468475342, + "num_tokens": 73453909.0, + "step": 61080 + }, + { + "entropy": 1.8594837307929992, + "epoch": 0.18937365492858965, + "grad_norm": 5.037121772766113, + "learning_rate": 5.813483624968877e-06, + "loss": 0.5775, + "mean_token_accuracy": 0.82839694917202, + "num_tokens": 73466187.0, + "step": 61090 + }, + { + "entropy": 1.816783571243286, + "epoch": 0.18940465405363935, + "grad_norm": 2.801055669784546, + "learning_rate": 5.813007862557388e-06, + "loss": 0.4086, + "mean_token_accuracy": 0.8508325532078743, + "num_tokens": 73479306.0, + "step": 61100 + }, + { + "entropy": 1.9225141122937202, + "epoch": 0.18943565317868905, + "grad_norm": 8.705913543701172, + "learning_rate": 5.812532216932759e-06, + "loss": 0.6119, + "mean_token_accuracy": 0.8328218296170234, + "num_tokens": 73491113.0, + "step": 61110 + }, + { + "entropy": 1.8322004929184914, + "epoch": 0.18946665230373871, + "grad_norm": 3.7541184425354004, + "learning_rate": 5.8120566880472155e-06, + "loss": 0.4586, + "mean_token_accuracy": 0.848419138789177, + "num_tokens": 73503751.0, + "step": 61120 + }, + { + "entropy": 1.910209448635578, + "epoch": 0.1894976514287884, + "grad_norm": 9.48783016204834, + "learning_rate": 5.811581275853014e-06, + "loss": 0.5389, + "mean_token_accuracy": 0.8345631062984467, + "num_tokens": 73515104.0, + "step": 61130 + }, + { + "entropy": 1.8471285864710807, + "epoch": 0.1895286505538381, + "grad_norm": 3.989353656768799, + "learning_rate": 5.811105980302438e-06, + "loss": 0.497, + "mean_token_accuracy": 0.8420290291309357, + "num_tokens": 73527180.0, + "step": 61140 + }, + { + "entropy": 1.8412006407976151, + "epoch": 0.1895596496788878, + "grad_norm": 9.421629905700684, + "learning_rate": 5.810630801347794e-06, + "loss": 0.5035, + "mean_token_accuracy": 0.843493039906025, + "num_tokens": 73540379.0, + "step": 61150 + }, + { + "entropy": 1.89210002720356, + "epoch": 0.1895906488039375, + "grad_norm": 4.73581075668335, + "learning_rate": 5.810155738941422e-06, + "loss": 0.4675, + "mean_token_accuracy": 0.8357082083821297, + "num_tokens": 73552901.0, + "step": 61160 + }, + { + "entropy": 1.899315556883812, + "epoch": 0.1896216479289872, + "grad_norm": 7.832334518432617, + "learning_rate": 5.809680793035686e-06, + "loss": 0.4787, + "mean_token_accuracy": 0.8473002672195434, + "num_tokens": 73564938.0, + "step": 61170 + }, + { + "entropy": 1.9637476325035095, + "epoch": 0.1896526470540369, + "grad_norm": 10.968403816223145, + "learning_rate": 5.8092059635829754e-06, + "loss": 0.5948, + "mean_token_accuracy": 0.8195749655365944, + "num_tokens": 73576160.0, + "step": 61180 + }, + { + "entropy": 1.9289003670215608, + "epoch": 0.1896836461790866, + "grad_norm": 9.405878067016602, + "learning_rate": 5.8087312505357115e-06, + "loss": 0.5501, + "mean_token_accuracy": 0.8394549310207366, + "num_tokens": 73588394.0, + "step": 61190 + }, + { + "entropy": 1.9212181255221368, + "epoch": 0.1897146453041363, + "grad_norm": 9.955124855041504, + "learning_rate": 5.80825665384634e-06, + "loss": 0.5592, + "mean_token_accuracy": 0.8281317040324211, + "num_tokens": 73600253.0, + "step": 61200 + }, + { + "entropy": 1.8745829716324807, + "epoch": 0.18974564442918598, + "grad_norm": 8.92733097076416, + "learning_rate": 5.807782173467334e-06, + "loss": 0.4997, + "mean_token_accuracy": 0.8472078263759613, + "num_tokens": 73613086.0, + "step": 61210 + }, + { + "entropy": 1.931848169863224, + "epoch": 0.18977664355423568, + "grad_norm": 7.844409465789795, + "learning_rate": 5.807307809351192e-06, + "loss": 0.5246, + "mean_token_accuracy": 0.8336080491542817, + "num_tokens": 73624754.0, + "step": 61220 + }, + { + "entropy": 1.915302050113678, + "epoch": 0.18980764267928538, + "grad_norm": 7.905981540679932, + "learning_rate": 5.806833561450444e-06, + "loss": 0.5309, + "mean_token_accuracy": 0.8398157224059105, + "num_tokens": 73636264.0, + "step": 61230 + }, + { + "entropy": 1.980039805173874, + "epoch": 0.18983864180433507, + "grad_norm": 7.911728858947754, + "learning_rate": 5.806359429717643e-06, + "loss": 0.5563, + "mean_token_accuracy": 0.8273883283138275, + "num_tokens": 73647130.0, + "step": 61240 + }, + { + "entropy": 1.9102006241679192, + "epoch": 0.18986964092938477, + "grad_norm": 8.068841934204102, + "learning_rate": 5.805885414105372e-06, + "loss": 0.5615, + "mean_token_accuracy": 0.8346290215849876, + "num_tokens": 73658915.0, + "step": 61250 + }, + { + "entropy": 1.8921155214309693, + "epoch": 0.18990064005443447, + "grad_norm": 9.479058265686035, + "learning_rate": 5.805411514566239e-06, + "loss": 0.5147, + "mean_token_accuracy": 0.8326509952545166, + "num_tokens": 73670760.0, + "step": 61260 + }, + { + "entropy": 1.8476255759596825, + "epoch": 0.18993163917948416, + "grad_norm": 3.8252689838409424, + "learning_rate": 5.804937731052881e-06, + "loss": 0.4446, + "mean_token_accuracy": 0.8441173121333122, + "num_tokens": 73683779.0, + "step": 61270 + }, + { + "entropy": 1.8536283493041992, + "epoch": 0.18996263830453386, + "grad_norm": 9.392457008361816, + "learning_rate": 5.80446406351796e-06, + "loss": 0.446, + "mean_token_accuracy": 0.8531845271587372, + "num_tokens": 73695924.0, + "step": 61280 + }, + { + "entropy": 2.0113365948200226, + "epoch": 0.18999363742958356, + "grad_norm": 9.121764183044434, + "learning_rate": 5.803990511914166e-06, + "loss": 0.6117, + "mean_token_accuracy": 0.825063693523407, + "num_tokens": 73706995.0, + "step": 61290 + }, + { + "entropy": 1.8952318519353866, + "epoch": 0.19002463655463325, + "grad_norm": 8.177303314208984, + "learning_rate": 5.803517076194217e-06, + "loss": 0.5076, + "mean_token_accuracy": 0.8383290365338325, + "num_tokens": 73718874.0, + "step": 61300 + }, + { + "entropy": 1.9754302859306336, + "epoch": 0.19005563567968295, + "grad_norm": 9.628982543945312, + "learning_rate": 5.803043756310858e-06, + "loss": 0.6201, + "mean_token_accuracy": 0.8155344128608704, + "num_tokens": 73729826.0, + "step": 61310 + }, + { + "entropy": 1.8714706540107726, + "epoch": 0.19008663480473265, + "grad_norm": 8.074376106262207, + "learning_rate": 5.802570552216857e-06, + "loss": 0.4571, + "mean_token_accuracy": 0.8505250707268714, + "num_tokens": 73741893.0, + "step": 61320 + }, + { + "entropy": 1.9628629803657531, + "epoch": 0.19011763392978234, + "grad_norm": 4.110509872436523, + "learning_rate": 5.802097463865013e-06, + "loss": 0.5459, + "mean_token_accuracy": 0.8337266921997071, + "num_tokens": 73753334.0, + "step": 61330 + }, + { + "entropy": 1.9020420521497727, + "epoch": 0.19014863305483204, + "grad_norm": 8.163873672485352, + "learning_rate": 5.801624491208153e-06, + "loss": 0.4906, + "mean_token_accuracy": 0.8378794327378273, + "num_tokens": 73765817.0, + "step": 61340 + }, + { + "entropy": 1.8595923513174057, + "epoch": 0.19017963217988174, + "grad_norm": 9.472771644592285, + "learning_rate": 5.8011516341991266e-06, + "loss": 0.4882, + "mean_token_accuracy": 0.8379010885953904, + "num_tokens": 73778450.0, + "step": 61350 + }, + { + "entropy": 1.9552810430526733, + "epoch": 0.1902106313049314, + "grad_norm": 9.82465648651123, + "learning_rate": 5.800678892790814e-06, + "loss": 0.531, + "mean_token_accuracy": 0.8336144149303436, + "num_tokens": 73789111.0, + "step": 61360 + }, + { + "entropy": 1.8182827576994895, + "epoch": 0.1902416304299811, + "grad_norm": 7.453169822692871, + "learning_rate": 5.80020626693612e-06, + "loss": 0.4876, + "mean_token_accuracy": 0.8479637071490288, + "num_tokens": 73801615.0, + "step": 61370 + }, + { + "entropy": 1.791535222530365, + "epoch": 0.1902726295550308, + "grad_norm": 3.9200282096862793, + "learning_rate": 5.799733756587978e-06, + "loss": 0.4514, + "mean_token_accuracy": 0.8588753417134285, + "num_tokens": 73815428.0, + "step": 61380 + }, + { + "entropy": 1.8905212104320526, + "epoch": 0.1903036286800805, + "grad_norm": 6.597655773162842, + "learning_rate": 5.799261361699344e-06, + "loss": 0.4899, + "mean_token_accuracy": 0.8383428543806076, + "num_tokens": 73826951.0, + "step": 61390 + }, + { + "entropy": 1.8670665681362153, + "epoch": 0.1903346278051302, + "grad_norm": 8.293522834777832, + "learning_rate": 5.798789082223209e-06, + "loss": 0.4997, + "mean_token_accuracy": 0.8391018345952034, + "num_tokens": 73839103.0, + "step": 61400 + }, + { + "entropy": 1.7990123346447944, + "epoch": 0.1903656269301799, + "grad_norm": 2.1400680541992188, + "learning_rate": 5.798316918112582e-06, + "loss": 0.4596, + "mean_token_accuracy": 0.8424833819270134, + "num_tokens": 73852403.0, + "step": 61410 + }, + { + "entropy": 1.777857731282711, + "epoch": 0.19039662605522958, + "grad_norm": 4.169025897979736, + "learning_rate": 5.797844869320504e-06, + "loss": 0.4243, + "mean_token_accuracy": 0.8537265419960022, + "num_tokens": 73866168.0, + "step": 61420 + }, + { + "entropy": 1.9535469591617585, + "epoch": 0.19042762518027928, + "grad_norm": 9.52925968170166, + "learning_rate": 5.797372935800044e-06, + "loss": 0.5682, + "mean_token_accuracy": 0.8247589409351349, + "num_tokens": 73876934.0, + "step": 61430 + }, + { + "entropy": 1.8025505855679511, + "epoch": 0.19045862430532898, + "grad_norm": 3.67460036277771, + "learning_rate": 5.796901117504291e-06, + "loss": 0.4403, + "mean_token_accuracy": 0.8523954421281814, + "num_tokens": 73889905.0, + "step": 61440 + }, + { + "entropy": 1.879899947345257, + "epoch": 0.19048962343037867, + "grad_norm": 8.983809471130371, + "learning_rate": 5.796429414386368e-06, + "loss": 0.5415, + "mean_token_accuracy": 0.823103578388691, + "num_tokens": 73901347.0, + "step": 61450 + }, + { + "entropy": 1.9289403408765793, + "epoch": 0.19052062255542837, + "grad_norm": 8.888519287109375, + "learning_rate": 5.7959578263994186e-06, + "loss": 0.5555, + "mean_token_accuracy": 0.8268095463514328, + "num_tokens": 73912721.0, + "step": 61460 + }, + { + "entropy": 1.7729464322328568, + "epoch": 0.19055162168047807, + "grad_norm": 9.862566947937012, + "learning_rate": 5.79548635349662e-06, + "loss": 0.4513, + "mean_token_accuracy": 0.8498085349798202, + "num_tokens": 73926277.0, + "step": 61470 + }, + { + "entropy": 1.8908159032464027, + "epoch": 0.19058262080552776, + "grad_norm": 9.228188514709473, + "learning_rate": 5.795014995631168e-06, + "loss": 0.5077, + "mean_token_accuracy": 0.8412437319755555, + "num_tokens": 73937675.0, + "step": 61480 + }, + { + "entropy": 1.8851223319768906, + "epoch": 0.19061361993057746, + "grad_norm": 8.568156242370605, + "learning_rate": 5.794543752756292e-06, + "loss": 0.5504, + "mean_token_accuracy": 0.82836285084486, + "num_tokens": 73949212.0, + "step": 61490 + }, + { + "entropy": 1.9255726218223572, + "epoch": 0.19064461905562716, + "grad_norm": 7.397071838378906, + "learning_rate": 5.794072624825245e-06, + "loss": 0.5881, + "mean_token_accuracy": 0.8346537619829177, + "num_tokens": 73960504.0, + "step": 61500 + }, + { + "entropy": 1.9002389639616013, + "epoch": 0.19067561818067685, + "grad_norm": 9.786722183227539, + "learning_rate": 5.793601611791305e-06, + "loss": 0.4892, + "mean_token_accuracy": 0.8421397104859352, + "num_tokens": 73972536.0, + "step": 61510 + }, + { + "entropy": 1.7949311509728432, + "epoch": 0.19070661730572655, + "grad_norm": 9.29088306427002, + "learning_rate": 5.7931307136077804e-06, + "loss": 0.438, + "mean_token_accuracy": 0.8478746846318245, + "num_tokens": 73985554.0, + "step": 61520 + }, + { + "entropy": 1.8513918846845627, + "epoch": 0.19073761643077625, + "grad_norm": 7.442810535430908, + "learning_rate": 5.792659930228004e-06, + "loss": 0.5175, + "mean_token_accuracy": 0.829763513803482, + "num_tokens": 73997021.0, + "step": 61530 + }, + { + "entropy": 1.8692805111408233, + "epoch": 0.19076861555582594, + "grad_norm": 6.96608304977417, + "learning_rate": 5.792189261605333e-06, + "loss": 0.4901, + "mean_token_accuracy": 0.8437046900391578, + "num_tokens": 74009028.0, + "step": 61540 + }, + { + "entropy": 1.9384054720401764, + "epoch": 0.19079961468087564, + "grad_norm": 8.266419410705566, + "learning_rate": 5.791718707693156e-06, + "loss": 0.5769, + "mean_token_accuracy": 0.830648484826088, + "num_tokens": 74020561.0, + "step": 61550 + }, + { + "entropy": 1.869347333908081, + "epoch": 0.19083061380592534, + "grad_norm": 4.296153545379639, + "learning_rate": 5.7912482684448845e-06, + "loss": 0.5328, + "mean_token_accuracy": 0.8308837920427322, + "num_tokens": 74032486.0, + "step": 61560 + }, + { + "entropy": 1.8627609625458716, + "epoch": 0.19086161293097503, + "grad_norm": 8.509102821350098, + "learning_rate": 5.790777943813958e-06, + "loss": 0.5362, + "mean_token_accuracy": 0.8389568880200386, + "num_tokens": 74044672.0, + "step": 61570 + }, + { + "entropy": 1.8437671825289725, + "epoch": 0.19089261205602473, + "grad_norm": 8.853963851928711, + "learning_rate": 5.79030773375384e-06, + "loss": 0.5127, + "mean_token_accuracy": 0.8353106841444969, + "num_tokens": 74057007.0, + "step": 61580 + }, + { + "entropy": 1.9143078982830048, + "epoch": 0.19092361118107443, + "grad_norm": 7.705349445343018, + "learning_rate": 5.789837638218024e-06, + "loss": 0.52, + "mean_token_accuracy": 0.8365962967276573, + "num_tokens": 74068293.0, + "step": 61590 + }, + { + "entropy": 1.8211023643612863, + "epoch": 0.19095461030612412, + "grad_norm": 8.679984092712402, + "learning_rate": 5.78936765716003e-06, + "loss": 0.4909, + "mean_token_accuracy": 0.8368862703442573, + "num_tokens": 74080738.0, + "step": 61600 + }, + { + "entropy": 1.9360493808984756, + "epoch": 0.1909856094311738, + "grad_norm": 8.391457557678223, + "learning_rate": 5.788897790533401e-06, + "loss": 0.583, + "mean_token_accuracy": 0.8265061900019646, + "num_tokens": 74091611.0, + "step": 61610 + }, + { + "entropy": 1.8157276138663292, + "epoch": 0.1910166085562235, + "grad_norm": 8.302699089050293, + "learning_rate": 5.788428038291707e-06, + "loss": 0.4698, + "mean_token_accuracy": 0.8514835327863693, + "num_tokens": 74104091.0, + "step": 61620 + }, + { + "entropy": 1.9097129538655282, + "epoch": 0.19104760768127318, + "grad_norm": 10.248120307922363, + "learning_rate": 5.787958400388546e-06, + "loss": 0.5394, + "mean_token_accuracy": 0.8318827226758003, + "num_tokens": 74115663.0, + "step": 61630 + }, + { + "entropy": 1.805907167494297, + "epoch": 0.19107860680632288, + "grad_norm": 9.584233283996582, + "learning_rate": 5.787488876777544e-06, + "loss": 0.4833, + "mean_token_accuracy": 0.8393665820360183, + "num_tokens": 74127987.0, + "step": 61640 + }, + { + "entropy": 1.772979559749365, + "epoch": 0.19110960593137258, + "grad_norm": 7.392131805419922, + "learning_rate": 5.78701946741235e-06, + "loss": 0.4289, + "mean_token_accuracy": 0.8417062953114509, + "num_tokens": 74141723.0, + "step": 61650 + }, + { + "entropy": 1.9116898834705354, + "epoch": 0.19114060505642227, + "grad_norm": 9.686699867248535, + "learning_rate": 5.786550172246639e-06, + "loss": 0.5491, + "mean_token_accuracy": 0.8334579989314079, + "num_tokens": 74152615.0, + "step": 61660 + }, + { + "entropy": 1.8346798852086068, + "epoch": 0.19117160418147197, + "grad_norm": 7.035947799682617, + "learning_rate": 5.786080991234115e-06, + "loss": 0.49, + "mean_token_accuracy": 0.8480042293667793, + "num_tokens": 74164838.0, + "step": 61670 + }, + { + "entropy": 1.822737891972065, + "epoch": 0.19120260330652167, + "grad_norm": 9.475711822509766, + "learning_rate": 5.785611924328507e-06, + "loss": 0.4779, + "mean_token_accuracy": 0.8468611136078834, + "num_tokens": 74177191.0, + "step": 61680 + }, + { + "entropy": 1.8375527679920196, + "epoch": 0.19123360243157136, + "grad_norm": 6.906650543212891, + "learning_rate": 5.785142971483572e-06, + "loss": 0.4749, + "mean_token_accuracy": 0.8478382468223572, + "num_tokens": 74189816.0, + "step": 61690 + }, + { + "entropy": 1.7865667924284936, + "epoch": 0.19126460155662106, + "grad_norm": 4.482153415679932, + "learning_rate": 5.784674132653088e-06, + "loss": 0.4917, + "mean_token_accuracy": 0.8439031273126603, + "num_tokens": 74202820.0, + "step": 61700 + }, + { + "entropy": 1.8899090319871903, + "epoch": 0.19129560068167076, + "grad_norm": 8.343757629394531, + "learning_rate": 5.784205407790866e-06, + "loss": 0.481, + "mean_token_accuracy": 0.8452587991952896, + "num_tokens": 74214375.0, + "step": 61710 + }, + { + "entropy": 1.83342125415802, + "epoch": 0.19132659980672045, + "grad_norm": 8.29771614074707, + "learning_rate": 5.783736796850737e-06, + "loss": 0.5425, + "mean_token_accuracy": 0.8335444629192352, + "num_tokens": 74227751.0, + "step": 61720 + }, + { + "entropy": 1.961284738779068, + "epoch": 0.19135759893177015, + "grad_norm": 9.149755477905273, + "learning_rate": 5.783268299786564e-06, + "loss": 0.544, + "mean_token_accuracy": 0.833545659482479, + "num_tokens": 74238590.0, + "step": 61730 + }, + { + "entropy": 1.8501825496554374, + "epoch": 0.19138859805681985, + "grad_norm": 10.697173118591309, + "learning_rate": 5.782799916552232e-06, + "loss": 0.5199, + "mean_token_accuracy": 0.8350521355867386, + "num_tokens": 74250667.0, + "step": 61740 + }, + { + "entropy": 1.9134044259786607, + "epoch": 0.19141959718186954, + "grad_norm": 6.616395950317383, + "learning_rate": 5.782331647101653e-06, + "loss": 0.5406, + "mean_token_accuracy": 0.8395699933171272, + "num_tokens": 74261958.0, + "step": 61750 + }, + { + "entropy": 1.9204273253679276, + "epoch": 0.19145059630691924, + "grad_norm": 10.518773078918457, + "learning_rate": 5.781863491388767e-06, + "loss": 0.5411, + "mean_token_accuracy": 0.8439541757106781, + "num_tokens": 74272746.0, + "step": 61760 + }, + { + "entropy": 1.8314485549926758, + "epoch": 0.19148159543196894, + "grad_norm": 8.12490463256836, + "learning_rate": 5.781395449367536e-06, + "loss": 0.4703, + "mean_token_accuracy": 0.8424828752875329, + "num_tokens": 74285787.0, + "step": 61770 + }, + { + "entropy": 1.894325715303421, + "epoch": 0.19151259455701863, + "grad_norm": 10.066198348999023, + "learning_rate": 5.780927520991953e-06, + "loss": 0.5017, + "mean_token_accuracy": 0.8336468815803528, + "num_tokens": 74298311.0, + "step": 61780 + }, + { + "entropy": 1.9529366761445999, + "epoch": 0.19154359368206833, + "grad_norm": 9.384862899780273, + "learning_rate": 5.780459706216036e-06, + "loss": 0.5579, + "mean_token_accuracy": 0.8310287207365036, + "num_tokens": 74309228.0, + "step": 61790 + }, + { + "entropy": 1.962406338751316, + "epoch": 0.19157459280711803, + "grad_norm": 7.662502288818359, + "learning_rate": 5.779992004993824e-06, + "loss": 0.5665, + "mean_token_accuracy": 0.8225008681416511, + "num_tokens": 74320341.0, + "step": 61800 + }, + { + "entropy": 1.9238436996936799, + "epoch": 0.19160559193216772, + "grad_norm": 9.47153091430664, + "learning_rate": 5.779524417279388e-06, + "loss": 0.526, + "mean_token_accuracy": 0.836972689628601, + "num_tokens": 74332180.0, + "step": 61810 + }, + { + "entropy": 1.933986322581768, + "epoch": 0.19163659105721742, + "grad_norm": 7.185315132141113, + "learning_rate": 5.779056943026824e-06, + "loss": 0.4926, + "mean_token_accuracy": 0.8494284927845002, + "num_tokens": 74343712.0, + "step": 61820 + }, + { + "entropy": 1.920379176735878, + "epoch": 0.19166759018226712, + "grad_norm": 4.159258842468262, + "learning_rate": 5.778589582190252e-06, + "loss": 0.489, + "mean_token_accuracy": 0.836681205034256, + "num_tokens": 74356004.0, + "step": 61830 + }, + { + "entropy": 1.8799566522240638, + "epoch": 0.1916985893073168, + "grad_norm": 8.231870651245117, + "learning_rate": 5.778122334723817e-06, + "loss": 0.5039, + "mean_token_accuracy": 0.8473984330892563, + "num_tokens": 74368305.0, + "step": 61840 + }, + { + "entropy": 1.893815641105175, + "epoch": 0.1917295884323665, + "grad_norm": 4.571141719818115, + "learning_rate": 5.777655200581693e-06, + "loss": 0.4869, + "mean_token_accuracy": 0.8461870953440667, + "num_tokens": 74380533.0, + "step": 61850 + }, + { + "entropy": 1.8599497005343437, + "epoch": 0.19176058755741618, + "grad_norm": 4.433168411254883, + "learning_rate": 5.77718817971808e-06, + "loss": 0.4654, + "mean_token_accuracy": 0.8428743481636047, + "num_tokens": 74393564.0, + "step": 61860 + }, + { + "entropy": 1.9357849955558777, + "epoch": 0.19179158668246588, + "grad_norm": 8.343416213989258, + "learning_rate": 5.776721272087201e-06, + "loss": 0.5484, + "mean_token_accuracy": 0.8426366299390793, + "num_tokens": 74405577.0, + "step": 61870 + }, + { + "entropy": 2.0002988666296004, + "epoch": 0.19182258580751557, + "grad_norm": 8.28388500213623, + "learning_rate": 5.776254477643307e-06, + "loss": 0.5856, + "mean_token_accuracy": 0.8197537645697593, + "num_tokens": 74416662.0, + "step": 61880 + }, + { + "entropy": 1.8667000889778138, + "epoch": 0.19185358493256527, + "grad_norm": 10.070082664489746, + "learning_rate": 5.775787796340675e-06, + "loss": 0.5541, + "mean_token_accuracy": 0.8392659932374954, + "num_tokens": 74428842.0, + "step": 61890 + }, + { + "entropy": 1.8394625827670097, + "epoch": 0.19188458405761497, + "grad_norm": 2.4154021739959717, + "learning_rate": 5.775321228133606e-06, + "loss": 0.4126, + "mean_token_accuracy": 0.8550085946917534, + "num_tokens": 74441442.0, + "step": 61900 + }, + { + "entropy": 1.8814760237932204, + "epoch": 0.19191558318266466, + "grad_norm": 8.660712242126465, + "learning_rate": 5.77485477297643e-06, + "loss": 0.486, + "mean_token_accuracy": 0.8466302067041397, + "num_tokens": 74453829.0, + "step": 61910 + }, + { + "entropy": 1.8994119971990586, + "epoch": 0.19194658230771436, + "grad_norm": 8.826970100402832, + "learning_rate": 5.774388430823499e-06, + "loss": 0.5172, + "mean_token_accuracy": 0.8399457424879074, + "num_tokens": 74464456.0, + "step": 61920 + }, + { + "entropy": 1.9710954904556275, + "epoch": 0.19197758143276406, + "grad_norm": 7.8489155769348145, + "learning_rate": 5.773922201629193e-06, + "loss": 0.5579, + "mean_token_accuracy": 0.8248024433851242, + "num_tokens": 74475138.0, + "step": 61930 + }, + { + "entropy": 1.8401651561260224, + "epoch": 0.19200858055781375, + "grad_norm": 9.654853820800781, + "learning_rate": 5.7734560853479185e-06, + "loss": 0.4738, + "mean_token_accuracy": 0.8436680495738983, + "num_tokens": 74488049.0, + "step": 61940 + }, + { + "entropy": 1.839670716226101, + "epoch": 0.19203957968286345, + "grad_norm": 8.641029357910156, + "learning_rate": 5.772990081934104e-06, + "loss": 0.4826, + "mean_token_accuracy": 0.8443516567349434, + "num_tokens": 74500733.0, + "step": 61950 + }, + { + "entropy": 1.8809733077883721, + "epoch": 0.19207057880791314, + "grad_norm": 7.769062519073486, + "learning_rate": 5.772524191342211e-06, + "loss": 0.4726, + "mean_token_accuracy": 0.8486372217535972, + "num_tokens": 74512926.0, + "step": 61960 + }, + { + "entropy": 1.8726215928792953, + "epoch": 0.19210157793296284, + "grad_norm": 9.620684623718262, + "learning_rate": 5.77205841352672e-06, + "loss": 0.5019, + "mean_token_accuracy": 0.8346716165542603, + "num_tokens": 74524851.0, + "step": 61970 + }, + { + "entropy": 1.8348112776875496, + "epoch": 0.19213257705801254, + "grad_norm": 9.443613052368164, + "learning_rate": 5.771592748442137e-06, + "loss": 0.4938, + "mean_token_accuracy": 0.8425130292773246, + "num_tokens": 74537835.0, + "step": 61980 + }, + { + "entropy": 1.871398164331913, + "epoch": 0.19216357618306223, + "grad_norm": 4.7039875984191895, + "learning_rate": 5.771127196042999e-06, + "loss": 0.4768, + "mean_token_accuracy": 0.8458441883325577, + "num_tokens": 74549780.0, + "step": 61990 + }, + { + "entropy": 1.8827469125390053, + "epoch": 0.19219457530811193, + "grad_norm": 10.308496475219727, + "learning_rate": 5.770661756283866e-06, + "loss": 0.5176, + "mean_token_accuracy": 0.8351684272289276, + "num_tokens": 74561738.0, + "step": 62000 + }, + { + "entropy": 1.9043960571289062, + "epoch": 0.19222557443316163, + "grad_norm": 7.712822914123535, + "learning_rate": 5.7701964291193214e-06, + "loss": 0.5082, + "mean_token_accuracy": 0.8395509481430053, + "num_tokens": 74573112.0, + "step": 62010 + }, + { + "entropy": 1.9056378319859504, + "epoch": 0.19225657355821132, + "grad_norm": 9.439181327819824, + "learning_rate": 5.769731214503978e-06, + "loss": 0.5181, + "mean_token_accuracy": 0.8385317623615265, + "num_tokens": 74583987.0, + "step": 62020 + }, + { + "entropy": 1.8788242667913437, + "epoch": 0.19228757268326102, + "grad_norm": 3.4620392322540283, + "learning_rate": 5.76926611239247e-06, + "loss": 0.4964, + "mean_token_accuracy": 0.8428028956055641, + "num_tokens": 74595794.0, + "step": 62030 + }, + { + "entropy": 1.8820303320884704, + "epoch": 0.19231857180831072, + "grad_norm": 10.090309143066406, + "learning_rate": 5.7688011227394625e-06, + "loss": 0.4822, + "mean_token_accuracy": 0.8451215922832489, + "num_tokens": 74607298.0, + "step": 62040 + }, + { + "entropy": 1.886990125477314, + "epoch": 0.19234957093336041, + "grad_norm": 8.511155128479004, + "learning_rate": 5.768336245499641e-06, + "loss": 0.5045, + "mean_token_accuracy": 0.8422086775302887, + "num_tokens": 74619848.0, + "step": 62050 + }, + { + "entropy": 1.9199146822094917, + "epoch": 0.1923805700584101, + "grad_norm": 8.11715316772461, + "learning_rate": 5.767871480627723e-06, + "loss": 0.5289, + "mean_token_accuracy": 0.8385195031762123, + "num_tokens": 74631619.0, + "step": 62060 + }, + { + "entropy": 1.8858480796217918, + "epoch": 0.1924115691834598, + "grad_norm": 7.366301536560059, + "learning_rate": 5.767406828078441e-06, + "loss": 0.489, + "mean_token_accuracy": 0.8323479920625687, + "num_tokens": 74643673.0, + "step": 62070 + }, + { + "entropy": 1.8879368424415588, + "epoch": 0.1924425683085095, + "grad_norm": 5.003079414367676, + "learning_rate": 5.766942287806564e-06, + "loss": 0.5219, + "mean_token_accuracy": 0.8332347095012664, + "num_tokens": 74656285.0, + "step": 62080 + }, + { + "entropy": 1.8710455060005189, + "epoch": 0.1924735674335592, + "grad_norm": 3.315375804901123, + "learning_rate": 5.766477859766882e-06, + "loss": 0.5307, + "mean_token_accuracy": 0.829591977596283, + "num_tokens": 74668414.0, + "step": 62090 + }, + { + "entropy": 1.8381486520171166, + "epoch": 0.19250456655860887, + "grad_norm": 7.80744743347168, + "learning_rate": 5.766013543914207e-06, + "loss": 0.4593, + "mean_token_accuracy": 0.8402635097503662, + "num_tokens": 74680230.0, + "step": 62100 + }, + { + "entropy": 1.9462435692548752, + "epoch": 0.19253556568365857, + "grad_norm": 8.435437202453613, + "learning_rate": 5.7655493402033836e-06, + "loss": 0.5916, + "mean_token_accuracy": 0.8307736337184906, + "num_tokens": 74692003.0, + "step": 62110 + }, + { + "entropy": 1.903138768672943, + "epoch": 0.19256656480870826, + "grad_norm": 8.948243141174316, + "learning_rate": 5.7650852485892765e-06, + "loss": 0.5137, + "mean_token_accuracy": 0.8443427816033363, + "num_tokens": 74703411.0, + "step": 62120 + }, + { + "entropy": 1.8868961855769157, + "epoch": 0.19259756393375796, + "grad_norm": 11.374164581298828, + "learning_rate": 5.764621269026775e-06, + "loss": 0.4984, + "mean_token_accuracy": 0.8403722077608109, + "num_tokens": 74715521.0, + "step": 62130 + }, + { + "entropy": 1.892562797665596, + "epoch": 0.19262856305880766, + "grad_norm": 7.841557025909424, + "learning_rate": 5.764157401470803e-06, + "loss": 0.5442, + "mean_token_accuracy": 0.8378325685858726, + "num_tokens": 74727068.0, + "step": 62140 + }, + { + "entropy": 1.7926482424139976, + "epoch": 0.19265956218385735, + "grad_norm": 3.997764825820923, + "learning_rate": 5.763693645876296e-06, + "loss": 0.4083, + "mean_token_accuracy": 0.8492718085646629, + "num_tokens": 74740269.0, + "step": 62150 + }, + { + "entropy": 1.866290318965912, + "epoch": 0.19269056130890705, + "grad_norm": 4.6976494789123535, + "learning_rate": 5.763230002198225e-06, + "loss": 0.4993, + "mean_token_accuracy": 0.8362017720937729, + "num_tokens": 74751597.0, + "step": 62160 + }, + { + "entropy": 1.8112548857927322, + "epoch": 0.19272156043395675, + "grad_norm": 8.123526573181152, + "learning_rate": 5.762766470391583e-06, + "loss": 0.4779, + "mean_token_accuracy": 0.8409424394369125, + "num_tokens": 74764989.0, + "step": 62170 + }, + { + "entropy": 1.753453615307808, + "epoch": 0.19275255955900644, + "grad_norm": 4.4287848472595215, + "learning_rate": 5.762303050411388e-06, + "loss": 0.397, + "mean_token_accuracy": 0.8451619282364845, + "num_tokens": 74778762.0, + "step": 62180 + }, + { + "entropy": 1.929175502061844, + "epoch": 0.19278355868405614, + "grad_norm": 9.11049747467041, + "learning_rate": 5.761839742212686e-06, + "loss": 0.5584, + "mean_token_accuracy": 0.841756422817707, + "num_tokens": 74789840.0, + "step": 62190 + }, + { + "entropy": 1.9000461488962173, + "epoch": 0.19281455780910584, + "grad_norm": 4.639410972595215, + "learning_rate": 5.761376545750544e-06, + "loss": 0.493, + "mean_token_accuracy": 0.8386220842599869, + "num_tokens": 74801445.0, + "step": 62200 + }, + { + "entropy": 1.8612179577350616, + "epoch": 0.19284555693415553, + "grad_norm": 9.239517211914062, + "learning_rate": 5.760913460980057e-06, + "loss": 0.4894, + "mean_token_accuracy": 0.8468897223472596, + "num_tokens": 74813988.0, + "step": 62210 + }, + { + "entropy": 1.7820834949612618, + "epoch": 0.19287655605920523, + "grad_norm": 3.959775447845459, + "learning_rate": 5.760450487856346e-06, + "loss": 0.4573, + "mean_token_accuracy": 0.8533436298370362, + "num_tokens": 74827651.0, + "step": 62220 + }, + { + "entropy": 1.8692509412765503, + "epoch": 0.19290755518425493, + "grad_norm": 9.524608612060547, + "learning_rate": 5.759987626334555e-06, + "loss": 0.498, + "mean_token_accuracy": 0.8503947392106056, + "num_tokens": 74840095.0, + "step": 62230 + }, + { + "entropy": 1.9380140736699105, + "epoch": 0.19293855430930462, + "grad_norm": 8.502176284790039, + "learning_rate": 5.759524876369853e-06, + "loss": 0.5736, + "mean_token_accuracy": 0.8298733696341515, + "num_tokens": 74851878.0, + "step": 62240 + }, + { + "entropy": 1.928398758172989, + "epoch": 0.19296955343435432, + "grad_norm": 9.923539161682129, + "learning_rate": 5.759062237917436e-06, + "loss": 0.523, + "mean_token_accuracy": 0.8408984065055847, + "num_tokens": 74862822.0, + "step": 62250 + }, + { + "entropy": 1.8122706308960914, + "epoch": 0.19300055255940401, + "grad_norm": 8.388425827026367, + "learning_rate": 5.758599710932528e-06, + "loss": 0.4573, + "mean_token_accuracy": 0.8447504699230194, + "num_tokens": 74875649.0, + "step": 62260 + }, + { + "entropy": 1.8911856144666672, + "epoch": 0.1930315516844537, + "grad_norm": 8.741740226745605, + "learning_rate": 5.75813729537037e-06, + "loss": 0.5357, + "mean_token_accuracy": 0.8428622677922248, + "num_tokens": 74887627.0, + "step": 62270 + }, + { + "entropy": 1.86591185182333, + "epoch": 0.1930625508095034, + "grad_norm": 8.541808128356934, + "learning_rate": 5.757674991186235e-06, + "loss": 0.4936, + "mean_token_accuracy": 0.8314554423093796, + "num_tokens": 74900605.0, + "step": 62280 + }, + { + "entropy": 1.926130199432373, + "epoch": 0.1930935499345531, + "grad_norm": 7.839807510375977, + "learning_rate": 5.75721279833542e-06, + "loss": 0.5659, + "mean_token_accuracy": 0.8310964539647102, + "num_tokens": 74912462.0, + "step": 62290 + }, + { + "entropy": 1.8626934379339217, + "epoch": 0.1931245490596028, + "grad_norm": 11.213129997253418, + "learning_rate": 5.756750716773244e-06, + "loss": 0.4776, + "mean_token_accuracy": 0.8312896370887757, + "num_tokens": 74924678.0, + "step": 62300 + }, + { + "entropy": 1.9561289519071579, + "epoch": 0.1931555481846525, + "grad_norm": 9.022578239440918, + "learning_rate": 5.7562887464550565e-06, + "loss": 0.5684, + "mean_token_accuracy": 0.83260398209095, + "num_tokens": 74935522.0, + "step": 62310 + }, + { + "entropy": 1.935487399995327, + "epoch": 0.1931865473097022, + "grad_norm": 8.986858367919922, + "learning_rate": 5.755826887336227e-06, + "loss": 0.5735, + "mean_token_accuracy": 0.8362299337983131, + "num_tokens": 74947411.0, + "step": 62320 + }, + { + "entropy": 1.8337993949651719, + "epoch": 0.1932175464347519, + "grad_norm": 8.892833709716797, + "learning_rate": 5.75536513937215e-06, + "loss": 0.5033, + "mean_token_accuracy": 0.840664692223072, + "num_tokens": 74961058.0, + "step": 62330 + }, + { + "entropy": 1.9785827055573464, + "epoch": 0.1932485455598016, + "grad_norm": 8.412293434143066, + "learning_rate": 5.7549035025182494e-06, + "loss": 0.5438, + "mean_token_accuracy": 0.8272167652845382, + "num_tokens": 74973029.0, + "step": 62340 + }, + { + "entropy": 1.8726650208234787, + "epoch": 0.19327954468485126, + "grad_norm": 3.809394598007202, + "learning_rate": 5.754441976729972e-06, + "loss": 0.4827, + "mean_token_accuracy": 0.846262401342392, + "num_tokens": 74985904.0, + "step": 62350 + }, + { + "entropy": 1.8960716605186463, + "epoch": 0.19331054380990095, + "grad_norm": 4.894566535949707, + "learning_rate": 5.753980561962787e-06, + "loss": 0.5005, + "mean_token_accuracy": 0.846526712179184, + "num_tokens": 74997658.0, + "step": 62360 + }, + { + "entropy": 1.9618462428450585, + "epoch": 0.19334154293495065, + "grad_norm": 7.828151226043701, + "learning_rate": 5.753519258172194e-06, + "loss": 0.5315, + "mean_token_accuracy": 0.8398016721010209, + "num_tokens": 75009078.0, + "step": 62370 + }, + { + "entropy": 1.8819516450166702, + "epoch": 0.19337254206000035, + "grad_norm": 8.260143280029297, + "learning_rate": 5.753058065313714e-06, + "loss": 0.5095, + "mean_token_accuracy": 0.8390106439590455, + "num_tokens": 75021094.0, + "step": 62380 + }, + { + "entropy": 1.8420429840683936, + "epoch": 0.19340354118505004, + "grad_norm": 7.1242475509643555, + "learning_rate": 5.7525969833428895e-06, + "loss": 0.5266, + "mean_token_accuracy": 0.834058178961277, + "num_tokens": 75033633.0, + "step": 62390 + }, + { + "entropy": 1.9346123069524765, + "epoch": 0.19343454031009974, + "grad_norm": 7.526602268218994, + "learning_rate": 5.752136012215297e-06, + "loss": 0.5595, + "mean_token_accuracy": 0.8317095413804054, + "num_tokens": 75045262.0, + "step": 62400 + }, + { + "entropy": 1.9385161980986596, + "epoch": 0.19346553943514944, + "grad_norm": 7.26128625869751, + "learning_rate": 5.751675151886529e-06, + "loss": 0.492, + "mean_token_accuracy": 0.8422947779297829, + "num_tokens": 75057520.0, + "step": 62410 + }, + { + "entropy": 1.8083595961332322, + "epoch": 0.19349653856019913, + "grad_norm": 8.095551490783691, + "learning_rate": 5.751214402312208e-06, + "loss": 0.4523, + "mean_token_accuracy": 0.8498411178588867, + "num_tokens": 75070750.0, + "step": 62420 + }, + { + "entropy": 1.8618351459503173, + "epoch": 0.19352753768524883, + "grad_norm": 9.302799224853516, + "learning_rate": 5.750753763447981e-06, + "loss": 0.4894, + "mean_token_accuracy": 0.8428505852818489, + "num_tokens": 75083142.0, + "step": 62430 + }, + { + "entropy": 1.874634511768818, + "epoch": 0.19355853681029853, + "grad_norm": 8.7201566696167, + "learning_rate": 5.750293235249518e-06, + "loss": 0.4875, + "mean_token_accuracy": 0.8446508884429932, + "num_tokens": 75094939.0, + "step": 62440 + }, + { + "entropy": 1.9367821574211121, + "epoch": 0.19358953593534822, + "grad_norm": 8.614350318908691, + "learning_rate": 5.749832817672515e-06, + "loss": 0.5362, + "mean_token_accuracy": 0.8392156153917313, + "num_tokens": 75106355.0, + "step": 62450 + }, + { + "entropy": 1.873591238260269, + "epoch": 0.19362053506039792, + "grad_norm": 9.169925689697266, + "learning_rate": 5.749372510672692e-06, + "loss": 0.4925, + "mean_token_accuracy": 0.8400036081671715, + "num_tokens": 75117711.0, + "step": 62460 + }, + { + "entropy": 1.8877021759748458, + "epoch": 0.19365153418544762, + "grad_norm": 4.023455619812012, + "learning_rate": 5.748912314205795e-06, + "loss": 0.4643, + "mean_token_accuracy": 0.8442784741520881, + "num_tokens": 75130177.0, + "step": 62470 + }, + { + "entropy": 1.9278222426772118, + "epoch": 0.1936825333104973, + "grad_norm": 8.48756217956543, + "learning_rate": 5.748452228227594e-06, + "loss": 0.5062, + "mean_token_accuracy": 0.8467386439442635, + "num_tokens": 75141376.0, + "step": 62480 + }, + { + "entropy": 1.8919538147747517, + "epoch": 0.193713532435547, + "grad_norm": 8.26032829284668, + "learning_rate": 5.7479922526938844e-06, + "loss": 0.4982, + "mean_token_accuracy": 0.8378899380564689, + "num_tokens": 75153872.0, + "step": 62490 + }, + { + "entropy": 1.975018060207367, + "epoch": 0.1937445315605967, + "grad_norm": 12.015787124633789, + "learning_rate": 5.747532387560486e-06, + "loss": 0.5763, + "mean_token_accuracy": 0.8265995383262634, + "num_tokens": 75164449.0, + "step": 62500 + }, + { + "entropy": 1.9173302993178367, + "epoch": 0.1937755306856464, + "grad_norm": 9.031746864318848, + "learning_rate": 5.747072632783242e-06, + "loss": 0.499, + "mean_token_accuracy": 0.8385538429021835, + "num_tokens": 75176299.0, + "step": 62510 + }, + { + "entropy": 1.9593890875577926, + "epoch": 0.1938065298106961, + "grad_norm": 9.437821388244629, + "learning_rate": 5.746612988318023e-06, + "loss": 0.5156, + "mean_token_accuracy": 0.8430341646075249, + "num_tokens": 75187861.0, + "step": 62520 + }, + { + "entropy": 1.8832410350441933, + "epoch": 0.1938375289357458, + "grad_norm": 13.652206420898438, + "learning_rate": 5.7461534541207234e-06, + "loss": 0.4873, + "mean_token_accuracy": 0.8412627220153809, + "num_tokens": 75200163.0, + "step": 62530 + }, + { + "entropy": 1.9353638172149659, + "epoch": 0.1938685280607955, + "grad_norm": 7.707444667816162, + "learning_rate": 5.745694030147259e-06, + "loss": 0.5079, + "mean_token_accuracy": 0.839554651081562, + "num_tokens": 75211123.0, + "step": 62540 + }, + { + "entropy": 1.8284053675830365, + "epoch": 0.1938995271858452, + "grad_norm": 8.445575714111328, + "learning_rate": 5.745234716353575e-06, + "loss": 0.4667, + "mean_token_accuracy": 0.84172955006361, + "num_tokens": 75223852.0, + "step": 62550 + }, + { + "entropy": 1.9116384595632554, + "epoch": 0.19393052631089489, + "grad_norm": 8.664936065673828, + "learning_rate": 5.744775512695639e-06, + "loss": 0.5437, + "mean_token_accuracy": 0.8343887582421303, + "num_tokens": 75235614.0, + "step": 62560 + }, + { + "entropy": 1.8100930273532867, + "epoch": 0.19396152543594458, + "grad_norm": 7.181854724884033, + "learning_rate": 5.744316419129445e-06, + "loss": 0.4493, + "mean_token_accuracy": 0.8528478279709816, + "num_tokens": 75248586.0, + "step": 62570 + }, + { + "entropy": 1.9202060773968697, + "epoch": 0.19399252456099428, + "grad_norm": 3.0128285884857178, + "learning_rate": 5.743857435611008e-06, + "loss": 0.5123, + "mean_token_accuracy": 0.8501577556133271, + "num_tokens": 75259638.0, + "step": 62580 + }, + { + "entropy": 1.782825130224228, + "epoch": 0.19402352368604397, + "grad_norm": 9.068933486938477, + "learning_rate": 5.743398562096369e-06, + "loss": 0.4205, + "mean_token_accuracy": 0.8459088578820229, + "num_tokens": 75273019.0, + "step": 62590 + }, + { + "entropy": 1.912159901857376, + "epoch": 0.19405452281109364, + "grad_norm": 8.978429794311523, + "learning_rate": 5.742939798541598e-06, + "loss": 0.5199, + "mean_token_accuracy": 0.8394264042377472, + "num_tokens": 75284329.0, + "step": 62600 + }, + { + "entropy": 1.9709818661212921, + "epoch": 0.19408552193614334, + "grad_norm": 8.043622970581055, + "learning_rate": 5.742481144902782e-06, + "loss": 0.5889, + "mean_token_accuracy": 0.8323390766978264, + "num_tokens": 75295155.0, + "step": 62610 + }, + { + "entropy": 1.8821393936872481, + "epoch": 0.19411652106119304, + "grad_norm": 7.908102989196777, + "learning_rate": 5.742022601136038e-06, + "loss": 0.4848, + "mean_token_accuracy": 0.8384183317422866, + "num_tokens": 75306535.0, + "step": 62620 + }, + { + "entropy": 1.8917057454586028, + "epoch": 0.19414752018624273, + "grad_norm": 4.215620994567871, + "learning_rate": 5.741564167197507e-06, + "loss": 0.514, + "mean_token_accuracy": 0.8402154177427292, + "num_tokens": 75318253.0, + "step": 62630 + }, + { + "entropy": 1.9241933241486548, + "epoch": 0.19417851931129243, + "grad_norm": 9.891353607177734, + "learning_rate": 5.741105843043353e-06, + "loss": 0.5965, + "mean_token_accuracy": 0.818701134622097, + "num_tokens": 75329353.0, + "step": 62640 + }, + { + "entropy": 1.9373375624418259, + "epoch": 0.19420951843634213, + "grad_norm": 9.279706954956055, + "learning_rate": 5.740647628629763e-06, + "loss": 0.525, + "mean_token_accuracy": 0.8327772691845894, + "num_tokens": 75340932.0, + "step": 62650 + }, + { + "entropy": 1.8756248265504838, + "epoch": 0.19424051756139182, + "grad_norm": 5.068475723266602, + "learning_rate": 5.740189523912952e-06, + "loss": 0.5173, + "mean_token_accuracy": 0.8361865937709808, + "num_tokens": 75353705.0, + "step": 62660 + }, + { + "entropy": 1.9200247406959534, + "epoch": 0.19427151668644152, + "grad_norm": 8.971196174621582, + "learning_rate": 5.7397315288491585e-06, + "loss": 0.553, + "mean_token_accuracy": 0.8186924889683723, + "num_tokens": 75365245.0, + "step": 62670 + }, + { + "entropy": 1.87558753490448, + "epoch": 0.19430251581149122, + "grad_norm": 8.281537055969238, + "learning_rate": 5.7392736433946424e-06, + "loss": 0.4695, + "mean_token_accuracy": 0.8481769561767578, + "num_tokens": 75377140.0, + "step": 62680 + }, + { + "entropy": 1.904216541349888, + "epoch": 0.1943335149365409, + "grad_norm": 8.079618453979492, + "learning_rate": 5.738815867505695e-06, + "loss": 0.5325, + "mean_token_accuracy": 0.8357766851782799, + "num_tokens": 75389067.0, + "step": 62690 + }, + { + "entropy": 1.8466911643743515, + "epoch": 0.1943645140615906, + "grad_norm": 8.569628715515137, + "learning_rate": 5.7383582011386215e-06, + "loss": 0.4744, + "mean_token_accuracy": 0.8424016252160073, + "num_tokens": 75401290.0, + "step": 62700 + }, + { + "entropy": 1.8429038256406785, + "epoch": 0.1943955131866403, + "grad_norm": 4.195886135101318, + "learning_rate": 5.737900644249762e-06, + "loss": 0.4951, + "mean_token_accuracy": 0.8370422974228859, + "num_tokens": 75414571.0, + "step": 62710 + }, + { + "entropy": 1.870844414830208, + "epoch": 0.19442651231169, + "grad_norm": 10.276758193969727, + "learning_rate": 5.737443196795474e-06, + "loss": 0.5647, + "mean_token_accuracy": 0.836325615644455, + "num_tokens": 75426778.0, + "step": 62720 + }, + { + "entropy": 1.8725099056959151, + "epoch": 0.1944575114367397, + "grad_norm": 7.546666622161865, + "learning_rate": 5.736985858732143e-06, + "loss": 0.4965, + "mean_token_accuracy": 0.8451737076044082, + "num_tokens": 75438333.0, + "step": 62730 + }, + { + "entropy": 1.9273310750722885, + "epoch": 0.1944885105617894, + "grad_norm": 9.644882202148438, + "learning_rate": 5.736528630016177e-06, + "loss": 0.5543, + "mean_token_accuracy": 0.8291450396180153, + "num_tokens": 75450773.0, + "step": 62740 + }, + { + "entropy": 1.9267635852098466, + "epoch": 0.1945195096868391, + "grad_norm": 3.730067729949951, + "learning_rate": 5.7360715106040076e-06, + "loss": 0.5217, + "mean_token_accuracy": 0.8472494944930077, + "num_tokens": 75462100.0, + "step": 62750 + }, + { + "entropy": 1.8266283124685287, + "epoch": 0.1945505088118888, + "grad_norm": 7.616328239440918, + "learning_rate": 5.735614500452095e-06, + "loss": 0.437, + "mean_token_accuracy": 0.8459511280059815, + "num_tokens": 75475416.0, + "step": 62760 + }, + { + "entropy": 1.9178658738732337, + "epoch": 0.19458150793693849, + "grad_norm": 9.340888977050781, + "learning_rate": 5.7351575995169186e-06, + "loss": 0.4936, + "mean_token_accuracy": 0.835166348516941, + "num_tokens": 75487885.0, + "step": 62770 + }, + { + "entropy": 1.8748321622610091, + "epoch": 0.19461250706198818, + "grad_norm": 3.991840124130249, + "learning_rate": 5.734700807754984e-06, + "loss": 0.478, + "mean_token_accuracy": 0.8418037429451942, + "num_tokens": 75499947.0, + "step": 62780 + }, + { + "entropy": 1.8018044173717498, + "epoch": 0.19464350618703788, + "grad_norm": 8.51059341430664, + "learning_rate": 5.734244125122822e-06, + "loss": 0.4169, + "mean_token_accuracy": 0.8524055883288384, + "num_tokens": 75513475.0, + "step": 62790 + }, + { + "entropy": 1.9655559062957764, + "epoch": 0.19467450531208758, + "grad_norm": 10.379049301147461, + "learning_rate": 5.733787551576987e-06, + "loss": 0.5602, + "mean_token_accuracy": 0.8376842170953751, + "num_tokens": 75524934.0, + "step": 62800 + }, + { + "entropy": 1.880786569416523, + "epoch": 0.19470550443713727, + "grad_norm": 8.091976165771484, + "learning_rate": 5.7333310870740565e-06, + "loss": 0.4844, + "mean_token_accuracy": 0.8456220507621766, + "num_tokens": 75537225.0, + "step": 62810 + }, + { + "entropy": 1.9510624960064888, + "epoch": 0.19473650356218697, + "grad_norm": 9.744572639465332, + "learning_rate": 5.732874731570633e-06, + "loss": 0.4754, + "mean_token_accuracy": 0.8450943067669868, + "num_tokens": 75548663.0, + "step": 62820 + }, + { + "entropy": 1.9875933229923248, + "epoch": 0.19476750268723667, + "grad_norm": 8.468138694763184, + "learning_rate": 5.732418485023345e-06, + "loss": 0.5636, + "mean_token_accuracy": 0.8282896682620049, + "num_tokens": 75559697.0, + "step": 62830 + }, + { + "entropy": 1.8670821204781531, + "epoch": 0.19479850181228636, + "grad_norm": 9.108405113220215, + "learning_rate": 5.731962347388841e-06, + "loss": 0.5391, + "mean_token_accuracy": 0.8396436840295791, + "num_tokens": 75571652.0, + "step": 62840 + }, + { + "entropy": 1.8874829977750778, + "epoch": 0.19482950093733603, + "grad_norm": 7.285294055938721, + "learning_rate": 5.731506318623798e-06, + "loss": 0.5263, + "mean_token_accuracy": 0.836583924293518, + "num_tokens": 75583725.0, + "step": 62850 + }, + { + "entropy": 1.8510995037853717, + "epoch": 0.19486050006238573, + "grad_norm": 8.491140365600586, + "learning_rate": 5.731050398684913e-06, + "loss": 0.4372, + "mean_token_accuracy": 0.8445952758193016, + "num_tokens": 75596812.0, + "step": 62860 + }, + { + "entropy": 1.9246593803167342, + "epoch": 0.19489149918743542, + "grad_norm": 8.031885147094727, + "learning_rate": 5.7305945875289126e-06, + "loss": 0.5581, + "mean_token_accuracy": 0.8263408780097962, + "num_tokens": 75609185.0, + "step": 62870 + }, + { + "entropy": 1.9077463194727897, + "epoch": 0.19492249831248512, + "grad_norm": 8.196245193481445, + "learning_rate": 5.730138885112542e-06, + "loss": 0.4949, + "mean_token_accuracy": 0.837520606815815, + "num_tokens": 75621376.0, + "step": 62880 + }, + { + "entropy": 1.9066799104213714, + "epoch": 0.19495349743753482, + "grad_norm": 8.017034530639648, + "learning_rate": 5.729683291392573e-06, + "loss": 0.522, + "mean_token_accuracy": 0.8396532908082008, + "num_tokens": 75634042.0, + "step": 62890 + }, + { + "entropy": 1.8118005082011224, + "epoch": 0.19498449656258451, + "grad_norm": 7.996087551116943, + "learning_rate": 5.7292278063258015e-06, + "loss": 0.4121, + "mean_token_accuracy": 0.8566027864813804, + "num_tokens": 75647509.0, + "step": 62900 + }, + { + "entropy": 1.9176707819104195, + "epoch": 0.1950154956876342, + "grad_norm": 3.844882011413574, + "learning_rate": 5.728772429869045e-06, + "loss": 0.5002, + "mean_token_accuracy": 0.842312179505825, + "num_tokens": 75659954.0, + "step": 62910 + }, + { + "entropy": 1.8584643453359604, + "epoch": 0.1950464948126839, + "grad_norm": 4.170616149902344, + "learning_rate": 5.72831716197915e-06, + "loss": 0.422, + "mean_token_accuracy": 0.8405888646841049, + "num_tokens": 75672753.0, + "step": 62920 + }, + { + "entropy": 1.8920593187212944, + "epoch": 0.1950774939377336, + "grad_norm": 8.466096878051758, + "learning_rate": 5.727862002612982e-06, + "loss": 0.4855, + "mean_token_accuracy": 0.8370663553476334, + "num_tokens": 75685124.0, + "step": 62930 + }, + { + "entropy": 1.9039631336927414, + "epoch": 0.1951084930627833, + "grad_norm": 10.970901489257812, + "learning_rate": 5.727406951727435e-06, + "loss": 0.5118, + "mean_token_accuracy": 0.837016536295414, + "num_tokens": 75696871.0, + "step": 62940 + }, + { + "entropy": 1.8986446857452393, + "epoch": 0.195139492187833, + "grad_norm": 8.603690147399902, + "learning_rate": 5.726952009279424e-06, + "loss": 0.4586, + "mean_token_accuracy": 0.845548364520073, + "num_tokens": 75708850.0, + "step": 62950 + }, + { + "entropy": 1.879063467681408, + "epoch": 0.1951704913128827, + "grad_norm": 8.833934783935547, + "learning_rate": 5.726497175225886e-06, + "loss": 0.5206, + "mean_token_accuracy": 0.8337208881974221, + "num_tokens": 75721064.0, + "step": 62960 + }, + { + "entropy": 1.9283981889486312, + "epoch": 0.1952014904379324, + "grad_norm": 4.0578155517578125, + "learning_rate": 5.726042449523786e-06, + "loss": 0.5236, + "mean_token_accuracy": 0.8367056354880333, + "num_tokens": 75732835.0, + "step": 62970 + }, + { + "entropy": 1.9276798009872436, + "epoch": 0.1952324895629821, + "grad_norm": 7.493443489074707, + "learning_rate": 5.725587832130112e-06, + "loss": 0.515, + "mean_token_accuracy": 0.841886368393898, + "num_tokens": 75743432.0, + "step": 62980 + }, + { + "entropy": 1.7954487293958663, + "epoch": 0.19526348868803178, + "grad_norm": 8.081300735473633, + "learning_rate": 5.725133323001873e-06, + "loss": 0.4176, + "mean_token_accuracy": 0.8556428179144859, + "num_tokens": 75757107.0, + "step": 62990 + }, + { + "entropy": 1.8096677586436272, + "epoch": 0.19529448781308148, + "grad_norm": 9.376117706298828, + "learning_rate": 5.724678922096108e-06, + "loss": 0.4369, + "mean_token_accuracy": 0.8452500954270363, + "num_tokens": 75769351.0, + "step": 63000 + }, + { + "entropy": 1.8731877133250237, + "epoch": 0.19532548693813118, + "grad_norm": 9.466216087341309, + "learning_rate": 5.724224629369872e-06, + "loss": 0.4606, + "mean_token_accuracy": 0.8412355482578278, + "num_tokens": 75781771.0, + "step": 63010 + }, + { + "entropy": 1.8507128790020944, + "epoch": 0.19535648606318087, + "grad_norm": 10.853748321533203, + "learning_rate": 5.72377044478025e-06, + "loss": 0.498, + "mean_token_accuracy": 0.8409447997808457, + "num_tokens": 75794706.0, + "step": 63020 + }, + { + "entropy": 1.9449727565050126, + "epoch": 0.19538748518823057, + "grad_norm": 6.4505085945129395, + "learning_rate": 5.723316368284348e-06, + "loss": 0.473, + "mean_token_accuracy": 0.8548792108893395, + "num_tokens": 75805711.0, + "step": 63030 + }, + { + "entropy": 1.8775279074907303, + "epoch": 0.19541848431328027, + "grad_norm": 7.87474250793457, + "learning_rate": 5.722862399839298e-06, + "loss": 0.4712, + "mean_token_accuracy": 0.8526403203606605, + "num_tokens": 75817522.0, + "step": 63040 + }, + { + "entropy": 1.856291452050209, + "epoch": 0.19544948343832996, + "grad_norm": 4.191627025604248, + "learning_rate": 5.722408539402254e-06, + "loss": 0.4899, + "mean_token_accuracy": 0.8563273102045059, + "num_tokens": 75829497.0, + "step": 63050 + }, + { + "entropy": 1.9053906798362732, + "epoch": 0.19548048256337966, + "grad_norm": 8.759142875671387, + "learning_rate": 5.72195478693039e-06, + "loss": 0.5129, + "mean_token_accuracy": 0.8432946696877479, + "num_tokens": 75840759.0, + "step": 63060 + }, + { + "entropy": 1.907120332121849, + "epoch": 0.19551148168842936, + "grad_norm": 9.412018775939941, + "learning_rate": 5.721501142380913e-06, + "loss": 0.4955, + "mean_token_accuracy": 0.8373003304004669, + "num_tokens": 75852503.0, + "step": 63070 + }, + { + "entropy": 2.0056669265031815, + "epoch": 0.19554248081347905, + "grad_norm": 7.475878715515137, + "learning_rate": 5.721047605711047e-06, + "loss": 0.57, + "mean_token_accuracy": 0.8297986865043641, + "num_tokens": 75863521.0, + "step": 63080 + }, + { + "entropy": 1.922423042356968, + "epoch": 0.19557347993852872, + "grad_norm": 4.381282329559326, + "learning_rate": 5.720594176878039e-06, + "loss": 0.4947, + "mean_token_accuracy": 0.8460247412323951, + "num_tokens": 75874691.0, + "step": 63090 + }, + { + "entropy": 1.9274744465947151, + "epoch": 0.19560447906357842, + "grad_norm": 10.508957862854004, + "learning_rate": 5.720140855839166e-06, + "loss": 0.5335, + "mean_token_accuracy": 0.8317728966474534, + "num_tokens": 75887760.0, + "step": 63100 + }, + { + "entropy": 1.9512497708201408, + "epoch": 0.19563547818862811, + "grad_norm": 9.120226860046387, + "learning_rate": 5.719687642551722e-06, + "loss": 0.5086, + "mean_token_accuracy": 0.8364842370152473, + "num_tokens": 75899765.0, + "step": 63110 + }, + { + "entropy": 1.925400537252426, + "epoch": 0.1956664773136778, + "grad_norm": 5.570336818695068, + "learning_rate": 5.719234536973028e-06, + "loss": 0.5209, + "mean_token_accuracy": 0.8405450001358986, + "num_tokens": 75911921.0, + "step": 63120 + }, + { + "entropy": 1.836472137272358, + "epoch": 0.1956974764387275, + "grad_norm": 2.734543561935425, + "learning_rate": 5.718781539060429e-06, + "loss": 0.4246, + "mean_token_accuracy": 0.851534178853035, + "num_tokens": 75925915.0, + "step": 63130 + }, + { + "entropy": 1.8467722043395043, + "epoch": 0.1957284755637772, + "grad_norm": 8.128966331481934, + "learning_rate": 5.718328648771291e-06, + "loss": 0.4669, + "mean_token_accuracy": 0.851094801723957, + "num_tokens": 75938929.0, + "step": 63140 + }, + { + "entropy": 1.9085650324821473, + "epoch": 0.1957594746888269, + "grad_norm": 8.325636863708496, + "learning_rate": 5.717875866063008e-06, + "loss": 0.4853, + "mean_token_accuracy": 0.8383592694997788, + "num_tokens": 75949645.0, + "step": 63150 + }, + { + "entropy": 1.8661453172564506, + "epoch": 0.1957904738138766, + "grad_norm": 4.130495548248291, + "learning_rate": 5.717423190892991e-06, + "loss": 0.4474, + "mean_token_accuracy": 0.8449689701199532, + "num_tokens": 75962507.0, + "step": 63160 + }, + { + "entropy": 1.9401205703616142, + "epoch": 0.1958214729389263, + "grad_norm": 10.611771583557129, + "learning_rate": 5.716970623218681e-06, + "loss": 0.5661, + "mean_token_accuracy": 0.8341074407100677, + "num_tokens": 75974430.0, + "step": 63170 + }, + { + "entropy": 1.834306775033474, + "epoch": 0.195852472063976, + "grad_norm": 8.28214168548584, + "learning_rate": 5.716518162997542e-06, + "loss": 0.4521, + "mean_token_accuracy": 0.8515095382928848, + "num_tokens": 75987170.0, + "step": 63180 + }, + { + "entropy": 1.8667117178440094, + "epoch": 0.1958834711890257, + "grad_norm": 4.622128963470459, + "learning_rate": 5.716065810187056e-06, + "loss": 0.4767, + "mean_token_accuracy": 0.8383040621876716, + "num_tokens": 75999364.0, + "step": 63190 + }, + { + "entropy": 1.9117874279618263, + "epoch": 0.19591447031407538, + "grad_norm": 7.746886730194092, + "learning_rate": 5.715613564744734e-06, + "loss": 0.5304, + "mean_token_accuracy": 0.8401778340339661, + "num_tokens": 76010533.0, + "step": 63200 + }, + { + "entropy": 1.860894750058651, + "epoch": 0.19594546943912508, + "grad_norm": 3.8454928398132324, + "learning_rate": 5.715161426628111e-06, + "loss": 0.4673, + "mean_token_accuracy": 0.847791762650013, + "num_tokens": 76023243.0, + "step": 63210 + }, + { + "entropy": 1.8731217697262763, + "epoch": 0.19597646856417478, + "grad_norm": 4.6532745361328125, + "learning_rate": 5.71470939579474e-06, + "loss": 0.4952, + "mean_token_accuracy": 0.8273180708289146, + "num_tokens": 76035128.0, + "step": 63220 + }, + { + "entropy": 1.892588709294796, + "epoch": 0.19600746768922447, + "grad_norm": 10.033417701721191, + "learning_rate": 5.714257472202201e-06, + "loss": 0.4887, + "mean_token_accuracy": 0.8375523954629898, + "num_tokens": 76047088.0, + "step": 63230 + }, + { + "entropy": 1.8054717764258386, + "epoch": 0.19603846681427417, + "grad_norm": 4.947404384613037, + "learning_rate": 5.7138056558080975e-06, + "loss": 0.4124, + "mean_token_accuracy": 0.8533313110470772, + "num_tokens": 76060229.0, + "step": 63240 + }, + { + "entropy": 1.9267044529318809, + "epoch": 0.19606946593932387, + "grad_norm": 8.534299850463867, + "learning_rate": 5.713353946570057e-06, + "loss": 0.5239, + "mean_token_accuracy": 0.8383896484971046, + "num_tokens": 76072124.0, + "step": 63250 + }, + { + "entropy": 1.9226173907518387, + "epoch": 0.19610046506437356, + "grad_norm": 8.385056495666504, + "learning_rate": 5.712902344445731e-06, + "loss": 0.5193, + "mean_token_accuracy": 0.8362686946988106, + "num_tokens": 76083932.0, + "step": 63260 + }, + { + "entropy": 1.8576195508241653, + "epoch": 0.19613146418942326, + "grad_norm": 9.054718017578125, + "learning_rate": 5.712450849392791e-06, + "loss": 0.4886, + "mean_token_accuracy": 0.8375715285539627, + "num_tokens": 76096111.0, + "step": 63270 + }, + { + "entropy": 1.9397896856069565, + "epoch": 0.19616246331447296, + "grad_norm": 10.121234893798828, + "learning_rate": 5.711999461368935e-06, + "loss": 0.5309, + "mean_token_accuracy": 0.834457078576088, + "num_tokens": 76107744.0, + "step": 63280 + }, + { + "entropy": 1.8755070850253106, + "epoch": 0.19619346243952265, + "grad_norm": 9.817523956298828, + "learning_rate": 5.711548180331882e-06, + "loss": 0.4992, + "mean_token_accuracy": 0.8419964432716369, + "num_tokens": 76119347.0, + "step": 63290 + }, + { + "entropy": 1.915390780568123, + "epoch": 0.19622446156457235, + "grad_norm": 7.848161697387695, + "learning_rate": 5.711097006239378e-06, + "loss": 0.56, + "mean_token_accuracy": 0.8367161169648171, + "num_tokens": 76130270.0, + "step": 63300 + }, + { + "entropy": 1.9267538145184517, + "epoch": 0.19625546068962205, + "grad_norm": 8.433647155761719, + "learning_rate": 5.710645939049189e-06, + "loss": 0.5233, + "mean_token_accuracy": 0.8365942299365997, + "num_tokens": 76141625.0, + "step": 63310 + }, + { + "entropy": 1.9020789548754693, + "epoch": 0.19628645981467174, + "grad_norm": 7.910606861114502, + "learning_rate": 5.710194978719105e-06, + "loss": 0.499, + "mean_token_accuracy": 0.838070061802864, + "num_tokens": 76153615.0, + "step": 63320 + }, + { + "entropy": 1.8399833947420121, + "epoch": 0.19631745893972144, + "grad_norm": 9.023702621459961, + "learning_rate": 5.70974412520694e-06, + "loss": 0.4621, + "mean_token_accuracy": 0.8374461650848388, + "num_tokens": 76166808.0, + "step": 63330 + }, + { + "entropy": 1.8543090134859086, + "epoch": 0.1963484580647711, + "grad_norm": 4.479185581207275, + "learning_rate": 5.709293378470532e-06, + "loss": 0.5471, + "mean_token_accuracy": 0.8332849636673927, + "num_tokens": 76180101.0, + "step": 63340 + }, + { + "entropy": 1.9013725534081458, + "epoch": 0.1963794571898208, + "grad_norm": 9.714118003845215, + "learning_rate": 5.70884273846774e-06, + "loss": 0.5029, + "mean_token_accuracy": 0.8363057851791382, + "num_tokens": 76191945.0, + "step": 63350 + }, + { + "entropy": 1.764454497396946, + "epoch": 0.1964104563148705, + "grad_norm": 9.242158889770508, + "learning_rate": 5.7083922051564485e-06, + "loss": 0.4131, + "mean_token_accuracy": 0.8551331639289856, + "num_tokens": 76205433.0, + "step": 63360 + }, + { + "entropy": 1.8029524132609367, + "epoch": 0.1964414554399202, + "grad_norm": 9.247615814208984, + "learning_rate": 5.707941778494567e-06, + "loss": 0.4469, + "mean_token_accuracy": 0.8530771881341934, + "num_tokens": 76218558.0, + "step": 63370 + }, + { + "entropy": 1.8929936558008194, + "epoch": 0.1964724545649699, + "grad_norm": 8.096500396728516, + "learning_rate": 5.707491458440021e-06, + "loss": 0.538, + "mean_token_accuracy": 0.8339035794138908, + "num_tokens": 76229934.0, + "step": 63380 + }, + { + "entropy": 1.92166768014431, + "epoch": 0.1965034536900196, + "grad_norm": 5.686055660247803, + "learning_rate": 5.707041244950767e-06, + "loss": 0.5108, + "mean_token_accuracy": 0.8357293531298637, + "num_tokens": 76241582.0, + "step": 63390 + }, + { + "entropy": 1.9319213822484016, + "epoch": 0.1965344528150693, + "grad_norm": 9.794820785522461, + "learning_rate": 5.706591137984782e-06, + "loss": 0.5742, + "mean_token_accuracy": 0.8362261816859246, + "num_tokens": 76253353.0, + "step": 63400 + }, + { + "entropy": 1.8578762322664262, + "epoch": 0.19656545194011898, + "grad_norm": 7.473123550415039, + "learning_rate": 5.706141137500063e-06, + "loss": 0.4557, + "mean_token_accuracy": 0.8495231464505195, + "num_tokens": 76266223.0, + "step": 63410 + }, + { + "entropy": 1.897558230161667, + "epoch": 0.19659645106516868, + "grad_norm": 8.56474494934082, + "learning_rate": 5.705691243454636e-06, + "loss": 0.4978, + "mean_token_accuracy": 0.8442750841379165, + "num_tokens": 76278256.0, + "step": 63420 + }, + { + "entropy": 1.8700624421238898, + "epoch": 0.19662745019021838, + "grad_norm": 4.685993194580078, + "learning_rate": 5.705241455806546e-06, + "loss": 0.5335, + "mean_token_accuracy": 0.8440624356269837, + "num_tokens": 76290947.0, + "step": 63430 + }, + { + "entropy": 1.9605732291936875, + "epoch": 0.19665844931526807, + "grad_norm": 9.56114387512207, + "learning_rate": 5.704791774513863e-06, + "loss": 0.4958, + "mean_token_accuracy": 0.8460546612739563, + "num_tokens": 76302224.0, + "step": 63440 + }, + { + "entropy": 1.9243500038981438, + "epoch": 0.19668944844031777, + "grad_norm": 4.1643195152282715, + "learning_rate": 5.704342199534677e-06, + "loss": 0.5014, + "mean_token_accuracy": 0.8379947647452355, + "num_tokens": 76314456.0, + "step": 63450 + }, + { + "entropy": 1.9549641892313958, + "epoch": 0.19672044756536747, + "grad_norm": 10.69619083404541, + "learning_rate": 5.703892730827107e-06, + "loss": 0.5515, + "mean_token_accuracy": 0.8345342621207237, + "num_tokens": 76325957.0, + "step": 63460 + }, + { + "entropy": 1.9176968559622765, + "epoch": 0.19675144669041716, + "grad_norm": 12.198366165161133, + "learning_rate": 5.703443368349289e-06, + "loss": 0.5492, + "mean_token_accuracy": 0.8376664772629738, + "num_tokens": 76337257.0, + "step": 63470 + }, + { + "entropy": 1.8076652333140373, + "epoch": 0.19678244581546686, + "grad_norm": 9.941756248474121, + "learning_rate": 5.7029941120593864e-06, + "loss": 0.4809, + "mean_token_accuracy": 0.8417710468173027, + "num_tokens": 76350461.0, + "step": 63480 + }, + { + "entropy": 1.8378579676151277, + "epoch": 0.19681344494051656, + "grad_norm": 8.623409271240234, + "learning_rate": 5.702544961915585e-06, + "loss": 0.464, + "mean_token_accuracy": 0.8429977357387543, + "num_tokens": 76363284.0, + "step": 63490 + }, + { + "entropy": 1.9412892490625382, + "epoch": 0.19684444406556625, + "grad_norm": 10.340338706970215, + "learning_rate": 5.702095917876089e-06, + "loss": 0.5472, + "mean_token_accuracy": 0.8276393041014671, + "num_tokens": 76374739.0, + "step": 63500 + }, + { + "entropy": 1.9820778489112854, + "epoch": 0.19687544319061595, + "grad_norm": 9.503996849060059, + "learning_rate": 5.701646979899134e-06, + "loss": 0.5663, + "mean_token_accuracy": 0.8296515002846718, + "num_tokens": 76386166.0, + "step": 63510 + }, + { + "entropy": 1.8864339843392373, + "epoch": 0.19690644231566565, + "grad_norm": 8.490914344787598, + "learning_rate": 5.70119814794297e-06, + "loss": 0.4971, + "mean_token_accuracy": 0.8448032855987548, + "num_tokens": 76399048.0, + "step": 63520 + }, + { + "entropy": 1.9335085853934288, + "epoch": 0.19693744144071534, + "grad_norm": 11.991941452026367, + "learning_rate": 5.700749421965877e-06, + "loss": 0.5494, + "mean_token_accuracy": 0.8254455342888832, + "num_tokens": 76410513.0, + "step": 63530 + }, + { + "entropy": 1.9954243630170823, + "epoch": 0.19696844056576504, + "grad_norm": 9.641355514526367, + "learning_rate": 5.700300801926151e-06, + "loss": 0.5682, + "mean_token_accuracy": 0.820628535747528, + "num_tokens": 76422104.0, + "step": 63540 + }, + { + "entropy": 1.879958561062813, + "epoch": 0.19699943969081474, + "grad_norm": 4.384090423583984, + "learning_rate": 5.6998522877821185e-06, + "loss": 0.5171, + "mean_token_accuracy": 0.8393704131245613, + "num_tokens": 76434124.0, + "step": 63550 + }, + { + "entropy": 1.8802961066365242, + "epoch": 0.19703043881586443, + "grad_norm": 6.248841762542725, + "learning_rate": 5.699403879492125e-06, + "loss": 0.5114, + "mean_token_accuracy": 0.8394171461462975, + "num_tokens": 76446867.0, + "step": 63560 + }, + { + "entropy": 1.9431464537978171, + "epoch": 0.19706143794091413, + "grad_norm": 9.1453857421875, + "learning_rate": 5.6989555770145386e-06, + "loss": 0.5201, + "mean_token_accuracy": 0.8342863008379936, + "num_tokens": 76458179.0, + "step": 63570 + }, + { + "entropy": 1.8758263066411018, + "epoch": 0.19709243706596383, + "grad_norm": 8.520499229431152, + "learning_rate": 5.69850738030775e-06, + "loss": 0.4718, + "mean_token_accuracy": 0.8425302848219871, + "num_tokens": 76469611.0, + "step": 63580 + }, + { + "entropy": 1.9540351748466491, + "epoch": 0.1971234361910135, + "grad_norm": 8.32021713256836, + "learning_rate": 5.698059289330175e-06, + "loss": 0.5643, + "mean_token_accuracy": 0.8359291762113571, + "num_tokens": 76481274.0, + "step": 63590 + }, + { + "entropy": 1.8006914272904395, + "epoch": 0.1971544353160632, + "grad_norm": 4.276872634887695, + "learning_rate": 5.69761130404025e-06, + "loss": 0.4121, + "mean_token_accuracy": 0.8568015277385712, + "num_tokens": 76494249.0, + "step": 63600 + }, + { + "entropy": 1.9028367415070533, + "epoch": 0.1971854344411129, + "grad_norm": 9.324864387512207, + "learning_rate": 5.6971634243964366e-06, + "loss": 0.5104, + "mean_token_accuracy": 0.8430081695318222, + "num_tokens": 76506213.0, + "step": 63610 + }, + { + "entropy": 1.8858824104070664, + "epoch": 0.19721643356616259, + "grad_norm": 10.004111289978027, + "learning_rate": 5.696715650357217e-06, + "loss": 0.5222, + "mean_token_accuracy": 0.8416607335209847, + "num_tokens": 76519093.0, + "step": 63620 + }, + { + "entropy": 1.8651671513915062, + "epoch": 0.19724743269121228, + "grad_norm": 8.351085662841797, + "learning_rate": 5.696267981881099e-06, + "loss": 0.4968, + "mean_token_accuracy": 0.8429739147424697, + "num_tokens": 76531349.0, + "step": 63630 + }, + { + "entropy": 1.9040121346712113, + "epoch": 0.19727843181626198, + "grad_norm": 8.7483549118042, + "learning_rate": 5.69582041892661e-06, + "loss": 0.4998, + "mean_token_accuracy": 0.8458465442061425, + "num_tokens": 76542921.0, + "step": 63640 + }, + { + "entropy": 1.8771297112107277, + "epoch": 0.19730943094131168, + "grad_norm": 7.3089141845703125, + "learning_rate": 5.695372961452301e-06, + "loss": 0.5093, + "mean_token_accuracy": 0.8363812834024429, + "num_tokens": 76555655.0, + "step": 63650 + }, + { + "entropy": 1.8038627937436105, + "epoch": 0.19734043006636137, + "grad_norm": 7.573980808258057, + "learning_rate": 5.694925609416748e-06, + "loss": 0.4854, + "mean_token_accuracy": 0.8495232254266739, + "num_tokens": 76569165.0, + "step": 63660 + }, + { + "entropy": 1.8558023154735566, + "epoch": 0.19737142919141107, + "grad_norm": 8.455429077148438, + "learning_rate": 5.694478362778547e-06, + "loss": 0.4853, + "mean_token_accuracy": 0.8412219509482384, + "num_tokens": 76581380.0, + "step": 63670 + }, + { + "entropy": 1.8561557665467263, + "epoch": 0.19740242831646077, + "grad_norm": 3.842336416244507, + "learning_rate": 5.69403122149632e-06, + "loss": 0.4631, + "mean_token_accuracy": 0.8478426292538643, + "num_tokens": 76594529.0, + "step": 63680 + }, + { + "entropy": 1.85400443226099, + "epoch": 0.19743342744151046, + "grad_norm": 4.739361763000488, + "learning_rate": 5.693584185528707e-06, + "loss": 0.5293, + "mean_token_accuracy": 0.8424937948584557, + "num_tokens": 76607350.0, + "step": 63690 + }, + { + "entropy": 1.8113030225038529, + "epoch": 0.19746442656656016, + "grad_norm": 8.30618953704834, + "learning_rate": 5.693137254834375e-06, + "loss": 0.398, + "mean_token_accuracy": 0.8603976845741272, + "num_tokens": 76620524.0, + "step": 63700 + }, + { + "entropy": 1.853132924437523, + "epoch": 0.19749542569160985, + "grad_norm": 9.655352592468262, + "learning_rate": 5.692690429372012e-06, + "loss": 0.5033, + "mean_token_accuracy": 0.8338514223694802, + "num_tokens": 76633444.0, + "step": 63710 + }, + { + "entropy": 1.898238630592823, + "epoch": 0.19752642481665955, + "grad_norm": 4.058250904083252, + "learning_rate": 5.692243709100329e-06, + "loss": 0.5007, + "mean_token_accuracy": 0.8403716444969177, + "num_tokens": 76645664.0, + "step": 63720 + }, + { + "entropy": 1.882179079949856, + "epoch": 0.19755742394170925, + "grad_norm": 5.784068584442139, + "learning_rate": 5.691797093978061e-06, + "loss": 0.4826, + "mean_token_accuracy": 0.8447945788502693, + "num_tokens": 76657600.0, + "step": 63730 + }, + { + "entropy": 1.8364885538816451, + "epoch": 0.19758842306675894, + "grad_norm": 5.507530689239502, + "learning_rate": 5.69135058396396e-06, + "loss": 0.4522, + "mean_token_accuracy": 0.8419953018426896, + "num_tokens": 76671201.0, + "step": 63740 + }, + { + "entropy": 1.8077232390642166, + "epoch": 0.19761942219180864, + "grad_norm": 8.423868179321289, + "learning_rate": 5.690904179016809e-06, + "loss": 0.4529, + "mean_token_accuracy": 0.8521018475294113, + "num_tokens": 76684611.0, + "step": 63750 + }, + { + "entropy": 1.8499022141098975, + "epoch": 0.19765042131685834, + "grad_norm": 5.460327625274658, + "learning_rate": 5.690457879095407e-06, + "loss": 0.5245, + "mean_token_accuracy": 0.8433385342359543, + "num_tokens": 76697424.0, + "step": 63760 + }, + { + "entropy": 1.8565186977386474, + "epoch": 0.19768142044190803, + "grad_norm": 10.535595893859863, + "learning_rate": 5.690011684158579e-06, + "loss": 0.4767, + "mean_token_accuracy": 0.8477386757731438, + "num_tokens": 76710657.0, + "step": 63770 + }, + { + "entropy": 1.9264386996626854, + "epoch": 0.19771241956695773, + "grad_norm": 8.825230598449707, + "learning_rate": 5.689565594165174e-06, + "loss": 0.5299, + "mean_token_accuracy": 0.8357585594058037, + "num_tokens": 76721904.0, + "step": 63780 + }, + { + "entropy": 1.9114050999283791, + "epoch": 0.19774341869200743, + "grad_norm": 10.269311904907227, + "learning_rate": 5.689119609074057e-06, + "loss": 0.5089, + "mean_token_accuracy": 0.8319689840078354, + "num_tokens": 76734403.0, + "step": 63790 + }, + { + "entropy": 1.8821352809667586, + "epoch": 0.19777441781705712, + "grad_norm": 8.149957656860352, + "learning_rate": 5.688673728844121e-06, + "loss": 0.483, + "mean_token_accuracy": 0.8380433395504951, + "num_tokens": 76745913.0, + "step": 63800 + }, + { + "entropy": 1.9270009264349937, + "epoch": 0.19780541694210682, + "grad_norm": 8.53276252746582, + "learning_rate": 5.688227953434282e-06, + "loss": 0.5314, + "mean_token_accuracy": 0.8341132417321205, + "num_tokens": 76758099.0, + "step": 63810 + }, + { + "entropy": 1.8181735575199127, + "epoch": 0.19783641606715652, + "grad_norm": 7.680285930633545, + "learning_rate": 5.687782282803477e-06, + "loss": 0.4461, + "mean_token_accuracy": 0.8435733437538147, + "num_tokens": 76771266.0, + "step": 63820 + }, + { + "entropy": 1.8427308067679404, + "epoch": 0.1978674151922062, + "grad_norm": 3.6180124282836914, + "learning_rate": 5.687336716910663e-06, + "loss": 0.4316, + "mean_token_accuracy": 0.8437779262661934, + "num_tokens": 76783158.0, + "step": 63830 + }, + { + "entropy": 1.9142847254872322, + "epoch": 0.19789841431725588, + "grad_norm": 4.747416019439697, + "learning_rate": 5.6868912557148245e-06, + "loss": 0.5068, + "mean_token_accuracy": 0.8309885829687118, + "num_tokens": 76795826.0, + "step": 63840 + }, + { + "entropy": 1.8509532332420349, + "epoch": 0.19792941344230558, + "grad_norm": 8.225378036499023, + "learning_rate": 5.686445899174965e-06, + "loss": 0.5019, + "mean_token_accuracy": 0.8449263677001, + "num_tokens": 76808003.0, + "step": 63850 + }, + { + "entropy": 1.9634561657905578, + "epoch": 0.19796041256735528, + "grad_norm": 8.663080215454102, + "learning_rate": 5.686000647250109e-06, + "loss": 0.583, + "mean_token_accuracy": 0.8314743667840958, + "num_tokens": 76819491.0, + "step": 63860 + }, + { + "entropy": 1.9227616682648658, + "epoch": 0.19799141169240497, + "grad_norm": 7.894753932952881, + "learning_rate": 5.685555499899311e-06, + "loss": 0.5717, + "mean_token_accuracy": 0.831498672068119, + "num_tokens": 76831514.0, + "step": 63870 + }, + { + "entropy": 1.969376365840435, + "epoch": 0.19802241081745467, + "grad_norm": 9.801876068115234, + "learning_rate": 5.685110457081639e-06, + "loss": 0.5539, + "mean_token_accuracy": 0.8390384584665298, + "num_tokens": 76842781.0, + "step": 63880 + }, + { + "entropy": 1.9508319780230523, + "epoch": 0.19805340994250437, + "grad_norm": 11.13720989227295, + "learning_rate": 5.6846655187561874e-06, + "loss": 0.5257, + "mean_token_accuracy": 0.835049818456173, + "num_tokens": 76854513.0, + "step": 63890 + }, + { + "entropy": 1.9810258507728578, + "epoch": 0.19808440906755406, + "grad_norm": 9.226847648620605, + "learning_rate": 5.684220684882074e-06, + "loss": 0.5617, + "mean_token_accuracy": 0.8322012394666671, + "num_tokens": 76865518.0, + "step": 63900 + }, + { + "entropy": 1.9172653019428254, + "epoch": 0.19811540819260376, + "grad_norm": 4.333598613739014, + "learning_rate": 5.683775955418437e-06, + "loss": 0.5478, + "mean_token_accuracy": 0.8240251198410988, + "num_tokens": 76877792.0, + "step": 63910 + }, + { + "entropy": 1.843831568211317, + "epoch": 0.19814640731765346, + "grad_norm": 9.46835708618164, + "learning_rate": 5.683331330324438e-06, + "loss": 0.4919, + "mean_token_accuracy": 0.8351884678006172, + "num_tokens": 76890123.0, + "step": 63920 + }, + { + "entropy": 1.7964137971401215, + "epoch": 0.19817740644270315, + "grad_norm": 8.710184097290039, + "learning_rate": 5.682886809559261e-06, + "loss": 0.5308, + "mean_token_accuracy": 0.8368506744503975, + "num_tokens": 76902945.0, + "step": 63930 + }, + { + "entropy": 1.9574948653578759, + "epoch": 0.19820840556775285, + "grad_norm": 8.956100463867188, + "learning_rate": 5.682442393082113e-06, + "loss": 0.528, + "mean_token_accuracy": 0.8361043930053711, + "num_tokens": 76914195.0, + "step": 63940 + }, + { + "entropy": 1.8992867067456245, + "epoch": 0.19823940469280255, + "grad_norm": 5.019556045532227, + "learning_rate": 5.681998080852219e-06, + "loss": 0.5534, + "mean_token_accuracy": 0.8368826553225517, + "num_tokens": 76925863.0, + "step": 63950 + }, + { + "entropy": 1.9133247062563896, + "epoch": 0.19827040381785224, + "grad_norm": 4.297521591186523, + "learning_rate": 5.681553872828835e-06, + "loss": 0.5043, + "mean_token_accuracy": 0.8392513215541839, + "num_tokens": 76937635.0, + "step": 63960 + }, + { + "entropy": 1.9356270998716354, + "epoch": 0.19830140294290194, + "grad_norm": 7.358262062072754, + "learning_rate": 5.681109768971228e-06, + "loss": 0.5431, + "mean_token_accuracy": 0.8359444841742516, + "num_tokens": 76948918.0, + "step": 63970 + }, + { + "entropy": 1.807618674635887, + "epoch": 0.19833240206795164, + "grad_norm": 8.462199211120605, + "learning_rate": 5.680665769238698e-06, + "loss": 0.4808, + "mean_token_accuracy": 0.844212980568409, + "num_tokens": 76962029.0, + "step": 63980 + }, + { + "entropy": 1.8870945870876312, + "epoch": 0.19836340119300133, + "grad_norm": 9.050925254821777, + "learning_rate": 5.68022187359056e-06, + "loss": 0.4935, + "mean_token_accuracy": 0.836103081703186, + "num_tokens": 76973718.0, + "step": 63990 + }, + { + "entropy": 1.8385724887251853, + "epoch": 0.19839440031805103, + "grad_norm": 9.02990436553955, + "learning_rate": 5.6797780819861545e-06, + "loss": 0.4713, + "mean_token_accuracy": 0.8344948351383209, + "num_tokens": 76986094.0, + "step": 64000 + }, + { + "entropy": 1.8950312197208405, + "epoch": 0.19842539944310073, + "grad_norm": 17.118181228637695, + "learning_rate": 5.679334394384845e-06, + "loss": 0.4611, + "mean_token_accuracy": 0.846867561340332, + "num_tokens": 76998477.0, + "step": 64010 + }, + { + "entropy": 1.9193229526281357, + "epoch": 0.19845639856815042, + "grad_norm": 10.047476768493652, + "learning_rate": 5.6788908107460135e-06, + "loss": 0.5418, + "mean_token_accuracy": 0.8332524642348289, + "num_tokens": 77009850.0, + "step": 64020 + }, + { + "entropy": 1.8773248583078384, + "epoch": 0.19848739769320012, + "grad_norm": 8.258344650268555, + "learning_rate": 5.678447331029068e-06, + "loss": 0.4643, + "mean_token_accuracy": 0.8483355283737183, + "num_tokens": 77022417.0, + "step": 64030 + }, + { + "entropy": 1.9686390221118928, + "epoch": 0.19851839681824981, + "grad_norm": 9.272161483764648, + "learning_rate": 5.678003955193437e-06, + "loss": 0.5287, + "mean_token_accuracy": 0.8388251766562462, + "num_tokens": 77033370.0, + "step": 64040 + }, + { + "entropy": 1.8505666583776474, + "epoch": 0.1985493959432995, + "grad_norm": 8.902565956115723, + "learning_rate": 5.677560683198569e-06, + "loss": 0.5142, + "mean_token_accuracy": 0.840177808701992, + "num_tokens": 77046098.0, + "step": 64050 + }, + { + "entropy": 1.9229148238897324, + "epoch": 0.1985803950683492, + "grad_norm": 7.94978141784668, + "learning_rate": 5.677117515003942e-06, + "loss": 0.5188, + "mean_token_accuracy": 0.8431189730763435, + "num_tokens": 77058145.0, + "step": 64060 + }, + { + "entropy": 1.8256406679749488, + "epoch": 0.1986113941933989, + "grad_norm": 5.319147109985352, + "learning_rate": 5.676674450569045e-06, + "loss": 0.4836, + "mean_token_accuracy": 0.8440526649355888, + "num_tokens": 77071645.0, + "step": 64070 + }, + { + "entropy": 1.8413712576031684, + "epoch": 0.19864239331844857, + "grad_norm": 7.502344131469727, + "learning_rate": 5.6762314898534e-06, + "loss": 0.4358, + "mean_token_accuracy": 0.8615258172154426, + "num_tokens": 77083570.0, + "step": 64080 + }, + { + "entropy": 1.8211113825440406, + "epoch": 0.19867339244349827, + "grad_norm": 4.350229740142822, + "learning_rate": 5.675788632816544e-06, + "loss": 0.4523, + "mean_token_accuracy": 0.8473315939307213, + "num_tokens": 77096128.0, + "step": 64090 + }, + { + "entropy": 1.7832973524928093, + "epoch": 0.19870439156854797, + "grad_norm": 3.9118053913116455, + "learning_rate": 5.67534587941804e-06, + "loss": 0.4505, + "mean_token_accuracy": 0.8490145295858383, + "num_tokens": 77109619.0, + "step": 64100 + }, + { + "entropy": 1.8864678606390952, + "epoch": 0.19873539069359766, + "grad_norm": 4.381364345550537, + "learning_rate": 5.674903229617469e-06, + "loss": 0.5189, + "mean_token_accuracy": 0.8414922654628754, + "num_tokens": 77121495.0, + "step": 64110 + }, + { + "entropy": 1.8282597333192825, + "epoch": 0.19876638981864736, + "grad_norm": 8.134101867675781, + "learning_rate": 5.67446068337444e-06, + "loss": 0.4934, + "mean_token_accuracy": 0.8438005477190018, + "num_tokens": 77133762.0, + "step": 64120 + }, + { + "entropy": 1.891626113653183, + "epoch": 0.19879738894369706, + "grad_norm": 8.915387153625488, + "learning_rate": 5.674018240648578e-06, + "loss": 0.4952, + "mean_token_accuracy": 0.8532876297831535, + "num_tokens": 77145322.0, + "step": 64130 + }, + { + "entropy": 1.8996186777949333, + "epoch": 0.19882838806874675, + "grad_norm": 9.293252944946289, + "learning_rate": 5.673575901399533e-06, + "loss": 0.5231, + "mean_token_accuracy": 0.8366784289479255, + "num_tokens": 77157517.0, + "step": 64140 + }, + { + "entropy": 1.8965847790241241, + "epoch": 0.19885938719379645, + "grad_norm": 4.745813846588135, + "learning_rate": 5.673133665586977e-06, + "loss": 0.52, + "mean_token_accuracy": 0.8319694399833679, + "num_tokens": 77169583.0, + "step": 64150 + }, + { + "entropy": 1.8250271826982498, + "epoch": 0.19889038631884615, + "grad_norm": 9.300322532653809, + "learning_rate": 5.672691533170605e-06, + "loss": 0.4958, + "mean_token_accuracy": 0.8459231272339821, + "num_tokens": 77182323.0, + "step": 64160 + }, + { + "entropy": 1.7872568264603614, + "epoch": 0.19892138544389584, + "grad_norm": 9.027938842773438, + "learning_rate": 5.672249504110131e-06, + "loss": 0.4101, + "mean_token_accuracy": 0.8547685459256172, + "num_tokens": 77196100.0, + "step": 64170 + }, + { + "entropy": 1.814260269701481, + "epoch": 0.19895238456894554, + "grad_norm": 8.810227394104004, + "learning_rate": 5.671807578365294e-06, + "loss": 0.4367, + "mean_token_accuracy": 0.849295774102211, + "num_tokens": 77208500.0, + "step": 64180 + }, + { + "entropy": 1.8780076310038567, + "epoch": 0.19898338369399524, + "grad_norm": 9.624731063842773, + "learning_rate": 5.671365755895851e-06, + "loss": 0.4757, + "mean_token_accuracy": 0.8416895166039466, + "num_tokens": 77220028.0, + "step": 64190 + }, + { + "entropy": 1.9330232828855514, + "epoch": 0.19901438281904493, + "grad_norm": 9.217514038085938, + "learning_rate": 5.670924036661586e-06, + "loss": 0.5643, + "mean_token_accuracy": 0.8271647378802299, + "num_tokens": 77231617.0, + "step": 64200 + }, + { + "entropy": 1.9576894700527192, + "epoch": 0.19904538194409463, + "grad_norm": 7.6577534675598145, + "learning_rate": 5.670482420622302e-06, + "loss": 0.5551, + "mean_token_accuracy": 0.8377937138080597, + "num_tokens": 77243109.0, + "step": 64210 + }, + { + "entropy": 1.9337700963020326, + "epoch": 0.19907638106914433, + "grad_norm": 9.766473770141602, + "learning_rate": 5.6700409077378235e-06, + "loss": 0.5428, + "mean_token_accuracy": 0.8324208498001099, + "num_tokens": 77253457.0, + "step": 64220 + }, + { + "entropy": 1.888157394528389, + "epoch": 0.19910738019419402, + "grad_norm": 8.229586601257324, + "learning_rate": 5.6695994979679995e-06, + "loss": 0.4829, + "mean_token_accuracy": 0.8483126178383827, + "num_tokens": 77265767.0, + "step": 64230 + }, + { + "entropy": 1.795005053281784, + "epoch": 0.19913837931924372, + "grad_norm": 8.953361511230469, + "learning_rate": 5.669158191272697e-06, + "loss": 0.4682, + "mean_token_accuracy": 0.8460436001420021, + "num_tokens": 77278828.0, + "step": 64240 + }, + { + "entropy": 1.8841128557920457, + "epoch": 0.19916937844429342, + "grad_norm": 9.218796730041504, + "learning_rate": 5.668716987611807e-06, + "loss": 0.504, + "mean_token_accuracy": 0.8368963778018952, + "num_tokens": 77291261.0, + "step": 64250 + }, + { + "entropy": 1.816078770160675, + "epoch": 0.1992003775693431, + "grad_norm": 7.860946178436279, + "learning_rate": 5.668275886945246e-06, + "loss": 0.4316, + "mean_token_accuracy": 0.8497811511158944, + "num_tokens": 77303890.0, + "step": 64260 + }, + { + "entropy": 1.9705949038267137, + "epoch": 0.1992313766943928, + "grad_norm": 7.762573719024658, + "learning_rate": 5.667834889232945e-06, + "loss": 0.5301, + "mean_token_accuracy": 0.8459170028567314, + "num_tokens": 77315352.0, + "step": 64270 + }, + { + "entropy": 1.8340299040079118, + "epoch": 0.1992623758194425, + "grad_norm": 5.21130895614624, + "learning_rate": 5.667393994434861e-06, + "loss": 0.49, + "mean_token_accuracy": 0.8440465152263641, + "num_tokens": 77328106.0, + "step": 64280 + }, + { + "entropy": 1.872594805061817, + "epoch": 0.1992933749444922, + "grad_norm": 7.447837829589844, + "learning_rate": 5.666953202510973e-06, + "loss": 0.4397, + "mean_token_accuracy": 0.8528081148862838, + "num_tokens": 77340307.0, + "step": 64290 + }, + { + "entropy": 1.9547280743718147, + "epoch": 0.1993243740695419, + "grad_norm": 8.335238456726074, + "learning_rate": 5.666512513421281e-06, + "loss": 0.5324, + "mean_token_accuracy": 0.8396565094590187, + "num_tokens": 77351792.0, + "step": 64300 + }, + { + "entropy": 1.8686283484101296, + "epoch": 0.1993553731945916, + "grad_norm": 10.21964168548584, + "learning_rate": 5.666071927125806e-06, + "loss": 0.4666, + "mean_token_accuracy": 0.8395275220274925, + "num_tokens": 77364455.0, + "step": 64310 + }, + { + "entropy": 1.9559142783284187, + "epoch": 0.1993863723196413, + "grad_norm": 6.589775085449219, + "learning_rate": 5.665631443584593e-06, + "loss": 0.5251, + "mean_token_accuracy": 0.8342868536710739, + "num_tokens": 77376312.0, + "step": 64320 + }, + { + "entropy": 1.8677933678030967, + "epoch": 0.19941737144469096, + "grad_norm": 4.30719518661499, + "learning_rate": 5.665191062757705e-06, + "loss": 0.4805, + "mean_token_accuracy": 0.8398402318358421, + "num_tokens": 77388805.0, + "step": 64330 + }, + { + "entropy": 2.0073965549468995, + "epoch": 0.19944837056974066, + "grad_norm": 8.787602424621582, + "learning_rate": 5.664750784605233e-06, + "loss": 0.5647, + "mean_token_accuracy": 0.8316746145486832, + "num_tokens": 77399105.0, + "step": 64340 + }, + { + "entropy": 1.9256497889757156, + "epoch": 0.19947936969479035, + "grad_norm": 4.221436023712158, + "learning_rate": 5.664310609087283e-06, + "loss": 0.5362, + "mean_token_accuracy": 0.8293071076273918, + "num_tokens": 77411168.0, + "step": 64350 + }, + { + "entropy": 1.8341520741581916, + "epoch": 0.19951036881984005, + "grad_norm": 7.341567516326904, + "learning_rate": 5.663870536163986e-06, + "loss": 0.5056, + "mean_token_accuracy": 0.835312868654728, + "num_tokens": 77424558.0, + "step": 64360 + }, + { + "entropy": 1.9391889229416848, + "epoch": 0.19954136794488975, + "grad_norm": 10.012337684631348, + "learning_rate": 5.663430565795495e-06, + "loss": 0.5365, + "mean_token_accuracy": 0.8299921050667762, + "num_tokens": 77436576.0, + "step": 64370 + }, + { + "entropy": 1.9387200742959976, + "epoch": 0.19957236706993944, + "grad_norm": 8.041949272155762, + "learning_rate": 5.6629906979419805e-06, + "loss": 0.5124, + "mean_token_accuracy": 0.8453411236405373, + "num_tokens": 77448293.0, + "step": 64380 + }, + { + "entropy": 1.845164766907692, + "epoch": 0.19960336619498914, + "grad_norm": 9.091232299804688, + "learning_rate": 5.662550932563643e-06, + "loss": 0.4484, + "mean_token_accuracy": 0.8558371141552925, + "num_tokens": 77461006.0, + "step": 64390 + }, + { + "entropy": 1.9543328523635863, + "epoch": 0.19963436532003884, + "grad_norm": 8.760300636291504, + "learning_rate": 5.662111269620696e-06, + "loss": 0.546, + "mean_token_accuracy": 0.8322904482483864, + "num_tokens": 77472575.0, + "step": 64400 + }, + { + "entropy": 1.8849376797676087, + "epoch": 0.19966536444508853, + "grad_norm": 8.26484489440918, + "learning_rate": 5.661671709073379e-06, + "loss": 0.4755, + "mean_token_accuracy": 0.8436549201607704, + "num_tokens": 77484546.0, + "step": 64410 + }, + { + "entropy": 1.9597482338547707, + "epoch": 0.19969636357013823, + "grad_norm": 7.358891487121582, + "learning_rate": 5.661232250881952e-06, + "loss": 0.5204, + "mean_token_accuracy": 0.8410298019647598, + "num_tokens": 77496212.0, + "step": 64420 + }, + { + "entropy": 1.9519696533679962, + "epoch": 0.19972736269518793, + "grad_norm": 8.28138542175293, + "learning_rate": 5.660792895006698e-06, + "loss": 0.527, + "mean_token_accuracy": 0.8325450897216797, + "num_tokens": 77507649.0, + "step": 64430 + }, + { + "entropy": 1.9554157480597496, + "epoch": 0.19975836182023762, + "grad_norm": 8.739425659179688, + "learning_rate": 5.660353641407921e-06, + "loss": 0.5152, + "mean_token_accuracy": 0.8404718577861786, + "num_tokens": 77518910.0, + "step": 64440 + }, + { + "entropy": 1.9898914635181426, + "epoch": 0.19978936094528732, + "grad_norm": 11.325966835021973, + "learning_rate": 5.659914490045944e-06, + "loss": 0.5837, + "mean_token_accuracy": 0.8286274582147598, + "num_tokens": 77529561.0, + "step": 64450 + }, + { + "entropy": 1.8334274768829346, + "epoch": 0.19982036007033702, + "grad_norm": 7.993432998657227, + "learning_rate": 5.659475440881115e-06, + "loss": 0.4159, + "mean_token_accuracy": 0.8571745663881302, + "num_tokens": 77542142.0, + "step": 64460 + }, + { + "entropy": 1.864787982404232, + "epoch": 0.1998513591953867, + "grad_norm": 8.196157455444336, + "learning_rate": 5.659036493873802e-06, + "loss": 0.472, + "mean_token_accuracy": 0.8399261072278023, + "num_tokens": 77554999.0, + "step": 64470 + }, + { + "entropy": 1.9031756028532982, + "epoch": 0.1998823583204364, + "grad_norm": 7.0196404457092285, + "learning_rate": 5.658597648984394e-06, + "loss": 0.5216, + "mean_token_accuracy": 0.8386317491531372, + "num_tokens": 77567455.0, + "step": 64480 + }, + { + "entropy": 1.8681576654314995, + "epoch": 0.1999133574454861, + "grad_norm": 8.093351364135742, + "learning_rate": 5.658158906173302e-06, + "loss": 0.406, + "mean_token_accuracy": 0.8515056669712067, + "num_tokens": 77580142.0, + "step": 64490 + }, + { + "entropy": 1.9180188179016113, + "epoch": 0.1999443565705358, + "grad_norm": 9.29157543182373, + "learning_rate": 5.657720265400961e-06, + "loss": 0.5155, + "mean_token_accuracy": 0.8398342624306678, + "num_tokens": 77591543.0, + "step": 64500 + }, + { + "entropy": 1.7902460798621178, + "epoch": 0.1999753556955855, + "grad_norm": 3.0435149669647217, + "learning_rate": 5.657281726627822e-06, + "loss": 0.4415, + "mean_token_accuracy": 0.8577824845910073, + "num_tokens": 77604945.0, + "step": 64510 + }, + { + "entropy": 1.9697556897997857, + "epoch": 0.2000063548206352, + "grad_norm": 8.26711654663086, + "learning_rate": 5.656843289814361e-06, + "loss": 0.6193, + "mean_token_accuracy": 0.8271805793046951, + "num_tokens": 77616932.0, + "step": 64520 + }, + { + "entropy": 1.90864490121603, + "epoch": 0.2000373539456849, + "grad_norm": 9.074986457824707, + "learning_rate": 5.656404954921076e-06, + "loss": 0.4704, + "mean_token_accuracy": 0.8418331027030945, + "num_tokens": 77629419.0, + "step": 64530 + }, + { + "entropy": 1.9975577175617218, + "epoch": 0.2000683530707346, + "grad_norm": 9.11250114440918, + "learning_rate": 5.655966721908486e-06, + "loss": 0.5851, + "mean_token_accuracy": 0.8237589776515961, + "num_tokens": 77640305.0, + "step": 64540 + }, + { + "entropy": 1.9538366571068764, + "epoch": 0.20009935219578429, + "grad_norm": 8.259376525878906, + "learning_rate": 5.65552859073713e-06, + "loss": 0.5225, + "mean_token_accuracy": 0.8349790692329406, + "num_tokens": 77651850.0, + "step": 64550 + }, + { + "entropy": 1.8756514206528663, + "epoch": 0.20013035132083398, + "grad_norm": 9.364764213562012, + "learning_rate": 5.655090561367568e-06, + "loss": 0.4591, + "mean_token_accuracy": 0.8493126779794693, + "num_tokens": 77664419.0, + "step": 64560 + }, + { + "entropy": 1.908227115869522, + "epoch": 0.20016135044588365, + "grad_norm": 8.66217041015625, + "learning_rate": 5.654652633760384e-06, + "loss": 0.5649, + "mean_token_accuracy": 0.8257618814706802, + "num_tokens": 77676594.0, + "step": 64570 + }, + { + "entropy": 1.9556605026125908, + "epoch": 0.20019234957093335, + "grad_norm": 3.5400590896606445, + "learning_rate": 5.654214807876182e-06, + "loss": 0.504, + "mean_token_accuracy": 0.8475314304232597, + "num_tokens": 77687943.0, + "step": 64580 + }, + { + "entropy": 1.883988819271326, + "epoch": 0.20022334869598304, + "grad_norm": 9.24106216430664, + "learning_rate": 5.653777083675587e-06, + "loss": 0.4301, + "mean_token_accuracy": 0.8436937823891639, + "num_tokens": 77700896.0, + "step": 64590 + }, + { + "entropy": 1.857344676554203, + "epoch": 0.20025434782103274, + "grad_norm": 9.264991760253906, + "learning_rate": 5.653339461119245e-06, + "loss": 0.4813, + "mean_token_accuracy": 0.8411024749279022, + "num_tokens": 77713541.0, + "step": 64600 + }, + { + "entropy": 1.9974335208535194, + "epoch": 0.20028534694608244, + "grad_norm": 9.765063285827637, + "learning_rate": 5.6529019401678256e-06, + "loss": 0.5565, + "mean_token_accuracy": 0.8282542005181313, + "num_tokens": 77724948.0, + "step": 64610 + }, + { + "entropy": 1.9897297486662864, + "epoch": 0.20031634607113213, + "grad_norm": 10.479876518249512, + "learning_rate": 5.652464520782016e-06, + "loss": 0.5124, + "mean_token_accuracy": 0.8397626534104348, + "num_tokens": 77736336.0, + "step": 64620 + }, + { + "entropy": 1.8929444566369056, + "epoch": 0.20034734519618183, + "grad_norm": 9.795084953308105, + "learning_rate": 5.652027202922528e-06, + "loss": 0.4723, + "mean_token_accuracy": 0.8592626452445984, + "num_tokens": 77748048.0, + "step": 64630 + }, + { + "entropy": 1.814023308455944, + "epoch": 0.20037834432123153, + "grad_norm": 7.460301399230957, + "learning_rate": 5.651589986550092e-06, + "loss": 0.4049, + "mean_token_accuracy": 0.8527435094118119, + "num_tokens": 77761289.0, + "step": 64640 + }, + { + "entropy": 1.8586131647229194, + "epoch": 0.20040934344628122, + "grad_norm": 8.590618133544922, + "learning_rate": 5.6511528716254636e-06, + "loss": 0.4138, + "mean_token_accuracy": 0.8501928374171257, + "num_tokens": 77773602.0, + "step": 64650 + }, + { + "entropy": 1.8472480833530427, + "epoch": 0.20044034257133092, + "grad_norm": 3.6471304893493652, + "learning_rate": 5.650715858109416e-06, + "loss": 0.4407, + "mean_token_accuracy": 0.8560655087232589, + "num_tokens": 77785559.0, + "step": 64660 + }, + { + "entropy": 2.008794406056404, + "epoch": 0.20047134169638062, + "grad_norm": 10.51722240447998, + "learning_rate": 5.650278945962744e-06, + "loss": 0.5934, + "mean_token_accuracy": 0.8334228664636611, + "num_tokens": 77796525.0, + "step": 64670 + }, + { + "entropy": 1.9870349198579789, + "epoch": 0.2005023408214303, + "grad_norm": 8.583792686462402, + "learning_rate": 5.649842135146264e-06, + "loss": 0.6073, + "mean_token_accuracy": 0.8145452126860618, + "num_tokens": 77807477.0, + "step": 64680 + }, + { + "entropy": 1.8656486958265304, + "epoch": 0.20053333994648, + "grad_norm": 8.072990417480469, + "learning_rate": 5.649405425620815e-06, + "loss": 0.4531, + "mean_token_accuracy": 0.8582053184509277, + "num_tokens": 77819825.0, + "step": 64690 + }, + { + "entropy": 1.8668094590306281, + "epoch": 0.2005643390715297, + "grad_norm": 9.32240104675293, + "learning_rate": 5.648968817347257e-06, + "loss": 0.4919, + "mean_token_accuracy": 0.8417691692709923, + "num_tokens": 77832238.0, + "step": 64700 + }, + { + "entropy": 1.9135265216231345, + "epoch": 0.2005953381965794, + "grad_norm": 8.36419677734375, + "learning_rate": 5.648532310286469e-06, + "loss": 0.507, + "mean_token_accuracy": 0.8405087172985077, + "num_tokens": 77844237.0, + "step": 64710 + }, + { + "entropy": 2.0349139839410784, + "epoch": 0.2006263373216291, + "grad_norm": 8.855172157287598, + "learning_rate": 5.648095904399352e-06, + "loss": 0.6111, + "mean_token_accuracy": 0.8124338254332543, + "num_tokens": 77854626.0, + "step": 64720 + }, + { + "entropy": 1.838395781815052, + "epoch": 0.2006573364466788, + "grad_norm": 5.492077350616455, + "learning_rate": 5.64765959964683e-06, + "loss": 0.5007, + "mean_token_accuracy": 0.8443232774734497, + "num_tokens": 77867844.0, + "step": 64730 + }, + { + "entropy": 1.9272115871310234, + "epoch": 0.2006883355717285, + "grad_norm": 8.975436210632324, + "learning_rate": 5.647223395989846e-06, + "loss": 0.5197, + "mean_token_accuracy": 0.8435587778687477, + "num_tokens": 77880414.0, + "step": 64740 + }, + { + "entropy": 1.8668215185403825, + "epoch": 0.2007193346967782, + "grad_norm": 8.991979598999023, + "learning_rate": 5.646787293389365e-06, + "loss": 0.5237, + "mean_token_accuracy": 0.8380271717905998, + "num_tokens": 77893230.0, + "step": 64750 + }, + { + "entropy": 1.9775980859994888, + "epoch": 0.2007503338218279, + "grad_norm": 9.368738174438477, + "learning_rate": 5.646351291806372e-06, + "loss": 0.5447, + "mean_token_accuracy": 0.8423718631267547, + "num_tokens": 77904698.0, + "step": 64760 + }, + { + "entropy": 1.880776160955429, + "epoch": 0.20078133294687758, + "grad_norm": 4.232335090637207, + "learning_rate": 5.645915391201876e-06, + "loss": 0.4911, + "mean_token_accuracy": 0.8433832973241806, + "num_tokens": 77918354.0, + "step": 64770 + }, + { + "entropy": 1.9753024756908417, + "epoch": 0.20081233207192728, + "grad_norm": 9.014158248901367, + "learning_rate": 5.645479591536904e-06, + "loss": 0.5052, + "mean_token_accuracy": 0.8433095306158066, + "num_tokens": 77929796.0, + "step": 64780 + }, + { + "entropy": 1.9146323308348656, + "epoch": 0.20084333119697698, + "grad_norm": 8.354053497314453, + "learning_rate": 5.645043892772506e-06, + "loss": 0.4854, + "mean_token_accuracy": 0.8488031610846519, + "num_tokens": 77941791.0, + "step": 64790 + }, + { + "entropy": 1.9045788452029229, + "epoch": 0.20087433032202667, + "grad_norm": 9.207497596740723, + "learning_rate": 5.64460829486975e-06, + "loss": 0.4815, + "mean_token_accuracy": 0.8390837252140045, + "num_tokens": 77953907.0, + "step": 64800 + }, + { + "entropy": 1.9096369460225104, + "epoch": 0.20090532944707637, + "grad_norm": 3.9759466648101807, + "learning_rate": 5.64417279778973e-06, + "loss": 0.5009, + "mean_token_accuracy": 0.834852209687233, + "num_tokens": 77966476.0, + "step": 64810 + }, + { + "entropy": 1.971180261671543, + "epoch": 0.20093632857212604, + "grad_norm": 7.077384948730469, + "learning_rate": 5.643737401493556e-06, + "loss": 0.5186, + "mean_token_accuracy": 0.8305558815598488, + "num_tokens": 77978231.0, + "step": 64820 + }, + { + "entropy": 1.910263580083847, + "epoch": 0.20096732769717573, + "grad_norm": 5.62261438369751, + "learning_rate": 5.643302105942363e-06, + "loss": 0.4388, + "mean_token_accuracy": 0.8370845153927803, + "num_tokens": 77991195.0, + "step": 64830 + }, + { + "entropy": 1.8971102446317674, + "epoch": 0.20099832682222543, + "grad_norm": 14.074230194091797, + "learning_rate": 5.6428669110973035e-06, + "loss": 0.4748, + "mean_token_accuracy": 0.8393953680992127, + "num_tokens": 78003545.0, + "step": 64840 + }, + { + "entropy": 1.8818523928523063, + "epoch": 0.20102932594727513, + "grad_norm": 8.744156837463379, + "learning_rate": 5.642431816919555e-06, + "loss": 0.4876, + "mean_token_accuracy": 0.8389109507203102, + "num_tokens": 78015484.0, + "step": 64850 + }, + { + "entropy": 1.8921169385313987, + "epoch": 0.20106032507232482, + "grad_norm": 6.810762882232666, + "learning_rate": 5.6419968233703105e-06, + "loss": 0.4518, + "mean_token_accuracy": 0.8551272869110107, + "num_tokens": 78027522.0, + "step": 64860 + }, + { + "entropy": 2.0100679606199265, + "epoch": 0.20109132419737452, + "grad_norm": 7.5746378898620605, + "learning_rate": 5.641561930410791e-06, + "loss": 0.5471, + "mean_token_accuracy": 0.8342107847332955, + "num_tokens": 78038669.0, + "step": 64870 + }, + { + "entropy": 1.886759166419506, + "epoch": 0.20112232332242422, + "grad_norm": 8.69468879699707, + "learning_rate": 5.64112713800223e-06, + "loss": 0.4933, + "mean_token_accuracy": 0.8427166402339935, + "num_tokens": 78051321.0, + "step": 64880 + }, + { + "entropy": 1.980288290977478, + "epoch": 0.20115332244747391, + "grad_norm": 9.738602638244629, + "learning_rate": 5.64069244610589e-06, + "loss": 0.5282, + "mean_token_accuracy": 0.8373025566339493, + "num_tokens": 78062300.0, + "step": 64890 + }, + { + "entropy": 1.9037230789661408, + "epoch": 0.2011843215725236, + "grad_norm": 7.894903659820557, + "learning_rate": 5.640257854683049e-06, + "loss": 0.4886, + "mean_token_accuracy": 0.8482183083891869, + "num_tokens": 78074143.0, + "step": 64900 + }, + { + "entropy": 1.965438848733902, + "epoch": 0.2012153206975733, + "grad_norm": 10.353565216064453, + "learning_rate": 5.639823363695008e-06, + "loss": 0.5162, + "mean_token_accuracy": 0.8452044859528541, + "num_tokens": 78084809.0, + "step": 64910 + }, + { + "entropy": 1.8660222113132476, + "epoch": 0.201246319822623, + "grad_norm": 8.646041870117188, + "learning_rate": 5.639388973103089e-06, + "loss": 0.4265, + "mean_token_accuracy": 0.8471137225627899, + "num_tokens": 78097733.0, + "step": 64920 + }, + { + "entropy": 1.92643845975399, + "epoch": 0.2012773189476727, + "grad_norm": 8.62920093536377, + "learning_rate": 5.638954682868635e-06, + "loss": 0.5631, + "mean_token_accuracy": 0.8208620086312294, + "num_tokens": 78110514.0, + "step": 64930 + }, + { + "entropy": 1.8348462983965874, + "epoch": 0.2013083180727224, + "grad_norm": 7.881258964538574, + "learning_rate": 5.638520492953008e-06, + "loss": 0.4422, + "mean_token_accuracy": 0.8508792489767074, + "num_tokens": 78123634.0, + "step": 64940 + }, + { + "entropy": 1.9531269282102586, + "epoch": 0.2013393171977721, + "grad_norm": 3.806800603866577, + "learning_rate": 5.638086403317592e-06, + "loss": 0.5033, + "mean_token_accuracy": 0.8420859977602959, + "num_tokens": 78134835.0, + "step": 64950 + }, + { + "entropy": 1.9173472926020623, + "epoch": 0.2013703163228218, + "grad_norm": 9.862812995910645, + "learning_rate": 5.637652413923792e-06, + "loss": 0.5778, + "mean_token_accuracy": 0.817606084048748, + "num_tokens": 78146512.0, + "step": 64960 + }, + { + "entropy": 1.9328098177909852, + "epoch": 0.2014013154478715, + "grad_norm": 8.525472640991211, + "learning_rate": 5.637218524733035e-06, + "loss": 0.5115, + "mean_token_accuracy": 0.8353380098938942, + "num_tokens": 78158404.0, + "step": 64970 + }, + { + "entropy": 1.9535952508449554, + "epoch": 0.20143231457292118, + "grad_norm": 7.10647439956665, + "learning_rate": 5.636784735706765e-06, + "loss": 0.5739, + "mean_token_accuracy": 0.8305754721164703, + "num_tokens": 78169749.0, + "step": 64980 + }, + { + "entropy": 1.946779978275299, + "epoch": 0.20146331369797088, + "grad_norm": 8.665600776672363, + "learning_rate": 5.636351046806451e-06, + "loss": 0.5525, + "mean_token_accuracy": 0.835592320561409, + "num_tokens": 78180620.0, + "step": 64990 + }, + { + "entropy": 1.8874027088284493, + "epoch": 0.20149431282302058, + "grad_norm": 10.456498146057129, + "learning_rate": 5.6359174579935805e-06, + "loss": 0.5393, + "mean_token_accuracy": 0.8345045700669289, + "num_tokens": 78192494.0, + "step": 65000 + }, + { + "entropy": 1.9049117237329483, + "epoch": 0.20152531194807027, + "grad_norm": 8.67405891418457, + "learning_rate": 5.635483969229662e-06, + "loss": 0.5042, + "mean_token_accuracy": 0.8426391571760178, + "num_tokens": 78204503.0, + "step": 65010 + }, + { + "entropy": 1.937725681066513, + "epoch": 0.20155631107311997, + "grad_norm": 8.417729377746582, + "learning_rate": 5.635050580476227e-06, + "loss": 0.5326, + "mean_token_accuracy": 0.8371980518102646, + "num_tokens": 78215731.0, + "step": 65020 + }, + { + "entropy": 1.881471572816372, + "epoch": 0.20158731019816967, + "grad_norm": 8.803342819213867, + "learning_rate": 5.6346172916948215e-06, + "loss": 0.4486, + "mean_token_accuracy": 0.8541508734226226, + "num_tokens": 78228595.0, + "step": 65030 + }, + { + "entropy": 1.997860112786293, + "epoch": 0.20161830932321936, + "grad_norm": 8.877419471740723, + "learning_rate": 5.634184102847018e-06, + "loss": 0.5516, + "mean_token_accuracy": 0.8311259895563126, + "num_tokens": 78239815.0, + "step": 65040 + }, + { + "entropy": 1.9192116037011147, + "epoch": 0.20164930844826906, + "grad_norm": 8.195626258850098, + "learning_rate": 5.6337510138944094e-06, + "loss": 0.5028, + "mean_token_accuracy": 0.8414540275931358, + "num_tokens": 78250962.0, + "step": 65050 + }, + { + "entropy": 1.9783461928367614, + "epoch": 0.20168030757331876, + "grad_norm": 9.960369110107422, + "learning_rate": 5.633318024798608e-06, + "loss": 0.5766, + "mean_token_accuracy": 0.8294192060828209, + "num_tokens": 78261583.0, + "step": 65060 + }, + { + "entropy": 1.999306383728981, + "epoch": 0.20171130669836843, + "grad_norm": 8.405972480773926, + "learning_rate": 5.6328851355212445e-06, + "loss": 0.5814, + "mean_token_accuracy": 0.8255703672766685, + "num_tokens": 78272701.0, + "step": 65070 + }, + { + "entropy": 1.9220364853739738, + "epoch": 0.20174230582341812, + "grad_norm": 8.912109375, + "learning_rate": 5.632452346023972e-06, + "loss": 0.5051, + "mean_token_accuracy": 0.8397694423794746, + "num_tokens": 78284100.0, + "step": 65080 + }, + { + "entropy": 1.8529915317893029, + "epoch": 0.20177330494846782, + "grad_norm": 4.4439697265625, + "learning_rate": 5.6320196562684685e-06, + "loss": 0.4849, + "mean_token_accuracy": 0.8452705577015877, + "num_tokens": 78296972.0, + "step": 65090 + }, + { + "entropy": 1.8957914143800736, + "epoch": 0.20180430407351752, + "grad_norm": 21.00503921508789, + "learning_rate": 5.6315870662164244e-06, + "loss": 0.4532, + "mean_token_accuracy": 0.8400771632790566, + "num_tokens": 78309482.0, + "step": 65100 + }, + { + "entropy": 1.9614157870411872, + "epoch": 0.2018353031985672, + "grad_norm": 9.07087230682373, + "learning_rate": 5.631154575829556e-06, + "loss": 0.4778, + "mean_token_accuracy": 0.8453190490603447, + "num_tokens": 78321178.0, + "step": 65110 + }, + { + "entropy": 1.8932098597288132, + "epoch": 0.2018663023236169, + "grad_norm": 2.6463136672973633, + "learning_rate": 5.6307221850696e-06, + "loss": 0.5016, + "mean_token_accuracy": 0.8434045687317848, + "num_tokens": 78333684.0, + "step": 65120 + }, + { + "entropy": 1.8674306973814965, + "epoch": 0.2018973014486666, + "grad_norm": 8.476088523864746, + "learning_rate": 5.630289893898313e-06, + "loss": 0.4328, + "mean_token_accuracy": 0.8461329773068428, + "num_tokens": 78346646.0, + "step": 65130 + }, + { + "entropy": 1.9643313199281693, + "epoch": 0.2019283005737163, + "grad_norm": 9.779008865356445, + "learning_rate": 5.629857702277471e-06, + "loss": 0.5428, + "mean_token_accuracy": 0.8256931126117706, + "num_tokens": 78358044.0, + "step": 65140 + }, + { + "entropy": 1.9168717786669731, + "epoch": 0.201959299698766, + "grad_norm": 8.545971870422363, + "learning_rate": 5.629425610168872e-06, + "loss": 0.5267, + "mean_token_accuracy": 0.8337204769253731, + "num_tokens": 78370065.0, + "step": 65150 + }, + { + "entropy": 1.8033166334033013, + "epoch": 0.2019902988238157, + "grad_norm": 4.0642991065979, + "learning_rate": 5.628993617534335e-06, + "loss": 0.3731, + "mean_token_accuracy": 0.8547035023570061, + "num_tokens": 78384325.0, + "step": 65160 + }, + { + "entropy": 1.8826155215501785, + "epoch": 0.2020212979488654, + "grad_norm": 4.962791442871094, + "learning_rate": 5.628561724335695e-06, + "loss": 0.4747, + "mean_token_accuracy": 0.8410342499613762, + "num_tokens": 78396987.0, + "step": 65170 + }, + { + "entropy": 1.8694611176848412, + "epoch": 0.2020522970739151, + "grad_norm": 3.838369131088257, + "learning_rate": 5.628129930534814e-06, + "loss": 0.4678, + "mean_token_accuracy": 0.846546882390976, + "num_tokens": 78410583.0, + "step": 65180 + }, + { + "entropy": 1.8311977073550225, + "epoch": 0.20208329619896478, + "grad_norm": 3.510143518447876, + "learning_rate": 5.627698236093573e-06, + "loss": 0.4623, + "mean_token_accuracy": 0.8387721896171569, + "num_tokens": 78423729.0, + "step": 65190 + }, + { + "entropy": 1.9377717927098275, + "epoch": 0.20211429532401448, + "grad_norm": 8.662220001220703, + "learning_rate": 5.627266640973867e-06, + "loss": 0.5164, + "mean_token_accuracy": 0.8386742398142815, + "num_tokens": 78435466.0, + "step": 65200 + }, + { + "entropy": 1.9007938578724861, + "epoch": 0.20214529444906418, + "grad_norm": 4.089953899383545, + "learning_rate": 5.626835145137622e-06, + "loss": 0.4383, + "mean_token_accuracy": 0.8495527386665345, + "num_tokens": 78447613.0, + "step": 65210 + }, + { + "entropy": 1.9075167581439019, + "epoch": 0.20217629357411387, + "grad_norm": 8.904956817626953, + "learning_rate": 5.626403748546773e-06, + "loss": 0.5229, + "mean_token_accuracy": 0.8407324820756912, + "num_tokens": 78459460.0, + "step": 65220 + }, + { + "entropy": 1.829827456176281, + "epoch": 0.20220729269916357, + "grad_norm": 8.076254844665527, + "learning_rate": 5.625972451163285e-06, + "loss": 0.4076, + "mean_token_accuracy": 0.8538295239210129, + "num_tokens": 78472309.0, + "step": 65230 + }, + { + "entropy": 1.964961776137352, + "epoch": 0.20223829182421327, + "grad_norm": 4.608421325683594, + "learning_rate": 5.625541252949139e-06, + "loss": 0.5394, + "mean_token_accuracy": 0.8294176504015922, + "num_tokens": 78483546.0, + "step": 65240 + }, + { + "entropy": 1.9537744015455245, + "epoch": 0.20226929094926296, + "grad_norm": 8.989550590515137, + "learning_rate": 5.6251101538663364e-06, + "loss": 0.5179, + "mean_token_accuracy": 0.8399158343672752, + "num_tokens": 78493937.0, + "step": 65250 + }, + { + "entropy": 1.8486858293414117, + "epoch": 0.20230029007431266, + "grad_norm": 8.618539810180664, + "learning_rate": 5.624679153876901e-06, + "loss": 0.5205, + "mean_token_accuracy": 0.8347037017345429, + "num_tokens": 78506495.0, + "step": 65260 + }, + { + "entropy": 1.8448414601385594, + "epoch": 0.20233128919936236, + "grad_norm": 1.566992998123169, + "learning_rate": 5.624248252942874e-06, + "loss": 0.5063, + "mean_token_accuracy": 0.8431580245494843, + "num_tokens": 78519640.0, + "step": 65270 + }, + { + "entropy": 1.9085783809423447, + "epoch": 0.20236228832441205, + "grad_norm": 8.527741432189941, + "learning_rate": 5.623817451026321e-06, + "loss": 0.5242, + "mean_token_accuracy": 0.8311206966638565, + "num_tokens": 78530969.0, + "step": 65280 + }, + { + "entropy": 1.9010276600718499, + "epoch": 0.20239328744946175, + "grad_norm": 8.006790161132812, + "learning_rate": 5.623386748089322e-06, + "loss": 0.4927, + "mean_token_accuracy": 0.8460407719016075, + "num_tokens": 78542383.0, + "step": 65290 + }, + { + "entropy": 1.9630857422947883, + "epoch": 0.20242428657451145, + "grad_norm": 10.483144760131836, + "learning_rate": 5.622956144093983e-06, + "loss": 0.5581, + "mean_token_accuracy": 0.8222698003053666, + "num_tokens": 78553514.0, + "step": 65300 + }, + { + "entropy": 1.9169240906834601, + "epoch": 0.20245528569956112, + "grad_norm": 7.206988334655762, + "learning_rate": 5.622525639002427e-06, + "loss": 0.5551, + "mean_token_accuracy": 0.8317204505205155, + "num_tokens": 78565113.0, + "step": 65310 + }, + { + "entropy": 2.0341182202100754, + "epoch": 0.2024862848246108, + "grad_norm": 9.495631217956543, + "learning_rate": 5.6220952327768e-06, + "loss": 0.5879, + "mean_token_accuracy": 0.8261968463659286, + "num_tokens": 78576309.0, + "step": 65320 + }, + { + "entropy": 1.8233429193496704, + "epoch": 0.2025172839496605, + "grad_norm": 7.392541885375977, + "learning_rate": 5.6216649253792645e-06, + "loss": 0.485, + "mean_token_accuracy": 0.8485372483730316, + "num_tokens": 78589699.0, + "step": 65330 + }, + { + "entropy": 1.9578388512134552, + "epoch": 0.2025482830747102, + "grad_norm": 9.04162883758545, + "learning_rate": 5.6212347167720085e-06, + "loss": 0.575, + "mean_token_accuracy": 0.8360649108886719, + "num_tokens": 78601015.0, + "step": 65340 + }, + { + "entropy": 1.9982189297676087, + "epoch": 0.2025792821997599, + "grad_norm": 8.363797187805176, + "learning_rate": 5.620804606917233e-06, + "loss": 0.5966, + "mean_token_accuracy": 0.8217172041535378, + "num_tokens": 78611845.0, + "step": 65350 + }, + { + "entropy": 1.954882425069809, + "epoch": 0.2026102813248096, + "grad_norm": 3.7723734378814697, + "learning_rate": 5.620374595777169e-06, + "loss": 0.5392, + "mean_token_accuracy": 0.8342722833156586, + "num_tokens": 78622857.0, + "step": 65360 + }, + { + "entropy": 1.9132612109184266, + "epoch": 0.2026412804498593, + "grad_norm": 8.916566848754883, + "learning_rate": 5.619944683314056e-06, + "loss": 0.545, + "mean_token_accuracy": 0.8364124834537506, + "num_tokens": 78635064.0, + "step": 65370 + }, + { + "entropy": 1.9422496438026429, + "epoch": 0.202672279574909, + "grad_norm": 9.716279983520508, + "learning_rate": 5.619514869490165e-06, + "loss": 0.5646, + "mean_token_accuracy": 0.8320507362484932, + "num_tokens": 78646048.0, + "step": 65380 + }, + { + "entropy": 1.8317863315343856, + "epoch": 0.2027032786999587, + "grad_norm": 9.566564559936523, + "learning_rate": 5.619085154267778e-06, + "loss": 0.4479, + "mean_token_accuracy": 0.8540365427732468, + "num_tokens": 78658841.0, + "step": 65390 + }, + { + "entropy": 1.8700613364577294, + "epoch": 0.20273427782500839, + "grad_norm": 8.530824661254883, + "learning_rate": 5.618655537609205e-06, + "loss": 0.4934, + "mean_token_accuracy": 0.8470278188586235, + "num_tokens": 78671464.0, + "step": 65400 + }, + { + "entropy": 1.9383513778448105, + "epoch": 0.20276527695005808, + "grad_norm": 11.235833168029785, + "learning_rate": 5.61822601947677e-06, + "loss": 0.5219, + "mean_token_accuracy": 0.8409254685044288, + "num_tokens": 78683244.0, + "step": 65410 + }, + { + "entropy": 1.8878853350877762, + "epoch": 0.20279627607510778, + "grad_norm": 4.765620231628418, + "learning_rate": 5.617796599832821e-06, + "loss": 0.4732, + "mean_token_accuracy": 0.8444775015115737, + "num_tokens": 78695252.0, + "step": 65420 + }, + { + "entropy": 1.9394584164023398, + "epoch": 0.20282727520015748, + "grad_norm": 9.20919132232666, + "learning_rate": 5.617367278639724e-06, + "loss": 0.5591, + "mean_token_accuracy": 0.8318523272871972, + "num_tokens": 78707014.0, + "step": 65430 + }, + { + "entropy": 1.856240051984787, + "epoch": 0.20285827432520717, + "grad_norm": 8.130507469177246, + "learning_rate": 5.6169380558598655e-06, + "loss": 0.4699, + "mean_token_accuracy": 0.8484870880842209, + "num_tokens": 78719124.0, + "step": 65440 + }, + { + "entropy": 1.9641065716743469, + "epoch": 0.20288927345025687, + "grad_norm": 11.00446891784668, + "learning_rate": 5.616508931455653e-06, + "loss": 0.6064, + "mean_token_accuracy": 0.8264747887849808, + "num_tokens": 78730740.0, + "step": 65450 + }, + { + "entropy": 1.906115210056305, + "epoch": 0.20292027257530657, + "grad_norm": 9.167335510253906, + "learning_rate": 5.616079905389513e-06, + "loss": 0.5045, + "mean_token_accuracy": 0.8373008102178574, + "num_tokens": 78742187.0, + "step": 65460 + }, + { + "entropy": 1.8616785183548927, + "epoch": 0.20295127170035626, + "grad_norm": 4.638375282287598, + "learning_rate": 5.6156509776238955e-06, + "loss": 0.4359, + "mean_token_accuracy": 0.851144377887249, + "num_tokens": 78754915.0, + "step": 65470 + }, + { + "entropy": 1.9143847838044166, + "epoch": 0.20298227082540596, + "grad_norm": 9.230887413024902, + "learning_rate": 5.615222148121263e-06, + "loss": 0.5505, + "mean_token_accuracy": 0.8377783179283143, + "num_tokens": 78767603.0, + "step": 65480 + }, + { + "entropy": 1.9339503601193428, + "epoch": 0.20301326995045565, + "grad_norm": 7.70396614074707, + "learning_rate": 5.614793416844106e-06, + "loss": 0.5429, + "mean_token_accuracy": 0.8370099574327469, + "num_tokens": 78779568.0, + "step": 65490 + }, + { + "entropy": 1.9379954680800437, + "epoch": 0.20304426907550535, + "grad_norm": 9.033950805664062, + "learning_rate": 5.614364783754932e-06, + "loss": 0.5227, + "mean_token_accuracy": 0.8270322412252427, + "num_tokens": 78791706.0, + "step": 65500 + }, + { + "entropy": 1.8664806455373764, + "epoch": 0.20307526820055505, + "grad_norm": 2.266970634460449, + "learning_rate": 5.613936248816266e-06, + "loss": 0.5006, + "mean_token_accuracy": 0.8406695753335953, + "num_tokens": 78804244.0, + "step": 65510 + }, + { + "entropy": 1.8240263767540454, + "epoch": 0.20310626732560474, + "grad_norm": 9.05511474609375, + "learning_rate": 5.613507811990659e-06, + "loss": 0.4567, + "mean_token_accuracy": 0.8478545337915421, + "num_tokens": 78817721.0, + "step": 65520 + }, + { + "entropy": 1.8720962792634963, + "epoch": 0.20313726645065444, + "grad_norm": 4.106721878051758, + "learning_rate": 5.613079473240674e-06, + "loss": 0.4869, + "mean_token_accuracy": 0.8406256809830666, + "num_tokens": 78829445.0, + "step": 65530 + }, + { + "entropy": 1.9249654412269592, + "epoch": 0.20316826557570414, + "grad_norm": 7.825181007385254, + "learning_rate": 5.612651232528903e-06, + "loss": 0.5033, + "mean_token_accuracy": 0.8406661361455917, + "num_tokens": 78841287.0, + "step": 65540 + }, + { + "entropy": 1.9155500084161758, + "epoch": 0.20319926470075383, + "grad_norm": 8.039549827575684, + "learning_rate": 5.61222308981795e-06, + "loss": 0.5308, + "mean_token_accuracy": 0.8323755100369453, + "num_tokens": 78853504.0, + "step": 65550 + }, + { + "entropy": 1.9643807247281075, + "epoch": 0.2032302638258035, + "grad_norm": 7.621034622192383, + "learning_rate": 5.611795045070444e-06, + "loss": 0.5137, + "mean_token_accuracy": 0.8504145249724389, + "num_tokens": 78864902.0, + "step": 65560 + }, + { + "entropy": 1.8879443630576134, + "epoch": 0.2032612629508532, + "grad_norm": 10.218156814575195, + "learning_rate": 5.611367098249031e-06, + "loss": 0.5091, + "mean_token_accuracy": 0.8361433282494545, + "num_tokens": 78877179.0, + "step": 65570 + }, + { + "entropy": 1.863333511352539, + "epoch": 0.2032922620759029, + "grad_norm": 8.165308952331543, + "learning_rate": 5.61093924931638e-06, + "loss": 0.4681, + "mean_token_accuracy": 0.836507086455822, + "num_tokens": 78889655.0, + "step": 65580 + }, + { + "entropy": 1.8848236933350564, + "epoch": 0.2033232612009526, + "grad_norm": 4.619635581970215, + "learning_rate": 5.610511498235176e-06, + "loss": 0.4952, + "mean_token_accuracy": 0.8372624054551124, + "num_tokens": 78901982.0, + "step": 65590 + }, + { + "entropy": 1.9587459236383438, + "epoch": 0.2033542603260023, + "grad_norm": 9.958028793334961, + "learning_rate": 5.610083844968128e-06, + "loss": 0.5397, + "mean_token_accuracy": 0.8413488537073135, + "num_tokens": 78912795.0, + "step": 65600 + }, + { + "entropy": 1.973938637971878, + "epoch": 0.203385259451052, + "grad_norm": 8.87924861907959, + "learning_rate": 5.609656289477961e-06, + "loss": 0.5612, + "mean_token_accuracy": 0.828423522412777, + "num_tokens": 78923643.0, + "step": 65610 + }, + { + "entropy": 1.9414395466446877, + "epoch": 0.20341625857610168, + "grad_norm": 8.88390064239502, + "learning_rate": 5.609228831727426e-06, + "loss": 0.5457, + "mean_token_accuracy": 0.8232395872473717, + "num_tokens": 78934550.0, + "step": 65620 + }, + { + "entropy": 1.9254730343818665, + "epoch": 0.20344725770115138, + "grad_norm": 9.035659790039062, + "learning_rate": 5.6088014716792835e-06, + "loss": 0.5016, + "mean_token_accuracy": 0.8413706630468368, + "num_tokens": 78946328.0, + "step": 65630 + }, + { + "entropy": 1.8993248373270035, + "epoch": 0.20347825682620108, + "grad_norm": 8.599936485290527, + "learning_rate": 5.6083742092963255e-06, + "loss": 0.4786, + "mean_token_accuracy": 0.8389776006340981, + "num_tokens": 78958262.0, + "step": 65640 + }, + { + "entropy": 1.8646668374538422, + "epoch": 0.20350925595125077, + "grad_norm": 9.56519603729248, + "learning_rate": 5.607947044541355e-06, + "loss": 0.4687, + "mean_token_accuracy": 0.8369742676615715, + "num_tokens": 78971613.0, + "step": 65650 + }, + { + "entropy": 1.9807878568768502, + "epoch": 0.20354025507630047, + "grad_norm": 7.997987747192383, + "learning_rate": 5.607519977377199e-06, + "loss": 0.535, + "mean_token_accuracy": 0.8315387591719627, + "num_tokens": 78983244.0, + "step": 65660 + }, + { + "entropy": 1.8589694291353225, + "epoch": 0.20357125420135017, + "grad_norm": 8.80921459197998, + "learning_rate": 5.607093007766705e-06, + "loss": 0.4687, + "mean_token_accuracy": 0.8467966377735138, + "num_tokens": 78997102.0, + "step": 65670 + }, + { + "entropy": 1.8324778914451598, + "epoch": 0.20360225332639986, + "grad_norm": 4.456134796142578, + "learning_rate": 5.606666135672738e-06, + "loss": 0.3992, + "mean_token_accuracy": 0.8528119862079621, + "num_tokens": 79011102.0, + "step": 65680 + }, + { + "entropy": 1.94354690015316, + "epoch": 0.20363325245144956, + "grad_norm": 8.482820510864258, + "learning_rate": 5.6062393610581824e-06, + "loss": 0.4913, + "mean_token_accuracy": 0.8464238449931145, + "num_tokens": 79022653.0, + "step": 65690 + }, + { + "entropy": 1.8365743920207023, + "epoch": 0.20366425157649926, + "grad_norm": 3.94012188911438, + "learning_rate": 5.605812683885945e-06, + "loss": 0.4387, + "mean_token_accuracy": 0.8460432440042496, + "num_tokens": 79035953.0, + "step": 65700 + }, + { + "entropy": 1.9237422615289688, + "epoch": 0.20369525070154895, + "grad_norm": 8.296760559082031, + "learning_rate": 5.6053861041189515e-06, + "loss": 0.5001, + "mean_token_accuracy": 0.8385673075914383, + "num_tokens": 79049233.0, + "step": 65710 + }, + { + "entropy": 1.9227632522583007, + "epoch": 0.20372624982659865, + "grad_norm": 4.054368019104004, + "learning_rate": 5.604959621720145e-06, + "loss": 0.5091, + "mean_token_accuracy": 0.8334377571940422, + "num_tokens": 79061362.0, + "step": 65720 + }, + { + "entropy": 1.9160942152142524, + "epoch": 0.20375724895164835, + "grad_norm": 3.290419340133667, + "learning_rate": 5.604533236652492e-06, + "loss": 0.4808, + "mean_token_accuracy": 0.840754111111164, + "num_tokens": 79073746.0, + "step": 65730 + }, + { + "entropy": 1.9141247898340226, + "epoch": 0.20378824807669804, + "grad_norm": 8.813379287719727, + "learning_rate": 5.604106948878974e-06, + "loss": 0.4842, + "mean_token_accuracy": 0.846279302239418, + "num_tokens": 79086069.0, + "step": 65740 + }, + { + "entropy": 1.8690850347280503, + "epoch": 0.20381924720174774, + "grad_norm": 3.4831085205078125, + "learning_rate": 5.603680758362599e-06, + "loss": 0.4491, + "mean_token_accuracy": 0.844321160018444, + "num_tokens": 79098143.0, + "step": 65750 + }, + { + "entropy": 1.9780056357383728, + "epoch": 0.20385024632679744, + "grad_norm": 8.82219409942627, + "learning_rate": 5.603254665066387e-06, + "loss": 0.5811, + "mean_token_accuracy": 0.8181054502725601, + "num_tokens": 79109063.0, + "step": 65760 + }, + { + "entropy": 1.9268645226955414, + "epoch": 0.20388124545184713, + "grad_norm": 7.107331275939941, + "learning_rate": 5.602828668953384e-06, + "loss": 0.4698, + "mean_token_accuracy": 0.8484486505389214, + "num_tokens": 79120352.0, + "step": 65770 + }, + { + "entropy": 1.9007486268877982, + "epoch": 0.20391224457689683, + "grad_norm": 8.633251190185547, + "learning_rate": 5.602402769986652e-06, + "loss": 0.4971, + "mean_token_accuracy": 0.8409877792000771, + "num_tokens": 79131530.0, + "step": 65780 + }, + { + "entropy": 1.873155763745308, + "epoch": 0.20394324370194652, + "grad_norm": 4.75627326965332, + "learning_rate": 5.601976968129274e-06, + "loss": 0.4344, + "mean_token_accuracy": 0.8587671235203743, + "num_tokens": 79143461.0, + "step": 65790 + }, + { + "entropy": 1.895801869034767, + "epoch": 0.20397424282699622, + "grad_norm": 6.479430675506592, + "learning_rate": 5.6015512633443526e-06, + "loss": 0.5122, + "mean_token_accuracy": 0.8410785019397735, + "num_tokens": 79155115.0, + "step": 65800 + }, + { + "entropy": 1.8817868903279305, + "epoch": 0.2040052419520459, + "grad_norm": 7.321788787841797, + "learning_rate": 5.60112565559501e-06, + "loss": 0.48, + "mean_token_accuracy": 0.8473842695355416, + "num_tokens": 79168062.0, + "step": 65810 + }, + { + "entropy": 1.9196514919400216, + "epoch": 0.2040362410770956, + "grad_norm": 4.99709415435791, + "learning_rate": 5.600700144844387e-06, + "loss": 0.5157, + "mean_token_accuracy": 0.8285318657755851, + "num_tokens": 79180238.0, + "step": 65820 + }, + { + "entropy": 1.880447769165039, + "epoch": 0.20406724020214528, + "grad_norm": 10.029952049255371, + "learning_rate": 5.600274731055645e-06, + "loss": 0.5018, + "mean_token_accuracy": 0.8361893489956855, + "num_tokens": 79193071.0, + "step": 65830 + }, + { + "entropy": 1.9641305297613143, + "epoch": 0.20409823932719498, + "grad_norm": 8.848322868347168, + "learning_rate": 5.599849414191965e-06, + "loss": 0.5383, + "mean_token_accuracy": 0.8405619546771049, + "num_tokens": 79204235.0, + "step": 65840 + }, + { + "entropy": 1.896867436170578, + "epoch": 0.20412923845224468, + "grad_norm": 9.602326393127441, + "learning_rate": 5.599424194216547e-06, + "loss": 0.492, + "mean_token_accuracy": 0.8406928956508637, + "num_tokens": 79216424.0, + "step": 65850 + }, + { + "entropy": 1.9198857069015502, + "epoch": 0.20416023757729437, + "grad_norm": 10.032066345214844, + "learning_rate": 5.598999071092613e-06, + "loss": 0.4794, + "mean_token_accuracy": 0.8465869203209877, + "num_tokens": 79227957.0, + "step": 65860 + }, + { + "entropy": 1.8796373263001442, + "epoch": 0.20419123670234407, + "grad_norm": 4.095681190490723, + "learning_rate": 5.598574044783399e-06, + "loss": 0.4646, + "mean_token_accuracy": 0.8457603216171264, + "num_tokens": 79241020.0, + "step": 65870 + }, + { + "entropy": 1.890330995619297, + "epoch": 0.20422223582739377, + "grad_norm": 8.238532066345215, + "learning_rate": 5.598149115252166e-06, + "loss": 0.4564, + "mean_token_accuracy": 0.8491918399930001, + "num_tokens": 79254022.0, + "step": 65880 + }, + { + "entropy": 1.960501140356064, + "epoch": 0.20425323495244346, + "grad_norm": 9.54404354095459, + "learning_rate": 5.597724282462193e-06, + "loss": 0.5882, + "mean_token_accuracy": 0.8301891297101974, + "num_tokens": 79265375.0, + "step": 65890 + }, + { + "entropy": 1.8832382291555405, + "epoch": 0.20428423407749316, + "grad_norm": 10.813145637512207, + "learning_rate": 5.597299546376778e-06, + "loss": 0.5184, + "mean_token_accuracy": 0.8315230071544647, + "num_tokens": 79277983.0, + "step": 65900 + }, + { + "entropy": 1.9647821724414825, + "epoch": 0.20431523320254286, + "grad_norm": 8.538866996765137, + "learning_rate": 5.596874906959238e-06, + "loss": 0.5566, + "mean_token_accuracy": 0.8316359147429466, + "num_tokens": 79289116.0, + "step": 65910 + }, + { + "entropy": 1.8251458272337913, + "epoch": 0.20434623232759255, + "grad_norm": 8.377923965454102, + "learning_rate": 5.596450364172909e-06, + "loss": 0.3959, + "mean_token_accuracy": 0.86036896109581, + "num_tokens": 79301925.0, + "step": 65920 + }, + { + "entropy": 1.9117643371224404, + "epoch": 0.20437723145264225, + "grad_norm": 8.313860893249512, + "learning_rate": 5.596025917981147e-06, + "loss": 0.5422, + "mean_token_accuracy": 0.8350561752915382, + "num_tokens": 79313308.0, + "step": 65930 + }, + { + "entropy": 1.9248536437749864, + "epoch": 0.20440823057769195, + "grad_norm": 7.247808933258057, + "learning_rate": 5.595601568347332e-06, + "loss": 0.5714, + "mean_token_accuracy": 0.8272919088602066, + "num_tokens": 79325093.0, + "step": 65940 + }, + { + "entropy": 1.9249488562345505, + "epoch": 0.20443922970274164, + "grad_norm": 4.61583137512207, + "learning_rate": 5.5951773152348545e-06, + "loss": 0.4612, + "mean_token_accuracy": 0.8503524556756019, + "num_tokens": 79336001.0, + "step": 65950 + }, + { + "entropy": 1.8732093065977096, + "epoch": 0.20447022882779134, + "grad_norm": 8.46263313293457, + "learning_rate": 5.594753158607133e-06, + "loss": 0.4757, + "mean_token_accuracy": 0.8438627734780312, + "num_tokens": 79347710.0, + "step": 65960 + }, + { + "entropy": 1.8675259336829186, + "epoch": 0.20450122795284104, + "grad_norm": 9.65857219696045, + "learning_rate": 5.594329098427599e-06, + "loss": 0.4691, + "mean_token_accuracy": 0.8433469668030739, + "num_tokens": 79359872.0, + "step": 65970 + }, + { + "entropy": 1.8642318069934845, + "epoch": 0.20453222707789073, + "grad_norm": 3.6823036670684814, + "learning_rate": 5.5939051346597075e-06, + "loss": 0.4386, + "mean_token_accuracy": 0.853775417804718, + "num_tokens": 79372088.0, + "step": 65980 + }, + { + "entropy": 1.7361238494515419, + "epoch": 0.20456322620294043, + "grad_norm": 3.253519296646118, + "learning_rate": 5.59348126726693e-06, + "loss": 0.4458, + "mean_token_accuracy": 0.8488474145531655, + "num_tokens": 79386133.0, + "step": 65990 + }, + { + "entropy": 1.9137817591428756, + "epoch": 0.20459422532799013, + "grad_norm": 9.194711685180664, + "learning_rate": 5.593057496212762e-06, + "loss": 0.5272, + "mean_token_accuracy": 0.8341950923204422, + "num_tokens": 79397436.0, + "step": 66000 + }, + { + "entropy": 1.8905989840626716, + "epoch": 0.20462522445303982, + "grad_norm": 9.221059799194336, + "learning_rate": 5.592633821460712e-06, + "loss": 0.5225, + "mean_token_accuracy": 0.8302080318331718, + "num_tokens": 79409957.0, + "step": 66010 + }, + { + "entropy": 1.9771162524819375, + "epoch": 0.20465622357808952, + "grad_norm": 8.793451309204102, + "learning_rate": 5.592210242974312e-06, + "loss": 0.5649, + "mean_token_accuracy": 0.8244690522551537, + "num_tokens": 79421218.0, + "step": 66020 + }, + { + "entropy": 1.9063599698245526, + "epoch": 0.20468722270313922, + "grad_norm": 8.209443092346191, + "learning_rate": 5.5917867607171115e-06, + "loss": 0.5213, + "mean_token_accuracy": 0.8356138646602631, + "num_tokens": 79432683.0, + "step": 66030 + }, + { + "entropy": 1.9731371477246284, + "epoch": 0.2047182218281889, + "grad_norm": 8.800919532775879, + "learning_rate": 5.5913633746526845e-06, + "loss": 0.5424, + "mean_token_accuracy": 0.8384016111493111, + "num_tokens": 79443817.0, + "step": 66040 + }, + { + "entropy": 1.9695898175239563, + "epoch": 0.2047492209532386, + "grad_norm": 5.338846206665039, + "learning_rate": 5.590940084744614e-06, + "loss": 0.556, + "mean_token_accuracy": 0.8361503854393959, + "num_tokens": 79455128.0, + "step": 66050 + }, + { + "entropy": 1.9504301056265831, + "epoch": 0.20478022007828828, + "grad_norm": 6.454843997955322, + "learning_rate": 5.590516890956512e-06, + "loss": 0.4941, + "mean_token_accuracy": 0.8300734832882881, + "num_tokens": 79466545.0, + "step": 66060 + }, + { + "entropy": 1.916282233595848, + "epoch": 0.20481121920333797, + "grad_norm": 10.762775421142578, + "learning_rate": 5.590093793252005e-06, + "loss": 0.5018, + "mean_token_accuracy": 0.850112085044384, + "num_tokens": 79477559.0, + "step": 66070 + }, + { + "entropy": 1.932596181333065, + "epoch": 0.20484221832838767, + "grad_norm": 8.139491081237793, + "learning_rate": 5.5896707915947404e-06, + "loss": 0.5233, + "mean_token_accuracy": 0.8297857508063317, + "num_tokens": 79489215.0, + "step": 66080 + }, + { + "entropy": 1.7636524528264999, + "epoch": 0.20487321745343737, + "grad_norm": 3.8502655029296875, + "learning_rate": 5.5892478859483836e-06, + "loss": 0.4123, + "mean_token_accuracy": 0.8482681661844254, + "num_tokens": 79503087.0, + "step": 66090 + }, + { + "entropy": 1.9150663495063782, + "epoch": 0.20490421657848706, + "grad_norm": 8.005017280578613, + "learning_rate": 5.588825076276619e-06, + "loss": 0.5327, + "mean_token_accuracy": 0.8414066791534424, + "num_tokens": 79514977.0, + "step": 66100 + }, + { + "entropy": 1.884339025616646, + "epoch": 0.20493521570353676, + "grad_norm": 12.069209098815918, + "learning_rate": 5.5884023625431536e-06, + "loss": 0.4926, + "mean_token_accuracy": 0.8346866846084595, + "num_tokens": 79527912.0, + "step": 66110 + }, + { + "entropy": 1.9233988001942635, + "epoch": 0.20496621482858646, + "grad_norm": 7.263315200805664, + "learning_rate": 5.58797974471171e-06, + "loss": 0.5187, + "mean_token_accuracy": 0.8440830901265144, + "num_tokens": 79539486.0, + "step": 66120 + }, + { + "entropy": 1.9001603171229362, + "epoch": 0.20499721395363615, + "grad_norm": 8.054461479187012, + "learning_rate": 5.587557222746031e-06, + "loss": 0.4695, + "mean_token_accuracy": 0.8448784217238426, + "num_tokens": 79551307.0, + "step": 66130 + }, + { + "entropy": 1.966906487941742, + "epoch": 0.20502821307868585, + "grad_norm": 7.843715190887451, + "learning_rate": 5.587134796609878e-06, + "loss": 0.534, + "mean_token_accuracy": 0.8330809384584427, + "num_tokens": 79563106.0, + "step": 66140 + }, + { + "entropy": 1.9580473832786083, + "epoch": 0.20505921220373555, + "grad_norm": 8.565571784973145, + "learning_rate": 5.586712466267033e-06, + "loss": 0.4751, + "mean_token_accuracy": 0.8391212552785874, + "num_tokens": 79574679.0, + "step": 66150 + }, + { + "entropy": 1.9535856321454048, + "epoch": 0.20509021132878524, + "grad_norm": 7.142053604125977, + "learning_rate": 5.586290231681297e-06, + "loss": 0.5079, + "mean_token_accuracy": 0.8499238818883896, + "num_tokens": 79586243.0, + "step": 66160 + }, + { + "entropy": 1.832335925102234, + "epoch": 0.20512121045383494, + "grad_norm": 4.120788097381592, + "learning_rate": 5.58586809281649e-06, + "loss": 0.4294, + "mean_token_accuracy": 0.8358307898044586, + "num_tokens": 79599760.0, + "step": 66170 + }, + { + "entropy": 2.0018374592065813, + "epoch": 0.20515220957888464, + "grad_norm": 9.204635620117188, + "learning_rate": 5.585446049636449e-06, + "loss": 0.5362, + "mean_token_accuracy": 0.8410599693655968, + "num_tokens": 79610302.0, + "step": 66180 + }, + { + "entropy": 1.914768175780773, + "epoch": 0.20518320870393433, + "grad_norm": 9.314388275146484, + "learning_rate": 5.585024102105034e-06, + "loss": 0.5387, + "mean_token_accuracy": 0.8398109778761864, + "num_tokens": 79621486.0, + "step": 66190 + }, + { + "entropy": 1.8635256588459015, + "epoch": 0.20521420782898403, + "grad_norm": 8.0902738571167, + "learning_rate": 5.5846022501861204e-06, + "loss": 0.461, + "mean_token_accuracy": 0.8411575838923454, + "num_tokens": 79634109.0, + "step": 66200 + }, + { + "entropy": 1.9430198609828948, + "epoch": 0.20524520695403373, + "grad_norm": 9.754541397094727, + "learning_rate": 5.584180493843605e-06, + "loss": 0.5164, + "mean_token_accuracy": 0.8356720983982087, + "num_tokens": 79645040.0, + "step": 66210 + }, + { + "entropy": 1.9141881585121154, + "epoch": 0.20527620607908342, + "grad_norm": 4.26265287399292, + "learning_rate": 5.583758833041404e-06, + "loss": 0.5076, + "mean_token_accuracy": 0.8484968930482865, + "num_tokens": 79656732.0, + "step": 66220 + }, + { + "entropy": 1.9716264665126801, + "epoch": 0.20530720520413312, + "grad_norm": 8.999820709228516, + "learning_rate": 5.583337267743449e-06, + "loss": 0.5258, + "mean_token_accuracy": 0.8347686797380447, + "num_tokens": 79667904.0, + "step": 66230 + }, + { + "entropy": 1.9396614864468575, + "epoch": 0.20533820432918282, + "grad_norm": 8.992365837097168, + "learning_rate": 5.582915797913695e-06, + "loss": 0.5204, + "mean_token_accuracy": 0.8403194323182106, + "num_tokens": 79679499.0, + "step": 66240 + }, + { + "entropy": 1.9468914091587066, + "epoch": 0.2053692034542325, + "grad_norm": 3.764883279800415, + "learning_rate": 5.582494423516115e-06, + "loss": 0.5039, + "mean_token_accuracy": 0.8384948015213013, + "num_tokens": 79690852.0, + "step": 66250 + }, + { + "entropy": 1.9295331597328187, + "epoch": 0.2054002025792822, + "grad_norm": 7.999744415283203, + "learning_rate": 5.582073144514698e-06, + "loss": 0.5274, + "mean_token_accuracy": 0.8378969594836235, + "num_tokens": 79701855.0, + "step": 66260 + }, + { + "entropy": 1.9138659819960595, + "epoch": 0.2054312017043319, + "grad_norm": 7.380083084106445, + "learning_rate": 5.5816519608734575e-06, + "loss": 0.5216, + "mean_token_accuracy": 0.8312686800956726, + "num_tokens": 79713471.0, + "step": 66270 + }, + { + "entropy": 1.8375635713338851, + "epoch": 0.2054622008293816, + "grad_norm": 6.883208751678467, + "learning_rate": 5.58123087255642e-06, + "loss": 0.4478, + "mean_token_accuracy": 0.8480865985155106, + "num_tokens": 79726767.0, + "step": 66280 + }, + { + "entropy": 1.8265572801232337, + "epoch": 0.2054931999544313, + "grad_norm": 10.417244911193848, + "learning_rate": 5.580809879527636e-06, + "loss": 0.4507, + "mean_token_accuracy": 0.8491293832659721, + "num_tokens": 79739638.0, + "step": 66290 + }, + { + "entropy": 1.8894486725330353, + "epoch": 0.20552419907948097, + "grad_norm": 6.471380710601807, + "learning_rate": 5.580388981751174e-06, + "loss": 0.5406, + "mean_token_accuracy": 0.8371762230992317, + "num_tokens": 79752223.0, + "step": 66300 + }, + { + "entropy": 1.7594513550400734, + "epoch": 0.20555519820453066, + "grad_norm": 3.132899522781372, + "learning_rate": 5.579968179191117e-06, + "loss": 0.4459, + "mean_token_accuracy": 0.8502689436078071, + "num_tokens": 79766341.0, + "step": 66310 + }, + { + "entropy": 1.7812666043639183, + "epoch": 0.20558619732958036, + "grad_norm": 3.3629016876220703, + "learning_rate": 5.579547471811571e-06, + "loss": 0.4518, + "mean_token_accuracy": 0.8472658976912498, + "num_tokens": 79779942.0, + "step": 66320 + }, + { + "entropy": 1.908031238615513, + "epoch": 0.20561719645463006, + "grad_norm": 9.27261734008789, + "learning_rate": 5.579126859576662e-06, + "loss": 0.4839, + "mean_token_accuracy": 0.8606662392616272, + "num_tokens": 79791530.0, + "step": 66330 + }, + { + "entropy": 1.912331785261631, + "epoch": 0.20564819557967975, + "grad_norm": 6.716431140899658, + "learning_rate": 5.578706342450532e-06, + "loss": 0.5208, + "mean_token_accuracy": 0.8352584257721901, + "num_tokens": 79804036.0, + "step": 66340 + }, + { + "entropy": 1.886544433236122, + "epoch": 0.20567919470472945, + "grad_norm": 7.406538963317871, + "learning_rate": 5.578285920397344e-06, + "loss": 0.469, + "mean_token_accuracy": 0.839650048315525, + "num_tokens": 79816050.0, + "step": 66350 + }, + { + "entropy": 1.9246870696544647, + "epoch": 0.20571019382977915, + "grad_norm": 9.786649703979492, + "learning_rate": 5.577865593381278e-06, + "loss": 0.4991, + "mean_token_accuracy": 0.8357265755534172, + "num_tokens": 79827947.0, + "step": 66360 + }, + { + "entropy": 1.9001474544405936, + "epoch": 0.20574119295482884, + "grad_norm": 5.080446243286133, + "learning_rate": 5.577445361366534e-06, + "loss": 0.461, + "mean_token_accuracy": 0.8487439841032028, + "num_tokens": 79839935.0, + "step": 66370 + }, + { + "entropy": 1.9511905193328858, + "epoch": 0.20577219207987854, + "grad_norm": 9.604957580566406, + "learning_rate": 5.577025224317332e-06, + "loss": 0.536, + "mean_token_accuracy": 0.8385162249207496, + "num_tokens": 79851151.0, + "step": 66380 + }, + { + "entropy": 1.9839515179395675, + "epoch": 0.20580319120492824, + "grad_norm": 8.998725891113281, + "learning_rate": 5.576605182197907e-06, + "loss": 0.5523, + "mean_token_accuracy": 0.8333553358912468, + "num_tokens": 79862474.0, + "step": 66390 + }, + { + "entropy": 2.031079703569412, + "epoch": 0.20583419032997793, + "grad_norm": 9.277356147766113, + "learning_rate": 5.57618523497252e-06, + "loss": 0.5982, + "mean_token_accuracy": 0.822320033609867, + "num_tokens": 79873184.0, + "step": 66400 + }, + { + "entropy": 1.959704938530922, + "epoch": 0.20586518945502763, + "grad_norm": 9.140758514404297, + "learning_rate": 5.575765382605441e-06, + "loss": 0.5058, + "mean_token_accuracy": 0.8415651440620422, + "num_tokens": 79884713.0, + "step": 66410 + }, + { + "entropy": 1.9794253259897232, + "epoch": 0.20589618858007733, + "grad_norm": 10.414609909057617, + "learning_rate": 5.575345625060967e-06, + "loss": 0.5002, + "mean_token_accuracy": 0.844949309527874, + "num_tokens": 79895411.0, + "step": 66420 + }, + { + "entropy": 1.9424155369400977, + "epoch": 0.20592718770512702, + "grad_norm": 7.093960762023926, + "learning_rate": 5.574925962303411e-06, + "loss": 0.4719, + "mean_token_accuracy": 0.8516281858086586, + "num_tokens": 79906896.0, + "step": 66430 + }, + { + "entropy": 1.966086308658123, + "epoch": 0.20595818683017672, + "grad_norm": 9.383204460144043, + "learning_rate": 5.574506394297104e-06, + "loss": 0.5155, + "mean_token_accuracy": 0.8342742830514908, + "num_tokens": 79918533.0, + "step": 66440 + }, + { + "entropy": 1.9549981564283372, + "epoch": 0.20598918595522642, + "grad_norm": 7.823967456817627, + "learning_rate": 5.574086921006398e-06, + "loss": 0.5041, + "mean_token_accuracy": 0.8452614173293114, + "num_tokens": 79930153.0, + "step": 66450 + }, + { + "entropy": 1.9196519732475281, + "epoch": 0.2060201850802761, + "grad_norm": 10.632098197937012, + "learning_rate": 5.57366754239566e-06, + "loss": 0.4804, + "mean_token_accuracy": 0.8417575985193253, + "num_tokens": 79941908.0, + "step": 66460 + }, + { + "entropy": 1.9567938655614854, + "epoch": 0.2060511842053258, + "grad_norm": 7.4412150382995605, + "learning_rate": 5.57324825842928e-06, + "loss": 0.5404, + "mean_token_accuracy": 0.8311595633625984, + "num_tokens": 79953034.0, + "step": 66470 + }, + { + "entropy": 1.9112473011016846, + "epoch": 0.2060821833303755, + "grad_norm": 10.366971015930176, + "learning_rate": 5.572829069071665e-06, + "loss": 0.4727, + "mean_token_accuracy": 0.8476077273488045, + "num_tokens": 79964657.0, + "step": 66480 + }, + { + "entropy": 1.9210296481847764, + "epoch": 0.2061131824554252, + "grad_norm": 9.77088451385498, + "learning_rate": 5.572409974287238e-06, + "loss": 0.525, + "mean_token_accuracy": 0.8393374785780907, + "num_tokens": 79976836.0, + "step": 66490 + }, + { + "entropy": 1.9088515147566796, + "epoch": 0.2061441815804749, + "grad_norm": 8.623429298400879, + "learning_rate": 5.5719909740404465e-06, + "loss": 0.5084, + "mean_token_accuracy": 0.8390779867768288, + "num_tokens": 79988995.0, + "step": 66500 + }, + { + "entropy": 1.8673071622848512, + "epoch": 0.2061751807055246, + "grad_norm": 4.687022686004639, + "learning_rate": 5.571572068295751e-06, + "loss": 0.451, + "mean_token_accuracy": 0.8348147094249725, + "num_tokens": 80001572.0, + "step": 66510 + }, + { + "entropy": 1.9063550010323524, + "epoch": 0.2062061798305743, + "grad_norm": 7.440145015716553, + "learning_rate": 5.571153257017634e-06, + "loss": 0.5065, + "mean_token_accuracy": 0.8437089160084724, + "num_tokens": 80013203.0, + "step": 66520 + }, + { + "entropy": 1.8806561693549155, + "epoch": 0.206237178955624, + "grad_norm": 3.6028239727020264, + "learning_rate": 5.570734540170597e-06, + "loss": 0.4346, + "mean_token_accuracy": 0.8492167726159096, + "num_tokens": 80025389.0, + "step": 66530 + }, + { + "entropy": 2.0046221882104875, + "epoch": 0.2062681780806737, + "grad_norm": 10.231752395629883, + "learning_rate": 5.570315917719158e-06, + "loss": 0.5636, + "mean_token_accuracy": 0.8313092350959778, + "num_tokens": 80036574.0, + "step": 66540 + }, + { + "entropy": 1.9903879404067992, + "epoch": 0.20629917720572336, + "grad_norm": 9.108248710632324, + "learning_rate": 5.569897389627855e-06, + "loss": 0.6043, + "mean_token_accuracy": 0.8205847591161728, + "num_tokens": 80048154.0, + "step": 66550 + }, + { + "entropy": 1.9472715437412262, + "epoch": 0.20633017633077305, + "grad_norm": 8.83901596069336, + "learning_rate": 5.569478955861244e-06, + "loss": 0.5332, + "mean_token_accuracy": 0.8277617216110229, + "num_tokens": 80060061.0, + "step": 66560 + }, + { + "entropy": 1.9148873046040535, + "epoch": 0.20636117545582275, + "grad_norm": 8.391643524169922, + "learning_rate": 5.5690606163839e-06, + "loss": 0.5329, + "mean_token_accuracy": 0.8314318493008613, + "num_tokens": 80071978.0, + "step": 66570 + }, + { + "entropy": 1.9878364622592926, + "epoch": 0.20639217458087245, + "grad_norm": 9.095196723937988, + "learning_rate": 5.568642371160418e-06, + "loss": 0.5044, + "mean_token_accuracy": 0.8388659507036209, + "num_tokens": 80083500.0, + "step": 66580 + }, + { + "entropy": 1.941962979733944, + "epoch": 0.20642317370592214, + "grad_norm": 10.697854995727539, + "learning_rate": 5.568224220155408e-06, + "loss": 0.5167, + "mean_token_accuracy": 0.833092825114727, + "num_tokens": 80095609.0, + "step": 66590 + }, + { + "entropy": 1.8618491291999817, + "epoch": 0.20645417283097184, + "grad_norm": 3.9840543270111084, + "learning_rate": 5.567806163333503e-06, + "loss": 0.4091, + "mean_token_accuracy": 0.8554950803518295, + "num_tokens": 80108181.0, + "step": 66600 + }, + { + "entropy": 1.8243538379669189, + "epoch": 0.20648517195602153, + "grad_norm": 3.919654130935669, + "learning_rate": 5.5673882006593514e-06, + "loss": 0.4579, + "mean_token_accuracy": 0.8437583655118942, + "num_tokens": 80121901.0, + "step": 66610 + }, + { + "entropy": 1.950028759241104, + "epoch": 0.20651617108107123, + "grad_norm": 9.593600273132324, + "learning_rate": 5.566970332097621e-06, + "loss": 0.5336, + "mean_token_accuracy": 0.8381760567426682, + "num_tokens": 80132818.0, + "step": 66620 + }, + { + "entropy": 1.9005480587482453, + "epoch": 0.20654717020612093, + "grad_norm": 8.083364486694336, + "learning_rate": 5.5665525576129985e-06, + "loss": 0.5259, + "mean_token_accuracy": 0.837388077378273, + "num_tokens": 80145296.0, + "step": 66630 + }, + { + "entropy": 1.9510286539793014, + "epoch": 0.20657816933117062, + "grad_norm": 9.924178123474121, + "learning_rate": 5.566134877170189e-06, + "loss": 0.5598, + "mean_token_accuracy": 0.8308894857764244, + "num_tokens": 80156187.0, + "step": 66640 + }, + { + "entropy": 1.9527115762233733, + "epoch": 0.20660916845622032, + "grad_norm": 8.096648216247559, + "learning_rate": 5.565717290733918e-06, + "loss": 0.5243, + "mean_token_accuracy": 0.8337409555912018, + "num_tokens": 80167814.0, + "step": 66650 + }, + { + "entropy": 2.028332456946373, + "epoch": 0.20664016758127002, + "grad_norm": 8.106879234313965, + "learning_rate": 5.565299798268925e-06, + "loss": 0.5553, + "mean_token_accuracy": 0.8302205935120582, + "num_tokens": 80178500.0, + "step": 66660 + }, + { + "entropy": 2.0136495500802996, + "epoch": 0.20667116670631971, + "grad_norm": 10.762166023254395, + "learning_rate": 5.5648823997399714e-06, + "loss": 0.5889, + "mean_token_accuracy": 0.8241096153855324, + "num_tokens": 80189113.0, + "step": 66670 + }, + { + "entropy": 1.8538544356822968, + "epoch": 0.2067021658313694, + "grad_norm": 9.980530738830566, + "learning_rate": 5.564465095111836e-06, + "loss": 0.4405, + "mean_token_accuracy": 0.854385307431221, + "num_tokens": 80201832.0, + "step": 66680 + }, + { + "entropy": 1.9106570437550545, + "epoch": 0.2067331649564191, + "grad_norm": 8.356368064880371, + "learning_rate": 5.564047884349318e-06, + "loss": 0.4916, + "mean_token_accuracy": 0.8357516199350357, + "num_tokens": 80213809.0, + "step": 66690 + }, + { + "entropy": 1.9118531972169877, + "epoch": 0.2067641640814688, + "grad_norm": 7.672004699707031, + "learning_rate": 5.563630767417233e-06, + "loss": 0.4976, + "mean_token_accuracy": 0.8388531014323235, + "num_tokens": 80226123.0, + "step": 66700 + }, + { + "entropy": 1.8637908115983008, + "epoch": 0.2067951632065185, + "grad_norm": 4.457185745239258, + "learning_rate": 5.563213744280416e-06, + "loss": 0.4235, + "mean_token_accuracy": 0.849481725692749, + "num_tokens": 80238328.0, + "step": 66710 + }, + { + "entropy": 2.0010482162237166, + "epoch": 0.2068261623315682, + "grad_norm": 10.969298362731934, + "learning_rate": 5.562796814903717e-06, + "loss": 0.5952, + "mean_token_accuracy": 0.8272617772221565, + "num_tokens": 80249249.0, + "step": 66720 + }, + { + "entropy": 1.8944606065750123, + "epoch": 0.2068571614566179, + "grad_norm": 8.652742385864258, + "learning_rate": 5.562379979252011e-06, + "loss": 0.5298, + "mean_token_accuracy": 0.8307804465293884, + "num_tokens": 80262484.0, + "step": 66730 + }, + { + "entropy": 1.8551345482468604, + "epoch": 0.2068881605816676, + "grad_norm": 9.101856231689453, + "learning_rate": 5.5619632372901865e-06, + "loss": 0.4624, + "mean_token_accuracy": 0.8466040581464768, + "num_tokens": 80275457.0, + "step": 66740 + }, + { + "entropy": 1.8982827827334403, + "epoch": 0.2069191597067173, + "grad_norm": 7.065288543701172, + "learning_rate": 5.561546588983153e-06, + "loss": 0.4591, + "mean_token_accuracy": 0.851127202808857, + "num_tokens": 80287822.0, + "step": 66750 + }, + { + "entropy": 1.9204147845506667, + "epoch": 0.20695015883176698, + "grad_norm": 3.726187229156494, + "learning_rate": 5.561130034295834e-06, + "loss": 0.5046, + "mean_token_accuracy": 0.8382663875818253, + "num_tokens": 80299151.0, + "step": 66760 + }, + { + "entropy": 1.8413376584649086, + "epoch": 0.20698115795681668, + "grad_norm": 3.96382737159729, + "learning_rate": 5.560713573193179e-06, + "loss": 0.4529, + "mean_token_accuracy": 0.8472777932882309, + "num_tokens": 80311811.0, + "step": 66770 + }, + { + "entropy": 1.8229566425085069, + "epoch": 0.20701215708186638, + "grad_norm": 9.275156021118164, + "learning_rate": 5.560297205640148e-06, + "loss": 0.4624, + "mean_token_accuracy": 0.8521854087710381, + "num_tokens": 80324515.0, + "step": 66780 + }, + { + "entropy": 1.9010821655392647, + "epoch": 0.20704315620691607, + "grad_norm": 3.5451276302337646, + "learning_rate": 5.559880931601726e-06, + "loss": 0.4801, + "mean_token_accuracy": 0.8351982876658439, + "num_tokens": 80337231.0, + "step": 66790 + }, + { + "entropy": 1.9652031093835831, + "epoch": 0.20707415533196574, + "grad_norm": 9.789170265197754, + "learning_rate": 5.559464751042909e-06, + "loss": 0.5863, + "mean_token_accuracy": 0.8287955492734909, + "num_tokens": 80348415.0, + "step": 66800 + }, + { + "entropy": 1.9459122210741042, + "epoch": 0.20710515445701544, + "grad_norm": 8.680326461791992, + "learning_rate": 5.559048663928719e-06, + "loss": 0.5252, + "mean_token_accuracy": 0.837482500076294, + "num_tokens": 80360317.0, + "step": 66810 + }, + { + "entropy": 1.882257905602455, + "epoch": 0.20713615358206514, + "grad_norm": 9.22433090209961, + "learning_rate": 5.558632670224192e-06, + "loss": 0.4569, + "mean_token_accuracy": 0.8451365128159523, + "num_tokens": 80372302.0, + "step": 66820 + }, + { + "entropy": 1.9027293175458908, + "epoch": 0.20716715270711483, + "grad_norm": 10.963581085205078, + "learning_rate": 5.558216769894383e-06, + "loss": 0.5326, + "mean_token_accuracy": 0.8382361069321632, + "num_tokens": 80384407.0, + "step": 66830 + }, + { + "entropy": 1.9402805969119072, + "epoch": 0.20719815183216453, + "grad_norm": 10.329010009765625, + "learning_rate": 5.557800962904364e-06, + "loss": 0.5305, + "mean_token_accuracy": 0.8304101303219795, + "num_tokens": 80396463.0, + "step": 66840 + }, + { + "entropy": 1.9431497290730477, + "epoch": 0.20722915095721423, + "grad_norm": 6.116969108581543, + "learning_rate": 5.557385249219228e-06, + "loss": 0.5488, + "mean_token_accuracy": 0.8325564071536065, + "num_tokens": 80407622.0, + "step": 66850 + }, + { + "entropy": 1.9404701754450797, + "epoch": 0.20726015008226392, + "grad_norm": 8.709739685058594, + "learning_rate": 5.5569696288040865e-06, + "loss": 0.5052, + "mean_token_accuracy": 0.8362139865756035, + "num_tokens": 80418746.0, + "step": 66860 + }, + { + "entropy": 1.833868359029293, + "epoch": 0.20729114920731362, + "grad_norm": 3.70329213142395, + "learning_rate": 5.5565541016240665e-06, + "loss": 0.4345, + "mean_token_accuracy": 0.8538075044751168, + "num_tokens": 80431962.0, + "step": 66870 + }, + { + "entropy": 1.9629965022206306, + "epoch": 0.20732214833236332, + "grad_norm": 7.854671478271484, + "learning_rate": 5.556138667644313e-06, + "loss": 0.5174, + "mean_token_accuracy": 0.8370448827743531, + "num_tokens": 80443654.0, + "step": 66880 + }, + { + "entropy": 1.8818984359502793, + "epoch": 0.207353147457413, + "grad_norm": 9.3052396774292, + "learning_rate": 5.5557233268299925e-06, + "loss": 0.4549, + "mean_token_accuracy": 0.8465585172176361, + "num_tokens": 80456312.0, + "step": 66890 + }, + { + "entropy": 1.925614383816719, + "epoch": 0.2073841465824627, + "grad_norm": 8.081002235412598, + "learning_rate": 5.555308079146288e-06, + "loss": 0.5944, + "mean_token_accuracy": 0.8237112745642662, + "num_tokens": 80468324.0, + "step": 66900 + }, + { + "entropy": 1.9605086162686347, + "epoch": 0.2074151457075124, + "grad_norm": 9.929429054260254, + "learning_rate": 5.554892924558401e-06, + "loss": 0.5574, + "mean_token_accuracy": 0.838320504128933, + "num_tokens": 80480134.0, + "step": 66910 + }, + { + "entropy": 1.9550656154751778, + "epoch": 0.2074461448325621, + "grad_norm": 9.695679664611816, + "learning_rate": 5.554477863031548e-06, + "loss": 0.5097, + "mean_token_accuracy": 0.8479965060949326, + "num_tokens": 80491369.0, + "step": 66920 + }, + { + "entropy": 1.7863646060228349, + "epoch": 0.2074771439576118, + "grad_norm": 8.903475761413574, + "learning_rate": 5.55406289453097e-06, + "loss": 0.4342, + "mean_token_accuracy": 0.8561421141028405, + "num_tokens": 80504627.0, + "step": 66930 + }, + { + "entropy": 1.9312346950173378, + "epoch": 0.2075081430826615, + "grad_norm": 8.315851211547852, + "learning_rate": 5.553648019021922e-06, + "loss": 0.5116, + "mean_token_accuracy": 0.8424394950270653, + "num_tokens": 80515858.0, + "step": 66940 + }, + { + "entropy": 1.9585540309548377, + "epoch": 0.2075391422077112, + "grad_norm": 8.037333488464355, + "learning_rate": 5.553233236469678e-06, + "loss": 0.5528, + "mean_token_accuracy": 0.8312124446034431, + "num_tokens": 80527650.0, + "step": 66950 + }, + { + "entropy": 1.885070489346981, + "epoch": 0.2075701413327609, + "grad_norm": 3.955148458480835, + "learning_rate": 5.552818546839529e-06, + "loss": 0.4427, + "mean_token_accuracy": 0.8447909742593765, + "num_tokens": 80539892.0, + "step": 66960 + }, + { + "entropy": 1.9014841109514236, + "epoch": 0.20760114045781058, + "grad_norm": 8.854087829589844, + "learning_rate": 5.552403950096787e-06, + "loss": 0.4729, + "mean_token_accuracy": 0.8488608747720718, + "num_tokens": 80551351.0, + "step": 66970 + }, + { + "entropy": 1.9420479103922843, + "epoch": 0.20763213958286028, + "grad_norm": 8.27338981628418, + "learning_rate": 5.551989446206778e-06, + "loss": 0.5369, + "mean_token_accuracy": 0.8347111612558364, + "num_tokens": 80563303.0, + "step": 66980 + }, + { + "entropy": 1.830499567091465, + "epoch": 0.20766313870790998, + "grad_norm": 8.519128799438477, + "learning_rate": 5.55157503513485e-06, + "loss": 0.4133, + "mean_token_accuracy": 0.8499986290931701, + "num_tokens": 80575761.0, + "step": 66990 + }, + { + "entropy": 1.8168928176164627, + "epoch": 0.20769413783295967, + "grad_norm": 3.4133358001708984, + "learning_rate": 5.551160716846368e-06, + "loss": 0.4532, + "mean_token_accuracy": 0.8549173071980476, + "num_tokens": 80588988.0, + "step": 67000 + }, + { + "entropy": 1.7608814522624017, + "epoch": 0.20772513695800937, + "grad_norm": 9.151144981384277, + "learning_rate": 5.550746491306713e-06, + "loss": 0.4139, + "mean_token_accuracy": 0.8523845434188843, + "num_tokens": 80602468.0, + "step": 67010 + }, + { + "entropy": 1.8375635772943497, + "epoch": 0.20775613608305907, + "grad_norm": 9.777922630310059, + "learning_rate": 5.5503323584812866e-06, + "loss": 0.4555, + "mean_token_accuracy": 0.8414626881480217, + "num_tokens": 80615406.0, + "step": 67020 + }, + { + "entropy": 1.7990097239613534, + "epoch": 0.20778713520810876, + "grad_norm": 5.260679244995117, + "learning_rate": 5.549918318335509e-06, + "loss": 0.4096, + "mean_token_accuracy": 0.8451185315847397, + "num_tokens": 80628306.0, + "step": 67030 + }, + { + "entropy": 1.8393166303634643, + "epoch": 0.20781813433315843, + "grad_norm": 9.089051246643066, + "learning_rate": 5.549504370834814e-06, + "loss": 0.4342, + "mean_token_accuracy": 0.8503473341464997, + "num_tokens": 80640534.0, + "step": 67040 + }, + { + "entropy": 1.9743037328124047, + "epoch": 0.20784913345820813, + "grad_norm": 7.8871684074401855, + "learning_rate": 5.54909051594466e-06, + "loss": 0.5803, + "mean_token_accuracy": 0.8254914477467536, + "num_tokens": 80652287.0, + "step": 67050 + }, + { + "entropy": 1.9053780257701873, + "epoch": 0.20788013258325783, + "grad_norm": 9.635024070739746, + "learning_rate": 5.5486767536305175e-06, + "loss": 0.4633, + "mean_token_accuracy": 0.8367680624127388, + "num_tokens": 80664802.0, + "step": 67060 + }, + { + "entropy": 1.9582928448915482, + "epoch": 0.20791113170830752, + "grad_norm": 9.71713924407959, + "learning_rate": 5.548263083857879e-06, + "loss": 0.5104, + "mean_token_accuracy": 0.8391944229602813, + "num_tokens": 80676644.0, + "step": 67070 + }, + { + "entropy": 1.914687879383564, + "epoch": 0.20794213083335722, + "grad_norm": 9.875877380371094, + "learning_rate": 5.547849506592251e-06, + "loss": 0.4863, + "mean_token_accuracy": 0.8417564824223518, + "num_tokens": 80688640.0, + "step": 67080 + }, + { + "entropy": 1.883899413049221, + "epoch": 0.20797312995840692, + "grad_norm": 4.134374618530273, + "learning_rate": 5.547436021799163e-06, + "loss": 0.4928, + "mean_token_accuracy": 0.8355391338467598, + "num_tokens": 80701658.0, + "step": 67090 + }, + { + "entropy": 1.892640021443367, + "epoch": 0.2080041290834566, + "grad_norm": 7.394375801086426, + "learning_rate": 5.54702262944416e-06, + "loss": 0.5235, + "mean_token_accuracy": 0.8335898667573929, + "num_tokens": 80713327.0, + "step": 67100 + }, + { + "entropy": 1.8861460164189339, + "epoch": 0.2080351282085063, + "grad_norm": 7.844824314117432, + "learning_rate": 5.546609329492804e-06, + "loss": 0.4888, + "mean_token_accuracy": 0.8451134487986565, + "num_tokens": 80724788.0, + "step": 67110 + }, + { + "entropy": 1.9601202994585036, + "epoch": 0.208066127333556, + "grad_norm": 9.103307723999023, + "learning_rate": 5.546196121910674e-06, + "loss": 0.5232, + "mean_token_accuracy": 0.8401788577437401, + "num_tokens": 80735373.0, + "step": 67120 + }, + { + "entropy": 1.9305698484182359, + "epoch": 0.2080971264586057, + "grad_norm": 10.515917778015137, + "learning_rate": 5.545783006663372e-06, + "loss": 0.506, + "mean_token_accuracy": 0.8503312930464745, + "num_tokens": 80745599.0, + "step": 67130 + }, + { + "entropy": 1.8946173369884491, + "epoch": 0.2081281255836554, + "grad_norm": 8.005970001220703, + "learning_rate": 5.545369983716514e-06, + "loss": 0.5053, + "mean_token_accuracy": 0.8377565070986748, + "num_tokens": 80757413.0, + "step": 67140 + }, + { + "entropy": 1.9489420622587204, + "epoch": 0.2081591247087051, + "grad_norm": 9.35682487487793, + "learning_rate": 5.544957053035733e-06, + "loss": 0.5294, + "mean_token_accuracy": 0.8386912405490875, + "num_tokens": 80767952.0, + "step": 67150 + }, + { + "entropy": 1.8574773401021958, + "epoch": 0.2081901238337548, + "grad_norm": 4.570268154144287, + "learning_rate": 5.5445442145866835e-06, + "loss": 0.4491, + "mean_token_accuracy": 0.8455320358276367, + "num_tokens": 80780046.0, + "step": 67160 + }, + { + "entropy": 1.871639384329319, + "epoch": 0.2082211229588045, + "grad_norm": 7.216861724853516, + "learning_rate": 5.544131468335036e-06, + "loss": 0.453, + "mean_token_accuracy": 0.846157830953598, + "num_tokens": 80792621.0, + "step": 67170 + }, + { + "entropy": 1.84230377972126, + "epoch": 0.20825212208385419, + "grad_norm": 3.1845858097076416, + "learning_rate": 5.543718814246477e-06, + "loss": 0.4188, + "mean_token_accuracy": 0.8477763459086418, + "num_tokens": 80805344.0, + "step": 67180 + }, + { + "entropy": 1.8911578252911567, + "epoch": 0.20828312120890388, + "grad_norm": 10.065200805664062, + "learning_rate": 5.543306252286714e-06, + "loss": 0.5073, + "mean_token_accuracy": 0.8389113351702691, + "num_tokens": 80817758.0, + "step": 67190 + }, + { + "entropy": 1.9382609099149704, + "epoch": 0.20831412033395358, + "grad_norm": 9.62885570526123, + "learning_rate": 5.542893782421471e-06, + "loss": 0.5609, + "mean_token_accuracy": 0.8365870863199234, + "num_tokens": 80828548.0, + "step": 67200 + }, + { + "entropy": 1.9892377644777297, + "epoch": 0.20834511945900328, + "grad_norm": 7.691858768463135, + "learning_rate": 5.54248140461649e-06, + "loss": 0.5937, + "mean_token_accuracy": 0.8110725492238998, + "num_tokens": 80840477.0, + "step": 67210 + }, + { + "entropy": 1.9411058470606803, + "epoch": 0.20837611858405297, + "grad_norm": 12.743127822875977, + "learning_rate": 5.54206911883753e-06, + "loss": 0.5645, + "mean_token_accuracy": 0.8270158648490906, + "num_tokens": 80851801.0, + "step": 67220 + }, + { + "entropy": 1.9056400299072265, + "epoch": 0.20840711770910267, + "grad_norm": 8.924535751342773, + "learning_rate": 5.541656925050371e-06, + "loss": 0.4843, + "mean_token_accuracy": 0.8432095557451248, + "num_tokens": 80864251.0, + "step": 67230 + }, + { + "entropy": 1.8577035881578923, + "epoch": 0.20843811683415236, + "grad_norm": 6.489585876464844, + "learning_rate": 5.541244823220805e-06, + "loss": 0.4592, + "mean_token_accuracy": 0.8534018874168396, + "num_tokens": 80877496.0, + "step": 67240 + }, + { + "entropy": 1.7855018101632596, + "epoch": 0.20846911595920206, + "grad_norm": 2.9368503093719482, + "learning_rate": 5.540832813314648e-06, + "loss": 0.3625, + "mean_token_accuracy": 0.8677582457661629, + "num_tokens": 80891030.0, + "step": 67250 + }, + { + "entropy": 1.9163383916020393, + "epoch": 0.20850011508425176, + "grad_norm": 8.91242504119873, + "learning_rate": 5.54042089529773e-06, + "loss": 0.5228, + "mean_token_accuracy": 0.8421859487891197, + "num_tokens": 80902336.0, + "step": 67260 + }, + { + "entropy": 1.9107780367136002, + "epoch": 0.20853111420930145, + "grad_norm": 8.577934265136719, + "learning_rate": 5.5400090691359e-06, + "loss": 0.499, + "mean_token_accuracy": 0.837343692779541, + "num_tokens": 80912858.0, + "step": 67270 + }, + { + "entropy": 1.868037761747837, + "epoch": 0.20856211333435115, + "grad_norm": 9.165842056274414, + "learning_rate": 5.539597334795024e-06, + "loss": 0.4754, + "mean_token_accuracy": 0.8567280262708664, + "num_tokens": 80924871.0, + "step": 67280 + }, + { + "entropy": 1.897133542597294, + "epoch": 0.20859311245940082, + "grad_norm": 3.6761436462402344, + "learning_rate": 5.539185692240987e-06, + "loss": 0.4565, + "mean_token_accuracy": 0.8450244545936585, + "num_tokens": 80936908.0, + "step": 67290 + }, + { + "entropy": 1.852649959921837, + "epoch": 0.20862411158445052, + "grad_norm": 5.1975483894348145, + "learning_rate": 5.538774141439691e-06, + "loss": 0.4465, + "mean_token_accuracy": 0.8447863146662712, + "num_tokens": 80949637.0, + "step": 67300 + }, + { + "entropy": 1.9235988169908524, + "epoch": 0.2086551107095002, + "grad_norm": 9.723345756530762, + "learning_rate": 5.538362682357055e-06, + "loss": 0.5537, + "mean_token_accuracy": 0.8258920639753342, + "num_tokens": 80960775.0, + "step": 67310 + }, + { + "entropy": 1.9941146105527878, + "epoch": 0.2086861098345499, + "grad_norm": 7.907135963439941, + "learning_rate": 5.537951314959018e-06, + "loss": 0.5285, + "mean_token_accuracy": 0.8398841202259064, + "num_tokens": 80971994.0, + "step": 67320 + }, + { + "entropy": 1.8793514996767045, + "epoch": 0.2087171089595996, + "grad_norm": 8.207256317138672, + "learning_rate": 5.537540039211534e-06, + "loss": 0.4522, + "mean_token_accuracy": 0.8463095262646675, + "num_tokens": 80984309.0, + "step": 67330 + }, + { + "entropy": 1.798450830578804, + "epoch": 0.2087481080846493, + "grad_norm": 2.7568373680114746, + "learning_rate": 5.537128855080577e-06, + "loss": 0.4334, + "mean_token_accuracy": 0.8558945968747139, + "num_tokens": 80997276.0, + "step": 67340 + }, + { + "entropy": 1.8368099942803382, + "epoch": 0.208779107209699, + "grad_norm": 8.600449562072754, + "learning_rate": 5.5367177625321355e-06, + "loss": 0.4832, + "mean_token_accuracy": 0.8459367111325264, + "num_tokens": 81010351.0, + "step": 67350 + }, + { + "entropy": 1.7894717290997506, + "epoch": 0.2088101063347487, + "grad_norm": 2.465794086456299, + "learning_rate": 5.5363067615322206e-06, + "loss": 0.4354, + "mean_token_accuracy": 0.8549363747239113, + "num_tokens": 81024645.0, + "step": 67360 + }, + { + "entropy": 1.80710818618536, + "epoch": 0.2088411054597984, + "grad_norm": 5.430171012878418, + "learning_rate": 5.535895852046857e-06, + "loss": 0.4382, + "mean_token_accuracy": 0.8437926679849624, + "num_tokens": 81038048.0, + "step": 67370 + }, + { + "entropy": 1.9083049342036247, + "epoch": 0.2088721045848481, + "grad_norm": 9.347772598266602, + "learning_rate": 5.535485034042086e-06, + "loss": 0.5258, + "mean_token_accuracy": 0.833581855893135, + "num_tokens": 81050463.0, + "step": 67380 + }, + { + "entropy": 1.9086160019040108, + "epoch": 0.20890310370989779, + "grad_norm": 8.73170280456543, + "learning_rate": 5.535074307483974e-06, + "loss": 0.4727, + "mean_token_accuracy": 0.8540397524833679, + "num_tokens": 81062467.0, + "step": 67390 + }, + { + "entropy": 1.8362308278679849, + "epoch": 0.20893410283494748, + "grad_norm": 6.898866176605225, + "learning_rate": 5.534663672338595e-06, + "loss": 0.4177, + "mean_token_accuracy": 0.8507218867540359, + "num_tokens": 81075477.0, + "step": 67400 + }, + { + "entropy": 1.964953315258026, + "epoch": 0.20896510195999718, + "grad_norm": 10.953664779663086, + "learning_rate": 5.534253128572048e-06, + "loss": 0.515, + "mean_token_accuracy": 0.845398873090744, + "num_tokens": 81086245.0, + "step": 67410 + }, + { + "entropy": 1.8964574694633485, + "epoch": 0.20899610108504688, + "grad_norm": 10.055764198303223, + "learning_rate": 5.533842676150446e-06, + "loss": 0.473, + "mean_token_accuracy": 0.8452073886990548, + "num_tokens": 81098248.0, + "step": 67420 + }, + { + "entropy": 1.9074363961815834, + "epoch": 0.20902710021009657, + "grad_norm": 8.339896202087402, + "learning_rate": 5.533432315039921e-06, + "loss": 0.481, + "mean_token_accuracy": 0.8502192839980125, + "num_tokens": 81109121.0, + "step": 67430 + }, + { + "entropy": 1.9613432750105857, + "epoch": 0.20905809933514627, + "grad_norm": 10.081904411315918, + "learning_rate": 5.533022045206623e-06, + "loss": 0.4932, + "mean_token_accuracy": 0.8488780468702316, + "num_tokens": 81120269.0, + "step": 67440 + }, + { + "entropy": 1.951788181066513, + "epoch": 0.20908909846019597, + "grad_norm": 8.335789680480957, + "learning_rate": 5.532611866616719e-06, + "loss": 0.5574, + "mean_token_accuracy": 0.8319823101162911, + "num_tokens": 81131197.0, + "step": 67450 + }, + { + "entropy": 1.8895439878106117, + "epoch": 0.20912009758524566, + "grad_norm": 9.064699172973633, + "learning_rate": 5.53220177923639e-06, + "loss": 0.5051, + "mean_token_accuracy": 0.8434992015361786, + "num_tokens": 81142524.0, + "step": 67460 + }, + { + "entropy": 1.8257933855056763, + "epoch": 0.20915109671029536, + "grad_norm": 8.583352088928223, + "learning_rate": 5.531791783031842e-06, + "loss": 0.4277, + "mean_token_accuracy": 0.850779265165329, + "num_tokens": 81154674.0, + "step": 67470 + }, + { + "entropy": 1.7744839206337928, + "epoch": 0.20918209583534506, + "grad_norm": 7.863668918609619, + "learning_rate": 5.5313818779692915e-06, + "loss": 0.4427, + "mean_token_accuracy": 0.8497899547219276, + "num_tokens": 81168137.0, + "step": 67480 + }, + { + "entropy": 1.9520211279392243, + "epoch": 0.20921309496039475, + "grad_norm": 12.411447525024414, + "learning_rate": 5.5309720640149785e-06, + "loss": 0.5196, + "mean_token_accuracy": 0.8483351454138756, + "num_tokens": 81178936.0, + "step": 67490 + }, + { + "entropy": 1.8984515577554704, + "epoch": 0.20924409408544445, + "grad_norm": 7.144548416137695, + "learning_rate": 5.530562341135155e-06, + "loss": 0.5423, + "mean_token_accuracy": 0.837235240638256, + "num_tokens": 81191279.0, + "step": 67500 + }, + { + "entropy": 1.8845224693417548, + "epoch": 0.20927509321049415, + "grad_norm": 9.875595092773438, + "learning_rate": 5.5301527092960925e-06, + "loss": 0.4426, + "mean_token_accuracy": 0.8476980268955231, + "num_tokens": 81203433.0, + "step": 67510 + }, + { + "entropy": 1.9049016535282135, + "epoch": 0.20930609233554384, + "grad_norm": 7.915565490722656, + "learning_rate": 5.529743168464083e-06, + "loss": 0.4639, + "mean_token_accuracy": 0.8503067553043365, + "num_tokens": 81215534.0, + "step": 67520 + }, + { + "entropy": 1.8581149563193322, + "epoch": 0.20933709146059354, + "grad_norm": 3.4274792671203613, + "learning_rate": 5.5293337186054315e-06, + "loss": 0.4919, + "mean_token_accuracy": 0.8372601106762886, + "num_tokens": 81227450.0, + "step": 67530 + }, + { + "entropy": 1.8535752549767495, + "epoch": 0.2093680905856432, + "grad_norm": 10.018072128295898, + "learning_rate": 5.528924359686464e-06, + "loss": 0.4527, + "mean_token_accuracy": 0.845684327185154, + "num_tokens": 81239765.0, + "step": 67540 + }, + { + "entropy": 1.8562732204794883, + "epoch": 0.2093990897106929, + "grad_norm": 7.851308822631836, + "learning_rate": 5.528515091673519e-06, + "loss": 0.4674, + "mean_token_accuracy": 0.8401390001177788, + "num_tokens": 81252593.0, + "step": 67550 + }, + { + "entropy": 1.9241983875632287, + "epoch": 0.2094300888357426, + "grad_norm": 9.050371170043945, + "learning_rate": 5.5281059145329605e-06, + "loss": 0.4962, + "mean_token_accuracy": 0.8380835622549057, + "num_tokens": 81264640.0, + "step": 67560 + }, + { + "entropy": 1.9892011910676957, + "epoch": 0.2094610879607923, + "grad_norm": 7.706739902496338, + "learning_rate": 5.527696828231161e-06, + "loss": 0.5774, + "mean_token_accuracy": 0.8230189695954323, + "num_tokens": 81275373.0, + "step": 67570 + }, + { + "entropy": 1.9130545258522034, + "epoch": 0.209492087085842, + "grad_norm": 8.775412559509277, + "learning_rate": 5.527287832734517e-06, + "loss": 0.4685, + "mean_token_accuracy": 0.8499484866857528, + "num_tokens": 81287424.0, + "step": 67580 + }, + { + "entropy": 1.9063115805387496, + "epoch": 0.2095230862108917, + "grad_norm": 9.373629570007324, + "learning_rate": 5.526878928009438e-06, + "loss": 0.4993, + "mean_token_accuracy": 0.8434836745262146, + "num_tokens": 81300644.0, + "step": 67590 + }, + { + "entropy": 2.0209019780158997, + "epoch": 0.2095540853359414, + "grad_norm": 8.37852954864502, + "learning_rate": 5.526470114022357e-06, + "loss": 0.5427, + "mean_token_accuracy": 0.830120287835598, + "num_tokens": 81311779.0, + "step": 67600 + }, + { + "entropy": 1.944344201683998, + "epoch": 0.20958508446099108, + "grad_norm": 8.276811599731445, + "learning_rate": 5.526061390739714e-06, + "loss": 0.5441, + "mean_token_accuracy": 0.8304149687290192, + "num_tokens": 81323352.0, + "step": 67610 + }, + { + "entropy": 1.9719750136137009, + "epoch": 0.20961608358604078, + "grad_norm": 7.648212909698486, + "learning_rate": 5.5256527581279785e-06, + "loss": 0.5287, + "mean_token_accuracy": 0.8441421076655388, + "num_tokens": 81335024.0, + "step": 67620 + }, + { + "entropy": 1.9229772925376891, + "epoch": 0.20964708271109048, + "grad_norm": 4.550539970397949, + "learning_rate": 5.5252442161536276e-06, + "loss": 0.4864, + "mean_token_accuracy": 0.8464221879839897, + "num_tokens": 81346392.0, + "step": 67630 + }, + { + "entropy": 1.9195913657546044, + "epoch": 0.20967808183614017, + "grad_norm": 5.1315999031066895, + "learning_rate": 5.524835764783162e-06, + "loss": 0.5051, + "mean_token_accuracy": 0.8396693095564842, + "num_tokens": 81358537.0, + "step": 67640 + }, + { + "entropy": 1.7798305720090866, + "epoch": 0.20970908096118987, + "grad_norm": 8.729194641113281, + "learning_rate": 5.524427403983096e-06, + "loss": 0.4147, + "mean_token_accuracy": 0.8598864421248436, + "num_tokens": 81372176.0, + "step": 67650 + }, + { + "entropy": 1.8792505249381066, + "epoch": 0.20974008008623957, + "grad_norm": 2.838498592376709, + "learning_rate": 5.524019133719963e-06, + "loss": 0.4509, + "mean_token_accuracy": 0.8509874641895294, + "num_tokens": 81384748.0, + "step": 67660 + }, + { + "entropy": 1.9109538584947585, + "epoch": 0.20977107921128926, + "grad_norm": 9.723373413085938, + "learning_rate": 5.5236109539603145e-06, + "loss": 0.502, + "mean_token_accuracy": 0.8392190888524056, + "num_tokens": 81396571.0, + "step": 67670 + }, + { + "entropy": 1.9938452377915383, + "epoch": 0.20980207833633896, + "grad_norm": 9.84363079071045, + "learning_rate": 5.523202864670717e-06, + "loss": 0.5627, + "mean_token_accuracy": 0.8240502581000329, + "num_tokens": 81408124.0, + "step": 67680 + }, + { + "entropy": 1.944431021809578, + "epoch": 0.20983307746138866, + "grad_norm": 7.82313871383667, + "learning_rate": 5.522794865817755e-06, + "loss": 0.4808, + "mean_token_accuracy": 0.8497413724660874, + "num_tokens": 81418940.0, + "step": 67690 + }, + { + "entropy": 1.866161908209324, + "epoch": 0.20986407658643835, + "grad_norm": 4.241913318634033, + "learning_rate": 5.52238695736803e-06, + "loss": 0.5066, + "mean_token_accuracy": 0.838020247220993, + "num_tokens": 81431920.0, + "step": 67700 + }, + { + "entropy": 1.9558416873216629, + "epoch": 0.20989507571148805, + "grad_norm": 6.742011547088623, + "learning_rate": 5.521979139288163e-06, + "loss": 0.5383, + "mean_token_accuracy": 0.8325184598565102, + "num_tokens": 81443316.0, + "step": 67710 + }, + { + "entropy": 1.9829859271645547, + "epoch": 0.20992607483653775, + "grad_norm": 7.284268856048584, + "learning_rate": 5.521571411544792e-06, + "loss": 0.5344, + "mean_token_accuracy": 0.8363759860396385, + "num_tokens": 81454728.0, + "step": 67720 + }, + { + "entropy": 1.9214450731873511, + "epoch": 0.20995707396158744, + "grad_norm": 8.096468925476074, + "learning_rate": 5.521163774104567e-06, + "loss": 0.4913, + "mean_token_accuracy": 0.83761525452137, + "num_tokens": 81467284.0, + "step": 67730 + }, + { + "entropy": 1.8891347169876098, + "epoch": 0.20998807308663714, + "grad_norm": 8.036181449890137, + "learning_rate": 5.520756226934162e-06, + "loss": 0.4661, + "mean_token_accuracy": 0.8447176307439804, + "num_tokens": 81480629.0, + "step": 67740 + }, + { + "entropy": 1.8999253660440445, + "epoch": 0.21001907221168684, + "grad_norm": 10.522799491882324, + "learning_rate": 5.520348770000264e-06, + "loss": 0.5095, + "mean_token_accuracy": 0.8358193069696427, + "num_tokens": 81492287.0, + "step": 67750 + }, + { + "entropy": 1.8602621525526046, + "epoch": 0.21005007133673653, + "grad_norm": 3.4755728244781494, + "learning_rate": 5.519941403269581e-06, + "loss": 0.4179, + "mean_token_accuracy": 0.8537157669663429, + "num_tokens": 81505125.0, + "step": 67760 + }, + { + "entropy": 1.9202613174915313, + "epoch": 0.21008107046178623, + "grad_norm": 8.444540977478027, + "learning_rate": 5.519534126708833e-06, + "loss": 0.4978, + "mean_token_accuracy": 0.842594002187252, + "num_tokens": 81517173.0, + "step": 67770 + }, + { + "entropy": 1.8747940637171268, + "epoch": 0.2101120695868359, + "grad_norm": 2.7482457160949707, + "learning_rate": 5.519126940284762e-06, + "loss": 0.4996, + "mean_token_accuracy": 0.847461374104023, + "num_tokens": 81530328.0, + "step": 67780 + }, + { + "entropy": 1.904065564274788, + "epoch": 0.2101430687118856, + "grad_norm": 4.310403347015381, + "learning_rate": 5.518719843964123e-06, + "loss": 0.4791, + "mean_token_accuracy": 0.8386267200112343, + "num_tokens": 81542313.0, + "step": 67790 + }, + { + "entropy": 1.9362899020314217, + "epoch": 0.2101740678369353, + "grad_norm": 8.252490043640137, + "learning_rate": 5.518312837713692e-06, + "loss": 0.4957, + "mean_token_accuracy": 0.8409799665212632, + "num_tokens": 81554270.0, + "step": 67800 + }, + { + "entropy": 1.9585260882973672, + "epoch": 0.210205066961985, + "grad_norm": 9.089271545410156, + "learning_rate": 5.517905921500259e-06, + "loss": 0.4919, + "mean_token_accuracy": 0.8473243802785874, + "num_tokens": 81565971.0, + "step": 67810 + }, + { + "entropy": 1.9076432511210442, + "epoch": 0.21023606608703468, + "grad_norm": 8.114571571350098, + "learning_rate": 5.517499095290636e-06, + "loss": 0.4766, + "mean_token_accuracy": 0.8429806470870972, + "num_tokens": 81578195.0, + "step": 67820 + }, + { + "entropy": 1.876829606294632, + "epoch": 0.21026706521208438, + "grad_norm": 10.027071952819824, + "learning_rate": 5.5170923590516444e-06, + "loss": 0.5153, + "mean_token_accuracy": 0.8376459792256356, + "num_tokens": 81591671.0, + "step": 67830 + }, + { + "entropy": 1.9631253123283385, + "epoch": 0.21029806433713408, + "grad_norm": 6.608702659606934, + "learning_rate": 5.51668571275013e-06, + "loss": 0.5715, + "mean_token_accuracy": 0.8352919310331345, + "num_tokens": 81603027.0, + "step": 67840 + }, + { + "entropy": 1.9352620527148248, + "epoch": 0.21032906346218377, + "grad_norm": 3.764373302459717, + "learning_rate": 5.51627915635295e-06, + "loss": 0.4655, + "mean_token_accuracy": 0.8549694180488586, + "num_tokens": 81615161.0, + "step": 67850 + }, + { + "entropy": 1.9944947317242623, + "epoch": 0.21036006258723347, + "grad_norm": 9.673376083374023, + "learning_rate": 5.515872689826984e-06, + "loss": 0.5357, + "mean_token_accuracy": 0.8339143067598342, + "num_tokens": 81627143.0, + "step": 67860 + }, + { + "entropy": 2.035477635264397, + "epoch": 0.21039106171228317, + "grad_norm": 9.949082374572754, + "learning_rate": 5.515466313139126e-06, + "loss": 0.5937, + "mean_token_accuracy": 0.8266064122319221, + "num_tokens": 81637833.0, + "step": 67870 + }, + { + "entropy": 1.9482318013906479, + "epoch": 0.21042206083733286, + "grad_norm": 8.834207534790039, + "learning_rate": 5.5150600262562855e-06, + "loss": 0.4966, + "mean_token_accuracy": 0.8446396380662918, + "num_tokens": 81649773.0, + "step": 67880 + }, + { + "entropy": 2.023282551765442, + "epoch": 0.21045305996238256, + "grad_norm": 8.027750015258789, + "learning_rate": 5.514653829145392e-06, + "loss": 0.5464, + "mean_token_accuracy": 0.8356869459152222, + "num_tokens": 81660767.0, + "step": 67890 + }, + { + "entropy": 1.8931564077734948, + "epoch": 0.21048405908743226, + "grad_norm": 4.419806003570557, + "learning_rate": 5.5142477217733905e-06, + "loss": 0.4572, + "mean_token_accuracy": 0.8490284129977226, + "num_tokens": 81673309.0, + "step": 67900 + }, + { + "entropy": 1.9538974300026895, + "epoch": 0.21051505821248195, + "grad_norm": 9.901854515075684, + "learning_rate": 5.513841704107242e-06, + "loss": 0.4815, + "mean_token_accuracy": 0.8569442689418793, + "num_tokens": 81684556.0, + "step": 67910 + }, + { + "entropy": 1.9160713866353034, + "epoch": 0.21054605733753165, + "grad_norm": 7.857386112213135, + "learning_rate": 5.513435776113929e-06, + "loss": 0.4742, + "mean_token_accuracy": 0.8500816449522972, + "num_tokens": 81696905.0, + "step": 67920 + }, + { + "entropy": 1.864570914208889, + "epoch": 0.21057705646258135, + "grad_norm": 9.086345672607422, + "learning_rate": 5.513029937760446e-06, + "loss": 0.4621, + "mean_token_accuracy": 0.8444380149245262, + "num_tokens": 81709959.0, + "step": 67930 + }, + { + "entropy": 2.0148605525493624, + "epoch": 0.21060805558763104, + "grad_norm": 8.19638442993164, + "learning_rate": 5.512624189013806e-06, + "loss": 0.549, + "mean_token_accuracy": 0.8306930497288704, + "num_tokens": 81721588.0, + "step": 67940 + }, + { + "entropy": 1.9208003774285316, + "epoch": 0.21063905471268074, + "grad_norm": 8.312312126159668, + "learning_rate": 5.512218529841038e-06, + "loss": 0.4823, + "mean_token_accuracy": 0.8352445542812348, + "num_tokens": 81733933.0, + "step": 67950 + }, + { + "entropy": 1.8529336631298066, + "epoch": 0.21067005383773044, + "grad_norm": 3.6399710178375244, + "learning_rate": 5.511812960209193e-06, + "loss": 0.4649, + "mean_token_accuracy": 0.8411158919334412, + "num_tokens": 81746778.0, + "step": 67960 + }, + { + "entropy": 1.9720004379749299, + "epoch": 0.21070105296278013, + "grad_norm": 8.460240364074707, + "learning_rate": 5.511407480085334e-06, + "loss": 0.5148, + "mean_token_accuracy": 0.8386137381196022, + "num_tokens": 81758395.0, + "step": 67970 + }, + { + "entropy": 2.007098397612572, + "epoch": 0.21073205208782983, + "grad_norm": 9.23631763458252, + "learning_rate": 5.51100208943654e-06, + "loss": 0.5553, + "mean_token_accuracy": 0.835183584690094, + "num_tokens": 81769530.0, + "step": 67980 + }, + { + "entropy": 1.8748371377587318, + "epoch": 0.21076305121287953, + "grad_norm": 8.400230407714844, + "learning_rate": 5.510596788229912e-06, + "loss": 0.4947, + "mean_token_accuracy": 0.8293806165456772, + "num_tokens": 81782455.0, + "step": 67990 + }, + { + "entropy": 1.9910854011774064, + "epoch": 0.21079405033792922, + "grad_norm": 8.415918350219727, + "learning_rate": 5.510191576432563e-06, + "loss": 0.5531, + "mean_token_accuracy": 0.8300189360976219, + "num_tokens": 81793212.0, + "step": 68000 + }, + { + "entropy": 1.8886552080512047, + "epoch": 0.21082504946297892, + "grad_norm": 7.983949661254883, + "learning_rate": 5.509786454011627e-06, + "loss": 0.4475, + "mean_token_accuracy": 0.8474817335605621, + "num_tokens": 81804680.0, + "step": 68010 + }, + { + "entropy": 1.929812017083168, + "epoch": 0.21085604858802862, + "grad_norm": 7.43207311630249, + "learning_rate": 5.509381420934252e-06, + "loss": 0.5576, + "mean_token_accuracy": 0.8385087087750435, + "num_tokens": 81816705.0, + "step": 68020 + }, + { + "entropy": 1.9719702288508416, + "epoch": 0.21088704771307829, + "grad_norm": 7.93623685836792, + "learning_rate": 5.5089764771676035e-06, + "loss": 0.4997, + "mean_token_accuracy": 0.8436658188700676, + "num_tokens": 81828323.0, + "step": 68030 + }, + { + "entropy": 1.9078006014227866, + "epoch": 0.21091804683812798, + "grad_norm": 10.205698013305664, + "learning_rate": 5.508571622678865e-06, + "loss": 0.4597, + "mean_token_accuracy": 0.8465578466653824, + "num_tokens": 81840696.0, + "step": 68040 + }, + { + "entropy": 1.9035088881850242, + "epoch": 0.21094904596317768, + "grad_norm": 8.092962265014648, + "learning_rate": 5.5081668574352364e-06, + "loss": 0.4854, + "mean_token_accuracy": 0.8459865510463714, + "num_tokens": 81853045.0, + "step": 68050 + }, + { + "entropy": 2.0051578521728515, + "epoch": 0.21098004508822737, + "grad_norm": 7.893548488616943, + "learning_rate": 5.507762181403934e-06, + "loss": 0.5796, + "mean_token_accuracy": 0.8307501718401908, + "num_tokens": 81863610.0, + "step": 68060 + }, + { + "entropy": 1.862001748383045, + "epoch": 0.21101104421327707, + "grad_norm": 9.222723960876465, + "learning_rate": 5.507357594552191e-06, + "loss": 0.4486, + "mean_token_accuracy": 0.8515830934047699, + "num_tokens": 81876722.0, + "step": 68070 + }, + { + "entropy": 1.9945030003786086, + "epoch": 0.21104204333832677, + "grad_norm": 9.973723411560059, + "learning_rate": 5.5069530968472575e-06, + "loss": 0.5548, + "mean_token_accuracy": 0.830809174478054, + "num_tokens": 81887391.0, + "step": 68080 + }, + { + "entropy": 1.9193598270416259, + "epoch": 0.21107304246337646, + "grad_norm": 8.859430313110352, + "learning_rate": 5.506548688256401e-06, + "loss": 0.4772, + "mean_token_accuracy": 0.851340101659298, + "num_tokens": 81899366.0, + "step": 68090 + }, + { + "entropy": 1.9339387387037277, + "epoch": 0.21110404158842616, + "grad_norm": 4.928318500518799, + "learning_rate": 5.506144368746905e-06, + "loss": 0.5192, + "mean_token_accuracy": 0.8325812682509423, + "num_tokens": 81911145.0, + "step": 68100 + }, + { + "entropy": 1.9344223082065581, + "epoch": 0.21113504071347586, + "grad_norm": 2.5606257915496826, + "learning_rate": 5.505740138286071e-06, + "loss": 0.5272, + "mean_token_accuracy": 0.8293151870369911, + "num_tokens": 81923550.0, + "step": 68110 + }, + { + "entropy": 1.9258287683129311, + "epoch": 0.21116603983852555, + "grad_norm": 7.782974720001221, + "learning_rate": 5.505335996841215e-06, + "loss": 0.489, + "mean_token_accuracy": 0.8488619342446327, + "num_tokens": 81935832.0, + "step": 68120 + }, + { + "entropy": 1.8819763243198395, + "epoch": 0.21119703896357525, + "grad_norm": 2.672853708267212, + "learning_rate": 5.504931944379673e-06, + "loss": 0.4988, + "mean_token_accuracy": 0.8511589944362641, + "num_tokens": 81949301.0, + "step": 68130 + }, + { + "entropy": 2.001712107658386, + "epoch": 0.21122803808862495, + "grad_norm": 9.238136291503906, + "learning_rate": 5.504527980868795e-06, + "loss": 0.5271, + "mean_token_accuracy": 0.8439046397805214, + "num_tokens": 81960491.0, + "step": 68140 + }, + { + "entropy": 1.9117625072598456, + "epoch": 0.21125903721367464, + "grad_norm": 8.495841026306152, + "learning_rate": 5.504124106275948e-06, + "loss": 0.5269, + "mean_token_accuracy": 0.8380966350436211, + "num_tokens": 81972833.0, + "step": 68150 + }, + { + "entropy": 1.9697169050574304, + "epoch": 0.21129003633872434, + "grad_norm": 9.759078025817871, + "learning_rate": 5.5037203205685196e-06, + "loss": 0.5163, + "mean_token_accuracy": 0.8402854591608048, + "num_tokens": 81984232.0, + "step": 68160 + }, + { + "entropy": 1.9297288402915, + "epoch": 0.21132103546377404, + "grad_norm": 4.200081825256348, + "learning_rate": 5.503316623713908e-06, + "loss": 0.5086, + "mean_token_accuracy": 0.8306440845131874, + "num_tokens": 81995830.0, + "step": 68170 + }, + { + "entropy": 1.8650834277272224, + "epoch": 0.21135203458882373, + "grad_norm": 7.745209693908691, + "learning_rate": 5.502913015679533e-06, + "loss": 0.4646, + "mean_token_accuracy": 0.8491313457489014, + "num_tokens": 82008180.0, + "step": 68180 + }, + { + "entropy": 1.9364105448126794, + "epoch": 0.21138303371387343, + "grad_norm": 9.66666030883789, + "learning_rate": 5.502509496432829e-06, + "loss": 0.5228, + "mean_token_accuracy": 0.8386658191680908, + "num_tokens": 82019778.0, + "step": 68190 + }, + { + "entropy": 1.9314407289028168, + "epoch": 0.21141403283892313, + "grad_norm": 9.512344360351562, + "learning_rate": 5.502106065941247e-06, + "loss": 0.5106, + "mean_token_accuracy": 0.8394078284502029, + "num_tokens": 82031972.0, + "step": 68200 + }, + { + "entropy": 1.9816076889634133, + "epoch": 0.21144503196397282, + "grad_norm": 7.756664276123047, + "learning_rate": 5.501702724172256e-06, + "loss": 0.5672, + "mean_token_accuracy": 0.8252545759081841, + "num_tokens": 82043777.0, + "step": 68210 + }, + { + "entropy": 1.9324266403913497, + "epoch": 0.21147603108902252, + "grad_norm": 4.227046966552734, + "learning_rate": 5.501299471093341e-06, + "loss": 0.4712, + "mean_token_accuracy": 0.8418770223855973, + "num_tokens": 82056125.0, + "step": 68220 + }, + { + "entropy": 1.8676659151911736, + "epoch": 0.21150703021407222, + "grad_norm": 9.944682121276855, + "learning_rate": 5.500896306672003e-06, + "loss": 0.448, + "mean_token_accuracy": 0.8422658532857895, + "num_tokens": 82068630.0, + "step": 68230 + }, + { + "entropy": 1.9347829192876815, + "epoch": 0.2115380293391219, + "grad_norm": 8.809101104736328, + "learning_rate": 5.500493230875762e-06, + "loss": 0.4802, + "mean_token_accuracy": 0.8409867346286773, + "num_tokens": 82080910.0, + "step": 68240 + }, + { + "entropy": 1.9162681803107262, + "epoch": 0.2115690284641716, + "grad_norm": 8.236538887023926, + "learning_rate": 5.500090243672151e-06, + "loss": 0.4667, + "mean_token_accuracy": 0.8516815707087517, + "num_tokens": 82092398.0, + "step": 68250 + }, + { + "entropy": 1.9335265710949898, + "epoch": 0.2116000275892213, + "grad_norm": 8.672457695007324, + "learning_rate": 5.4996873450287205e-06, + "loss": 0.5003, + "mean_token_accuracy": 0.8431479528546333, + "num_tokens": 82104602.0, + "step": 68260 + }, + { + "entropy": 1.8349942237138748, + "epoch": 0.211631026714271, + "grad_norm": 7.4427924156188965, + "learning_rate": 5.4992845349130406e-06, + "loss": 0.4437, + "mean_token_accuracy": 0.843446071445942, + "num_tokens": 82117625.0, + "step": 68270 + }, + { + "entropy": 1.9474259793758393, + "epoch": 0.21166202583932067, + "grad_norm": 4.666007041931152, + "learning_rate": 5.498881813292697e-06, + "loss": 0.4792, + "mean_token_accuracy": 0.8499509572982789, + "num_tokens": 82129574.0, + "step": 68280 + }, + { + "entropy": 1.8501393646001816, + "epoch": 0.21169302496437037, + "grad_norm": 8.344110488891602, + "learning_rate": 5.498479180135289e-06, + "loss": 0.4608, + "mean_token_accuracy": 0.8498427078127861, + "num_tokens": 82143041.0, + "step": 68290 + }, + { + "entropy": 1.8913217276334762, + "epoch": 0.21172402408942007, + "grad_norm": 11.081100463867188, + "learning_rate": 5.498076635408436e-06, + "loss": 0.48, + "mean_token_accuracy": 0.8393001139163971, + "num_tokens": 82155572.0, + "step": 68300 + }, + { + "entropy": 1.8870702683925629, + "epoch": 0.21175502321446976, + "grad_norm": 3.9212942123413086, + "learning_rate": 5.497674179079771e-06, + "loss": 0.5092, + "mean_token_accuracy": 0.8416898652911187, + "num_tokens": 82167720.0, + "step": 68310 + }, + { + "entropy": 1.8655384734272957, + "epoch": 0.21178602233951946, + "grad_norm": 9.342215538024902, + "learning_rate": 5.497271811116948e-06, + "loss": 0.4638, + "mean_token_accuracy": 0.8492528066039086, + "num_tokens": 82180629.0, + "step": 68320 + }, + { + "entropy": 1.8181511342525483, + "epoch": 0.21181702146456916, + "grad_norm": 7.617997169494629, + "learning_rate": 5.496869531487634e-06, + "loss": 0.426, + "mean_token_accuracy": 0.8530010193586349, + "num_tokens": 82194518.0, + "step": 68330 + }, + { + "entropy": 1.939797979593277, + "epoch": 0.21184802058961885, + "grad_norm": 6.760346412658691, + "learning_rate": 5.496467340159511e-06, + "loss": 0.5003, + "mean_token_accuracy": 0.8504227936267853, + "num_tokens": 82205732.0, + "step": 68340 + }, + { + "entropy": 1.9736777380108834, + "epoch": 0.21187901971466855, + "grad_norm": 10.192756652832031, + "learning_rate": 5.496065237100283e-06, + "loss": 0.5461, + "mean_token_accuracy": 0.8265336826443672, + "num_tokens": 82217338.0, + "step": 68350 + }, + { + "entropy": 1.9774869233369827, + "epoch": 0.21191001883971824, + "grad_norm": 8.361988067626953, + "learning_rate": 5.495663222277665e-06, + "loss": 0.5018, + "mean_token_accuracy": 0.8427802279591561, + "num_tokens": 82228528.0, + "step": 68360 + }, + { + "entropy": 1.941089576482773, + "epoch": 0.21194101796476794, + "grad_norm": 6.703628063201904, + "learning_rate": 5.495261295659393e-06, + "loss": 0.4872, + "mean_token_accuracy": 0.8476432636380196, + "num_tokens": 82239902.0, + "step": 68370 + }, + { + "entropy": 1.9076963618397713, + "epoch": 0.21197201708981764, + "grad_norm": 7.75342321395874, + "learning_rate": 5.494859457213216e-06, + "loss": 0.5217, + "mean_token_accuracy": 0.8372241660952568, + "num_tokens": 82251693.0, + "step": 68380 + }, + { + "entropy": 1.877877102792263, + "epoch": 0.21200301621486733, + "grad_norm": 4.743284225463867, + "learning_rate": 5.494457706906901e-06, + "loss": 0.4329, + "mean_token_accuracy": 0.854452121257782, + "num_tokens": 82264357.0, + "step": 68390 + }, + { + "entropy": 1.927871122956276, + "epoch": 0.21203401533991703, + "grad_norm": 8.312472343444824, + "learning_rate": 5.494056044708233e-06, + "loss": 0.5026, + "mean_token_accuracy": 0.8458049356937408, + "num_tokens": 82275361.0, + "step": 68400 + }, + { + "entropy": 1.8819461211562156, + "epoch": 0.21206501446496673, + "grad_norm": 8.821391105651855, + "learning_rate": 5.493654470585011e-06, + "loss": 0.4855, + "mean_token_accuracy": 0.8411281630396843, + "num_tokens": 82287079.0, + "step": 68410 + }, + { + "entropy": 1.9309892505407333, + "epoch": 0.21209601359001642, + "grad_norm": 7.936349391937256, + "learning_rate": 5.4932529845050494e-06, + "loss": 0.5459, + "mean_token_accuracy": 0.833111310005188, + "num_tokens": 82298592.0, + "step": 68420 + }, + { + "entropy": 1.9115046963095665, + "epoch": 0.21212701271506612, + "grad_norm": 7.75383996963501, + "learning_rate": 5.492851586436185e-06, + "loss": 0.4562, + "mean_token_accuracy": 0.8484810963273048, + "num_tokens": 82310414.0, + "step": 68430 + }, + { + "entropy": 1.8966820895671845, + "epoch": 0.21215801184011582, + "grad_norm": 9.623336791992188, + "learning_rate": 5.492450276346264e-06, + "loss": 0.5298, + "mean_token_accuracy": 0.824579867720604, + "num_tokens": 82323527.0, + "step": 68440 + }, + { + "entropy": 1.929227152466774, + "epoch": 0.21218901096516551, + "grad_norm": 8.437525749206543, + "learning_rate": 5.4920490542031545e-06, + "loss": 0.4853, + "mean_token_accuracy": 0.8451452344655991, + "num_tokens": 82335454.0, + "step": 68450 + }, + { + "entropy": 1.8906142652034759, + "epoch": 0.2122200100902152, + "grad_norm": 8.768442153930664, + "learning_rate": 5.4916479199747375e-06, + "loss": 0.5266, + "mean_token_accuracy": 0.8401165828108788, + "num_tokens": 82347531.0, + "step": 68460 + }, + { + "entropy": 1.9358887538313865, + "epoch": 0.2122510092152649, + "grad_norm": 8.354040145874023, + "learning_rate": 5.491246873628911e-06, + "loss": 0.5167, + "mean_token_accuracy": 0.8335384339094162, + "num_tokens": 82358918.0, + "step": 68470 + }, + { + "entropy": 1.722612802684307, + "epoch": 0.2122820083403146, + "grad_norm": 4.17601203918457, + "learning_rate": 5.490845915133592e-06, + "loss": 0.418, + "mean_token_accuracy": 0.852487288415432, + "num_tokens": 82373282.0, + "step": 68480 + }, + { + "entropy": 1.9600424468517303, + "epoch": 0.2123130074653643, + "grad_norm": 8.103271484375, + "learning_rate": 5.49044504445671e-06, + "loss": 0.5425, + "mean_token_accuracy": 0.8350730538368225, + "num_tokens": 82384254.0, + "step": 68490 + }, + { + "entropy": 1.9285910725593567, + "epoch": 0.212344006590414, + "grad_norm": 7.515426158905029, + "learning_rate": 5.490044261566214e-06, + "loss": 0.5133, + "mean_token_accuracy": 0.8370998069643975, + "num_tokens": 82395748.0, + "step": 68500 + }, + { + "entropy": 1.7850094467401505, + "epoch": 0.2123750057154637, + "grad_norm": 2.562181234359741, + "learning_rate": 5.489643566430068e-06, + "loss": 0.3974, + "mean_token_accuracy": 0.8605840176343917, + "num_tokens": 82408632.0, + "step": 68510 + }, + { + "entropy": 1.8428252264857292, + "epoch": 0.21240600484051336, + "grad_norm": 10.207568168640137, + "learning_rate": 5.489242959016253e-06, + "loss": 0.4632, + "mean_token_accuracy": 0.8491317898035049, + "num_tokens": 82421317.0, + "step": 68520 + }, + { + "entropy": 1.9188319817185402, + "epoch": 0.21243700396556306, + "grad_norm": 8.114725112915039, + "learning_rate": 5.488842439292764e-06, + "loss": 0.4659, + "mean_token_accuracy": 0.8487970292568207, + "num_tokens": 82432817.0, + "step": 68530 + }, + { + "entropy": 1.8276264518499374, + "epoch": 0.21246800309061276, + "grad_norm": 3.9016194343566895, + "learning_rate": 5.4884420072276175e-06, + "loss": 0.4068, + "mean_token_accuracy": 0.8536558151245117, + "num_tokens": 82445275.0, + "step": 68540 + }, + { + "entropy": 1.8365425869822503, + "epoch": 0.21249900221566245, + "grad_norm": 10.570717811584473, + "learning_rate": 5.48804166278884e-06, + "loss": 0.4222, + "mean_token_accuracy": 0.8519185066223145, + "num_tokens": 82457785.0, + "step": 68550 + }, + { + "entropy": 1.9167245119810103, + "epoch": 0.21253000134071215, + "grad_norm": 6.744617938995361, + "learning_rate": 5.487641405944478e-06, + "loss": 0.5499, + "mean_token_accuracy": 0.8354932352900505, + "num_tokens": 82469125.0, + "step": 68560 + }, + { + "entropy": 1.908539716899395, + "epoch": 0.21256100046576185, + "grad_norm": 8.550407409667969, + "learning_rate": 5.487241236662596e-06, + "loss": 0.456, + "mean_token_accuracy": 0.8480324164032936, + "num_tokens": 82481644.0, + "step": 68570 + }, + { + "entropy": 1.8927012383937836, + "epoch": 0.21259199959081154, + "grad_norm": 8.421663284301758, + "learning_rate": 5.486841154911271e-06, + "loss": 0.4723, + "mean_token_accuracy": 0.8398829996585846, + "num_tokens": 82493695.0, + "step": 68580 + }, + { + "entropy": 1.9059516102075578, + "epoch": 0.21262299871586124, + "grad_norm": 4.052833080291748, + "learning_rate": 5.486441160658598e-06, + "loss": 0.4829, + "mean_token_accuracy": 0.8456622689962388, + "num_tokens": 82505268.0, + "step": 68590 + }, + { + "entropy": 1.9685225948691367, + "epoch": 0.21265399784091094, + "grad_norm": 12.194074630737305, + "learning_rate": 5.486041253872687e-06, + "loss": 0.5623, + "mean_token_accuracy": 0.8278161734342575, + "num_tokens": 82516596.0, + "step": 68600 + }, + { + "entropy": 1.84447330981493, + "epoch": 0.21268499696596063, + "grad_norm": 8.463014602661133, + "learning_rate": 5.485641434521665e-06, + "loss": 0.4572, + "mean_token_accuracy": 0.8529325991868972, + "num_tokens": 82529400.0, + "step": 68610 + }, + { + "entropy": 1.8710281908512116, + "epoch": 0.21271599609101033, + "grad_norm": 7.990627765655518, + "learning_rate": 5.48524170257368e-06, + "loss": 0.4716, + "mean_token_accuracy": 0.8378486022353172, + "num_tokens": 82541818.0, + "step": 68620 + }, + { + "entropy": 1.8808648347854615, + "epoch": 0.21274699521606003, + "grad_norm": 4.200010776519775, + "learning_rate": 5.484842057996887e-06, + "loss": 0.4435, + "mean_token_accuracy": 0.8450862690806389, + "num_tokens": 82554516.0, + "step": 68630 + }, + { + "entropy": 1.8946150675415994, + "epoch": 0.21277799434110972, + "grad_norm": 7.040407657623291, + "learning_rate": 5.484442500759464e-06, + "loss": 0.4846, + "mean_token_accuracy": 0.8425398871302605, + "num_tokens": 82566421.0, + "step": 68640 + }, + { + "entropy": 1.9813665717840194, + "epoch": 0.21280899346615942, + "grad_norm": 7.405858516693115, + "learning_rate": 5.4840430308296035e-06, + "loss": 0.5383, + "mean_token_accuracy": 0.8323499038815498, + "num_tokens": 82578195.0, + "step": 68650 + }, + { + "entropy": 1.9478058412671089, + "epoch": 0.21283999259120912, + "grad_norm": 4.5492329597473145, + "learning_rate": 5.483643648175514e-06, + "loss": 0.5035, + "mean_token_accuracy": 0.8383038908243179, + "num_tokens": 82590022.0, + "step": 68660 + }, + { + "entropy": 1.838891714811325, + "epoch": 0.2128709917162588, + "grad_norm": 7.715846061706543, + "learning_rate": 5.48324435276542e-06, + "loss": 0.4777, + "mean_token_accuracy": 0.8427498295903206, + "num_tokens": 82603321.0, + "step": 68670 + }, + { + "entropy": 1.8843644648790359, + "epoch": 0.2129019908413085, + "grad_norm": 9.188892364501953, + "learning_rate": 5.482845144567561e-06, + "loss": 0.4404, + "mean_token_accuracy": 0.8428171291947365, + "num_tokens": 82615976.0, + "step": 68680 + }, + { + "entropy": 1.9740795180201531, + "epoch": 0.2129329899663582, + "grad_norm": 4.1340765953063965, + "learning_rate": 5.482446023550199e-06, + "loss": 0.5321, + "mean_token_accuracy": 0.8349632441997528, + "num_tokens": 82627312.0, + "step": 68690 + }, + { + "entropy": 1.9550517603754998, + "epoch": 0.2129639890914079, + "grad_norm": 8.036059379577637, + "learning_rate": 5.482046989681602e-06, + "loss": 0.5156, + "mean_token_accuracy": 0.8462382450699806, + "num_tokens": 82638817.0, + "step": 68700 + }, + { + "entropy": 1.8374310091137886, + "epoch": 0.2129949882164576, + "grad_norm": 9.189522743225098, + "learning_rate": 5.481648042930061e-06, + "loss": 0.4203, + "mean_token_accuracy": 0.849276214838028, + "num_tokens": 82651779.0, + "step": 68710 + }, + { + "entropy": 1.9555922120809555, + "epoch": 0.2130259873415073, + "grad_norm": 8.850881576538086, + "learning_rate": 5.481249183263883e-06, + "loss": 0.5377, + "mean_token_accuracy": 0.8374005898833274, + "num_tokens": 82663394.0, + "step": 68720 + }, + { + "entropy": 1.9426356315612794, + "epoch": 0.213056986466557, + "grad_norm": 9.274955749511719, + "learning_rate": 5.480850410651389e-06, + "loss": 0.5114, + "mean_token_accuracy": 0.8367856085300446, + "num_tokens": 82674791.0, + "step": 68730 + }, + { + "entropy": 1.815965899825096, + "epoch": 0.2130879855916067, + "grad_norm": 8.32228946685791, + "learning_rate": 5.4804517250609165e-06, + "loss": 0.4675, + "mean_token_accuracy": 0.8415889486670494, + "num_tokens": 82688146.0, + "step": 68740 + }, + { + "entropy": 1.8485229358077049, + "epoch": 0.21311898471665638, + "grad_norm": 4.761613845825195, + "learning_rate": 5.4800531264608205e-06, + "loss": 0.4694, + "mean_token_accuracy": 0.8439338058233261, + "num_tokens": 82701247.0, + "step": 68750 + }, + { + "entropy": 1.9277915149927138, + "epoch": 0.21314998384170608, + "grad_norm": 9.579967498779297, + "learning_rate": 5.47965461481947e-06, + "loss": 0.5294, + "mean_token_accuracy": 0.8259765520691872, + "num_tokens": 82713022.0, + "step": 68760 + }, + { + "entropy": 1.950098218023777, + "epoch": 0.21318098296675575, + "grad_norm": 10.044625282287598, + "learning_rate": 5.47925619010525e-06, + "loss": 0.5154, + "mean_token_accuracy": 0.8383822947740555, + "num_tokens": 82723810.0, + "step": 68770 + }, + { + "entropy": 1.9772588819265366, + "epoch": 0.21321198209180545, + "grad_norm": 3.088735342025757, + "learning_rate": 5.478857852286567e-06, + "loss": 0.5824, + "mean_token_accuracy": 0.8151338204741478, + "num_tokens": 82735793.0, + "step": 68780 + }, + { + "entropy": 1.8972803056240082, + "epoch": 0.21324298121685514, + "grad_norm": 7.994344711303711, + "learning_rate": 5.478459601331835e-06, + "loss": 0.4829, + "mean_token_accuracy": 0.8478777214884758, + "num_tokens": 82747857.0, + "step": 68790 + }, + { + "entropy": 1.9425535961985587, + "epoch": 0.21327398034190484, + "grad_norm": 9.188566207885742, + "learning_rate": 5.478061437209491e-06, + "loss": 0.5046, + "mean_token_accuracy": 0.8370546743273735, + "num_tokens": 82759441.0, + "step": 68800 + }, + { + "entropy": 1.9374151572585105, + "epoch": 0.21330497946695454, + "grad_norm": 9.73183822631836, + "learning_rate": 5.477663359887986e-06, + "loss": 0.5835, + "mean_token_accuracy": 0.8232653871178627, + "num_tokens": 82771750.0, + "step": 68810 + }, + { + "entropy": 1.8426042452454567, + "epoch": 0.21333597859200423, + "grad_norm": 8.94770622253418, + "learning_rate": 5.4772653693357835e-06, + "loss": 0.4571, + "mean_token_accuracy": 0.8506119534373283, + "num_tokens": 82785172.0, + "step": 68820 + }, + { + "entropy": 1.9215306863188744, + "epoch": 0.21336697771705393, + "grad_norm": 8.768072128295898, + "learning_rate": 5.476867465521369e-06, + "loss": 0.485, + "mean_token_accuracy": 0.8429464474320412, + "num_tokens": 82796191.0, + "step": 68830 + }, + { + "entropy": 1.864244209229946, + "epoch": 0.21339797684210363, + "grad_norm": 8.412409782409668, + "learning_rate": 5.4764696484132394e-06, + "loss": 0.4564, + "mean_token_accuracy": 0.8489756211638451, + "num_tokens": 82808552.0, + "step": 68840 + }, + { + "entropy": 1.942490178346634, + "epoch": 0.21342897596715332, + "grad_norm": 7.980142116546631, + "learning_rate": 5.47607191797991e-06, + "loss": 0.5294, + "mean_token_accuracy": 0.8287267431616783, + "num_tokens": 82820097.0, + "step": 68850 + }, + { + "entropy": 1.9282173097133637, + "epoch": 0.21345997509220302, + "grad_norm": 8.005396842956543, + "learning_rate": 5.475674274189913e-06, + "loss": 0.5122, + "mean_token_accuracy": 0.8375453725457191, + "num_tokens": 82831508.0, + "step": 68860 + }, + { + "entropy": 1.9350904941558837, + "epoch": 0.21349097421725272, + "grad_norm": 9.574058532714844, + "learning_rate": 5.4752767170117924e-06, + "loss": 0.5194, + "mean_token_accuracy": 0.8478696927428245, + "num_tokens": 82842649.0, + "step": 68870 + }, + { + "entropy": 1.9676428467035294, + "epoch": 0.2135219733423024, + "grad_norm": 8.351263046264648, + "learning_rate": 5.474879246414112e-06, + "loss": 0.526, + "mean_token_accuracy": 0.8345917791128159, + "num_tokens": 82854006.0, + "step": 68880 + }, + { + "entropy": 1.8507450267672538, + "epoch": 0.2135529724673521, + "grad_norm": 9.128543853759766, + "learning_rate": 5.47448186236545e-06, + "loss": 0.4949, + "mean_token_accuracy": 0.843813742697239, + "num_tokens": 82866178.0, + "step": 68890 + }, + { + "entropy": 1.9041711300611497, + "epoch": 0.2135839715924018, + "grad_norm": 8.370091438293457, + "learning_rate": 5.474084564834402e-06, + "loss": 0.4606, + "mean_token_accuracy": 0.8432056456804276, + "num_tokens": 82878638.0, + "step": 68900 + }, + { + "entropy": 1.7404463455080985, + "epoch": 0.2136149707174515, + "grad_norm": 9.250161170959473, + "learning_rate": 5.473687353789579e-06, + "loss": 0.3617, + "mean_token_accuracy": 0.8636480793356895, + "num_tokens": 82892733.0, + "step": 68910 + }, + { + "entropy": 1.8433457240462303, + "epoch": 0.2136459698425012, + "grad_norm": 8.090189933776855, + "learning_rate": 5.473290229199604e-06, + "loss": 0.4374, + "mean_token_accuracy": 0.8539926618337631, + "num_tokens": 82905596.0, + "step": 68920 + }, + { + "entropy": 1.9230886220932006, + "epoch": 0.2136769689675509, + "grad_norm": 9.134618759155273, + "learning_rate": 5.472893191033122e-06, + "loss": 0.4933, + "mean_token_accuracy": 0.8437715753912925, + "num_tokens": 82917687.0, + "step": 68930 + }, + { + "entropy": 1.9287979647517204, + "epoch": 0.2137079680926006, + "grad_norm": 9.821601867675781, + "learning_rate": 5.472496239258791e-06, + "loss": 0.5274, + "mean_token_accuracy": 0.839262530207634, + "num_tokens": 82928862.0, + "step": 68940 + }, + { + "entropy": 1.8872993856668472, + "epoch": 0.2137389672176503, + "grad_norm": 8.534150123596191, + "learning_rate": 5.472099373845285e-06, + "loss": 0.4787, + "mean_token_accuracy": 0.8436684221029281, + "num_tokens": 82940484.0, + "step": 68950 + }, + { + "entropy": 1.8906041860580445, + "epoch": 0.21376996634269999, + "grad_norm": 10.668354988098145, + "learning_rate": 5.471702594761294e-06, + "loss": 0.4922, + "mean_token_accuracy": 0.8390226155519486, + "num_tokens": 82952870.0, + "step": 68960 + }, + { + "entropy": 1.808730572462082, + "epoch": 0.21380096546774968, + "grad_norm": 4.311106204986572, + "learning_rate": 5.471305901975526e-06, + "loss": 0.4422, + "mean_token_accuracy": 0.8523236215114594, + "num_tokens": 82965433.0, + "step": 68970 + }, + { + "entropy": 1.9497309267520904, + "epoch": 0.21383196459279938, + "grad_norm": 7.436934471130371, + "learning_rate": 5.470909295456699e-06, + "loss": 0.515, + "mean_token_accuracy": 0.843020536005497, + "num_tokens": 82976459.0, + "step": 68980 + }, + { + "entropy": 1.9538449883460998, + "epoch": 0.21386296371784907, + "grad_norm": 7.01023530960083, + "learning_rate": 5.470512775173554e-06, + "loss": 0.5377, + "mean_token_accuracy": 0.8403704330325127, + "num_tokens": 82987717.0, + "step": 68990 + }, + { + "entropy": 1.879340572655201, + "epoch": 0.21389396284289877, + "grad_norm": 7.733237266540527, + "learning_rate": 5.470116341094843e-06, + "loss": 0.4839, + "mean_token_accuracy": 0.8418584942817688, + "num_tokens": 83000323.0, + "step": 69000 + }, + { + "entropy": 1.9619764864444733, + "epoch": 0.21392496196794847, + "grad_norm": 9.5713472366333, + "learning_rate": 5.469719993189336e-06, + "loss": 0.5458, + "mean_token_accuracy": 0.8402963668107987, + "num_tokens": 83011596.0, + "step": 69010 + }, + { + "entropy": 1.9356929183006286, + "epoch": 0.21395596109299814, + "grad_norm": 7.573556900024414, + "learning_rate": 5.469323731425817e-06, + "loss": 0.499, + "mean_token_accuracy": 0.8439344793558121, + "num_tokens": 83022881.0, + "step": 69020 + }, + { + "entropy": 1.8428806871175767, + "epoch": 0.21398696021804783, + "grad_norm": 4.41666841506958, + "learning_rate": 5.46892755577309e-06, + "loss": 0.4711, + "mean_token_accuracy": 0.845615790784359, + "num_tokens": 83035756.0, + "step": 69030 + }, + { + "entropy": 1.8916326105594634, + "epoch": 0.21401795934309753, + "grad_norm": 8.813542366027832, + "learning_rate": 5.46853146619997e-06, + "loss": 0.485, + "mean_token_accuracy": 0.8480862900614738, + "num_tokens": 83046777.0, + "step": 69040 + }, + { + "entropy": 1.9366579607129097, + "epoch": 0.21404895846814723, + "grad_norm": 10.812383651733398, + "learning_rate": 5.468135462675289e-06, + "loss": 0.5598, + "mean_token_accuracy": 0.8317944586277009, + "num_tokens": 83058798.0, + "step": 69050 + }, + { + "entropy": 1.875098566710949, + "epoch": 0.21407995759319692, + "grad_norm": 7.257170677185059, + "learning_rate": 5.467739545167898e-06, + "loss": 0.4672, + "mean_token_accuracy": 0.8402672663331032, + "num_tokens": 83071399.0, + "step": 69060 + }, + { + "entropy": 1.9152754202485085, + "epoch": 0.21411095671824662, + "grad_norm": 8.708121299743652, + "learning_rate": 5.467343713646658e-06, + "loss": 0.4949, + "mean_token_accuracy": 0.8351624384522438, + "num_tokens": 83083904.0, + "step": 69070 + }, + { + "entropy": 1.938556207716465, + "epoch": 0.21414195584329632, + "grad_norm": 9.21479606628418, + "learning_rate": 5.466947968080452e-06, + "loss": 0.493, + "mean_token_accuracy": 0.8361433446407318, + "num_tokens": 83095844.0, + "step": 69080 + }, + { + "entropy": 1.9366083085536956, + "epoch": 0.214172954968346, + "grad_norm": 8.12380313873291, + "learning_rate": 5.466552308438176e-06, + "loss": 0.5251, + "mean_token_accuracy": 0.8430330127477645, + "num_tokens": 83107044.0, + "step": 69090 + }, + { + "entropy": 1.9676083683967591, + "epoch": 0.2142039540933957, + "grad_norm": 8.887046813964844, + "learning_rate": 5.466156734688738e-06, + "loss": 0.588, + "mean_token_accuracy": 0.8293441072106361, + "num_tokens": 83118055.0, + "step": 69100 + }, + { + "entropy": 1.8269154354929924, + "epoch": 0.2142349532184454, + "grad_norm": 4.919051647186279, + "learning_rate": 5.4657612468010675e-06, + "loss": 0.4613, + "mean_token_accuracy": 0.8531086966395378, + "num_tokens": 83130970.0, + "step": 69110 + }, + { + "entropy": 2.0073640048503876, + "epoch": 0.2142659523434951, + "grad_norm": 9.67685317993164, + "learning_rate": 5.46536584474411e-06, + "loss": 0.5912, + "mean_token_accuracy": 0.8317721515893937, + "num_tokens": 83142153.0, + "step": 69120 + }, + { + "entropy": 1.9201458156108857, + "epoch": 0.2142969514685448, + "grad_norm": 11.463045120239258, + "learning_rate": 5.464970528486821e-06, + "loss": 0.5906, + "mean_token_accuracy": 0.8264217585325241, + "num_tokens": 83153550.0, + "step": 69130 + }, + { + "entropy": 2.0055788159370422, + "epoch": 0.2143279505935945, + "grad_norm": 9.573265075683594, + "learning_rate": 5.464575297998177e-06, + "loss": 0.5833, + "mean_token_accuracy": 0.8264138385653496, + "num_tokens": 83164548.0, + "step": 69140 + }, + { + "entropy": 1.8320914067327976, + "epoch": 0.2143589497186442, + "grad_norm": 7.352808952331543, + "learning_rate": 5.464180153247167e-06, + "loss": 0.432, + "mean_token_accuracy": 0.8488354295492172, + "num_tokens": 83177984.0, + "step": 69150 + }, + { + "entropy": 1.8535144045948981, + "epoch": 0.2143899488436939, + "grad_norm": 8.093338012695312, + "learning_rate": 5.463785094202798e-06, + "loss": 0.5093, + "mean_token_accuracy": 0.8474163025617599, + "num_tokens": 83190325.0, + "step": 69160 + }, + { + "entropy": 1.9603209257125855, + "epoch": 0.21442094796874359, + "grad_norm": 7.552434921264648, + "learning_rate": 5.46339012083409e-06, + "loss": 0.549, + "mean_token_accuracy": 0.8280288457870484, + "num_tokens": 83201141.0, + "step": 69170 + }, + { + "entropy": 1.9304186806082726, + "epoch": 0.21445194709379328, + "grad_norm": 4.334521770477295, + "learning_rate": 5.462995233110082e-06, + "loss": 0.5573, + "mean_token_accuracy": 0.8300809517502785, + "num_tokens": 83212796.0, + "step": 69180 + }, + { + "entropy": 1.9205600872635842, + "epoch": 0.21448294621884298, + "grad_norm": 7.847184658050537, + "learning_rate": 5.462600430999826e-06, + "loss": 0.5061, + "mean_token_accuracy": 0.8355577915906907, + "num_tokens": 83224491.0, + "step": 69190 + }, + { + "entropy": 1.9560988396406174, + "epoch": 0.21451394534389268, + "grad_norm": 8.206576347351074, + "learning_rate": 5.462205714472391e-06, + "loss": 0.5586, + "mean_token_accuracy": 0.8278788521885871, + "num_tokens": 83235792.0, + "step": 69200 + }, + { + "entropy": 1.8794623762369156, + "epoch": 0.21454494446894237, + "grad_norm": 8.750676155090332, + "learning_rate": 5.4618110834968615e-06, + "loss": 0.4661, + "mean_token_accuracy": 0.8406390935182572, + "num_tokens": 83247555.0, + "step": 69210 + }, + { + "entropy": 1.8585446387529374, + "epoch": 0.21457594359399207, + "grad_norm": 3.854595422744751, + "learning_rate": 5.461416538042337e-06, + "loss": 0.4323, + "mean_token_accuracy": 0.8608892947435379, + "num_tokens": 83259237.0, + "step": 69220 + }, + { + "entropy": 1.7895995572209358, + "epoch": 0.21460694271904177, + "grad_norm": 4.206018447875977, + "learning_rate": 5.4610220780779335e-06, + "loss": 0.451, + "mean_token_accuracy": 0.8465156257152557, + "num_tokens": 83272545.0, + "step": 69230 + }, + { + "entropy": 1.7522881373763084, + "epoch": 0.21463794184409146, + "grad_norm": 5.796809673309326, + "learning_rate": 5.46062770357278e-06, + "loss": 0.4137, + "mean_token_accuracy": 0.8636647120118142, + "num_tokens": 83285897.0, + "step": 69240 + }, + { + "entropy": 1.8799547150731086, + "epoch": 0.21466894096914116, + "grad_norm": 4.128879070281982, + "learning_rate": 5.460233414496026e-06, + "loss": 0.4874, + "mean_token_accuracy": 0.8423994764685631, + "num_tokens": 83297294.0, + "step": 69250 + }, + { + "entropy": 1.9117163017392158, + "epoch": 0.21469994009419083, + "grad_norm": 8.076613426208496, + "learning_rate": 5.459839210816833e-06, + "loss": 0.5012, + "mean_token_accuracy": 0.8484107598662376, + "num_tokens": 83308283.0, + "step": 69260 + }, + { + "entropy": 1.8744409173727035, + "epoch": 0.21473093921924052, + "grad_norm": 8.670746803283691, + "learning_rate": 5.459445092504379e-06, + "loss": 0.4878, + "mean_token_accuracy": 0.840450718998909, + "num_tokens": 83319959.0, + "step": 69270 + }, + { + "entropy": 1.83160520195961, + "epoch": 0.21476193834429022, + "grad_norm": 9.35455322265625, + "learning_rate": 5.459051059527855e-06, + "loss": 0.4709, + "mean_token_accuracy": 0.8386999174952507, + "num_tokens": 83332935.0, + "step": 69280 + }, + { + "entropy": 1.9142531588673593, + "epoch": 0.21479293746933992, + "grad_norm": 9.220640182495117, + "learning_rate": 5.458657111856474e-06, + "loss": 0.4902, + "mean_token_accuracy": 0.8410219177603722, + "num_tokens": 83343839.0, + "step": 69290 + }, + { + "entropy": 1.859879168868065, + "epoch": 0.21482393659438961, + "grad_norm": 8.967870712280273, + "learning_rate": 5.458263249459458e-06, + "loss": 0.4535, + "mean_token_accuracy": 0.8425408810377121, + "num_tokens": 83356149.0, + "step": 69300 + }, + { + "entropy": 1.9017312452197075, + "epoch": 0.2148549357194393, + "grad_norm": 8.242757797241211, + "learning_rate": 5.4578694723060475e-06, + "loss": 0.4979, + "mean_token_accuracy": 0.8415887534618378, + "num_tokens": 83367770.0, + "step": 69310 + }, + { + "entropy": 1.8691633999347688, + "epoch": 0.214885934844489, + "grad_norm": 8.582977294921875, + "learning_rate": 5.4574757803654994e-06, + "loss": 0.4712, + "mean_token_accuracy": 0.850647197663784, + "num_tokens": 83378887.0, + "step": 69320 + }, + { + "entropy": 1.908175478875637, + "epoch": 0.2149169339695387, + "grad_norm": 7.830326080322266, + "learning_rate": 5.457082173607083e-06, + "loss": 0.5134, + "mean_token_accuracy": 0.8378239721059799, + "num_tokens": 83390350.0, + "step": 69330 + }, + { + "entropy": 1.9030898958444595, + "epoch": 0.2149479330945884, + "grad_norm": 6.693634510040283, + "learning_rate": 5.456688652000087e-06, + "loss": 0.5441, + "mean_token_accuracy": 0.8313490644097328, + "num_tokens": 83402282.0, + "step": 69340 + }, + { + "entropy": 1.9227802157402039, + "epoch": 0.2149789322196381, + "grad_norm": 13.45190143585205, + "learning_rate": 5.456295215513813e-06, + "loss": 0.514, + "mean_token_accuracy": 0.8335293993353844, + "num_tokens": 83413989.0, + "step": 69350 + }, + { + "entropy": 1.957323005795479, + "epoch": 0.2150099313446878, + "grad_norm": 12.361275672912598, + "learning_rate": 5.455901864117576e-06, + "loss": 0.5202, + "mean_token_accuracy": 0.837789298593998, + "num_tokens": 83424597.0, + "step": 69360 + }, + { + "entropy": 1.9297591596841812, + "epoch": 0.2150409304697375, + "grad_norm": 12.451163291931152, + "learning_rate": 5.455508597780713e-06, + "loss": 0.5219, + "mean_token_accuracy": 0.831664165854454, + "num_tokens": 83436096.0, + "step": 69370 + }, + { + "entropy": 1.9417940333485604, + "epoch": 0.2150719295947872, + "grad_norm": 8.612032890319824, + "learning_rate": 5.455115416472572e-06, + "loss": 0.5074, + "mean_token_accuracy": 0.8476990014314651, + "num_tokens": 83447255.0, + "step": 69380 + }, + { + "entropy": 1.9677758157253264, + "epoch": 0.21510292871983688, + "grad_norm": 7.140786170959473, + "learning_rate": 5.454722320162514e-06, + "loss": 0.5591, + "mean_token_accuracy": 0.8349123015999794, + "num_tokens": 83458140.0, + "step": 69390 + }, + { + "entropy": 1.8533436045050622, + "epoch": 0.21513392784488658, + "grad_norm": 8.395837783813477, + "learning_rate": 5.454329308819922e-06, + "loss": 0.4613, + "mean_token_accuracy": 0.835472346842289, + "num_tokens": 83470694.0, + "step": 69400 + }, + { + "entropy": 1.9218518555164337, + "epoch": 0.21516492696993628, + "grad_norm": Infinity, + "learning_rate": 5.45393638241419e-06, + "loss": 0.518, + "mean_token_accuracy": 0.8393091827630996, + "num_tokens": 83482363.0, + "step": 69410 + }, + { + "entropy": 1.9040866911411285, + "epoch": 0.21519592609498597, + "grad_norm": 7.9769673347473145, + "learning_rate": 5.4535435409147265e-06, + "loss": 0.4728, + "mean_token_accuracy": 0.842051412165165, + "num_tokens": 83494631.0, + "step": 69420 + }, + { + "entropy": 1.9231672808527946, + "epoch": 0.21522692522003567, + "grad_norm": 9.245956420898438, + "learning_rate": 5.45315078429096e-06, + "loss": 0.4454, + "mean_token_accuracy": 0.847654415667057, + "num_tokens": 83506941.0, + "step": 69430 + }, + { + "entropy": 1.9047624886035919, + "epoch": 0.21525792434508537, + "grad_norm": 9.397164344787598, + "learning_rate": 5.452758112512331e-06, + "loss": 0.5046, + "mean_token_accuracy": 0.8385005071759224, + "num_tokens": 83518932.0, + "step": 69440 + }, + { + "entropy": 1.8809319645166398, + "epoch": 0.21528892347013506, + "grad_norm": 10.352240562438965, + "learning_rate": 5.452365525548295e-06, + "loss": 0.5005, + "mean_token_accuracy": 0.8362722516059875, + "num_tokens": 83531350.0, + "step": 69450 + }, + { + "entropy": 1.8688334867358207, + "epoch": 0.21531992259518476, + "grad_norm": 4.476759433746338, + "learning_rate": 5.451973023368325e-06, + "loss": 0.4834, + "mean_token_accuracy": 0.8403964400291443, + "num_tokens": 83543584.0, + "step": 69460 + }, + { + "entropy": 1.9141950756311417, + "epoch": 0.21535092172023446, + "grad_norm": 8.979772567749023, + "learning_rate": 5.451580605941909e-06, + "loss": 0.5066, + "mean_token_accuracy": 0.8465558111667633, + "num_tokens": 83555457.0, + "step": 69470 + }, + { + "entropy": 1.9037456959486008, + "epoch": 0.21538192084528415, + "grad_norm": 12.75918197631836, + "learning_rate": 5.451188273238549e-06, + "loss": 0.4657, + "mean_token_accuracy": 0.8453539073467254, + "num_tokens": 83567575.0, + "step": 69480 + }, + { + "entropy": 1.886037102341652, + "epoch": 0.21541291997033385, + "grad_norm": 2.314100503921509, + "learning_rate": 5.450796025227764e-06, + "loss": 0.463, + "mean_token_accuracy": 0.8452310040593147, + "num_tokens": 83580210.0, + "step": 69490 + }, + { + "entropy": 1.9223955109715463, + "epoch": 0.21544391909538355, + "grad_norm": 7.768220901489258, + "learning_rate": 5.450403861879085e-06, + "loss": 0.436, + "mean_token_accuracy": 0.8553515776991845, + "num_tokens": 83592037.0, + "step": 69500 + }, + { + "entropy": 1.8389130905270576, + "epoch": 0.21547491822043321, + "grad_norm": 3.468709945678711, + "learning_rate": 5.450011783162063e-06, + "loss": 0.4376, + "mean_token_accuracy": 0.8565122127532959, + "num_tokens": 83604963.0, + "step": 69510 + }, + { + "entropy": 1.9391876861453057, + "epoch": 0.2155059173454829, + "grad_norm": 7.677327632904053, + "learning_rate": 5.449619789046263e-06, + "loss": 0.5053, + "mean_token_accuracy": 0.8431411743164062, + "num_tokens": 83616136.0, + "step": 69520 + }, + { + "entropy": 1.923940536379814, + "epoch": 0.2155369164705326, + "grad_norm": 9.04365348815918, + "learning_rate": 5.449227879501263e-06, + "loss": 0.4923, + "mean_token_accuracy": 0.8554372638463974, + "num_tokens": 83627773.0, + "step": 69530 + }, + { + "entropy": 1.8013005346059798, + "epoch": 0.2155679155955823, + "grad_norm": 7.584324359893799, + "learning_rate": 5.448836054496658e-06, + "loss": 0.4881, + "mean_token_accuracy": 0.8366676792502403, + "num_tokens": 83642089.0, + "step": 69540 + }, + { + "entropy": 1.9511119529604912, + "epoch": 0.215598914720632, + "grad_norm": 8.205597877502441, + "learning_rate": 5.448444314002058e-06, + "loss": 0.561, + "mean_token_accuracy": 0.8263240188360215, + "num_tokens": 83653309.0, + "step": 69550 + }, + { + "entropy": 1.8489666640758515, + "epoch": 0.2156299138456817, + "grad_norm": 9.638209342956543, + "learning_rate": 5.448052657987088e-06, + "loss": 0.4825, + "mean_token_accuracy": 0.8453381776809692, + "num_tokens": 83666503.0, + "step": 69560 + }, + { + "entropy": 1.9822421237826346, + "epoch": 0.2156609129707314, + "grad_norm": 7.235997676849365, + "learning_rate": 5.4476610864213905e-06, + "loss": 0.5225, + "mean_token_accuracy": 0.8368374884128571, + "num_tokens": 83677549.0, + "step": 69570 + }, + { + "entropy": 1.9172227129340171, + "epoch": 0.2156919120957811, + "grad_norm": 9.961241722106934, + "learning_rate": 5.447269599274621e-06, + "loss": 0.5732, + "mean_token_accuracy": 0.8320469737052918, + "num_tokens": 83689740.0, + "step": 69580 + }, + { + "entropy": 1.8688659638166427, + "epoch": 0.2157229112208308, + "grad_norm": 4.677713394165039, + "learning_rate": 5.446878196516448e-06, + "loss": 0.4483, + "mean_token_accuracy": 0.8490154132246971, + "num_tokens": 83701923.0, + "step": 69590 + }, + { + "entropy": 1.9590770199894905, + "epoch": 0.21575391034588048, + "grad_norm": 4.404328346252441, + "learning_rate": 5.446486878116561e-06, + "loss": 0.4984, + "mean_token_accuracy": 0.8431831479072571, + "num_tokens": 83713981.0, + "step": 69600 + }, + { + "entropy": 1.9376450017094613, + "epoch": 0.21578490947093018, + "grad_norm": 9.207627296447754, + "learning_rate": 5.446095644044661e-06, + "loss": 0.5232, + "mean_token_accuracy": 0.8345242127776146, + "num_tokens": 83725681.0, + "step": 69610 + }, + { + "entropy": 1.9425865828990936, + "epoch": 0.21581590859597988, + "grad_norm": 9.27120304107666, + "learning_rate": 5.445704494270465e-06, + "loss": 0.4888, + "mean_token_accuracy": 0.8417545929551125, + "num_tokens": 83737057.0, + "step": 69620 + }, + { + "entropy": 1.794182512164116, + "epoch": 0.21584690772102957, + "grad_norm": 9.344676971435547, + "learning_rate": 5.445313428763705e-06, + "loss": 0.4512, + "mean_token_accuracy": 0.8520261868834496, + "num_tokens": 83750518.0, + "step": 69630 + }, + { + "entropy": 1.8970528110861777, + "epoch": 0.21587790684607927, + "grad_norm": 10.588910102844238, + "learning_rate": 5.444922447494128e-06, + "loss": 0.4665, + "mean_token_accuracy": 0.8415069609880448, + "num_tokens": 83763123.0, + "step": 69640 + }, + { + "entropy": 1.9143267586827277, + "epoch": 0.21590890597112897, + "grad_norm": 8.116543769836426, + "learning_rate": 5.444531550431497e-06, + "loss": 0.487, + "mean_token_accuracy": 0.8432750076055526, + "num_tokens": 83774896.0, + "step": 69650 + }, + { + "entropy": 1.8820920512080193, + "epoch": 0.21593990509617866, + "grad_norm": 9.194504737854004, + "learning_rate": 5.444140737545589e-06, + "loss": 0.4566, + "mean_token_accuracy": 0.8378720536828042, + "num_tokens": 83786830.0, + "step": 69660 + }, + { + "entropy": 1.8950974389910697, + "epoch": 0.21597090422122836, + "grad_norm": 9.50033187866211, + "learning_rate": 5.443750008806198e-06, + "loss": 0.5127, + "mean_token_accuracy": 0.840063288807869, + "num_tokens": 83798450.0, + "step": 69670 + }, + { + "entropy": 1.9445108637213706, + "epoch": 0.21600190334627806, + "grad_norm": 7.2820892333984375, + "learning_rate": 5.443359364183132e-06, + "loss": 0.5052, + "mean_token_accuracy": 0.8457172334194183, + "num_tokens": 83809795.0, + "step": 69680 + }, + { + "entropy": 1.8929253578186036, + "epoch": 0.21603290247132775, + "grad_norm": 8.967961311340332, + "learning_rate": 5.442968803646214e-06, + "loss": 0.4744, + "mean_token_accuracy": 0.8517738983035088, + "num_tokens": 83821954.0, + "step": 69690 + }, + { + "entropy": 1.8933438271284104, + "epoch": 0.21606390159637745, + "grad_norm": 8.333928108215332, + "learning_rate": 5.4425783271652824e-06, + "loss": 0.4771, + "mean_token_accuracy": 0.8390064716339112, + "num_tokens": 83834653.0, + "step": 69700 + }, + { + "entropy": 1.8290221512317657, + "epoch": 0.21609490072142715, + "grad_norm": 4.229883670806885, + "learning_rate": 5.44218793471019e-06, + "loss": 0.4057, + "mean_token_accuracy": 0.8560205176472664, + "num_tokens": 83847132.0, + "step": 69710 + }, + { + "entropy": 1.9552450522780418, + "epoch": 0.21612589984647684, + "grad_norm": 8.421517372131348, + "learning_rate": 5.441797626250808e-06, + "loss": 0.5362, + "mean_token_accuracy": 0.8295035645365715, + "num_tokens": 83858171.0, + "step": 69720 + }, + { + "entropy": 1.912724708020687, + "epoch": 0.21615689897152654, + "grad_norm": 7.742305278778076, + "learning_rate": 5.441407401757017e-06, + "loss": 0.5841, + "mean_token_accuracy": 0.8269262716174126, + "num_tokens": 83870397.0, + "step": 69730 + }, + { + "entropy": 1.9444334492087365, + "epoch": 0.21618789809657624, + "grad_norm": 4.496399402618408, + "learning_rate": 5.44101726119872e-06, + "loss": 0.4989, + "mean_token_accuracy": 0.8388503283262253, + "num_tokens": 83882469.0, + "step": 69740 + }, + { + "entropy": 1.8968336150050162, + "epoch": 0.21621889722162593, + "grad_norm": 7.71511697769165, + "learning_rate": 5.440627204545827e-06, + "loss": 0.4742, + "mean_token_accuracy": 0.8507803946733474, + "num_tokens": 83894493.0, + "step": 69750 + }, + { + "entropy": 1.9784729063510895, + "epoch": 0.2162498963466756, + "grad_norm": 10.393377304077148, + "learning_rate": 5.440237231768271e-06, + "loss": 0.5628, + "mean_token_accuracy": 0.8280197530984879, + "num_tokens": 83905959.0, + "step": 69760 + }, + { + "entropy": 1.9202745750546455, + "epoch": 0.2162808954717253, + "grad_norm": 7.838612079620361, + "learning_rate": 5.439847342835992e-06, + "loss": 0.517, + "mean_token_accuracy": 0.8408389419317246, + "num_tokens": 83917828.0, + "step": 69770 + }, + { + "entropy": 1.9422619119286537, + "epoch": 0.216311894596775, + "grad_norm": 3.9136011600494385, + "learning_rate": 5.439457537718953e-06, + "loss": 0.474, + "mean_token_accuracy": 0.8431269928812981, + "num_tokens": 83929517.0, + "step": 69780 + }, + { + "entropy": 2.040539500117302, + "epoch": 0.2163428937218247, + "grad_norm": 10.292856216430664, + "learning_rate": 5.4390678163871265e-06, + "loss": 0.585, + "mean_token_accuracy": 0.8160558506846428, + "num_tokens": 83940294.0, + "step": 69790 + }, + { + "entropy": 2.000251492857933, + "epoch": 0.2163738928468744, + "grad_norm": 7.672717571258545, + "learning_rate": 5.438678178810503e-06, + "loss": 0.5194, + "mean_token_accuracy": 0.8404903680086135, + "num_tokens": 83951400.0, + "step": 69800 + }, + { + "entropy": 1.9795056134462357, + "epoch": 0.21640489197192408, + "grad_norm": 9.820237159729004, + "learning_rate": 5.438288624959086e-06, + "loss": 0.5402, + "mean_token_accuracy": 0.8368617355823517, + "num_tokens": 83963042.0, + "step": 69810 + }, + { + "entropy": 1.9222991615533829, + "epoch": 0.21643589109697378, + "grad_norm": 9.5084810256958, + "learning_rate": 5.437899154802895e-06, + "loss": 0.5009, + "mean_token_accuracy": 0.8351835623383522, + "num_tokens": 83974409.0, + "step": 69820 + }, + { + "entropy": 1.945703822374344, + "epoch": 0.21646689022202348, + "grad_norm": 11.023735046386719, + "learning_rate": 5.437509768311967e-06, + "loss": 0.5189, + "mean_token_accuracy": 0.8369563966989517, + "num_tokens": 83985791.0, + "step": 69830 + }, + { + "entropy": 1.8587691098451615, + "epoch": 0.21649788934707317, + "grad_norm": 3.4747183322906494, + "learning_rate": 5.437120465456348e-06, + "loss": 0.4441, + "mean_token_accuracy": 0.8473239108920098, + "num_tokens": 83998427.0, + "step": 69840 + }, + { + "entropy": 1.891299197077751, + "epoch": 0.21652888847212287, + "grad_norm": 7.577733993530273, + "learning_rate": 5.436731246206105e-06, + "loss": 0.4721, + "mean_token_accuracy": 0.846482339501381, + "num_tokens": 84010579.0, + "step": 69850 + }, + { + "entropy": 1.9225863426923753, + "epoch": 0.21655988759717257, + "grad_norm": 6.7725324630737305, + "learning_rate": 5.436342110531316e-06, + "loss": 0.5036, + "mean_token_accuracy": 0.8383674845099449, + "num_tokens": 84022493.0, + "step": 69860 + }, + { + "entropy": 1.955282860994339, + "epoch": 0.21659088672222226, + "grad_norm": 9.114567756652832, + "learning_rate": 5.435953058402078e-06, + "loss": 0.5296, + "mean_token_accuracy": 0.8373220950365067, + "num_tokens": 84033749.0, + "step": 69870 + }, + { + "entropy": 2.0027921319007875, + "epoch": 0.21662188584727196, + "grad_norm": 8.859573364257812, + "learning_rate": 5.435564089788498e-06, + "loss": 0.5998, + "mean_token_accuracy": 0.8251512601971627, + "num_tokens": 84044757.0, + "step": 69880 + }, + { + "entropy": 1.8767832040786743, + "epoch": 0.21665288497232166, + "grad_norm": 8.931615829467773, + "learning_rate": 5.435175204660702e-06, + "loss": 0.516, + "mean_token_accuracy": 0.8378856316208839, + "num_tokens": 84056798.0, + "step": 69890 + }, + { + "entropy": 1.8918719656765461, + "epoch": 0.21668388409737135, + "grad_norm": 9.630752563476562, + "learning_rate": 5.4347864029888284e-06, + "loss": 0.4562, + "mean_token_accuracy": 0.8510532274842262, + "num_tokens": 84068455.0, + "step": 69900 + }, + { + "entropy": 1.9158081158995628, + "epoch": 0.21671488322242105, + "grad_norm": 4.970503807067871, + "learning_rate": 5.434397684743034e-06, + "loss": 0.4817, + "mean_token_accuracy": 0.8565353244543076, + "num_tokens": 84080433.0, + "step": 69910 + }, + { + "entropy": 1.8488872632384301, + "epoch": 0.21674588234747075, + "grad_norm": 8.853670120239258, + "learning_rate": 5.434009049893485e-06, + "loss": 0.4252, + "mean_token_accuracy": 0.8482698664069176, + "num_tokens": 84093473.0, + "step": 69920 + }, + { + "entropy": 1.941627450287342, + "epoch": 0.21677688147252044, + "grad_norm": 8.593602180480957, + "learning_rate": 5.433620498410368e-06, + "loss": 0.5158, + "mean_token_accuracy": 0.8356386333703995, + "num_tokens": 84105334.0, + "step": 69930 + }, + { + "entropy": 1.9388030782341956, + "epoch": 0.21680788059757014, + "grad_norm": 8.22298526763916, + "learning_rate": 5.433232030263881e-06, + "loss": 0.5375, + "mean_token_accuracy": 0.8294572129845619, + "num_tokens": 84117279.0, + "step": 69940 + }, + { + "entropy": 1.844594794511795, + "epoch": 0.21683887972261984, + "grad_norm": 9.116217613220215, + "learning_rate": 5.432843645424239e-06, + "loss": 0.4281, + "mean_token_accuracy": 0.8563014373183251, + "num_tokens": 84129513.0, + "step": 69950 + }, + { + "entropy": 1.860558421909809, + "epoch": 0.21686987884766953, + "grad_norm": 8.21207332611084, + "learning_rate": 5.43245534386167e-06, + "loss": 0.449, + "mean_token_accuracy": 0.8442764699459075, + "num_tokens": 84142071.0, + "step": 69960 + }, + { + "entropy": 1.9477815553545952, + "epoch": 0.21690087797271923, + "grad_norm": 9.044229507446289, + "learning_rate": 5.432067125546419e-06, + "loss": 0.507, + "mean_token_accuracy": 0.8379213094711304, + "num_tokens": 84153180.0, + "step": 69970 + }, + { + "entropy": 1.9056160926818848, + "epoch": 0.21693187709776893, + "grad_norm": 8.381840705871582, + "learning_rate": 5.431678990448746e-06, + "loss": 0.5013, + "mean_token_accuracy": 0.8412555903196335, + "num_tokens": 84165277.0, + "step": 69980 + }, + { + "entropy": 1.9368507653474807, + "epoch": 0.21696287622281862, + "grad_norm": 9.226876258850098, + "learning_rate": 5.431290938538921e-06, + "loss": 0.5589, + "mean_token_accuracy": 0.8354489773511886, + "num_tokens": 84176763.0, + "step": 69990 + }, + { + "entropy": 1.9172518044710158, + "epoch": 0.21699387534786832, + "grad_norm": 4.2976603507995605, + "learning_rate": 5.430902969787236e-06, + "loss": 0.5022, + "mean_token_accuracy": 0.8291812673211097, + "num_tokens": 84189241.0, + "step": 70000 + }, + { + "entropy": 1.8809144780039788, + "epoch": 0.217024874472918, + "grad_norm": 7.371922016143799, + "learning_rate": 5.430515084163993e-06, + "loss": 0.4669, + "mean_token_accuracy": 0.8422490224242211, + "num_tokens": 84201347.0, + "step": 70010 + }, + { + "entropy": 1.951662066578865, + "epoch": 0.21705587359796769, + "grad_norm": 10.154802322387695, + "learning_rate": 5.43012728163951e-06, + "loss": 0.5137, + "mean_token_accuracy": 0.8398729935288429, + "num_tokens": 84213095.0, + "step": 70020 + }, + { + "entropy": 1.8219065442681313, + "epoch": 0.21708687272301738, + "grad_norm": 8.22426700592041, + "learning_rate": 5.429739562184121e-06, + "loss": 0.4161, + "mean_token_accuracy": 0.8590575277805328, + "num_tokens": 84225978.0, + "step": 70030 + }, + { + "entropy": 1.8591913282871246, + "epoch": 0.21711787184806708, + "grad_norm": 9.920334815979004, + "learning_rate": 5.429351925768173e-06, + "loss": 0.4268, + "mean_token_accuracy": 0.8464103743433953, + "num_tokens": 84239200.0, + "step": 70040 + }, + { + "entropy": 1.88853762447834, + "epoch": 0.21714887097311678, + "grad_norm": 4.088785171508789, + "learning_rate": 5.428964372362031e-06, + "loss": 0.4223, + "mean_token_accuracy": 0.8548205360770226, + "num_tokens": 84252272.0, + "step": 70050 + }, + { + "entropy": 1.8458364441990853, + "epoch": 0.21717987009816647, + "grad_norm": 9.360186576843262, + "learning_rate": 5.428576901936069e-06, + "loss": 0.4356, + "mean_token_accuracy": 0.852430385351181, + "num_tokens": 84265230.0, + "step": 70060 + }, + { + "entropy": 1.9471904829144477, + "epoch": 0.21721086922321617, + "grad_norm": 3.6564393043518066, + "learning_rate": 5.428189514460681e-06, + "loss": 0.484, + "mean_token_accuracy": 0.8501205831766129, + "num_tokens": 84276446.0, + "step": 70070 + }, + { + "entropy": 1.8922279462218285, + "epoch": 0.21724186834826587, + "grad_norm": 3.798719644546509, + "learning_rate": 5.427802209906274e-06, + "loss": 0.5129, + "mean_token_accuracy": 0.838223971426487, + "num_tokens": 84288449.0, + "step": 70080 + }, + { + "entropy": 1.9325299307703971, + "epoch": 0.21727286747331556, + "grad_norm": 9.839792251586914, + "learning_rate": 5.427414988243273e-06, + "loss": 0.5621, + "mean_token_accuracy": 0.83007542937994, + "num_tokens": 84299853.0, + "step": 70090 + }, + { + "entropy": 1.8988226309418679, + "epoch": 0.21730386659836526, + "grad_norm": 7.719738960266113, + "learning_rate": 5.427027849442109e-06, + "loss": 0.4806, + "mean_token_accuracy": 0.8383999258279801, + "num_tokens": 84312190.0, + "step": 70100 + }, + { + "entropy": 1.8617843106389045, + "epoch": 0.21733486572341496, + "grad_norm": 8.315159797668457, + "learning_rate": 5.426640793473237e-06, + "loss": 0.4371, + "mean_token_accuracy": 0.8574797883629799, + "num_tokens": 84324677.0, + "step": 70110 + }, + { + "entropy": 1.9673965632915498, + "epoch": 0.21736586484846465, + "grad_norm": 8.235828399658203, + "learning_rate": 5.426253820307122e-06, + "loss": 0.4979, + "mean_token_accuracy": 0.8495687767863274, + "num_tokens": 84335718.0, + "step": 70120 + }, + { + "entropy": 1.9207556203007699, + "epoch": 0.21739686397351435, + "grad_norm": 8.374682426452637, + "learning_rate": 5.4258669299142465e-06, + "loss": 0.4762, + "mean_token_accuracy": 0.8351808488368988, + "num_tokens": 84347566.0, + "step": 70130 + }, + { + "entropy": 1.9010922938585282, + "epoch": 0.21742786309856404, + "grad_norm": 9.063957214355469, + "learning_rate": 5.425480122265106e-06, + "loss": 0.501, + "mean_token_accuracy": 0.8409019201993942, + "num_tokens": 84360542.0, + "step": 70140 + }, + { + "entropy": 1.9091226264834404, + "epoch": 0.21745886222361374, + "grad_norm": 9.379876136779785, + "learning_rate": 5.425093397330208e-06, + "loss": 0.4669, + "mean_token_accuracy": 0.8458316907286644, + "num_tokens": 84372835.0, + "step": 70150 + }, + { + "entropy": 1.7553482845425605, + "epoch": 0.21748986134866344, + "grad_norm": 3.750493288040161, + "learning_rate": 5.424706755080079e-06, + "loss": 0.4027, + "mean_token_accuracy": 0.8478049889206887, + "num_tokens": 84387102.0, + "step": 70160 + }, + { + "entropy": 1.9269262328743935, + "epoch": 0.21752086047371313, + "grad_norm": 11.567874908447266, + "learning_rate": 5.4243201954852605e-06, + "loss": 0.5347, + "mean_token_accuracy": 0.837294514477253, + "num_tokens": 84398445.0, + "step": 70170 + }, + { + "entropy": 1.757249329984188, + "epoch": 0.21755185959876283, + "grad_norm": 2.541597604751587, + "learning_rate": 5.423933718516307e-06, + "loss": 0.4143, + "mean_token_accuracy": 0.8519966840744019, + "num_tokens": 84412057.0, + "step": 70180 + }, + { + "entropy": 1.9713240012526512, + "epoch": 0.21758285872381253, + "grad_norm": 9.577204704284668, + "learning_rate": 5.423547324143784e-06, + "loss": 0.4958, + "mean_token_accuracy": 0.8448547378182412, + "num_tokens": 84423064.0, + "step": 70190 + }, + { + "entropy": 1.9091683775186539, + "epoch": 0.21761385784886222, + "grad_norm": 7.392279624938965, + "learning_rate": 5.423161012338279e-06, + "loss": 0.494, + "mean_token_accuracy": 0.8476917743682861, + "num_tokens": 84435063.0, + "step": 70200 + }, + { + "entropy": 1.926077064871788, + "epoch": 0.21764485697391192, + "grad_norm": 8.991741180419922, + "learning_rate": 5.422774783070391e-06, + "loss": 0.542, + "mean_token_accuracy": 0.8332406044006347, + "num_tokens": 84446653.0, + "step": 70210 + }, + { + "entropy": 1.9285721063613892, + "epoch": 0.21767585609896162, + "grad_norm": 4.427052974700928, + "learning_rate": 5.42238863631073e-06, + "loss": 0.5299, + "mean_token_accuracy": 0.8307848066091538, + "num_tokens": 84458387.0, + "step": 70220 + }, + { + "entropy": 1.8635422542691231, + "epoch": 0.21770685522401131, + "grad_norm": 8.436079025268555, + "learning_rate": 5.422002572029925e-06, + "loss": 0.4617, + "mean_token_accuracy": 0.8484039083123207, + "num_tokens": 84470516.0, + "step": 70230 + }, + { + "entropy": 1.9386792957782746, + "epoch": 0.217737854349061, + "grad_norm": 4.04444694519043, + "learning_rate": 5.421616590198619e-06, + "loss": 0.479, + "mean_token_accuracy": 0.8558227822184563, + "num_tokens": 84481818.0, + "step": 70240 + }, + { + "entropy": 1.9428984984755515, + "epoch": 0.21776885347411068, + "grad_norm": 7.593319892883301, + "learning_rate": 5.4212306907874705e-06, + "loss": 0.4931, + "mean_token_accuracy": 0.8404227659106255, + "num_tokens": 84493867.0, + "step": 70250 + }, + { + "entropy": 1.854594275355339, + "epoch": 0.21779985259916038, + "grad_norm": 10.006421089172363, + "learning_rate": 5.420844873767147e-06, + "loss": 0.4851, + "mean_token_accuracy": 0.8399219319224358, + "num_tokens": 84506467.0, + "step": 70260 + }, + { + "entropy": 1.9580897375941277, + "epoch": 0.21783085172421007, + "grad_norm": 9.24285888671875, + "learning_rate": 5.420459139108339e-06, + "loss": 0.5512, + "mean_token_accuracy": 0.8321612671017646, + "num_tokens": 84517655.0, + "step": 70270 + }, + { + "entropy": 1.9946820080280303, + "epoch": 0.21786185084925977, + "grad_norm": 9.095592498779297, + "learning_rate": 5.420073486781746e-06, + "loss": 0.5713, + "mean_token_accuracy": 0.8361142173409462, + "num_tokens": 84527947.0, + "step": 70280 + }, + { + "entropy": 1.819739530980587, + "epoch": 0.21789284997430947, + "grad_norm": 9.337332725524902, + "learning_rate": 5.419687916758083e-06, + "loss": 0.414, + "mean_token_accuracy": 0.857285912334919, + "num_tokens": 84540966.0, + "step": 70290 + }, + { + "entropy": 1.959464368224144, + "epoch": 0.21792384909935916, + "grad_norm": 9.26564884185791, + "learning_rate": 5.419302429008081e-06, + "loss": 0.5639, + "mean_token_accuracy": 0.8280451089143753, + "num_tokens": 84552392.0, + "step": 70300 + }, + { + "entropy": 1.824575850367546, + "epoch": 0.21795484822440886, + "grad_norm": 8.046067237854004, + "learning_rate": 5.418917023502482e-06, + "loss": 0.496, + "mean_token_accuracy": 0.8442411884665489, + "num_tokens": 84565104.0, + "step": 70310 + }, + { + "entropy": 1.9438457012176513, + "epoch": 0.21798584734945856, + "grad_norm": 8.829744338989258, + "learning_rate": 5.41853170021205e-06, + "loss": 0.5719, + "mean_token_accuracy": 0.8261488035321236, + "num_tokens": 84576914.0, + "step": 70320 + }, + { + "entropy": 1.9732747316360473, + "epoch": 0.21801684647450825, + "grad_norm": 8.278334617614746, + "learning_rate": 5.418146459107554e-06, + "loss": 0.5788, + "mean_token_accuracy": 0.8282134965062141, + "num_tokens": 84588102.0, + "step": 70330 + }, + { + "entropy": 1.9039635524153709, + "epoch": 0.21804784559955795, + "grad_norm": 7.718066692352295, + "learning_rate": 5.417761300159784e-06, + "loss": 0.4893, + "mean_token_accuracy": 0.8389883413910866, + "num_tokens": 84599955.0, + "step": 70340 + }, + { + "entropy": 1.910678581893444, + "epoch": 0.21807884472460765, + "grad_norm": 3.6842432022094727, + "learning_rate": 5.417376223339544e-06, + "loss": 0.5259, + "mean_token_accuracy": 0.8448718905448913, + "num_tokens": 84611103.0, + "step": 70350 + }, + { + "entropy": 1.8811346575617791, + "epoch": 0.21810984384965734, + "grad_norm": 10.019588470458984, + "learning_rate": 5.41699122861765e-06, + "loss": 0.5111, + "mean_token_accuracy": 0.8373315662145615, + "num_tokens": 84622901.0, + "step": 70360 + }, + { + "entropy": 1.751522246003151, + "epoch": 0.21814084297470704, + "grad_norm": 9.472254753112793, + "learning_rate": 5.416606315964937e-06, + "loss": 0.4021, + "mean_token_accuracy": 0.8536264047026634, + "num_tokens": 84636784.0, + "step": 70370 + }, + { + "entropy": 1.9128979295492172, + "epoch": 0.21817184209975674, + "grad_norm": 7.596695899963379, + "learning_rate": 5.416221485352247e-06, + "loss": 0.5573, + "mean_token_accuracy": 0.8282625824213028, + "num_tokens": 84649019.0, + "step": 70380 + }, + { + "entropy": 1.90406776368618, + "epoch": 0.21820284122480643, + "grad_norm": 9.435015678405762, + "learning_rate": 5.415836736750441e-06, + "loss": 0.4832, + "mean_token_accuracy": 0.8421972319483757, + "num_tokens": 84660281.0, + "step": 70390 + }, + { + "entropy": 1.77169189453125, + "epoch": 0.21823384034985613, + "grad_norm": 3.901792287826538, + "learning_rate": 5.415452070130397e-06, + "loss": 0.3818, + "mean_token_accuracy": 0.8625829800963402, + "num_tokens": 84673251.0, + "step": 70400 + }, + { + "entropy": 1.9100979268550873, + "epoch": 0.21826483947490583, + "grad_norm": 8.25220012664795, + "learning_rate": 5.415067485463005e-06, + "loss": 0.4971, + "mean_token_accuracy": 0.8386782616376877, + "num_tokens": 84685476.0, + "step": 70410 + }, + { + "entropy": 1.8133963495492935, + "epoch": 0.21829583859995552, + "grad_norm": 3.5599453449249268, + "learning_rate": 5.414682982719167e-06, + "loss": 0.4691, + "mean_token_accuracy": 0.8472143262624741, + "num_tokens": 84698468.0, + "step": 70420 + }, + { + "entropy": 1.9142759516835213, + "epoch": 0.21832683772500522, + "grad_norm": 7.450267314910889, + "learning_rate": 5.414298561869803e-06, + "loss": 0.4914, + "mean_token_accuracy": 0.8527796879410744, + "num_tokens": 84709995.0, + "step": 70430 + }, + { + "entropy": 1.9114891082048415, + "epoch": 0.21835783685005491, + "grad_norm": 8.194131851196289, + "learning_rate": 5.413914222885847e-06, + "loss": 0.5451, + "mean_token_accuracy": 0.838103885948658, + "num_tokens": 84720745.0, + "step": 70440 + }, + { + "entropy": 1.8565336361527442, + "epoch": 0.2183888359751046, + "grad_norm": 7.725749969482422, + "learning_rate": 5.413529965738245e-06, + "loss": 0.4834, + "mean_token_accuracy": 0.8396166414022446, + "num_tokens": 84732619.0, + "step": 70450 + }, + { + "entropy": 1.933957888185978, + "epoch": 0.2184198351001543, + "grad_norm": 9.328766822814941, + "learning_rate": 5.41314579039796e-06, + "loss": 0.5431, + "mean_token_accuracy": 0.8291135087609292, + "num_tokens": 84744044.0, + "step": 70460 + }, + { + "entropy": 1.881012487411499, + "epoch": 0.218450834225204, + "grad_norm": 7.425388813018799, + "learning_rate": 5.412761696835969e-06, + "loss": 0.5131, + "mean_token_accuracy": 0.8454090863466263, + "num_tokens": 84756091.0, + "step": 70470 + }, + { + "entropy": 1.8883946731686592, + "epoch": 0.2184818333502537, + "grad_norm": 7.838623523712158, + "learning_rate": 5.412377685023262e-06, + "loss": 0.4852, + "mean_token_accuracy": 0.8344195529818534, + "num_tokens": 84768181.0, + "step": 70480 + }, + { + "entropy": 1.8858201622962951, + "epoch": 0.2185128324753034, + "grad_norm": 8.257635116577148, + "learning_rate": 5.411993754930844e-06, + "loss": 0.4949, + "mean_token_accuracy": 0.8470271736383438, + "num_tokens": 84779613.0, + "step": 70490 + }, + { + "entropy": 1.87155821621418, + "epoch": 0.21854383160035307, + "grad_norm": 11.120357513427734, + "learning_rate": 5.411609906529737e-06, + "loss": 0.5173, + "mean_token_accuracy": 0.8390228837728501, + "num_tokens": 84792236.0, + "step": 70500 + }, + { + "entropy": 1.836224192380905, + "epoch": 0.21857483072540276, + "grad_norm": 9.826521873474121, + "learning_rate": 5.411226139790973e-06, + "loss": 0.4767, + "mean_token_accuracy": 0.8449719101190567, + "num_tokens": 84805022.0, + "step": 70510 + }, + { + "entropy": 1.9778549373149872, + "epoch": 0.21860582985045246, + "grad_norm": 9.500571250915527, + "learning_rate": 5.410842454685601e-06, + "loss": 0.6753, + "mean_token_accuracy": 0.8305225998163224, + "num_tokens": 84816577.0, + "step": 70520 + }, + { + "entropy": 1.8446165218949317, + "epoch": 0.21863682897550216, + "grad_norm": 9.461227416992188, + "learning_rate": 5.4104588511846846e-06, + "loss": 0.4405, + "mean_token_accuracy": 0.8477078288793564, + "num_tokens": 84829132.0, + "step": 70530 + }, + { + "entropy": 1.859433715045452, + "epoch": 0.21866782810055185, + "grad_norm": 7.800492286682129, + "learning_rate": 5.410075329259299e-06, + "loss": 0.4696, + "mean_token_accuracy": 0.8472880437970162, + "num_tokens": 84841353.0, + "step": 70540 + }, + { + "entropy": 1.9165452167391777, + "epoch": 0.21869882722560155, + "grad_norm": 8.318398475646973, + "learning_rate": 5.4096918888805385e-06, + "loss": 0.5168, + "mean_token_accuracy": 0.8420958235859871, + "num_tokens": 84852397.0, + "step": 70550 + }, + { + "entropy": 1.8914957597851754, + "epoch": 0.21872982635065125, + "grad_norm": 9.693846702575684, + "learning_rate": 5.409308530019507e-06, + "loss": 0.5111, + "mean_token_accuracy": 0.8405283436179161, + "num_tokens": 84863491.0, + "step": 70560 + }, + { + "entropy": 1.945120519399643, + "epoch": 0.21876082547570094, + "grad_norm": 9.057845115661621, + "learning_rate": 5.408925252647326e-06, + "loss": 0.4925, + "mean_token_accuracy": 0.8449465945363045, + "num_tokens": 84874898.0, + "step": 70570 + }, + { + "entropy": 1.9313878312706947, + "epoch": 0.21879182460075064, + "grad_norm": 8.234183311462402, + "learning_rate": 5.408542056735129e-06, + "loss": 0.5658, + "mean_token_accuracy": 0.8252523899078369, + "num_tokens": 84886394.0, + "step": 70580 + }, + { + "entropy": 1.9243649192154408, + "epoch": 0.21882282372580034, + "grad_norm": 8.189180374145508, + "learning_rate": 5.408158942254065e-06, + "loss": 0.5435, + "mean_token_accuracy": 0.8330753937363624, + "num_tokens": 84898510.0, + "step": 70590 + }, + { + "entropy": 1.8462709829211235, + "epoch": 0.21885382285085003, + "grad_norm": 7.798243522644043, + "learning_rate": 5.407775909175298e-06, + "loss": 0.4693, + "mean_token_accuracy": 0.8495460674166679, + "num_tokens": 84910723.0, + "step": 70600 + }, + { + "entropy": 1.8084265619516373, + "epoch": 0.21888482197589973, + "grad_norm": 10.60255241394043, + "learning_rate": 5.407392957470005e-06, + "loss": 0.418, + "mean_token_accuracy": 0.8456479325890541, + "num_tokens": 84923960.0, + "step": 70610 + }, + { + "entropy": 1.880794422328472, + "epoch": 0.21891582110094943, + "grad_norm": 4.523313045501709, + "learning_rate": 5.4070100871093764e-06, + "loss": 0.4671, + "mean_token_accuracy": 0.8479678109288216, + "num_tokens": 84935570.0, + "step": 70620 + }, + { + "entropy": 1.8770657986402512, + "epoch": 0.21894682022599912, + "grad_norm": 2.5397913455963135, + "learning_rate": 5.406627298064622e-06, + "loss": 0.5017, + "mean_token_accuracy": 0.8388467326760292, + "num_tokens": 84948444.0, + "step": 70630 + }, + { + "entropy": 1.946268130838871, + "epoch": 0.21897781935104882, + "grad_norm": 9.20574951171875, + "learning_rate": 5.406244590306958e-06, + "loss": 0.5068, + "mean_token_accuracy": 0.8426952421665191, + "num_tokens": 84959437.0, + "step": 70640 + }, + { + "entropy": 1.9465517818927764, + "epoch": 0.21900881847609852, + "grad_norm": 9.563549995422363, + "learning_rate": 5.405861963807622e-06, + "loss": 0.5409, + "mean_token_accuracy": 0.834696726500988, + "num_tokens": 84970287.0, + "step": 70650 + }, + { + "entropy": 1.9270946726202964, + "epoch": 0.2190398176011482, + "grad_norm": 7.783606052398682, + "learning_rate": 5.4054794185378615e-06, + "loss": 0.5061, + "mean_token_accuracy": 0.8351917177438736, + "num_tokens": 84982065.0, + "step": 70660 + }, + { + "entropy": 1.8967857383191586, + "epoch": 0.2190708167261979, + "grad_norm": 8.703535079956055, + "learning_rate": 5.405096954468938e-06, + "loss": 0.4889, + "mean_token_accuracy": 0.8460932150483131, + "num_tokens": 84993902.0, + "step": 70670 + }, + { + "entropy": 1.9372851148247718, + "epoch": 0.2191018158512476, + "grad_norm": 4.806000232696533, + "learning_rate": 5.4047145715721315e-06, + "loss": 0.566, + "mean_token_accuracy": 0.8275101691484451, + "num_tokens": 85005220.0, + "step": 70680 + }, + { + "entropy": 1.88345315605402, + "epoch": 0.2191328149762973, + "grad_norm": 7.77454948425293, + "learning_rate": 5.404332269818732e-06, + "loss": 0.4613, + "mean_token_accuracy": 0.8458261653780937, + "num_tokens": 85017018.0, + "step": 70690 + }, + { + "entropy": 1.7957565858960152, + "epoch": 0.219163814101347, + "grad_norm": 8.677156448364258, + "learning_rate": 5.403950049180046e-06, + "loss": 0.4538, + "mean_token_accuracy": 0.8442620366811753, + "num_tokens": 85030620.0, + "step": 70700 + }, + { + "entropy": 1.8943561017513275, + "epoch": 0.2191948132263967, + "grad_norm": 7.602232933044434, + "learning_rate": 5.403567909627393e-06, + "loss": 0.5309, + "mean_token_accuracy": 0.8428284287452698, + "num_tokens": 85041835.0, + "step": 70710 + }, + { + "entropy": 1.9183341562747955, + "epoch": 0.2192258123514464, + "grad_norm": 8.18143367767334, + "learning_rate": 5.4031858511321065e-06, + "loss": 0.5451, + "mean_token_accuracy": 0.8351936027407646, + "num_tokens": 85053607.0, + "step": 70720 + }, + { + "entropy": 1.8503445282578468, + "epoch": 0.2192568114764961, + "grad_norm": 8.790541648864746, + "learning_rate": 5.402803873665535e-06, + "loss": 0.414, + "mean_token_accuracy": 0.8500261098146439, + "num_tokens": 85065898.0, + "step": 70730 + }, + { + "entropy": 1.869480137526989, + "epoch": 0.21928781060154579, + "grad_norm": 10.168290138244629, + "learning_rate": 5.402421977199042e-06, + "loss": 0.4884, + "mean_token_accuracy": 0.838524155318737, + "num_tokens": 85078033.0, + "step": 70740 + }, + { + "entropy": 1.9085437297821044, + "epoch": 0.21931880972659545, + "grad_norm": 8.224616050720215, + "learning_rate": 5.402040161704004e-06, + "loss": 0.4843, + "mean_token_accuracy": 0.8398436903953552, + "num_tokens": 85090009.0, + "step": 70750 + }, + { + "entropy": 1.8690147161483766, + "epoch": 0.21934980885164515, + "grad_norm": 7.7431416511535645, + "learning_rate": 5.4016584271518116e-06, + "loss": 0.4382, + "mean_token_accuracy": 0.8496010720729827, + "num_tokens": 85102267.0, + "step": 70760 + }, + { + "entropy": 1.908894456923008, + "epoch": 0.21938080797669485, + "grad_norm": 9.895110130310059, + "learning_rate": 5.401276773513869e-06, + "loss": 0.5064, + "mean_token_accuracy": 0.839865879714489, + "num_tokens": 85113947.0, + "step": 70770 + }, + { + "entropy": 1.8951399207115174, + "epoch": 0.21941180710174454, + "grad_norm": 7.593353271484375, + "learning_rate": 5.400895200761596e-06, + "loss": 0.5009, + "mean_token_accuracy": 0.8396402359008789, + "num_tokens": 85125637.0, + "step": 70780 + }, + { + "entropy": 1.8372249327600003, + "epoch": 0.21944280622679424, + "grad_norm": 10.441277503967285, + "learning_rate": 5.400513708866425e-06, + "loss": 0.4661, + "mean_token_accuracy": 0.8503782153129578, + "num_tokens": 85138593.0, + "step": 70790 + }, + { + "entropy": 1.9151126846671105, + "epoch": 0.21947380535184394, + "grad_norm": 11.181161880493164, + "learning_rate": 5.400132297799804e-06, + "loss": 0.4699, + "mean_token_accuracy": 0.8408087193965912, + "num_tokens": 85150130.0, + "step": 70800 + }, + { + "entropy": 1.9188938543200493, + "epoch": 0.21950480447689363, + "grad_norm": 7.681055545806885, + "learning_rate": 5.399750967533195e-06, + "loss": 0.5204, + "mean_token_accuracy": 0.8343793347477912, + "num_tokens": 85161618.0, + "step": 70810 + }, + { + "entropy": 1.8500541925430298, + "epoch": 0.21953580360194333, + "grad_norm": 9.042118072509766, + "learning_rate": 5.399369718038073e-06, + "loss": 0.5029, + "mean_token_accuracy": 0.8330987498164177, + "num_tokens": 85174263.0, + "step": 70820 + }, + { + "entropy": 1.8322535023093223, + "epoch": 0.21956680272699303, + "grad_norm": 4.178494930267334, + "learning_rate": 5.398988549285927e-06, + "loss": 0.4795, + "mean_token_accuracy": 0.8345492288470269, + "num_tokens": 85187028.0, + "step": 70830 + }, + { + "entropy": 1.8351117119193077, + "epoch": 0.21959780185204272, + "grad_norm": 4.005007743835449, + "learning_rate": 5.398607461248263e-06, + "loss": 0.4246, + "mean_token_accuracy": 0.8548778548836709, + "num_tokens": 85199485.0, + "step": 70840 + }, + { + "entropy": 1.8513669028878212, + "epoch": 0.21962880097709242, + "grad_norm": 8.433499336242676, + "learning_rate": 5.398226453896596e-06, + "loss": 0.5017, + "mean_token_accuracy": 0.8487704336643219, + "num_tokens": 85211879.0, + "step": 70850 + }, + { + "entropy": 1.8096089884638786, + "epoch": 0.21965980010214212, + "grad_norm": 10.63721752166748, + "learning_rate": 5.39784552720246e-06, + "loss": 0.4147, + "mean_token_accuracy": 0.8548229023814201, + "num_tokens": 85224815.0, + "step": 70860 + }, + { + "entropy": 1.8579236298799515, + "epoch": 0.2196907992271918, + "grad_norm": 8.522771835327148, + "learning_rate": 5.3974646811373986e-06, + "loss": 0.4972, + "mean_token_accuracy": 0.8453032687306404, + "num_tokens": 85236804.0, + "step": 70870 + }, + { + "entropy": 1.8571562334895133, + "epoch": 0.2197217983522415, + "grad_norm": 8.076016426086426, + "learning_rate": 5.397083915672975e-06, + "loss": 0.4687, + "mean_token_accuracy": 0.8438248604536056, + "num_tokens": 85248947.0, + "step": 70880 + }, + { + "entropy": 1.851157009601593, + "epoch": 0.2197527974772912, + "grad_norm": 9.514842987060547, + "learning_rate": 5.396703230780761e-06, + "loss": 0.4632, + "mean_token_accuracy": 0.8403349593281746, + "num_tokens": 85261568.0, + "step": 70890 + }, + { + "entropy": 1.8390909820795058, + "epoch": 0.2197837966023409, + "grad_norm": 9.045916557312012, + "learning_rate": 5.396322626432345e-06, + "loss": 0.4379, + "mean_token_accuracy": 0.8455731064081192, + "num_tokens": 85274322.0, + "step": 70900 + }, + { + "entropy": 1.9245983332395553, + "epoch": 0.2198147957273906, + "grad_norm": 7.920559406280518, + "learning_rate": 5.39594210259933e-06, + "loss": 0.5211, + "mean_token_accuracy": 0.8428880482912063, + "num_tokens": 85285617.0, + "step": 70910 + }, + { + "entropy": 1.8781443014740944, + "epoch": 0.2198457948524403, + "grad_norm": 4.677145481109619, + "learning_rate": 5.395561659253331e-06, + "loss": 0.4892, + "mean_token_accuracy": 0.8449257969856262, + "num_tokens": 85297208.0, + "step": 70920 + }, + { + "entropy": 1.9284519299864769, + "epoch": 0.21987679397749, + "grad_norm": 3.986211061477661, + "learning_rate": 5.395181296365979e-06, + "loss": 0.5201, + "mean_token_accuracy": 0.8414773017168045, + "num_tokens": 85308876.0, + "step": 70930 + }, + { + "entropy": 1.7243415489792824, + "epoch": 0.2199077931025397, + "grad_norm": 4.131589412689209, + "learning_rate": 5.394801013908917e-06, + "loss": 0.3716, + "mean_token_accuracy": 0.865143957734108, + "num_tokens": 85323468.0, + "step": 70940 + }, + { + "entropy": 1.9777496784925461, + "epoch": 0.21993879222758939, + "grad_norm": 7.485686779022217, + "learning_rate": 5.394420811853805e-06, + "loss": 0.5543, + "mean_token_accuracy": 0.8345079109072685, + "num_tokens": 85334331.0, + "step": 70950 + }, + { + "entropy": 1.9227191910147667, + "epoch": 0.21996979135263908, + "grad_norm": 4.045227527618408, + "learning_rate": 5.394040690172313e-06, + "loss": 0.5687, + "mean_token_accuracy": 0.8265855267643929, + "num_tokens": 85346105.0, + "step": 70960 + }, + { + "entropy": 1.9031540900468826, + "epoch": 0.22000079047768878, + "grad_norm": 4.864028453826904, + "learning_rate": 5.393660648836128e-06, + "loss": 0.4885, + "mean_token_accuracy": 0.8400901973247528, + "num_tokens": 85358132.0, + "step": 70970 + }, + { + "entropy": 1.98341506421566, + "epoch": 0.22003178960273848, + "grad_norm": 8.267914772033691, + "learning_rate": 5.393280687816951e-06, + "loss": 0.5651, + "mean_token_accuracy": 0.8373934581875802, + "num_tokens": 85369306.0, + "step": 70980 + }, + { + "entropy": 1.9509702712297439, + "epoch": 0.22006278872778814, + "grad_norm": 8.570839881896973, + "learning_rate": 5.392900807086495e-06, + "loss": 0.5279, + "mean_token_accuracy": 0.8445903062820435, + "num_tokens": 85380557.0, + "step": 70990 + }, + { + "entropy": 1.9639266401529312, + "epoch": 0.22009378785283784, + "grad_norm": 9.221185684204102, + "learning_rate": 5.392521006616488e-06, + "loss": 0.4951, + "mean_token_accuracy": 0.8395860701799392, + "num_tokens": 85392420.0, + "step": 71000 + }, + { + "entropy": 1.9102492406964302, + "epoch": 0.22012478697788754, + "grad_norm": 3.7444820404052734, + "learning_rate": 5.392141286378672e-06, + "loss": 0.497, + "mean_token_accuracy": 0.8374612465500831, + "num_tokens": 85404213.0, + "step": 71010 + }, + { + "entropy": 1.8868770197033882, + "epoch": 0.22015578610293723, + "grad_norm": 2.842402219772339, + "learning_rate": 5.391761646344802e-06, + "loss": 0.4818, + "mean_token_accuracy": 0.8464625418186188, + "num_tokens": 85416634.0, + "step": 71020 + }, + { + "entropy": 1.8559535294771194, + "epoch": 0.22018678522798693, + "grad_norm": 8.821431159973145, + "learning_rate": 5.391382086486649e-06, + "loss": 0.5071, + "mean_token_accuracy": 0.8445154055953026, + "num_tokens": 85430022.0, + "step": 71030 + }, + { + "entropy": 1.9842773735523225, + "epoch": 0.22021778435303663, + "grad_norm": 8.328567504882812, + "learning_rate": 5.391002606775996e-06, + "loss": 0.5205, + "mean_token_accuracy": 0.8406598642468452, + "num_tokens": 85441410.0, + "step": 71040 + }, + { + "entropy": 1.9358683452010155, + "epoch": 0.22024878347808632, + "grad_norm": 9.156634330749512, + "learning_rate": 5.3906232071846385e-06, + "loss": 0.4785, + "mean_token_accuracy": 0.8384227842092514, + "num_tokens": 85453655.0, + "step": 71050 + }, + { + "entropy": 1.8562579780817032, + "epoch": 0.22027978260313602, + "grad_norm": 7.700805187225342, + "learning_rate": 5.390243887684392e-06, + "loss": 0.4197, + "mean_token_accuracy": 0.854043036699295, + "num_tokens": 85466539.0, + "step": 71060 + }, + { + "entropy": 1.9701458364725113, + "epoch": 0.22031078172818572, + "grad_norm": 7.824542045593262, + "learning_rate": 5.38986464824708e-06, + "loss": 0.5203, + "mean_token_accuracy": 0.8268503859639168, + "num_tokens": 85478191.0, + "step": 71070 + }, + { + "entropy": 1.9239497467875482, + "epoch": 0.2203417808532354, + "grad_norm": 10.349506378173828, + "learning_rate": 5.3894854888445415e-06, + "loss": 0.5199, + "mean_token_accuracy": 0.8427152633666992, + "num_tokens": 85490546.0, + "step": 71080 + }, + { + "entropy": 1.9393352545797824, + "epoch": 0.2203727799782851, + "grad_norm": 8.5408935546875, + "learning_rate": 5.389106409448628e-06, + "loss": 0.4947, + "mean_token_accuracy": 0.8323530539870262, + "num_tokens": 85502425.0, + "step": 71090 + }, + { + "entropy": 1.997844734787941, + "epoch": 0.2204037791033348, + "grad_norm": 7.891308307647705, + "learning_rate": 5.38872741003121e-06, + "loss": 0.5502, + "mean_token_accuracy": 0.8281294673681259, + "num_tokens": 85513168.0, + "step": 71100 + }, + { + "entropy": 1.9478082045912744, + "epoch": 0.2204347782283845, + "grad_norm": 8.2448091506958, + "learning_rate": 5.388348490564164e-06, + "loss": 0.5257, + "mean_token_accuracy": 0.8402855768799782, + "num_tokens": 85524556.0, + "step": 71110 + }, + { + "entropy": 1.931041233241558, + "epoch": 0.2204657773534342, + "grad_norm": 9.231786727905273, + "learning_rate": 5.387969651019387e-06, + "loss": 0.4919, + "mean_token_accuracy": 0.8433810248970985, + "num_tokens": 85536049.0, + "step": 71120 + }, + { + "entropy": 1.9640550106763839, + "epoch": 0.2204967764784839, + "grad_norm": 8.980378150939941, + "learning_rate": 5.387590891368787e-06, + "loss": 0.5885, + "mean_token_accuracy": 0.8172830164432525, + "num_tokens": 85547228.0, + "step": 71130 + }, + { + "entropy": 1.8748082131147386, + "epoch": 0.2205277756035336, + "grad_norm": 8.130640983581543, + "learning_rate": 5.387212211584286e-06, + "loss": 0.4647, + "mean_token_accuracy": 0.838733246922493, + "num_tokens": 85559551.0, + "step": 71140 + }, + { + "entropy": 1.8640730693936347, + "epoch": 0.2205587747285833, + "grad_norm": 7.803213596343994, + "learning_rate": 5.386833611637822e-06, + "loss": 0.5066, + "mean_token_accuracy": 0.844725139439106, + "num_tokens": 85571361.0, + "step": 71150 + }, + { + "entropy": 1.8213941514492036, + "epoch": 0.220589773853633, + "grad_norm": 9.419188499450684, + "learning_rate": 5.386455091501342e-06, + "loss": 0.447, + "mean_token_accuracy": 0.8451396659016609, + "num_tokens": 85584550.0, + "step": 71160 + }, + { + "entropy": 1.8446722000837326, + "epoch": 0.22062077297868268, + "grad_norm": 9.770685195922852, + "learning_rate": 5.3860766511468095e-06, + "loss": 0.4354, + "mean_token_accuracy": 0.849824532866478, + "num_tokens": 85597173.0, + "step": 71170 + }, + { + "entropy": 1.8369273975491525, + "epoch": 0.22065177210373238, + "grad_norm": 7.802650451660156, + "learning_rate": 5.385698290546205e-06, + "loss": 0.4305, + "mean_token_accuracy": 0.8467299669981003, + "num_tokens": 85611218.0, + "step": 71180 + }, + { + "entropy": 1.9682567581534385, + "epoch": 0.22068277122878208, + "grad_norm": 8.1614408493042, + "learning_rate": 5.3853200096715175e-06, + "loss": 0.5049, + "mean_token_accuracy": 0.8387658104300499, + "num_tokens": 85622538.0, + "step": 71190 + }, + { + "entropy": 1.8726427100598813, + "epoch": 0.22071377035383177, + "grad_norm": 7.950981616973877, + "learning_rate": 5.384941808494753e-06, + "loss": 0.4462, + "mean_token_accuracy": 0.8444047793745995, + "num_tokens": 85635421.0, + "step": 71200 + }, + { + "entropy": 1.893759909272194, + "epoch": 0.22074476947888147, + "grad_norm": 8.878076553344727, + "learning_rate": 5.384563686987928e-06, + "loss": 0.4765, + "mean_token_accuracy": 0.8406562879681587, + "num_tokens": 85648124.0, + "step": 71210 + }, + { + "entropy": 1.9666777163743974, + "epoch": 0.22077576860393117, + "grad_norm": 7.653510093688965, + "learning_rate": 5.384185645123078e-06, + "loss": 0.5379, + "mean_token_accuracy": 0.840550334751606, + "num_tokens": 85659636.0, + "step": 71220 + }, + { + "entropy": 1.898690427839756, + "epoch": 0.22080676772898086, + "grad_norm": 4.228546619415283, + "learning_rate": 5.383807682872247e-06, + "loss": 0.4725, + "mean_token_accuracy": 0.8438500598073005, + "num_tokens": 85671836.0, + "step": 71230 + }, + { + "entropy": 1.9357111945748329, + "epoch": 0.22083776685403053, + "grad_norm": 5.342586517333984, + "learning_rate": 5.383429800207497e-06, + "loss": 0.5501, + "mean_token_accuracy": 0.8345894768834115, + "num_tokens": 85684340.0, + "step": 71240 + }, + { + "entropy": 1.9341301143169403, + "epoch": 0.22086876597908023, + "grad_norm": 8.754364013671875, + "learning_rate": 5.3830519971009e-06, + "loss": 0.5377, + "mean_token_accuracy": 0.8313232839107514, + "num_tokens": 85695420.0, + "step": 71250 + }, + { + "entropy": 1.9368458151817323, + "epoch": 0.22089976510412992, + "grad_norm": 8.349538803100586, + "learning_rate": 5.382674273524544e-06, + "loss": 0.5105, + "mean_token_accuracy": 0.8416445463895798, + "num_tokens": 85707345.0, + "step": 71260 + }, + { + "entropy": 1.7997360065579415, + "epoch": 0.22093076422917962, + "grad_norm": 8.042038917541504, + "learning_rate": 5.382296629450529e-06, + "loss": 0.4267, + "mean_token_accuracy": 0.845326641201973, + "num_tokens": 85720742.0, + "step": 71270 + }, + { + "entropy": 1.958139282464981, + "epoch": 0.22096176335422932, + "grad_norm": 10.129528045654297, + "learning_rate": 5.3819190648509714e-06, + "loss": 0.555, + "mean_token_accuracy": 0.8313161879777908, + "num_tokens": 85730990.0, + "step": 71280 + }, + { + "entropy": 1.959393960237503, + "epoch": 0.22099276247927901, + "grad_norm": 8.495170593261719, + "learning_rate": 5.381541579697999e-06, + "loss": 0.5183, + "mean_token_accuracy": 0.8422238364815712, + "num_tokens": 85742428.0, + "step": 71290 + }, + { + "entropy": 1.851896020770073, + "epoch": 0.2210237616043287, + "grad_norm": 8.348380088806152, + "learning_rate": 5.381164173963755e-06, + "loss": 0.4896, + "mean_token_accuracy": 0.8479127526283264, + "num_tokens": 85755347.0, + "step": 71300 + }, + { + "entropy": 1.875650581717491, + "epoch": 0.2210547607293784, + "grad_norm": 3.1359446048736572, + "learning_rate": 5.380786847620394e-06, + "loss": 0.5207, + "mean_token_accuracy": 0.8393297314643859, + "num_tokens": 85767657.0, + "step": 71310 + }, + { + "entropy": 1.8281921789050102, + "epoch": 0.2210857598544281, + "grad_norm": 9.828446388244629, + "learning_rate": 5.3804096006400844e-06, + "loss": 0.4102, + "mean_token_accuracy": 0.8533445596694946, + "num_tokens": 85780812.0, + "step": 71320 + }, + { + "entropy": 1.8348768278956413, + "epoch": 0.2211167589794778, + "grad_norm": 8.210378646850586, + "learning_rate": 5.380032432995013e-06, + "loss": 0.4545, + "mean_token_accuracy": 0.8492519795894623, + "num_tokens": 85794004.0, + "step": 71330 + }, + { + "entropy": 1.8405587255954743, + "epoch": 0.2211477581045275, + "grad_norm": 8.21066951751709, + "learning_rate": 5.379655344657373e-06, + "loss": 0.427, + "mean_token_accuracy": 0.8470315858721733, + "num_tokens": 85807268.0, + "step": 71340 + }, + { + "entropy": 1.908400295674801, + "epoch": 0.2211787572295772, + "grad_norm": 8.391548156738281, + "learning_rate": 5.379278335599377e-06, + "loss": 0.5263, + "mean_token_accuracy": 0.8420346319675446, + "num_tokens": 85819177.0, + "step": 71350 + }, + { + "entropy": 1.929537844657898, + "epoch": 0.2212097563546269, + "grad_norm": 9.014862060546875, + "learning_rate": 5.378901405793249e-06, + "loss": 0.5488, + "mean_token_accuracy": 0.8366276562213898, + "num_tokens": 85830729.0, + "step": 71360 + }, + { + "entropy": 1.9423951536417008, + "epoch": 0.2212407554796766, + "grad_norm": 7.291965484619141, + "learning_rate": 5.378524555211225e-06, + "loss": 0.5274, + "mean_token_accuracy": 0.8441259831190109, + "num_tokens": 85841037.0, + "step": 71370 + }, + { + "entropy": 1.88891644179821, + "epoch": 0.22127175460472628, + "grad_norm": 8.193086624145508, + "learning_rate": 5.378147783825558e-06, + "loss": 0.4727, + "mean_token_accuracy": 0.8404372721910477, + "num_tokens": 85852678.0, + "step": 71380 + }, + { + "entropy": 1.911217801272869, + "epoch": 0.22130275372977598, + "grad_norm": 6.810231685638428, + "learning_rate": 5.3777710916085125e-06, + "loss": 0.4821, + "mean_token_accuracy": 0.8454770565032959, + "num_tokens": 85864412.0, + "step": 71390 + }, + { + "entropy": 1.9492921188473702, + "epoch": 0.22133375285482568, + "grad_norm": 9.480756759643555, + "learning_rate": 5.377394478532367e-06, + "loss": 0.5085, + "mean_token_accuracy": 0.843833090364933, + "num_tokens": 85875825.0, + "step": 71400 + }, + { + "entropy": 1.8765543788671493, + "epoch": 0.22136475197987537, + "grad_norm": 10.186013221740723, + "learning_rate": 5.377017944569414e-06, + "loss": 0.4708, + "mean_token_accuracy": 0.8491380795836448, + "num_tokens": 85887745.0, + "step": 71410 + }, + { + "entropy": 1.9640293627977372, + "epoch": 0.22139575110492507, + "grad_norm": 4.53644323348999, + "learning_rate": 5.376641489691959e-06, + "loss": 0.5024, + "mean_token_accuracy": 0.8418907791376113, + "num_tokens": 85898854.0, + "step": 71420 + }, + { + "entropy": 1.9712566941976548, + "epoch": 0.22142675022997477, + "grad_norm": 8.63244915008545, + "learning_rate": 5.37626511387232e-06, + "loss": 0.5354, + "mean_token_accuracy": 0.8289510354399681, + "num_tokens": 85909616.0, + "step": 71430 + }, + { + "entropy": 1.9296884000301362, + "epoch": 0.22145774935502446, + "grad_norm": 8.477029800415039, + "learning_rate": 5.375888817082833e-06, + "loss": 0.5219, + "mean_token_accuracy": 0.8430458202958107, + "num_tokens": 85920633.0, + "step": 71440 + }, + { + "entropy": 1.8276248887181281, + "epoch": 0.22148874848007416, + "grad_norm": 4.144367694854736, + "learning_rate": 5.37551259929584e-06, + "loss": 0.4666, + "mean_token_accuracy": 0.8453167825937271, + "num_tokens": 85933483.0, + "step": 71450 + }, + { + "entropy": 1.9024877399206161, + "epoch": 0.22151974760512386, + "grad_norm": 7.674788475036621, + "learning_rate": 5.375136460483704e-06, + "loss": 0.476, + "mean_token_accuracy": 0.8434097394347191, + "num_tokens": 85945186.0, + "step": 71460 + }, + { + "entropy": 1.8075610235333444, + "epoch": 0.22155074673017355, + "grad_norm": 8.375659942626953, + "learning_rate": 5.374760400618798e-06, + "loss": 0.4304, + "mean_token_accuracy": 0.8462614819407464, + "num_tokens": 85958094.0, + "step": 71470 + }, + { + "entropy": 1.8221330136060714, + "epoch": 0.22158174585522325, + "grad_norm": 7.434113025665283, + "learning_rate": 5.37438441967351e-06, + "loss": 0.4377, + "mean_token_accuracy": 0.8467920750379563, + "num_tokens": 85971956.0, + "step": 71480 + }, + { + "entropy": 1.9221985384821891, + "epoch": 0.22161274498027292, + "grad_norm": 8.287166595458984, + "learning_rate": 5.374008517620237e-06, + "loss": 0.5122, + "mean_token_accuracy": 0.8432189226150513, + "num_tokens": 85982856.0, + "step": 71490 + }, + { + "entropy": 1.9569891929626464, + "epoch": 0.22164374410532262, + "grad_norm": 9.78246021270752, + "learning_rate": 5.373632694431396e-06, + "loss": 0.563, + "mean_token_accuracy": 0.826215885579586, + "num_tokens": 85994575.0, + "step": 71500 + }, + { + "entropy": 1.9260801374912262, + "epoch": 0.2216747432303723, + "grad_norm": 7.269822120666504, + "learning_rate": 5.373256950079414e-06, + "loss": 0.5272, + "mean_token_accuracy": 0.8399269118905067, + "num_tokens": 86006104.0, + "step": 71510 + }, + { + "entropy": 1.8692869395017624, + "epoch": 0.221705742355422, + "grad_norm": 8.818297386169434, + "learning_rate": 5.372881284536732e-06, + "loss": 0.5117, + "mean_token_accuracy": 0.8455665573477745, + "num_tokens": 86019217.0, + "step": 71520 + }, + { + "entropy": 1.897246205806732, + "epoch": 0.2217367414804717, + "grad_norm": 7.545614242553711, + "learning_rate": 5.372505697775805e-06, + "loss": 0.4867, + "mean_token_accuracy": 0.8487001821398735, + "num_tokens": 86030738.0, + "step": 71530 + }, + { + "entropy": 1.922917690873146, + "epoch": 0.2217677406055214, + "grad_norm": 9.24394416809082, + "learning_rate": 5.372130189769099e-06, + "loss": 0.5345, + "mean_token_accuracy": 0.8366933539509773, + "num_tokens": 86042441.0, + "step": 71540 + }, + { + "entropy": 1.8682702884078026, + "epoch": 0.2217987397305711, + "grad_norm": 8.580945014953613, + "learning_rate": 5.371754760489097e-06, + "loss": 0.5302, + "mean_token_accuracy": 0.8417121097445488, + "num_tokens": 86054427.0, + "step": 71550 + }, + { + "entropy": 1.8668757036328316, + "epoch": 0.2218297388556208, + "grad_norm": 10.871688842773438, + "learning_rate": 5.371379409908294e-06, + "loss": 0.4733, + "mean_token_accuracy": 0.8479284450411797, + "num_tokens": 86066725.0, + "step": 71560 + }, + { + "entropy": 1.915563191473484, + "epoch": 0.2218607379806705, + "grad_norm": 10.612263679504395, + "learning_rate": 5.371004137999198e-06, + "loss": 0.5192, + "mean_token_accuracy": 0.8447510316967964, + "num_tokens": 86078349.0, + "step": 71570 + }, + { + "entropy": 1.897247090935707, + "epoch": 0.2218917371057202, + "grad_norm": 8.9105863571167, + "learning_rate": 5.370628944734331e-06, + "loss": 0.5511, + "mean_token_accuracy": 0.8368135377764702, + "num_tokens": 86091101.0, + "step": 71580 + }, + { + "entropy": 1.6814722761511802, + "epoch": 0.22192273623076988, + "grad_norm": 3.993140697479248, + "learning_rate": 5.370253830086228e-06, + "loss": 0.33, + "mean_token_accuracy": 0.8649098068475723, + "num_tokens": 86105722.0, + "step": 71590 + }, + { + "entropy": 1.9403679892420769, + "epoch": 0.22195373535581958, + "grad_norm": 8.66745662689209, + "learning_rate": 5.369878794027438e-06, + "loss": 0.504, + "mean_token_accuracy": 0.8391989499330521, + "num_tokens": 86116963.0, + "step": 71600 + }, + { + "entropy": 1.8736673273146152, + "epoch": 0.22198473448086928, + "grad_norm": 9.14969539642334, + "learning_rate": 5.369503836530523e-06, + "loss": 0.4614, + "mean_token_accuracy": 0.8440220966935158, + "num_tokens": 86129346.0, + "step": 71610 + }, + { + "entropy": 1.901363869011402, + "epoch": 0.22201573360591897, + "grad_norm": 8.975959777832031, + "learning_rate": 5.369128957568058e-06, + "loss": 0.487, + "mean_token_accuracy": 0.843775762617588, + "num_tokens": 86141322.0, + "step": 71620 + }, + { + "entropy": 1.836611707508564, + "epoch": 0.22204673273096867, + "grad_norm": 3.2881882190704346, + "learning_rate": 5.368754157112632e-06, + "loss": 0.4551, + "mean_token_accuracy": 0.8536363363265991, + "num_tokens": 86154410.0, + "step": 71630 + }, + { + "entropy": 1.817706936597824, + "epoch": 0.22207773185601837, + "grad_norm": 7.758713722229004, + "learning_rate": 5.368379435136848e-06, + "loss": 0.4165, + "mean_token_accuracy": 0.8535854294896126, + "num_tokens": 86167987.0, + "step": 71640 + }, + { + "entropy": 1.8554053410887719, + "epoch": 0.22210873098106806, + "grad_norm": 3.6705055236816406, + "learning_rate": 5.368004791613321e-06, + "loss": 0.4786, + "mean_token_accuracy": 0.8438238322734832, + "num_tokens": 86180395.0, + "step": 71650 + }, + { + "entropy": 1.8957013592123986, + "epoch": 0.22213973010611776, + "grad_norm": 4.396717548370361, + "learning_rate": 5.36763022651468e-06, + "loss": 0.4914, + "mean_token_accuracy": 0.8409201994538307, + "num_tokens": 86192690.0, + "step": 71660 + }, + { + "entropy": 1.8508854925632476, + "epoch": 0.22217072923116746, + "grad_norm": 4.068726539611816, + "learning_rate": 5.367255739813568e-06, + "loss": 0.4696, + "mean_token_accuracy": 0.8503674864768982, + "num_tokens": 86205641.0, + "step": 71670 + }, + { + "entropy": 1.8594465300440788, + "epoch": 0.22220172835621715, + "grad_norm": 8.009425163269043, + "learning_rate": 5.3668813314826414e-06, + "loss": 0.4362, + "mean_token_accuracy": 0.8513711959123611, + "num_tokens": 86218005.0, + "step": 71680 + }, + { + "entropy": 1.9636985063552856, + "epoch": 0.22223272748126685, + "grad_norm": 9.654029846191406, + "learning_rate": 5.366507001494568e-06, + "loss": 0.5595, + "mean_token_accuracy": 0.8238719955086709, + "num_tokens": 86229342.0, + "step": 71690 + }, + { + "entropy": 1.8041048809885978, + "epoch": 0.22226372660631655, + "grad_norm": 8.672904014587402, + "learning_rate": 5.3661327498220305e-06, + "loss": 0.4395, + "mean_token_accuracy": 0.8550098910927773, + "num_tokens": 86241797.0, + "step": 71700 + }, + { + "entropy": 1.78623516112566, + "epoch": 0.22229472573136624, + "grad_norm": 8.161778450012207, + "learning_rate": 5.365758576437724e-06, + "loss": 0.413, + "mean_token_accuracy": 0.8487135574221611, + "num_tokens": 86255083.0, + "step": 71710 + }, + { + "entropy": 1.904472067952156, + "epoch": 0.22232572485641594, + "grad_norm": 9.36921501159668, + "learning_rate": 5.365384481314359e-06, + "loss": 0.5776, + "mean_token_accuracy": 0.8299209102988243, + "num_tokens": 86266459.0, + "step": 71720 + }, + { + "entropy": 1.8936071269214154, + "epoch": 0.2223567239814656, + "grad_norm": 8.524149894714355, + "learning_rate": 5.365010464424658e-06, + "loss": 0.4546, + "mean_token_accuracy": 0.8460559591650962, + "num_tokens": 86278539.0, + "step": 71730 + }, + { + "entropy": 1.8948934614658355, + "epoch": 0.2223877231065153, + "grad_norm": 8.934950828552246, + "learning_rate": 5.364636525741356e-06, + "loss": 0.5051, + "mean_token_accuracy": 0.838769344985485, + "num_tokens": 86290914.0, + "step": 71740 + }, + { + "entropy": 1.913631896674633, + "epoch": 0.222418722231565, + "grad_norm": 3.6874070167541504, + "learning_rate": 5.364262665237202e-06, + "loss": 0.4744, + "mean_token_accuracy": 0.8442276403307915, + "num_tokens": 86302745.0, + "step": 71750 + }, + { + "entropy": 1.92730543166399, + "epoch": 0.2224497213566147, + "grad_norm": 8.071089744567871, + "learning_rate": 5.363888882884958e-06, + "loss": 0.4861, + "mean_token_accuracy": 0.8468718230724335, + "num_tokens": 86314083.0, + "step": 71760 + }, + { + "entropy": 1.9443558216094972, + "epoch": 0.2224807204816644, + "grad_norm": 8.098677635192871, + "learning_rate": 5.363515178657401e-06, + "loss": 0.5392, + "mean_token_accuracy": 0.8254296198487282, + "num_tokens": 86325749.0, + "step": 71770 + }, + { + "entropy": 1.8496044874191284, + "epoch": 0.2225117196067141, + "grad_norm": 9.303723335266113, + "learning_rate": 5.36314155252732e-06, + "loss": 0.4445, + "mean_token_accuracy": 0.8531778752803802, + "num_tokens": 86339269.0, + "step": 71780 + }, + { + "entropy": 1.8947973191738128, + "epoch": 0.2225427187317638, + "grad_norm": 3.389211654663086, + "learning_rate": 5.362768004467516e-06, + "loss": 0.4637, + "mean_token_accuracy": 0.8556076481938362, + "num_tokens": 86351044.0, + "step": 71790 + }, + { + "entropy": 1.9089409783482552, + "epoch": 0.22257371785681349, + "grad_norm": 9.990097999572754, + "learning_rate": 5.362394534450803e-06, + "loss": 0.5464, + "mean_token_accuracy": 0.8289933249354362, + "num_tokens": 86362914.0, + "step": 71800 + }, + { + "entropy": 1.8263745561242104, + "epoch": 0.22260471698186318, + "grad_norm": 8.100030899047852, + "learning_rate": 5.362021142450014e-06, + "loss": 0.4337, + "mean_token_accuracy": 0.8659976094961166, + "num_tokens": 86375568.0, + "step": 71810 + }, + { + "entropy": 1.9488488882780075, + "epoch": 0.22263571610691288, + "grad_norm": 8.042920112609863, + "learning_rate": 5.361647828437985e-06, + "loss": 0.5051, + "mean_token_accuracy": 0.8422571077942849, + "num_tokens": 86386717.0, + "step": 71820 + }, + { + "entropy": 1.9189837276935577, + "epoch": 0.22266671523196258, + "grad_norm": 7.569156646728516, + "learning_rate": 5.361274592387578e-06, + "loss": 0.533, + "mean_token_accuracy": 0.8239206343889236, + "num_tokens": 86399499.0, + "step": 71830 + }, + { + "entropy": 1.8419446378946305, + "epoch": 0.22269771435701227, + "grad_norm": 3.669304609298706, + "learning_rate": 5.360901434271656e-06, + "loss": 0.4513, + "mean_token_accuracy": 0.8487843006849289, + "num_tokens": 86412252.0, + "step": 71840 + }, + { + "entropy": 1.8220521062612534, + "epoch": 0.22272871348206197, + "grad_norm": 5.168817043304443, + "learning_rate": 5.360528354063102e-06, + "loss": 0.4524, + "mean_token_accuracy": 0.8524244442582131, + "num_tokens": 86424935.0, + "step": 71850 + }, + { + "entropy": 1.836111642420292, + "epoch": 0.22275971260711167, + "grad_norm": 4.996338367462158, + "learning_rate": 5.360155351734812e-06, + "loss": 0.464, + "mean_token_accuracy": 0.8411974281072616, + "num_tokens": 86438125.0, + "step": 71860 + }, + { + "entropy": 1.985448981821537, + "epoch": 0.22279071173216136, + "grad_norm": 8.615861892700195, + "learning_rate": 5.359782427259694e-06, + "loss": 0.5502, + "mean_token_accuracy": 0.8280460953712463, + "num_tokens": 86448970.0, + "step": 71870 + }, + { + "entropy": 1.8285388305783272, + "epoch": 0.22282171085721106, + "grad_norm": 4.234899520874023, + "learning_rate": 5.359409580610668e-06, + "loss": 0.4333, + "mean_token_accuracy": 0.8518346786499024, + "num_tokens": 86461661.0, + "step": 71880 + }, + { + "entropy": 2.012462750822306, + "epoch": 0.22285270998226075, + "grad_norm": 3.0699379444122314, + "learning_rate": 5.359036811760669e-06, + "loss": 0.5702, + "mean_token_accuracy": 0.8206193700432778, + "num_tokens": 86473970.0, + "step": 71890 + }, + { + "entropy": 1.941874098777771, + "epoch": 0.22288370910731045, + "grad_norm": 6.572078227996826, + "learning_rate": 5.358664120682644e-06, + "loss": 0.5122, + "mean_token_accuracy": 0.8426774546504021, + "num_tokens": 86485521.0, + "step": 71900 + }, + { + "entropy": 1.9939334601163865, + "epoch": 0.22291470823236015, + "grad_norm": 9.066808700561523, + "learning_rate": 5.358291507349554e-06, + "loss": 0.5545, + "mean_token_accuracy": 0.8358783975243569, + "num_tokens": 86496329.0, + "step": 71910 + }, + { + "entropy": 1.8996268406510353, + "epoch": 0.22294570735740984, + "grad_norm": 7.218948841094971, + "learning_rate": 5.357918971734374e-06, + "loss": 0.4499, + "mean_token_accuracy": 0.8539462700486183, + "num_tokens": 86508655.0, + "step": 71920 + }, + { + "entropy": 1.8882664322853089, + "epoch": 0.22297670648245954, + "grad_norm": 7.739889621734619, + "learning_rate": 5.35754651381009e-06, + "loss": 0.495, + "mean_token_accuracy": 0.8399605572223663, + "num_tokens": 86521278.0, + "step": 71930 + }, + { + "entropy": 1.89051483720541, + "epoch": 0.22300770560750924, + "grad_norm": 3.720012903213501, + "learning_rate": 5.357174133549702e-06, + "loss": 0.4798, + "mean_token_accuracy": 0.8391882091760635, + "num_tokens": 86533583.0, + "step": 71940 + }, + { + "entropy": 1.8325127944350244, + "epoch": 0.22303870473255893, + "grad_norm": 4.477136611938477, + "learning_rate": 5.356801830926224e-06, + "loss": 0.4477, + "mean_token_accuracy": 0.837264607846737, + "num_tokens": 86546642.0, + "step": 71950 + }, + { + "entropy": 1.916852205991745, + "epoch": 0.22306970385760863, + "grad_norm": 8.114299774169922, + "learning_rate": 5.356429605912681e-06, + "loss": 0.5005, + "mean_token_accuracy": 0.8481098964810372, + "num_tokens": 86558470.0, + "step": 71960 + }, + { + "entropy": 1.869056871533394, + "epoch": 0.22310070298265833, + "grad_norm": 4.9179182052612305, + "learning_rate": 5.356057458482115e-06, + "loss": 0.4322, + "mean_token_accuracy": 0.8442103147506714, + "num_tokens": 86571713.0, + "step": 71970 + }, + { + "entropy": 1.9630917876958847, + "epoch": 0.223131702107708, + "grad_norm": 9.305983543395996, + "learning_rate": 5.355685388607575e-06, + "loss": 0.4825, + "mean_token_accuracy": 0.8500207409262657, + "num_tokens": 86582632.0, + "step": 71980 + }, + { + "entropy": 1.9708934336900712, + "epoch": 0.2231627012327577, + "grad_norm": 9.161256790161133, + "learning_rate": 5.3553133962621305e-06, + "loss": 0.5558, + "mean_token_accuracy": 0.8375864192843437, + "num_tokens": 86593114.0, + "step": 71990 + }, + { + "entropy": 1.892447827756405, + "epoch": 0.2231937003578074, + "grad_norm": 8.874557495117188, + "learning_rate": 5.35494148141886e-06, + "loss": 0.505, + "mean_token_accuracy": 0.8418182253837585, + "num_tokens": 86605405.0, + "step": 72000 + }, + { + "entropy": 1.9689023926854134, + "epoch": 0.2232246994828571, + "grad_norm": 8.011334419250488, + "learning_rate": 5.354569644050853e-06, + "loss": 0.5485, + "mean_token_accuracy": 0.8215542823076248, + "num_tokens": 86616609.0, + "step": 72010 + }, + { + "entropy": 1.7857394725084306, + "epoch": 0.22325569860790678, + "grad_norm": 8.13255786895752, + "learning_rate": 5.354197884131216e-06, + "loss": 0.3809, + "mean_token_accuracy": 0.8540096297860146, + "num_tokens": 86630878.0, + "step": 72020 + }, + { + "entropy": 1.8428432375192643, + "epoch": 0.22328669773295648, + "grad_norm": 10.879768371582031, + "learning_rate": 5.353826201633068e-06, + "loss": 0.4629, + "mean_token_accuracy": 0.8473896354436874, + "num_tokens": 86643315.0, + "step": 72030 + }, + { + "entropy": 1.9004595071077346, + "epoch": 0.22331769685800618, + "grad_norm": 10.056571960449219, + "learning_rate": 5.3534545965295384e-06, + "loss": 0.5032, + "mean_token_accuracy": 0.8414341598749161, + "num_tokens": 86654883.0, + "step": 72040 + }, + { + "entropy": 1.8974163249135017, + "epoch": 0.22334869598305587, + "grad_norm": 4.734619140625, + "learning_rate": 5.353083068793772e-06, + "loss": 0.5136, + "mean_token_accuracy": 0.8393331229686737, + "num_tokens": 86666404.0, + "step": 72050 + }, + { + "entropy": 1.8827458634972571, + "epoch": 0.22337969510810557, + "grad_norm": 7.965644359588623, + "learning_rate": 5.352711618398927e-06, + "loss": 0.4825, + "mean_token_accuracy": 0.8474840223789215, + "num_tokens": 86678813.0, + "step": 72060 + }, + { + "entropy": 1.9129985481500626, + "epoch": 0.22341069423315527, + "grad_norm": 3.682452917098999, + "learning_rate": 5.352340245318172e-06, + "loss": 0.4979, + "mean_token_accuracy": 0.8426467835903168, + "num_tokens": 86691188.0, + "step": 72070 + }, + { + "entropy": 1.8033430457115174, + "epoch": 0.22344169335820496, + "grad_norm": 8.872450828552246, + "learning_rate": 5.351968949524691e-06, + "loss": 0.3794, + "mean_token_accuracy": 0.8609629422426224, + "num_tokens": 86704614.0, + "step": 72080 + }, + { + "entropy": 1.8621386557817459, + "epoch": 0.22347269248325466, + "grad_norm": 3.9304046630859375, + "learning_rate": 5.351597730991682e-06, + "loss": 0.4775, + "mean_token_accuracy": 0.8484826713800431, + "num_tokens": 86717142.0, + "step": 72090 + }, + { + "entropy": 1.978152585029602, + "epoch": 0.22350369160830436, + "grad_norm": 8.142024040222168, + "learning_rate": 5.351226589692352e-06, + "loss": 0.5474, + "mean_token_accuracy": 0.8380045786499977, + "num_tokens": 86727790.0, + "step": 72100 + }, + { + "entropy": 1.861580203473568, + "epoch": 0.22353469073335405, + "grad_norm": 9.076419830322266, + "learning_rate": 5.350855525599924e-06, + "loss": 0.4839, + "mean_token_accuracy": 0.8400816440582275, + "num_tokens": 86739827.0, + "step": 72110 + }, + { + "entropy": 1.813391050696373, + "epoch": 0.22356568985840375, + "grad_norm": 8.701443672180176, + "learning_rate": 5.350484538687634e-06, + "loss": 0.4251, + "mean_token_accuracy": 0.8458169072866439, + "num_tokens": 86752977.0, + "step": 72120 + }, + { + "entropy": 1.9347064226865769, + "epoch": 0.22359668898345345, + "grad_norm": 8.15721607208252, + "learning_rate": 5.350113628928731e-06, + "loss": 0.499, + "mean_token_accuracy": 0.8386592581868172, + "num_tokens": 86764531.0, + "step": 72130 + }, + { + "entropy": 1.82811646014452, + "epoch": 0.22362768810850314, + "grad_norm": 3.8640589714050293, + "learning_rate": 5.349742796296475e-06, + "loss": 0.4314, + "mean_token_accuracy": 0.8612099349498749, + "num_tokens": 86777148.0, + "step": 72140 + }, + { + "entropy": 1.9188909232616425, + "epoch": 0.22365868723355284, + "grad_norm": 8.598167419433594, + "learning_rate": 5.349372040764139e-06, + "loss": 0.5201, + "mean_token_accuracy": 0.8361388146877289, + "num_tokens": 86788330.0, + "step": 72150 + }, + { + "entropy": 1.8747940585017204, + "epoch": 0.22368968635860254, + "grad_norm": 4.909088134765625, + "learning_rate": 5.349001362305013e-06, + "loss": 0.4986, + "mean_token_accuracy": 0.836989839375019, + "num_tokens": 86800353.0, + "step": 72160 + }, + { + "entropy": 1.9335097655653954, + "epoch": 0.22372068548365223, + "grad_norm": 8.049686431884766, + "learning_rate": 5.348630760892396e-06, + "loss": 0.5019, + "mean_token_accuracy": 0.8366736158728599, + "num_tokens": 86811632.0, + "step": 72170 + }, + { + "entropy": 1.8739148452877998, + "epoch": 0.22375168460870193, + "grad_norm": 8.556473731994629, + "learning_rate": 5.3482602364996015e-06, + "loss": 0.5167, + "mean_token_accuracy": 0.8425387993454934, + "num_tokens": 86824293.0, + "step": 72180 + }, + { + "entropy": 1.9024907439947127, + "epoch": 0.22378268373375163, + "grad_norm": 8.516105651855469, + "learning_rate": 5.347889789099956e-06, + "loss": 0.5412, + "mean_token_accuracy": 0.825544822216034, + "num_tokens": 86836349.0, + "step": 72190 + }, + { + "entropy": 1.897923794388771, + "epoch": 0.22381368285880132, + "grad_norm": 7.900509357452393, + "learning_rate": 5.347519418666795e-06, + "loss": 0.4713, + "mean_token_accuracy": 0.8481002271175384, + "num_tokens": 86847118.0, + "step": 72200 + }, + { + "entropy": 1.8177719444036484, + "epoch": 0.22384468198385102, + "grad_norm": 7.703773498535156, + "learning_rate": 5.347149125173477e-06, + "loss": 0.4398, + "mean_token_accuracy": 0.8518023356795311, + "num_tokens": 86860044.0, + "step": 72210 + }, + { + "entropy": 1.8685875788331032, + "epoch": 0.22387568110890071, + "grad_norm": 6.817202091217041, + "learning_rate": 5.3467789085933605e-06, + "loss": 0.4636, + "mean_token_accuracy": 0.8450110226869583, + "num_tokens": 86872544.0, + "step": 72220 + }, + { + "entropy": 1.9101796388626098, + "epoch": 0.22390668023395038, + "grad_norm": 8.793787002563477, + "learning_rate": 5.346408768899827e-06, + "loss": 0.538, + "mean_token_accuracy": 0.8359600931406022, + "num_tokens": 86884268.0, + "step": 72230 + }, + { + "entropy": 1.8767978221178054, + "epoch": 0.22393767935900008, + "grad_norm": 8.353222846984863, + "learning_rate": 5.3460387060662665e-06, + "loss": 0.4822, + "mean_token_accuracy": 0.8356214210391044, + "num_tokens": 86895860.0, + "step": 72240 + }, + { + "entropy": 1.8598055988550186, + "epoch": 0.22396867848404978, + "grad_norm": 8.898566246032715, + "learning_rate": 5.345668720066082e-06, + "loss": 0.5261, + "mean_token_accuracy": 0.8349154770374299, + "num_tokens": 86907668.0, + "step": 72250 + }, + { + "entropy": 1.9466781616210938, + "epoch": 0.22399967760909947, + "grad_norm": 8.537947654724121, + "learning_rate": 5.34529881087269e-06, + "loss": 0.5206, + "mean_token_accuracy": 0.8424281865358353, + "num_tokens": 86919261.0, + "step": 72260 + }, + { + "entropy": 1.89094198346138, + "epoch": 0.22403067673414917, + "grad_norm": 8.919629096984863, + "learning_rate": 5.344928978459521e-06, + "loss": 0.469, + "mean_token_accuracy": 0.8445121660828591, + "num_tokens": 86930676.0, + "step": 72270 + }, + { + "entropy": 1.896059663593769, + "epoch": 0.22406167585919887, + "grad_norm": 8.955682754516602, + "learning_rate": 5.344559222800014e-06, + "loss": 0.5143, + "mean_token_accuracy": 0.8374183923006058, + "num_tokens": 86941738.0, + "step": 72280 + }, + { + "entropy": 1.883140115439892, + "epoch": 0.22409267498424856, + "grad_norm": 4.690348148345947, + "learning_rate": 5.344189543867627e-06, + "loss": 0.4934, + "mean_token_accuracy": 0.844383516907692, + "num_tokens": 86953896.0, + "step": 72290 + }, + { + "entropy": 1.95089001506567, + "epoch": 0.22412367410929826, + "grad_norm": 10.894330024719238, + "learning_rate": 5.3438199416358285e-06, + "loss": 0.5338, + "mean_token_accuracy": 0.8346414759755134, + "num_tokens": 86965358.0, + "step": 72300 + }, + { + "entropy": 1.991261574625969, + "epoch": 0.22415467323434796, + "grad_norm": 8.711053848266602, + "learning_rate": 5.343450416078097e-06, + "loss": 0.5846, + "mean_token_accuracy": 0.8269345715641976, + "num_tokens": 86976161.0, + "step": 72310 + }, + { + "entropy": 1.9148213535547256, + "epoch": 0.22418567235939765, + "grad_norm": 8.75413703918457, + "learning_rate": 5.343080967167927e-06, + "loss": 0.5037, + "mean_token_accuracy": 0.8315785259008408, + "num_tokens": 86988247.0, + "step": 72320 + }, + { + "entropy": 1.967397329211235, + "epoch": 0.22421667148444735, + "grad_norm": 8.436080932617188, + "learning_rate": 5.342711594878823e-06, + "loss": 0.5688, + "mean_token_accuracy": 0.8347294554114342, + "num_tokens": 86999086.0, + "step": 72330 + }, + { + "entropy": 2.000737062096596, + "epoch": 0.22424767060949705, + "grad_norm": 9.230005264282227, + "learning_rate": 5.342342299184309e-06, + "loss": 0.6598, + "mean_token_accuracy": 0.8300457715988159, + "num_tokens": 87010382.0, + "step": 72340 + }, + { + "entropy": 1.9067621529102325, + "epoch": 0.22427866973454674, + "grad_norm": 8.503578186035156, + "learning_rate": 5.341973080057913e-06, + "loss": 0.5206, + "mean_token_accuracy": 0.831950668990612, + "num_tokens": 87022828.0, + "step": 72350 + }, + { + "entropy": 1.9217410042881966, + "epoch": 0.22430966885959644, + "grad_norm": 9.517120361328125, + "learning_rate": 5.34160393747318e-06, + "loss": 0.4946, + "mean_token_accuracy": 0.8437418282032013, + "num_tokens": 87034285.0, + "step": 72360 + }, + { + "entropy": 1.8522989198565483, + "epoch": 0.22434066798464614, + "grad_norm": 9.239526748657227, + "learning_rate": 5.34123487140367e-06, + "loss": 0.4158, + "mean_token_accuracy": 0.8460891872644425, + "num_tokens": 87046754.0, + "step": 72370 + }, + { + "entropy": 1.9420243352651596, + "epoch": 0.22437166710969583, + "grad_norm": 8.482245445251465, + "learning_rate": 5.340865881822951e-06, + "loss": 0.5542, + "mean_token_accuracy": 0.8409739390015603, + "num_tokens": 87058047.0, + "step": 72380 + }, + { + "entropy": 1.9989713937044145, + "epoch": 0.22440266623474553, + "grad_norm": 10.160581588745117, + "learning_rate": 5.340496968704607e-06, + "loss": 0.6208, + "mean_token_accuracy": 0.82189432233572, + "num_tokens": 87068796.0, + "step": 72390 + }, + { + "entropy": 1.8937209725379944, + "epoch": 0.22443366535979523, + "grad_norm": 8.602519035339355, + "learning_rate": 5.340128132022235e-06, + "loss": 0.4805, + "mean_token_accuracy": 0.8464359551668167, + "num_tokens": 87081062.0, + "step": 72400 + }, + { + "entropy": 1.923159296810627, + "epoch": 0.22446466448484492, + "grad_norm": 3.398913860321045, + "learning_rate": 5.339759371749443e-06, + "loss": 0.501, + "mean_token_accuracy": 0.8402744174003601, + "num_tokens": 87093053.0, + "step": 72410 + }, + { + "entropy": 1.8120801776647568, + "epoch": 0.22449566360989462, + "grad_norm": 4.736013412475586, + "learning_rate": 5.339390687859851e-06, + "loss": 0.4571, + "mean_token_accuracy": 0.8469538927078247, + "num_tokens": 87106652.0, + "step": 72420 + }, + { + "entropy": 1.976495975255966, + "epoch": 0.22452666273494432, + "grad_norm": 9.108383178710938, + "learning_rate": 5.339022080327097e-06, + "loss": 0.5906, + "mean_token_accuracy": 0.8242412880063057, + "num_tokens": 87117466.0, + "step": 72430 + }, + { + "entropy": 1.8691235825419426, + "epoch": 0.224557661859994, + "grad_norm": 3.6879398822784424, + "learning_rate": 5.338653549124824e-06, + "loss": 0.4588, + "mean_token_accuracy": 0.8466471612453461, + "num_tokens": 87130079.0, + "step": 72440 + }, + { + "entropy": 1.872739316523075, + "epoch": 0.2245886609850437, + "grad_norm": 8.234794616699219, + "learning_rate": 5.338285094226693e-06, + "loss": 0.5574, + "mean_token_accuracy": 0.8330441653728485, + "num_tokens": 87143430.0, + "step": 72450 + }, + { + "entropy": 1.911289119720459, + "epoch": 0.2246196601100934, + "grad_norm": 11.46174430847168, + "learning_rate": 5.337916715606378e-06, + "loss": 0.5162, + "mean_token_accuracy": 0.8353003263473511, + "num_tokens": 87155144.0, + "step": 72460 + }, + { + "entropy": 1.9616554155945778, + "epoch": 0.22465065923514307, + "grad_norm": 7.765041351318359, + "learning_rate": 5.337548413237561e-06, + "loss": 0.5378, + "mean_token_accuracy": 0.8335460603237153, + "num_tokens": 87166518.0, + "step": 72470 + }, + { + "entropy": 1.9011431649327277, + "epoch": 0.22468165836019277, + "grad_norm": 8.76104736328125, + "learning_rate": 5.337180187093943e-06, + "loss": 0.5007, + "mean_token_accuracy": 0.8393405005335808, + "num_tokens": 87178008.0, + "step": 72480 + }, + { + "entropy": 1.8503070279955864, + "epoch": 0.22471265748524247, + "grad_norm": 10.410445213317871, + "learning_rate": 5.336812037149233e-06, + "loss": 0.4587, + "mean_token_accuracy": 0.8424379363656044, + "num_tokens": 87191011.0, + "step": 72490 + }, + { + "entropy": 1.9341685771942139, + "epoch": 0.22474365661029216, + "grad_norm": 3.9696474075317383, + "learning_rate": 5.336443963377155e-06, + "loss": 0.4707, + "mean_token_accuracy": 0.8476704210042953, + "num_tokens": 87202897.0, + "step": 72500 + }, + { + "entropy": 1.9511468440294266, + "epoch": 0.22477465573534186, + "grad_norm": 9.15610122680664, + "learning_rate": 5.336075965751444e-06, + "loss": 0.5751, + "mean_token_accuracy": 0.8327615946531296, + "num_tokens": 87213746.0, + "step": 72510 + }, + { + "entropy": 1.9269428536295892, + "epoch": 0.22480565486039156, + "grad_norm": 9.162718772888184, + "learning_rate": 5.335708044245848e-06, + "loss": 0.5052, + "mean_token_accuracy": 0.840380209684372, + "num_tokens": 87225670.0, + "step": 72520 + }, + { + "entropy": 1.940378698706627, + "epoch": 0.22483665398544125, + "grad_norm": 6.751579284667969, + "learning_rate": 5.335340198834132e-06, + "loss": 0.532, + "mean_token_accuracy": 0.8332238659262657, + "num_tokens": 87236688.0, + "step": 72530 + }, + { + "entropy": 1.958859845995903, + "epoch": 0.22486765311049095, + "grad_norm": 10.634821891784668, + "learning_rate": 5.334972429490065e-06, + "loss": 0.5331, + "mean_token_accuracy": 0.8343855604529381, + "num_tokens": 87247914.0, + "step": 72540 + }, + { + "entropy": 1.9445181697607041, + "epoch": 0.22489865223554065, + "grad_norm": 9.388453483581543, + "learning_rate": 5.334604736187437e-06, + "loss": 0.4933, + "mean_token_accuracy": 0.8429577887058258, + "num_tokens": 87258915.0, + "step": 72550 + }, + { + "entropy": 1.920157741010189, + "epoch": 0.22492965136059034, + "grad_norm": 8.916130065917969, + "learning_rate": 5.334237118900046e-06, + "loss": 0.4968, + "mean_token_accuracy": 0.8405413702130318, + "num_tokens": 87271134.0, + "step": 72560 + }, + { + "entropy": 1.9094747498631477, + "epoch": 0.22496065048564004, + "grad_norm": 8.803078651428223, + "learning_rate": 5.333869577601703e-06, + "loss": 0.5004, + "mean_token_accuracy": 0.8441318541765213, + "num_tokens": 87282774.0, + "step": 72570 + }, + { + "entropy": 1.8855116859078407, + "epoch": 0.22499164961068974, + "grad_norm": 9.416301727294922, + "learning_rate": 5.333502112266234e-06, + "loss": 0.4659, + "mean_token_accuracy": 0.8383870780467987, + "num_tokens": 87295016.0, + "step": 72580 + }, + { + "entropy": 1.8912652000784873, + "epoch": 0.22502264873573943, + "grad_norm": 7.569722652435303, + "learning_rate": 5.333134722867477e-06, + "loss": 0.5354, + "mean_token_accuracy": 0.8289127513766289, + "num_tokens": 87307357.0, + "step": 72590 + }, + { + "entropy": 1.8563939124345779, + "epoch": 0.22505364786078913, + "grad_norm": 3.901181221008301, + "learning_rate": 5.332767409379278e-06, + "loss": 0.4824, + "mean_token_accuracy": 0.8382441207766533, + "num_tokens": 87321430.0, + "step": 72600 + }, + { + "entropy": 1.7983256116509438, + "epoch": 0.22508464698583883, + "grad_norm": 8.528167724609375, + "learning_rate": 5.332400171775503e-06, + "loss": 0.479, + "mean_token_accuracy": 0.8504438042640686, + "num_tokens": 87334567.0, + "step": 72610 + }, + { + "entropy": 1.8781714573502541, + "epoch": 0.22511564611088852, + "grad_norm": 8.67835521697998, + "learning_rate": 5.332033010030026e-06, + "loss": 0.4685, + "mean_token_accuracy": 0.8410458341240883, + "num_tokens": 87347091.0, + "step": 72620 + }, + { + "entropy": 1.869327275454998, + "epoch": 0.22514664523593822, + "grad_norm": 7.16377592086792, + "learning_rate": 5.331665924116734e-06, + "loss": 0.4843, + "mean_token_accuracy": 0.8444514736533165, + "num_tokens": 87359313.0, + "step": 72630 + }, + { + "entropy": 1.9245384186506271, + "epoch": 0.22517764436098792, + "grad_norm": 8.11259651184082, + "learning_rate": 5.331298914009525e-06, + "loss": 0.4793, + "mean_token_accuracy": 0.8420319616794586, + "num_tokens": 87370592.0, + "step": 72640 + }, + { + "entropy": 1.9539393037557602, + "epoch": 0.2252086434860376, + "grad_norm": 8.318745613098145, + "learning_rate": 5.3309319796823165e-06, + "loss": 0.5345, + "mean_token_accuracy": 0.8356306239962578, + "num_tokens": 87381867.0, + "step": 72650 + }, + { + "entropy": 1.860501480102539, + "epoch": 0.2252396426110873, + "grad_norm": 11.79776668548584, + "learning_rate": 5.33056512110903e-06, + "loss": 0.4856, + "mean_token_accuracy": 0.8360062450170517, + "num_tokens": 87394488.0, + "step": 72660 + }, + { + "entropy": 1.9126732975244523, + "epoch": 0.225270641736137, + "grad_norm": 7.536765098571777, + "learning_rate": 5.330198338263605e-06, + "loss": 0.5059, + "mean_token_accuracy": 0.8378516137599945, + "num_tokens": 87406482.0, + "step": 72670 + }, + { + "entropy": 1.950472255051136, + "epoch": 0.2253016408611867, + "grad_norm": 6.861098766326904, + "learning_rate": 5.329831631119992e-06, + "loss": 0.5095, + "mean_token_accuracy": 0.8359086707234382, + "num_tokens": 87417682.0, + "step": 72680 + }, + { + "entropy": 1.8644071131944657, + "epoch": 0.2253326399862364, + "grad_norm": 3.9185712337493896, + "learning_rate": 5.329464999652151e-06, + "loss": 0.4551, + "mean_token_accuracy": 0.8424749970436096, + "num_tokens": 87430239.0, + "step": 72690 + }, + { + "entropy": 1.8736916035413742, + "epoch": 0.2253636391112861, + "grad_norm": 8.611660957336426, + "learning_rate": 5.329098443834062e-06, + "loss": 0.4387, + "mean_token_accuracy": 0.857531276345253, + "num_tokens": 87442834.0, + "step": 72700 + }, + { + "entropy": 1.903275479376316, + "epoch": 0.2253946382363358, + "grad_norm": 2.624007225036621, + "learning_rate": 5.328731963639709e-06, + "loss": 0.492, + "mean_token_accuracy": 0.8541991651058197, + "num_tokens": 87453877.0, + "step": 72710 + }, + { + "entropy": 1.8286138609051705, + "epoch": 0.22542563736138546, + "grad_norm": 2.311256170272827, + "learning_rate": 5.328365559043095e-06, + "loss": 0.4711, + "mean_token_accuracy": 0.8497246339917183, + "num_tokens": 87466560.0, + "step": 72720 + }, + { + "entropy": 1.7663344264030456, + "epoch": 0.22545663648643516, + "grad_norm": 9.070472717285156, + "learning_rate": 5.327999230018231e-06, + "loss": 0.3978, + "mean_token_accuracy": 0.8562262952327728, + "num_tokens": 87480091.0, + "step": 72730 + }, + { + "entropy": 1.8812779873609542, + "epoch": 0.22548763561148485, + "grad_norm": 10.331746101379395, + "learning_rate": 5.327632976539146e-06, + "loss": 0.5269, + "mean_token_accuracy": 0.8345619887113571, + "num_tokens": 87492404.0, + "step": 72740 + }, + { + "entropy": 1.8396708235144614, + "epoch": 0.22551863473653455, + "grad_norm": 4.0618462562561035, + "learning_rate": 5.327266798579874e-06, + "loss": 0.414, + "mean_token_accuracy": 0.8638743400573731, + "num_tokens": 87505236.0, + "step": 72750 + }, + { + "entropy": 1.8074708625674247, + "epoch": 0.22554963386158425, + "grad_norm": 4.086684703826904, + "learning_rate": 5.326900696114468e-06, + "loss": 0.3903, + "mean_token_accuracy": 0.8573868215084076, + "num_tokens": 87518657.0, + "step": 72760 + }, + { + "entropy": 1.912179996073246, + "epoch": 0.22558063298663394, + "grad_norm": 6.899926662445068, + "learning_rate": 5.326534669116988e-06, + "loss": 0.4744, + "mean_token_accuracy": 0.8395515128970146, + "num_tokens": 87530348.0, + "step": 72770 + }, + { + "entropy": 1.95667557567358, + "epoch": 0.22561163211168364, + "grad_norm": 7.5052361488342285, + "learning_rate": 5.326168717561514e-06, + "loss": 0.5686, + "mean_token_accuracy": 0.8295163378119469, + "num_tokens": 87541743.0, + "step": 72780 + }, + { + "entropy": 1.7888675391674043, + "epoch": 0.22564263123673334, + "grad_norm": 6.81760311126709, + "learning_rate": 5.325802841422131e-06, + "loss": 0.3976, + "mean_token_accuracy": 0.8517789766192436, + "num_tokens": 87554783.0, + "step": 72790 + }, + { + "entropy": 1.9340469419956208, + "epoch": 0.22567363036178303, + "grad_norm": 8.59512996673584, + "learning_rate": 5.325437040672939e-06, + "loss": 0.4909, + "mean_token_accuracy": 0.8468787103891373, + "num_tokens": 87565222.0, + "step": 72800 + }, + { + "entropy": 1.8804991841316223, + "epoch": 0.22570462948683273, + "grad_norm": 4.417396545410156, + "learning_rate": 5.3250713152880525e-06, + "loss": 0.5092, + "mean_token_accuracy": 0.8418024495244026, + "num_tokens": 87577513.0, + "step": 72810 + }, + { + "entropy": 1.889168956875801, + "epoch": 0.22573562861188243, + "grad_norm": 6.085160732269287, + "learning_rate": 5.3247056652415955e-06, + "loss": 0.4342, + "mean_token_accuracy": 0.8587741032242775, + "num_tokens": 87589761.0, + "step": 72820 + }, + { + "entropy": 1.8290639758110045, + "epoch": 0.22576662773693212, + "grad_norm": 9.560019493103027, + "learning_rate": 5.324340090507707e-06, + "loss": 0.4573, + "mean_token_accuracy": 0.8536442771553994, + "num_tokens": 87602453.0, + "step": 72830 + }, + { + "entropy": 1.9466052517294883, + "epoch": 0.22579762686198182, + "grad_norm": 10.381073951721191, + "learning_rate": 5.323974591060536e-06, + "loss": 0.5507, + "mean_token_accuracy": 0.8318102985620499, + "num_tokens": 87614036.0, + "step": 72840 + }, + { + "entropy": 1.8601842939853668, + "epoch": 0.22582862598703152, + "grad_norm": 7.986728668212891, + "learning_rate": 5.323609166874244e-06, + "loss": 0.4184, + "mean_token_accuracy": 0.8537519767880439, + "num_tokens": 87625934.0, + "step": 72850 + }, + { + "entropy": 1.9298621758818626, + "epoch": 0.2258596251120812, + "grad_norm": 7.938553810119629, + "learning_rate": 5.3232438179230086e-06, + "loss": 0.5259, + "mean_token_accuracy": 0.8388584434986115, + "num_tokens": 87637522.0, + "step": 72860 + }, + { + "entropy": 1.8683239459991454, + "epoch": 0.2258906242371309, + "grad_norm": 7.229837894439697, + "learning_rate": 5.322878544181015e-06, + "loss": 0.4459, + "mean_token_accuracy": 0.8578355640172959, + "num_tokens": 87649965.0, + "step": 72870 + }, + { + "entropy": 1.9087668746709823, + "epoch": 0.2259216233621806, + "grad_norm": 8.52033519744873, + "learning_rate": 5.322513345622464e-06, + "loss": 0.4597, + "mean_token_accuracy": 0.8454820603132248, + "num_tokens": 87662212.0, + "step": 72880 + }, + { + "entropy": 1.8324894472956657, + "epoch": 0.2259526224872303, + "grad_norm": 9.297342300415039, + "learning_rate": 5.322148222221568e-06, + "loss": 0.4355, + "mean_token_accuracy": 0.8492673173546791, + "num_tokens": 87675147.0, + "step": 72890 + }, + { + "entropy": 1.8852784946560859, + "epoch": 0.22598362161228, + "grad_norm": 3.968653440475464, + "learning_rate": 5.3217831739525515e-06, + "loss": 0.53, + "mean_token_accuracy": 0.8285475313663483, + "num_tokens": 87686898.0, + "step": 72900 + }, + { + "entropy": 1.9112021766602993, + "epoch": 0.2260146207373297, + "grad_norm": 4.359899044036865, + "learning_rate": 5.321418200789649e-06, + "loss": 0.5096, + "mean_token_accuracy": 0.8378346741199494, + "num_tokens": 87699547.0, + "step": 72910 + }, + { + "entropy": 1.9921528965234756, + "epoch": 0.2260456198623794, + "grad_norm": 8.43260383605957, + "learning_rate": 5.321053302707114e-06, + "loss": 0.5883, + "mean_token_accuracy": 0.8277798056602478, + "num_tokens": 87710308.0, + "step": 72920 + }, + { + "entropy": 1.847052039206028, + "epoch": 0.2260766189874291, + "grad_norm": 8.61585521697998, + "learning_rate": 5.320688479679204e-06, + "loss": 0.4228, + "mean_token_accuracy": 0.855183681845665, + "num_tokens": 87723526.0, + "step": 72930 + }, + { + "entropy": 1.9104703694581986, + "epoch": 0.2261076181124788, + "grad_norm": 9.710681915283203, + "learning_rate": 5.320323731680197e-06, + "loss": 0.4608, + "mean_token_accuracy": 0.8491173967719078, + "num_tokens": 87735916.0, + "step": 72940 + }, + { + "entropy": 1.9582516461610795, + "epoch": 0.22613861723752848, + "grad_norm": 7.941116809844971, + "learning_rate": 5.319959058684375e-06, + "loss": 0.5148, + "mean_token_accuracy": 0.8455424636602402, + "num_tokens": 87747201.0, + "step": 72950 + }, + { + "entropy": 1.8938917353749276, + "epoch": 0.22616961636257818, + "grad_norm": 8.022253036499023, + "learning_rate": 5.319594460666041e-06, + "loss": 0.5049, + "mean_token_accuracy": 0.8349814653396607, + "num_tokens": 87758501.0, + "step": 72960 + }, + { + "entropy": 1.9582582622766496, + "epoch": 0.22620061548762785, + "grad_norm": 8.384016036987305, + "learning_rate": 5.319229937599502e-06, + "loss": 0.5318, + "mean_token_accuracy": 0.8396169364452362, + "num_tokens": 87769542.0, + "step": 72970 + }, + { + "entropy": 1.9805827289819717, + "epoch": 0.22623161461267755, + "grad_norm": 7.869809150695801, + "learning_rate": 5.318865489459086e-06, + "loss": 0.4925, + "mean_token_accuracy": 0.8493814244866371, + "num_tokens": 87779962.0, + "step": 72980 + }, + { + "entropy": 1.9055181667208672, + "epoch": 0.22626261373772724, + "grad_norm": 8.316291809082031, + "learning_rate": 5.3185011162191226e-06, + "loss": 0.4932, + "mean_token_accuracy": 0.8454388037323952, + "num_tokens": 87791102.0, + "step": 72990 + }, + { + "entropy": 1.9191015899181365, + "epoch": 0.22629361286277694, + "grad_norm": 7.545352458953857, + "learning_rate": 5.318136817853964e-06, + "loss": 0.5087, + "mean_token_accuracy": 0.8402454987168312, + "num_tokens": 87803046.0, + "step": 73000 + }, + { + "entropy": 1.9037675946950912, + "epoch": 0.22632461198782663, + "grad_norm": 9.174158096313477, + "learning_rate": 5.317772594337969e-06, + "loss": 0.5138, + "mean_token_accuracy": 0.8370854437351227, + "num_tokens": 87814781.0, + "step": 73010 + }, + { + "entropy": 1.941840186715126, + "epoch": 0.22635561111287633, + "grad_norm": 11.267866134643555, + "learning_rate": 5.317408445645512e-06, + "loss": 0.5302, + "mean_token_accuracy": 0.834077525138855, + "num_tokens": 87826000.0, + "step": 73020 + }, + { + "entropy": 1.9527758836746216, + "epoch": 0.22638661023792603, + "grad_norm": 7.3283514976501465, + "learning_rate": 5.3170443717509745e-06, + "loss": 0.5012, + "mean_token_accuracy": 0.8412612110376358, + "num_tokens": 87836886.0, + "step": 73030 + }, + { + "entropy": 1.8808117777109146, + "epoch": 0.22641760936297572, + "grad_norm": 10.775453567504883, + "learning_rate": 5.316680372628757e-06, + "loss": 0.4909, + "mean_token_accuracy": 0.8417520016431809, + "num_tokens": 87848489.0, + "step": 73040 + }, + { + "entropy": 1.8674452692270278, + "epoch": 0.22644860848802542, + "grad_norm": 4.4659223556518555, + "learning_rate": 5.316316448253266e-06, + "loss": 0.4594, + "mean_token_accuracy": 0.8506308153271676, + "num_tokens": 87860795.0, + "step": 73050 + }, + { + "entropy": 1.8244705572724342, + "epoch": 0.22647960761307512, + "grad_norm": 8.522405624389648, + "learning_rate": 5.315952598598925e-06, + "loss": 0.4971, + "mean_token_accuracy": 0.8474853068590165, + "num_tokens": 87873677.0, + "step": 73060 + }, + { + "entropy": 1.923226311802864, + "epoch": 0.22651060673812481, + "grad_norm": 7.727634906768799, + "learning_rate": 5.315588823640166e-06, + "loss": 0.5555, + "mean_token_accuracy": 0.8321527540683746, + "num_tokens": 87885403.0, + "step": 73070 + }, + { + "entropy": 1.8770409598946571, + "epoch": 0.2265416058631745, + "grad_norm": 7.156225681304932, + "learning_rate": 5.315225123351437e-06, + "loss": 0.4545, + "mean_token_accuracy": 0.8440602973103524, + "num_tokens": 87897387.0, + "step": 73080 + }, + { + "entropy": 1.8762115344405175, + "epoch": 0.2265726049882242, + "grad_norm": 7.807554721832275, + "learning_rate": 5.3148614977071956e-06, + "loss": 0.4957, + "mean_token_accuracy": 0.8331007972359658, + "num_tokens": 87910085.0, + "step": 73090 + }, + { + "entropy": 1.9102078214287759, + "epoch": 0.2266036041132739, + "grad_norm": 7.591177940368652, + "learning_rate": 5.314497946681913e-06, + "loss": 0.5056, + "mean_token_accuracy": 0.8310081690549851, + "num_tokens": 87922858.0, + "step": 73100 + }, + { + "entropy": 1.9448012560606003, + "epoch": 0.2266346032383236, + "grad_norm": 8.734160423278809, + "learning_rate": 5.31413447025007e-06, + "loss": 0.4943, + "mean_token_accuracy": 0.8439796343445778, + "num_tokens": 87933821.0, + "step": 73110 + }, + { + "entropy": 1.8482655853033065, + "epoch": 0.2266656023633733, + "grad_norm": 6.68172550201416, + "learning_rate": 5.3137710683861635e-06, + "loss": 0.4315, + "mean_token_accuracy": 0.8570908337831498, + "num_tokens": 87946480.0, + "step": 73120 + }, + { + "entropy": 1.9056684881448747, + "epoch": 0.226696601488423, + "grad_norm": 7.89918327331543, + "learning_rate": 5.3134077410647004e-06, + "loss": 0.4895, + "mean_token_accuracy": 0.8478031545877457, + "num_tokens": 87958529.0, + "step": 73130 + }, + { + "entropy": 1.8404195442795754, + "epoch": 0.2267276006134727, + "grad_norm": 8.014739990234375, + "learning_rate": 5.3130444882601994e-06, + "loss": 0.4243, + "mean_token_accuracy": 0.8541824370622635, + "num_tokens": 87971303.0, + "step": 73140 + }, + { + "entropy": 1.8912012234330178, + "epoch": 0.2267585997385224, + "grad_norm": 9.917512893676758, + "learning_rate": 5.312681309947193e-06, + "loss": 0.4525, + "mean_token_accuracy": 0.8446354269981384, + "num_tokens": 87983955.0, + "step": 73150 + }, + { + "entropy": 1.9031813889741898, + "epoch": 0.22678959886357208, + "grad_norm": 8.86811637878418, + "learning_rate": 5.3123182061002245e-06, + "loss": 0.4985, + "mean_token_accuracy": 0.8356417074799538, + "num_tokens": 87995437.0, + "step": 73160 + }, + { + "entropy": 1.9914841502904892, + "epoch": 0.22682059798862178, + "grad_norm": 8.847204208374023, + "learning_rate": 5.311955176693849e-06, + "loss": 0.5371, + "mean_token_accuracy": 0.8395547032356262, + "num_tokens": 88006625.0, + "step": 73170 + }, + { + "entropy": 1.9578437075018882, + "epoch": 0.22685159711367148, + "grad_norm": 8.657424926757812, + "learning_rate": 5.311592221702637e-06, + "loss": 0.5205, + "mean_token_accuracy": 0.8370061025023461, + "num_tokens": 88017862.0, + "step": 73180 + }, + { + "entropy": 1.843288530409336, + "epoch": 0.22688259623872117, + "grad_norm": 7.968525409698486, + "learning_rate": 5.311229341101166e-06, + "loss": 0.4257, + "mean_token_accuracy": 0.8541227072477341, + "num_tokens": 88030463.0, + "step": 73190 + }, + { + "entropy": 1.949458932876587, + "epoch": 0.22691359536377087, + "grad_norm": 8.945792198181152, + "learning_rate": 5.3108665348640306e-06, + "loss": 0.5382, + "mean_token_accuracy": 0.8297375872731209, + "num_tokens": 88041754.0, + "step": 73200 + }, + { + "entropy": 1.858209890127182, + "epoch": 0.22694459448882057, + "grad_norm": 4.584981441497803, + "learning_rate": 5.3105038029658355e-06, + "loss": 0.4564, + "mean_token_accuracy": 0.849499624967575, + "num_tokens": 88054627.0, + "step": 73210 + }, + { + "entropy": 1.9475027039647101, + "epoch": 0.22697559361387024, + "grad_norm": 3.7373814582824707, + "learning_rate": 5.310141145381194e-06, + "loss": 0.5527, + "mean_token_accuracy": 0.8340747594833374, + "num_tokens": 88066580.0, + "step": 73220 + }, + { + "entropy": 1.8866298824548722, + "epoch": 0.22700659273891993, + "grad_norm": 8.264451026916504, + "learning_rate": 5.30977856208474e-06, + "loss": 0.4415, + "mean_token_accuracy": 0.8434574559330941, + "num_tokens": 88078483.0, + "step": 73230 + }, + { + "entropy": 1.8932233542203902, + "epoch": 0.22703759186396963, + "grad_norm": 12.378700256347656, + "learning_rate": 5.309416053051112e-06, + "loss": 0.4815, + "mean_token_accuracy": 0.8487199395895004, + "num_tokens": 88090079.0, + "step": 73240 + }, + { + "entropy": 1.8719527080655098, + "epoch": 0.22706859098901933, + "grad_norm": 9.49503231048584, + "learning_rate": 5.309053618254963e-06, + "loss": 0.5081, + "mean_token_accuracy": 0.8392317876219749, + "num_tokens": 88103225.0, + "step": 73250 + }, + { + "entropy": 1.8768180578947067, + "epoch": 0.22709959011406902, + "grad_norm": 4.083436965942383, + "learning_rate": 5.308691257670956e-06, + "loss": 0.4518, + "mean_token_accuracy": 0.8495214492082596, + "num_tokens": 88115707.0, + "step": 73260 + }, + { + "entropy": 1.9521862715482712, + "epoch": 0.22713058923911872, + "grad_norm": 4.048558712005615, + "learning_rate": 5.308328971273773e-06, + "loss": 0.5133, + "mean_token_accuracy": 0.8412155851721763, + "num_tokens": 88126836.0, + "step": 73270 + }, + { + "entropy": 1.9520546540617942, + "epoch": 0.22716158836416842, + "grad_norm": 4.938279151916504, + "learning_rate": 5.3079667590381004e-06, + "loss": 0.5319, + "mean_token_accuracy": 0.8300433591008186, + "num_tokens": 88138872.0, + "step": 73280 + }, + { + "entropy": 1.8862208157777787, + "epoch": 0.2271925874892181, + "grad_norm": 7.787830829620361, + "learning_rate": 5.3076046209386405e-06, + "loss": 0.475, + "mean_token_accuracy": 0.8360537528991699, + "num_tokens": 88151734.0, + "step": 73290 + }, + { + "entropy": 1.9074003919959068, + "epoch": 0.2272235866142678, + "grad_norm": 7.785778045654297, + "learning_rate": 5.307242556950106e-06, + "loss": 0.4628, + "mean_token_accuracy": 0.8495361775159835, + "num_tokens": 88163810.0, + "step": 73300 + }, + { + "entropy": 1.9791472971439361, + "epoch": 0.2272545857393175, + "grad_norm": 8.446614265441895, + "learning_rate": 5.306880567047223e-06, + "loss": 0.6625, + "mean_token_accuracy": 0.8312054976820946, + "num_tokens": 88176079.0, + "step": 73310 + }, + { + "entropy": 1.8590899035334587, + "epoch": 0.2272855848643672, + "grad_norm": 4.120956897735596, + "learning_rate": 5.306518651204732e-06, + "loss": 0.4315, + "mean_token_accuracy": 0.8539949178695678, + "num_tokens": 88188688.0, + "step": 73320 + }, + { + "entropy": 1.92610152810812, + "epoch": 0.2273165839894169, + "grad_norm": 8.005793571472168, + "learning_rate": 5.306156809397379e-06, + "loss": 0.4976, + "mean_token_accuracy": 0.8419401630759239, + "num_tokens": 88200847.0, + "step": 73330 + }, + { + "entropy": 1.9598787605762482, + "epoch": 0.2273475831144666, + "grad_norm": 8.136838912963867, + "learning_rate": 5.305795041599927e-06, + "loss": 0.4955, + "mean_token_accuracy": 0.8463526502251625, + "num_tokens": 88212189.0, + "step": 73340 + }, + { + "entropy": 1.8861487179994583, + "epoch": 0.2273785822395163, + "grad_norm": 8.918937683105469, + "learning_rate": 5.30543334778715e-06, + "loss": 0.4946, + "mean_token_accuracy": 0.8471100255846977, + "num_tokens": 88223697.0, + "step": 73350 + }, + { + "entropy": 2.0112386524677275, + "epoch": 0.227409581364566, + "grad_norm": 8.22529411315918, + "learning_rate": 5.305071727933835e-06, + "loss": 0.5405, + "mean_token_accuracy": 0.8350095435976982, + "num_tokens": 88233981.0, + "step": 73360 + }, + { + "entropy": 1.9829117342829705, + "epoch": 0.22744058048961568, + "grad_norm": 8.34417724609375, + "learning_rate": 5.304710182014778e-06, + "loss": 0.524, + "mean_token_accuracy": 0.8332961440086365, + "num_tokens": 88245931.0, + "step": 73370 + }, + { + "entropy": 1.8644984647631646, + "epoch": 0.22747157961466538, + "grad_norm": 9.018733024597168, + "learning_rate": 5.304348710004791e-06, + "loss": 0.4726, + "mean_token_accuracy": 0.8427291125059128, + "num_tokens": 88258399.0, + "step": 73380 + }, + { + "entropy": 1.8523142382502555, + "epoch": 0.22750257873971508, + "grad_norm": 7.3071370124816895, + "learning_rate": 5.303987311878693e-06, + "loss": 0.4313, + "mean_token_accuracy": 0.856279893219471, + "num_tokens": 88270512.0, + "step": 73390 + }, + { + "entropy": 1.948920576274395, + "epoch": 0.22753357786476477, + "grad_norm": 8.815826416015625, + "learning_rate": 5.303625987611321e-06, + "loss": 0.5292, + "mean_token_accuracy": 0.8310361623764038, + "num_tokens": 88282293.0, + "step": 73400 + }, + { + "entropy": 1.8547866210341453, + "epoch": 0.22756457698981447, + "grad_norm": 4.00172233581543, + "learning_rate": 5.30326473717752e-06, + "loss": 0.4735, + "mean_token_accuracy": 0.8407047167420387, + "num_tokens": 88295869.0, + "step": 73410 + }, + { + "entropy": 1.9882050842046737, + "epoch": 0.22759557611486417, + "grad_norm": 4.15764045715332, + "learning_rate": 5.3029035605521485e-06, + "loss": 0.5662, + "mean_token_accuracy": 0.824670821428299, + "num_tokens": 88307629.0, + "step": 73420 + }, + { + "entropy": 1.896325621008873, + "epoch": 0.22762657523991386, + "grad_norm": 7.764687538146973, + "learning_rate": 5.302542457710075e-06, + "loss": 0.4483, + "mean_token_accuracy": 0.8539949029684066, + "num_tokens": 88319325.0, + "step": 73430 + }, + { + "entropy": 1.9740970045328141, + "epoch": 0.22765757436496356, + "grad_norm": 8.697678565979004, + "learning_rate": 5.302181428626182e-06, + "loss": 0.5632, + "mean_token_accuracy": 0.8374375849962234, + "num_tokens": 88330494.0, + "step": 73440 + }, + { + "entropy": 1.9539963483810425, + "epoch": 0.22768857349001326, + "grad_norm": 9.064693450927734, + "learning_rate": 5.301820473275364e-06, + "loss": 0.5405, + "mean_token_accuracy": 0.8300345674157142, + "num_tokens": 88341702.0, + "step": 73450 + }, + { + "entropy": 1.8306120559573174, + "epoch": 0.22771957261506293, + "grad_norm": 9.045393943786621, + "learning_rate": 5.301459591632527e-06, + "loss": 0.4311, + "mean_token_accuracy": 0.847631786763668, + "num_tokens": 88354694.0, + "step": 73460 + }, + { + "entropy": 1.9496394276618958, + "epoch": 0.22775057174011262, + "grad_norm": 9.405258178710938, + "learning_rate": 5.301098783672588e-06, + "loss": 0.5447, + "mean_token_accuracy": 0.837113332748413, + "num_tokens": 88365566.0, + "step": 73470 + }, + { + "entropy": 1.832155992090702, + "epoch": 0.22778157086516232, + "grad_norm": 8.784136772155762, + "learning_rate": 5.300738049370477e-06, + "loss": 0.4445, + "mean_token_accuracy": 0.8398911356925964, + "num_tokens": 88378055.0, + "step": 73480 + }, + { + "entropy": 1.8397590324282647, + "epoch": 0.22781256999021202, + "grad_norm": 10.566763877868652, + "learning_rate": 5.3003773887011364e-06, + "loss": 0.5284, + "mean_token_accuracy": 0.8380313396453858, + "num_tokens": 88391505.0, + "step": 73490 + }, + { + "entropy": 1.9307547613978386, + "epoch": 0.2278435691152617, + "grad_norm": 7.682570934295654, + "learning_rate": 5.3000168016395195e-06, + "loss": 0.529, + "mean_token_accuracy": 0.8304173111915588, + "num_tokens": 88402732.0, + "step": 73500 + }, + { + "entropy": 1.9600440084934234, + "epoch": 0.2278745682403114, + "grad_norm": 8.14580249786377, + "learning_rate": 5.299656288160591e-06, + "loss": 0.5391, + "mean_token_accuracy": 0.8324763268232346, + "num_tokens": 88414140.0, + "step": 73510 + }, + { + "entropy": 1.9035349115729332, + "epoch": 0.2279055673653611, + "grad_norm": 8.5955171585083, + "learning_rate": 5.299295848239329e-06, + "loss": 0.4725, + "mean_token_accuracy": 0.8473742425441741, + "num_tokens": 88425821.0, + "step": 73520 + }, + { + "entropy": 1.9537647753953933, + "epoch": 0.2279365664904108, + "grad_norm": 8.174680709838867, + "learning_rate": 5.298935481850723e-06, + "loss": 0.5177, + "mean_token_accuracy": 0.8415812566876412, + "num_tokens": 88436784.0, + "step": 73530 + }, + { + "entropy": 1.9054030612111093, + "epoch": 0.2279675656154605, + "grad_norm": 9.483674049377441, + "learning_rate": 5.2985751889697746e-06, + "loss": 0.4891, + "mean_token_accuracy": 0.8276939764618874, + "num_tokens": 88448671.0, + "step": 73540 + }, + { + "entropy": 1.7816397354006768, + "epoch": 0.2279985647405102, + "grad_norm": 9.179033279418945, + "learning_rate": 5.2982149695714964e-06, + "loss": 0.3919, + "mean_token_accuracy": 0.8552224412560463, + "num_tokens": 88463062.0, + "step": 73550 + }, + { + "entropy": 1.926460900902748, + "epoch": 0.2280295638655599, + "grad_norm": 8.515421867370605, + "learning_rate": 5.297854823630913e-06, + "loss": 0.503, + "mean_token_accuracy": 0.8481031745672226, + "num_tokens": 88474333.0, + "step": 73560 + }, + { + "entropy": 1.944996650516987, + "epoch": 0.2280605629906096, + "grad_norm": 7.795518398284912, + "learning_rate": 5.2974947511230635e-06, + "loss": 0.4867, + "mean_token_accuracy": 0.847733362019062, + "num_tokens": 88486183.0, + "step": 73570 + }, + { + "entropy": 1.924484086036682, + "epoch": 0.22809156211565929, + "grad_norm": 9.433268547058105, + "learning_rate": 5.297134752022996e-06, + "loss": 0.5195, + "mean_token_accuracy": 0.83953125923872, + "num_tokens": 88498763.0, + "step": 73580 + }, + { + "entropy": 1.8909065306186676, + "epoch": 0.22812256124070898, + "grad_norm": 4.520410537719727, + "learning_rate": 5.2967748263057685e-06, + "loss": 0.4292, + "mean_token_accuracy": 0.8501924559473991, + "num_tokens": 88511080.0, + "step": 73590 + }, + { + "entropy": 1.855413518846035, + "epoch": 0.22815356036575868, + "grad_norm": 7.426053047180176, + "learning_rate": 5.296414973946457e-06, + "loss": 0.4606, + "mean_token_accuracy": 0.848104490339756, + "num_tokens": 88523301.0, + "step": 73600 + }, + { + "entropy": 1.9100253120064736, + "epoch": 0.22818455949080838, + "grad_norm": 3.6768534183502197, + "learning_rate": 5.2960551949201445e-06, + "loss": 0.5232, + "mean_token_accuracy": 0.831768749654293, + "num_tokens": 88535393.0, + "step": 73610 + }, + { + "entropy": 1.9303764522075653, + "epoch": 0.22821555861585807, + "grad_norm": 9.903493881225586, + "learning_rate": 5.295695489201927e-06, + "loss": 0.522, + "mean_token_accuracy": 0.8348423153162002, + "num_tokens": 88547298.0, + "step": 73620 + }, + { + "entropy": 1.8810430273413659, + "epoch": 0.22824655774090777, + "grad_norm": 8.662088394165039, + "learning_rate": 5.295335856766913e-06, + "loss": 0.4731, + "mean_token_accuracy": 0.836891371011734, + "num_tokens": 88559818.0, + "step": 73630 + }, + { + "entropy": 1.9283401042222976, + "epoch": 0.22827755686595746, + "grad_norm": 8.903078079223633, + "learning_rate": 5.294976297590223e-06, + "loss": 0.5273, + "mean_token_accuracy": 0.8383133113384247, + "num_tokens": 88570970.0, + "step": 73640 + }, + { + "entropy": 1.9302942425012588, + "epoch": 0.22830855599100716, + "grad_norm": 8.803302764892578, + "learning_rate": 5.294616811646988e-06, + "loss": 0.5433, + "mean_token_accuracy": 0.833921717107296, + "num_tokens": 88582398.0, + "step": 73650 + }, + { + "entropy": 1.8926509320735931, + "epoch": 0.22833955511605686, + "grad_norm": 7.454058647155762, + "learning_rate": 5.294257398912351e-06, + "loss": 0.5225, + "mean_token_accuracy": 0.8442865967750549, + "num_tokens": 88594112.0, + "step": 73660 + }, + { + "entropy": 1.811450758576393, + "epoch": 0.22837055424110655, + "grad_norm": 3.648226499557495, + "learning_rate": 5.29389805936147e-06, + "loss": 0.3976, + "mean_token_accuracy": 0.8563667595386505, + "num_tokens": 88606926.0, + "step": 73670 + }, + { + "entropy": 1.927273753285408, + "epoch": 0.22840155336615625, + "grad_norm": 7.629867076873779, + "learning_rate": 5.293538792969509e-06, + "loss": 0.55, + "mean_token_accuracy": 0.8345969125628472, + "num_tokens": 88617995.0, + "step": 73680 + }, + { + "entropy": 1.866581754386425, + "epoch": 0.22843255249120595, + "grad_norm": 10.446778297424316, + "learning_rate": 5.293179599711649e-06, + "loss": 0.4725, + "mean_token_accuracy": 0.8569563254714012, + "num_tokens": 88630088.0, + "step": 73690 + }, + { + "entropy": 1.9038088709115981, + "epoch": 0.22846355161625564, + "grad_norm": 8.493614196777344, + "learning_rate": 5.29282047956308e-06, + "loss": 0.5245, + "mean_token_accuracy": 0.8368158757686615, + "num_tokens": 88642108.0, + "step": 73700 + }, + { + "entropy": 1.8956671848893165, + "epoch": 0.2284945507413053, + "grad_norm": 8.689042091369629, + "learning_rate": 5.2924614324990045e-06, + "loss": 0.4814, + "mean_token_accuracy": 0.8425026118755341, + "num_tokens": 88653698.0, + "step": 73710 + }, + { + "entropy": 1.8952711433172227, + "epoch": 0.228525549866355, + "grad_norm": 8.7789306640625, + "learning_rate": 5.292102458494637e-06, + "loss": 0.4779, + "mean_token_accuracy": 0.8493198037147522, + "num_tokens": 88665017.0, + "step": 73720 + }, + { + "entropy": 1.925877857208252, + "epoch": 0.2285565489914047, + "grad_norm": 9.960886001586914, + "learning_rate": 5.2917435575252045e-06, + "loss": 0.5461, + "mean_token_accuracy": 0.833611187338829, + "num_tokens": 88676741.0, + "step": 73730 + }, + { + "entropy": 1.9205459102988243, + "epoch": 0.2285875481164544, + "grad_norm": 4.133283615112305, + "learning_rate": 5.291384729565944e-06, + "loss": 0.525, + "mean_token_accuracy": 0.8389394223690033, + "num_tokens": 88688321.0, + "step": 73740 + }, + { + "entropy": 1.974983049929142, + "epoch": 0.2286185472415041, + "grad_norm": 8.154329299926758, + "learning_rate": 5.291025974592104e-06, + "loss": 0.582, + "mean_token_accuracy": 0.8283642366528511, + "num_tokens": 88700262.0, + "step": 73750 + }, + { + "entropy": 1.8185023814439774, + "epoch": 0.2286495463665538, + "grad_norm": 8.861298561096191, + "learning_rate": 5.290667292578948e-06, + "loss": 0.469, + "mean_token_accuracy": 0.8509408429265022, + "num_tokens": 88713127.0, + "step": 73760 + }, + { + "entropy": 1.8385506071150304, + "epoch": 0.2286805454916035, + "grad_norm": 3.4774937629699707, + "learning_rate": 5.290308683501748e-06, + "loss": 0.447, + "mean_token_accuracy": 0.8416382968425751, + "num_tokens": 88726534.0, + "step": 73770 + }, + { + "entropy": 2.0139817029237745, + "epoch": 0.2287115446166532, + "grad_norm": 9.335980415344238, + "learning_rate": 5.289950147335788e-06, + "loss": 0.5409, + "mean_token_accuracy": 0.8362293437123298, + "num_tokens": 88737523.0, + "step": 73780 + }, + { + "entropy": 1.7716790959239006, + "epoch": 0.2287425437417029, + "grad_norm": 2.6843504905700684, + "learning_rate": 5.2895916840563675e-06, + "loss": 0.3855, + "mean_token_accuracy": 0.8626002490520477, + "num_tokens": 88751293.0, + "step": 73790 + }, + { + "entropy": 1.9235130712389945, + "epoch": 0.22877354286675258, + "grad_norm": 9.159330368041992, + "learning_rate": 5.289233293638791e-06, + "loss": 0.5496, + "mean_token_accuracy": 0.8392347306013107, + "num_tokens": 88763247.0, + "step": 73800 + }, + { + "entropy": 1.8758223891258239, + "epoch": 0.22880454199180228, + "grad_norm": 7.979211330413818, + "learning_rate": 5.288874976058381e-06, + "loss": 0.4729, + "mean_token_accuracy": 0.8446521013975143, + "num_tokens": 88775511.0, + "step": 73810 + }, + { + "entropy": 1.9276006370782852, + "epoch": 0.22883554111685198, + "grad_norm": 7.672868251800537, + "learning_rate": 5.288516731290468e-06, + "loss": 0.514, + "mean_token_accuracy": 0.8356775805354119, + "num_tokens": 88786826.0, + "step": 73820 + }, + { + "entropy": 1.9099656268954277, + "epoch": 0.22886654024190167, + "grad_norm": 9.19827651977539, + "learning_rate": 5.288158559310397e-06, + "loss": 0.4861, + "mean_token_accuracy": 0.8499478042125702, + "num_tokens": 88799014.0, + "step": 73830 + }, + { + "entropy": 1.8794591814279555, + "epoch": 0.22889753936695137, + "grad_norm": 9.211488723754883, + "learning_rate": 5.287800460093521e-06, + "loss": 0.4953, + "mean_token_accuracy": 0.8426800593733788, + "num_tokens": 88811087.0, + "step": 73840 + }, + { + "entropy": 1.8956602096557618, + "epoch": 0.22892853849200107, + "grad_norm": 8.783346176147461, + "learning_rate": 5.287442433615207e-06, + "loss": 0.5048, + "mean_token_accuracy": 0.8364124745130539, + "num_tokens": 88823246.0, + "step": 73850 + }, + { + "entropy": 1.8410479247570037, + "epoch": 0.22895953761705076, + "grad_norm": 7.995346546173096, + "learning_rate": 5.287084479850834e-06, + "loss": 0.4777, + "mean_token_accuracy": 0.8437046140432358, + "num_tokens": 88836150.0, + "step": 73860 + }, + { + "entropy": 1.9268400833010673, + "epoch": 0.22899053674210046, + "grad_norm": 6.432771682739258, + "learning_rate": 5.286726598775794e-06, + "loss": 0.559, + "mean_token_accuracy": 0.8304910391569138, + "num_tokens": 88846935.0, + "step": 73870 + }, + { + "entropy": 1.967933678627014, + "epoch": 0.22902153586715016, + "grad_norm": 8.738357543945312, + "learning_rate": 5.286368790365485e-06, + "loss": 0.5395, + "mean_token_accuracy": 0.8432664573192596, + "num_tokens": 88857965.0, + "step": 73880 + }, + { + "entropy": 1.8915558218955995, + "epoch": 0.22905253499219985, + "grad_norm": 8.925724983215332, + "learning_rate": 5.286011054595324e-06, + "loss": 0.4876, + "mean_token_accuracy": 0.8440207138657569, + "num_tokens": 88869852.0, + "step": 73890 + }, + { + "entropy": 1.9460553407669068, + "epoch": 0.22908353411724955, + "grad_norm": 10.097593307495117, + "learning_rate": 5.285653391440732e-06, + "loss": 0.5563, + "mean_token_accuracy": 0.8398781731724739, + "num_tokens": 88880507.0, + "step": 73900 + }, + { + "entropy": 1.9188208684325219, + "epoch": 0.22911453324229925, + "grad_norm": 7.534275531768799, + "learning_rate": 5.285295800877149e-06, + "loss": 0.4985, + "mean_token_accuracy": 0.845045380294323, + "num_tokens": 88891773.0, + "step": 73910 + }, + { + "entropy": 1.9317274510860443, + "epoch": 0.22914553236734894, + "grad_norm": 7.515310764312744, + "learning_rate": 5.284938282880022e-06, + "loss": 0.5103, + "mean_token_accuracy": 0.830031855404377, + "num_tokens": 88903628.0, + "step": 73920 + }, + { + "entropy": 1.8418959081172943, + "epoch": 0.22917653149239864, + "grad_norm": 10.1459379196167, + "learning_rate": 5.284580837424812e-06, + "loss": 0.4781, + "mean_token_accuracy": 0.8359077215194702, + "num_tokens": 88915956.0, + "step": 73930 + }, + { + "entropy": 1.9436316937208176, + "epoch": 0.22920753061744834, + "grad_norm": 9.291312217712402, + "learning_rate": 5.2842234644869895e-06, + "loss": 0.56, + "mean_token_accuracy": 0.8290729984641075, + "num_tokens": 88926709.0, + "step": 73940 + }, + { + "entropy": 1.997511848807335, + "epoch": 0.22923852974249803, + "grad_norm": 8.855123519897461, + "learning_rate": 5.283866164042037e-06, + "loss": 0.5755, + "mean_token_accuracy": 0.8371048003435135, + "num_tokens": 88937889.0, + "step": 73950 + }, + { + "entropy": 1.9249091997742653, + "epoch": 0.2292695288675477, + "grad_norm": 10.297185897827148, + "learning_rate": 5.283508936065452e-06, + "loss": 0.5007, + "mean_token_accuracy": 0.8376726984977723, + "num_tokens": 88950020.0, + "step": 73960 + }, + { + "entropy": 1.974366408586502, + "epoch": 0.2293005279925974, + "grad_norm": 9.853002548217773, + "learning_rate": 5.283151780532737e-06, + "loss": 0.5265, + "mean_token_accuracy": 0.8404949456453323, + "num_tokens": 88960953.0, + "step": 73970 + }, + { + "entropy": 1.9446517005562782, + "epoch": 0.2293315271176471, + "grad_norm": 7.498370170593262, + "learning_rate": 5.282794697419412e-06, + "loss": 0.5435, + "mean_token_accuracy": 0.8361097246408462, + "num_tokens": 88972293.0, + "step": 73980 + }, + { + "entropy": 1.9057669252157212, + "epoch": 0.2293625262426968, + "grad_norm": 9.31442928314209, + "learning_rate": 5.282437686701009e-06, + "loss": 0.5119, + "mean_token_accuracy": 0.8370606362819671, + "num_tokens": 88983890.0, + "step": 73990 + }, + { + "entropy": 1.9226253479719162, + "epoch": 0.2293935253677465, + "grad_norm": 8.424675941467285, + "learning_rate": 5.2820807483530635e-06, + "loss": 0.5432, + "mean_token_accuracy": 0.8275117412209511, + "num_tokens": 88995595.0, + "step": 74000 + }, + { + "entropy": 1.9122430935502053, + "epoch": 0.22942452449279618, + "grad_norm": 7.728245258331299, + "learning_rate": 5.281723882351132e-06, + "loss": 0.4993, + "mean_token_accuracy": 0.8327098101377487, + "num_tokens": 89008305.0, + "step": 74010 + }, + { + "entropy": 1.8688533097505569, + "epoch": 0.22945552361784588, + "grad_norm": 4.136612415313721, + "learning_rate": 5.281367088670779e-06, + "loss": 0.4394, + "mean_token_accuracy": 0.852810050547123, + "num_tokens": 89020921.0, + "step": 74020 + }, + { + "entropy": 1.8244116827845573, + "epoch": 0.22948652274289558, + "grad_norm": 9.678080558776855, + "learning_rate": 5.281010367287579e-06, + "loss": 0.3859, + "mean_token_accuracy": 0.8560571730136871, + "num_tokens": 89034197.0, + "step": 74030 + }, + { + "entropy": 1.9837775856256485, + "epoch": 0.22951752186794527, + "grad_norm": 8.24919319152832, + "learning_rate": 5.280653718177119e-06, + "loss": 0.519, + "mean_token_accuracy": 0.83905139118433, + "num_tokens": 89044845.0, + "step": 74040 + }, + { + "entropy": 1.9067578569054604, + "epoch": 0.22954852099299497, + "grad_norm": 4.405426502227783, + "learning_rate": 5.2802971413149995e-06, + "loss": 0.4771, + "mean_token_accuracy": 0.8524753168225289, + "num_tokens": 89056288.0, + "step": 74050 + }, + { + "entropy": 1.8309132128953933, + "epoch": 0.22957952011804467, + "grad_norm": 7.820222854614258, + "learning_rate": 5.279940636676828e-06, + "loss": 0.4325, + "mean_token_accuracy": 0.8572394847869873, + "num_tokens": 89069166.0, + "step": 74060 + }, + { + "entropy": 1.9012271910905838, + "epoch": 0.22961051924309436, + "grad_norm": 9.459297180175781, + "learning_rate": 5.27958420423823e-06, + "loss": 0.4733, + "mean_token_accuracy": 0.8430428206920624, + "num_tokens": 89081030.0, + "step": 74070 + }, + { + "entropy": 1.937754437327385, + "epoch": 0.22964151836814406, + "grad_norm": 7.659812927246094, + "learning_rate": 5.279227843974837e-06, + "loss": 0.5038, + "mean_token_accuracy": 0.8409286737442017, + "num_tokens": 89093165.0, + "step": 74080 + }, + { + "entropy": 1.8637029841542243, + "epoch": 0.22967251749319376, + "grad_norm": 8.62972354888916, + "learning_rate": 5.278871555862294e-06, + "loss": 0.5112, + "mean_token_accuracy": 0.8332642212510109, + "num_tokens": 89105778.0, + "step": 74090 + }, + { + "entropy": 1.9526731699705124, + "epoch": 0.22970351661824345, + "grad_norm": 8.800896644592285, + "learning_rate": 5.278515339876257e-06, + "loss": 0.5603, + "mean_token_accuracy": 0.8348133593797684, + "num_tokens": 89116443.0, + "step": 74100 + }, + { + "entropy": 1.906109546124935, + "epoch": 0.22973451574329315, + "grad_norm": 8.518561363220215, + "learning_rate": 5.278159195992395e-06, + "loss": 0.5096, + "mean_token_accuracy": 0.8357308685779572, + "num_tokens": 89128471.0, + "step": 74110 + }, + { + "entropy": 1.9231334283947945, + "epoch": 0.22976551486834285, + "grad_norm": 5.9242401123046875, + "learning_rate": 5.277803124186387e-06, + "loss": 0.5068, + "mean_token_accuracy": 0.8310579225420952, + "num_tokens": 89140663.0, + "step": 74120 + }, + { + "entropy": 1.8632835254073143, + "epoch": 0.22979651399339254, + "grad_norm": 7.401041507720947, + "learning_rate": 5.277447124433924e-06, + "loss": 0.4631, + "mean_token_accuracy": 0.8411552801728248, + "num_tokens": 89153122.0, + "step": 74130 + }, + { + "entropy": 1.8588541388511657, + "epoch": 0.22982751311844224, + "grad_norm": 8.74480152130127, + "learning_rate": 5.277091196710709e-06, + "loss": 0.4855, + "mean_token_accuracy": 0.8416764706373214, + "num_tokens": 89166007.0, + "step": 74140 + }, + { + "entropy": 1.953472825884819, + "epoch": 0.22985851224349194, + "grad_norm": 10.276031494140625, + "learning_rate": 5.276735340992454e-06, + "loss": 0.5669, + "mean_token_accuracy": 0.835921137034893, + "num_tokens": 89177892.0, + "step": 74150 + }, + { + "entropy": 1.9886727631092072, + "epoch": 0.22988951136854163, + "grad_norm": 8.81214714050293, + "learning_rate": 5.276379557254886e-06, + "loss": 0.5502, + "mean_token_accuracy": 0.8341899603605271, + "num_tokens": 89188418.0, + "step": 74160 + }, + { + "entropy": 1.8839535772800446, + "epoch": 0.22992051049359133, + "grad_norm": 7.365606307983398, + "learning_rate": 5.276023845473741e-06, + "loss": 0.4427, + "mean_token_accuracy": 0.8486846655607223, + "num_tokens": 89201029.0, + "step": 74170 + }, + { + "entropy": 1.8606535702943803, + "epoch": 0.22995150961864103, + "grad_norm": 6.745984077453613, + "learning_rate": 5.275668205624769e-06, + "loss": 0.432, + "mean_token_accuracy": 0.8506320536136627, + "num_tokens": 89213185.0, + "step": 74180 + }, + { + "entropy": 1.9073518499732018, + "epoch": 0.22998250874369072, + "grad_norm": 8.412069320678711, + "learning_rate": 5.275312637683727e-06, + "loss": 0.5009, + "mean_token_accuracy": 0.839865280687809, + "num_tokens": 89225044.0, + "step": 74190 + }, + { + "entropy": 1.9035982072353363, + "epoch": 0.2300135078687404, + "grad_norm": 9.212803840637207, + "learning_rate": 5.274957141626388e-06, + "loss": 0.4738, + "mean_token_accuracy": 0.8478123605251312, + "num_tokens": 89236364.0, + "step": 74200 + }, + { + "entropy": 1.847705954313278, + "epoch": 0.2300445069937901, + "grad_norm": 8.281103134155273, + "learning_rate": 5.274601717428534e-06, + "loss": 0.4457, + "mean_token_accuracy": 0.8482792422175407, + "num_tokens": 89249228.0, + "step": 74210 + }, + { + "entropy": 1.8957882165908813, + "epoch": 0.23007550611883978, + "grad_norm": 9.440603256225586, + "learning_rate": 5.274246365065958e-06, + "loss": 0.5042, + "mean_token_accuracy": 0.8471298664808273, + "num_tokens": 89260514.0, + "step": 74220 + }, + { + "entropy": 1.8138180442154408, + "epoch": 0.23010650524388948, + "grad_norm": 3.8155839443206787, + "learning_rate": 5.273891084514467e-06, + "loss": 0.3897, + "mean_token_accuracy": 0.8571711733937264, + "num_tokens": 89274014.0, + "step": 74230 + }, + { + "entropy": 1.913990643620491, + "epoch": 0.23013750436893918, + "grad_norm": 9.489317893981934, + "learning_rate": 5.273535875749878e-06, + "loss": 0.5619, + "mean_token_accuracy": 0.832446351647377, + "num_tokens": 89286179.0, + "step": 74240 + }, + { + "entropy": 1.921476237475872, + "epoch": 0.23016850349398887, + "grad_norm": 4.419705867767334, + "learning_rate": 5.273180738748017e-06, + "loss": 0.458, + "mean_token_accuracy": 0.8520811811089516, + "num_tokens": 89298896.0, + "step": 74250 + }, + { + "entropy": 1.9407318532466888, + "epoch": 0.23019950261903857, + "grad_norm": 9.360230445861816, + "learning_rate": 5.2728256734847265e-06, + "loss": 0.4911, + "mean_token_accuracy": 0.8388498574495316, + "num_tokens": 89310196.0, + "step": 74260 + }, + { + "entropy": 1.822094811499119, + "epoch": 0.23023050174408827, + "grad_norm": 9.136000633239746, + "learning_rate": 5.272470679935853e-06, + "loss": 0.4498, + "mean_token_accuracy": 0.8359728991985321, + "num_tokens": 89323522.0, + "step": 74270 + }, + { + "entropy": 1.8379074111580849, + "epoch": 0.23026150086913796, + "grad_norm": 8.908660888671875, + "learning_rate": 5.2721157580772635e-06, + "loss": 0.4626, + "mean_token_accuracy": 0.8487562105059624, + "num_tokens": 89336641.0, + "step": 74280 + }, + { + "entropy": 1.9069863885641098, + "epoch": 0.23029249999418766, + "grad_norm": 9.512563705444336, + "learning_rate": 5.27176090788483e-06, + "loss": 0.4891, + "mean_token_accuracy": 0.8323235869407654, + "num_tokens": 89348756.0, + "step": 74290 + }, + { + "entropy": 1.9753981336951256, + "epoch": 0.23032349911923736, + "grad_norm": 8.980364799499512, + "learning_rate": 5.271406129334436e-06, + "loss": 0.5229, + "mean_token_accuracy": 0.8389256626367569, + "num_tokens": 89360026.0, + "step": 74300 + }, + { + "entropy": 1.9259448662400245, + "epoch": 0.23035449824428705, + "grad_norm": 8.680480003356934, + "learning_rate": 5.271051422401982e-06, + "loss": 0.5148, + "mean_token_accuracy": 0.8409480020403862, + "num_tokens": 89372328.0, + "step": 74310 + }, + { + "entropy": 1.9059909775853157, + "epoch": 0.23038549736933675, + "grad_norm": 3.84014892578125, + "learning_rate": 5.2706967870633704e-06, + "loss": 0.507, + "mean_token_accuracy": 0.8374297887086868, + "num_tokens": 89385781.0, + "step": 74320 + }, + { + "entropy": 1.9672224968671799, + "epoch": 0.23041649649438645, + "grad_norm": 8.767590522766113, + "learning_rate": 5.270342223294524e-06, + "loss": 0.5579, + "mean_token_accuracy": 0.8284627929329872, + "num_tokens": 89396717.0, + "step": 74330 + }, + { + "entropy": 1.9303541094064713, + "epoch": 0.23044749561943614, + "grad_norm": 7.687719345092773, + "learning_rate": 5.2699877310713735e-06, + "loss": 0.4751, + "mean_token_accuracy": 0.846529133617878, + "num_tokens": 89408365.0, + "step": 74340 + }, + { + "entropy": 1.8513358294963838, + "epoch": 0.23047849474448584, + "grad_norm": 4.211158275604248, + "learning_rate": 5.26963331036986e-06, + "loss": 0.4756, + "mean_token_accuracy": 0.8417376056313515, + "num_tokens": 89420728.0, + "step": 74350 + }, + { + "entropy": 1.9158269882202148, + "epoch": 0.23050949386953554, + "grad_norm": 8.824563026428223, + "learning_rate": 5.2692789611659345e-06, + "loss": 0.4523, + "mean_token_accuracy": 0.8564553380012512, + "num_tokens": 89432042.0, + "step": 74360 + }, + { + "entropy": 1.8483793511986732, + "epoch": 0.23054049299458523, + "grad_norm": 8.350893020629883, + "learning_rate": 5.268924683435563e-06, + "loss": 0.4704, + "mean_token_accuracy": 0.8427222266793251, + "num_tokens": 89445106.0, + "step": 74370 + }, + { + "entropy": 1.9533838108181953, + "epoch": 0.23057149211963493, + "grad_norm": 7.688497066497803, + "learning_rate": 5.268570477154723e-06, + "loss": 0.487, + "mean_token_accuracy": 0.8480934947729111, + "num_tokens": 89456517.0, + "step": 74380 + }, + { + "entropy": 1.7940612569451333, + "epoch": 0.23060249124468463, + "grad_norm": 10.271604537963867, + "learning_rate": 5.268216342299399e-06, + "loss": 0.4255, + "mean_token_accuracy": 0.8567822203040123, + "num_tokens": 89470177.0, + "step": 74390 + }, + { + "entropy": 1.905691820383072, + "epoch": 0.23063349036973432, + "grad_norm": 9.242095947265625, + "learning_rate": 5.267862278845591e-06, + "loss": 0.4724, + "mean_token_accuracy": 0.8492756888270379, + "num_tokens": 89481980.0, + "step": 74400 + }, + { + "entropy": 1.9306585028767587, + "epoch": 0.23066448949478402, + "grad_norm": 8.073739051818848, + "learning_rate": 5.267508286769307e-06, + "loss": 0.4924, + "mean_token_accuracy": 0.8462639227509499, + "num_tokens": 89493310.0, + "step": 74410 + }, + { + "entropy": 1.9487823605537415, + "epoch": 0.23069548861983372, + "grad_norm": 7.810174942016602, + "learning_rate": 5.267154366046571e-06, + "loss": 0.5225, + "mean_token_accuracy": 0.8337769046425819, + "num_tokens": 89504297.0, + "step": 74420 + }, + { + "entropy": 2.020465725660324, + "epoch": 0.2307264877448834, + "grad_norm": 8.833086967468262, + "learning_rate": 5.266800516653412e-06, + "loss": 0.5802, + "mean_token_accuracy": 0.8244875445961952, + "num_tokens": 89515014.0, + "step": 74430 + }, + { + "entropy": 1.8529538474977016, + "epoch": 0.2307574868699331, + "grad_norm": 8.668907165527344, + "learning_rate": 5.266446738565875e-06, + "loss": 0.4542, + "mean_token_accuracy": 0.8453396156430244, + "num_tokens": 89527739.0, + "step": 74440 + }, + { + "entropy": 1.938523431122303, + "epoch": 0.23078848599498278, + "grad_norm": 3.760244846343994, + "learning_rate": 5.266093031760013e-06, + "loss": 0.5123, + "mean_token_accuracy": 0.8392288982868195, + "num_tokens": 89539449.0, + "step": 74450 + }, + { + "entropy": 1.8795533359050751, + "epoch": 0.23081948512003247, + "grad_norm": 8.304437637329102, + "learning_rate": 5.265739396211895e-06, + "loss": 0.4722, + "mean_token_accuracy": 0.8489022374153137, + "num_tokens": 89551459.0, + "step": 74460 + }, + { + "entropy": 1.8862609788775444, + "epoch": 0.23085048424508217, + "grad_norm": 8.59811782836914, + "learning_rate": 5.265385831897596e-06, + "loss": 0.4956, + "mean_token_accuracy": 0.8412604182958603, + "num_tokens": 89563656.0, + "step": 74470 + }, + { + "entropy": 1.906290727853775, + "epoch": 0.23088148337013187, + "grad_norm": 8.959325790405273, + "learning_rate": 5.2650323387932055e-06, + "loss": 0.5424, + "mean_token_accuracy": 0.8369390457868576, + "num_tokens": 89575960.0, + "step": 74480 + }, + { + "entropy": 1.8360799536108972, + "epoch": 0.23091248249518156, + "grad_norm": 8.049399375915527, + "learning_rate": 5.264678916874822e-06, + "loss": 0.413, + "mean_token_accuracy": 0.8528993338346481, + "num_tokens": 89588989.0, + "step": 74490 + }, + { + "entropy": 1.9210985012352466, + "epoch": 0.23094348162023126, + "grad_norm": 9.01938533782959, + "learning_rate": 5.264325566118559e-06, + "loss": 0.5033, + "mean_token_accuracy": 0.8337433129549027, + "num_tokens": 89601183.0, + "step": 74500 + }, + { + "entropy": 1.925881953537464, + "epoch": 0.23097448074528096, + "grad_norm": 9.546213150024414, + "learning_rate": 5.263972286500535e-06, + "loss": 0.5117, + "mean_token_accuracy": 0.8354784622788429, + "num_tokens": 89612244.0, + "step": 74510 + }, + { + "entropy": 1.85896704941988, + "epoch": 0.23100547987033065, + "grad_norm": 9.076498031616211, + "learning_rate": 5.263619077996888e-06, + "loss": 0.4686, + "mean_token_accuracy": 0.8546704173088073, + "num_tokens": 89623851.0, + "step": 74520 + }, + { + "entropy": 1.9326560199260712, + "epoch": 0.23103647899538035, + "grad_norm": 3.5278940200805664, + "learning_rate": 5.263265940583757e-06, + "loss": 0.4875, + "mean_token_accuracy": 0.8429828837513924, + "num_tokens": 89635763.0, + "step": 74530 + }, + { + "entropy": 1.8581456407904624, + "epoch": 0.23106747812043005, + "grad_norm": 3.4591639041900635, + "learning_rate": 5.262912874237302e-06, + "loss": 0.4786, + "mean_token_accuracy": 0.8445979550480842, + "num_tokens": 89648396.0, + "step": 74540 + }, + { + "entropy": 1.983565354347229, + "epoch": 0.23109847724547974, + "grad_norm": 9.090567588806152, + "learning_rate": 5.262559878933689e-06, + "loss": 0.54, + "mean_token_accuracy": 0.8390719383955002, + "num_tokens": 89659197.0, + "step": 74550 + }, + { + "entropy": 1.8371982112526895, + "epoch": 0.23112947637052944, + "grad_norm": 8.092364311218262, + "learning_rate": 5.262206954649097e-06, + "loss": 0.4645, + "mean_token_accuracy": 0.8549615368247032, + "num_tokens": 89671458.0, + "step": 74560 + }, + { + "entropy": 1.8131123587489129, + "epoch": 0.23116047549557914, + "grad_norm": 4.235384941101074, + "learning_rate": 5.2618541013597135e-06, + "loss": 0.4451, + "mean_token_accuracy": 0.8519744485616684, + "num_tokens": 89684238.0, + "step": 74570 + }, + { + "entropy": 1.8373139381408692, + "epoch": 0.23119147462062883, + "grad_norm": 7.887211322784424, + "learning_rate": 5.26150131904174e-06, + "loss": 0.4723, + "mean_token_accuracy": 0.8530314326286316, + "num_tokens": 89697119.0, + "step": 74580 + }, + { + "entropy": 1.9169560462236404, + "epoch": 0.23122247374567853, + "grad_norm": 7.602973937988281, + "learning_rate": 5.261148607671387e-06, + "loss": 0.493, + "mean_token_accuracy": 0.8439723640680313, + "num_tokens": 89708694.0, + "step": 74590 + }, + { + "entropy": 1.9264726474881173, + "epoch": 0.23125347287072823, + "grad_norm": 8.548664093017578, + "learning_rate": 5.26079596722488e-06, + "loss": 0.4944, + "mean_token_accuracy": 0.844902828335762, + "num_tokens": 89720007.0, + "step": 74600 + }, + { + "entropy": 1.9451578676700592, + "epoch": 0.23128447199577792, + "grad_norm": 8.304434776306152, + "learning_rate": 5.260443397678451e-06, + "loss": 0.4489, + "mean_token_accuracy": 0.8561795771121978, + "num_tokens": 89731316.0, + "step": 74610 + }, + { + "entropy": 1.9096089273691177, + "epoch": 0.23131547112082762, + "grad_norm": 5.2652459144592285, + "learning_rate": 5.260090899008346e-06, + "loss": 0.5157, + "mean_token_accuracy": 0.8433238387107849, + "num_tokens": 89742801.0, + "step": 74620 + }, + { + "entropy": 1.9577663838863373, + "epoch": 0.23134647024587732, + "grad_norm": 9.235634803771973, + "learning_rate": 5.25973847119082e-06, + "loss": 0.5354, + "mean_token_accuracy": 0.8379147708415985, + "num_tokens": 89754662.0, + "step": 74630 + }, + { + "entropy": 1.8390830487012864, + "epoch": 0.231377469370927, + "grad_norm": 10.85385799407959, + "learning_rate": 5.259386114202142e-06, + "loss": 0.4284, + "mean_token_accuracy": 0.8555236831307411, + "num_tokens": 89767357.0, + "step": 74640 + }, + { + "entropy": 1.8444815009832383, + "epoch": 0.2314084684959767, + "grad_norm": 7.597925186157227, + "learning_rate": 5.25903382801859e-06, + "loss": 0.466, + "mean_token_accuracy": 0.8467239618301392, + "num_tokens": 89779687.0, + "step": 74650 + }, + { + "entropy": 1.8531509324908257, + "epoch": 0.2314394676210264, + "grad_norm": 9.575505256652832, + "learning_rate": 5.2586816126164544e-06, + "loss": 0.4716, + "mean_token_accuracy": 0.846321189403534, + "num_tokens": 89792615.0, + "step": 74660 + }, + { + "entropy": 1.8784852564334868, + "epoch": 0.2314704667460761, + "grad_norm": 6.865838527679443, + "learning_rate": 5.258329467972034e-06, + "loss": 0.4986, + "mean_token_accuracy": 0.8459220543503762, + "num_tokens": 89804495.0, + "step": 74670 + }, + { + "entropy": 1.8415795341134071, + "epoch": 0.2315014658711258, + "grad_norm": 4.12829065322876, + "learning_rate": 5.257977394061643e-06, + "loss": 0.4897, + "mean_token_accuracy": 0.8459180906414986, + "num_tokens": 89816717.0, + "step": 74680 + }, + { + "entropy": 1.8776524886488914, + "epoch": 0.2315324649961755, + "grad_norm": 8.387575149536133, + "learning_rate": 5.257625390861604e-06, + "loss": 0.5371, + "mean_token_accuracy": 0.8276556923985481, + "num_tokens": 89828650.0, + "step": 74690 + }, + { + "entropy": 1.8143211975693703, + "epoch": 0.23156346412122517, + "grad_norm": 9.098214149475098, + "learning_rate": 5.25727345834825e-06, + "loss": 0.4038, + "mean_token_accuracy": 0.8600962340831757, + "num_tokens": 89841929.0, + "step": 74700 + }, + { + "entropy": 1.9715292781591416, + "epoch": 0.23159446324627486, + "grad_norm": 8.993027687072754, + "learning_rate": 5.256921596497926e-06, + "loss": 0.5908, + "mean_token_accuracy": 0.8187263280153274, + "num_tokens": 89853031.0, + "step": 74710 + }, + { + "entropy": 1.8571769908070563, + "epoch": 0.23162546237132456, + "grad_norm": 9.328154563903809, + "learning_rate": 5.256569805286989e-06, + "loss": 0.4653, + "mean_token_accuracy": 0.8409965321421623, + "num_tokens": 89866198.0, + "step": 74720 + }, + { + "entropy": 1.930174747109413, + "epoch": 0.23165646149637426, + "grad_norm": 8.642340660095215, + "learning_rate": 5.256218084691808e-06, + "loss": 0.532, + "mean_token_accuracy": 0.8321129947900772, + "num_tokens": 89877322.0, + "step": 74730 + }, + { + "entropy": 1.8410791546106338, + "epoch": 0.23168746062142395, + "grad_norm": 7.149337291717529, + "learning_rate": 5.255866434688759e-06, + "loss": 0.4858, + "mean_token_accuracy": 0.8385511308908462, + "num_tokens": 89890110.0, + "step": 74740 + }, + { + "entropy": 1.8869677096605302, + "epoch": 0.23171845974647365, + "grad_norm": 4.138514518737793, + "learning_rate": 5.255514855254232e-06, + "loss": 0.4707, + "mean_token_accuracy": 0.8496659383177757, + "num_tokens": 89901498.0, + "step": 74750 + }, + { + "entropy": 1.9108739644289017, + "epoch": 0.23174945887152335, + "grad_norm": 3.6030426025390625, + "learning_rate": 5.255163346364628e-06, + "loss": 0.4849, + "mean_token_accuracy": 0.8481365412473678, + "num_tokens": 89913794.0, + "step": 74760 + }, + { + "entropy": 1.8742002308368684, + "epoch": 0.23178045799657304, + "grad_norm": 4.183920383453369, + "learning_rate": 5.25481190799636e-06, + "loss": 0.4603, + "mean_token_accuracy": 0.854676042497158, + "num_tokens": 89926016.0, + "step": 74770 + }, + { + "entropy": 1.8252388328313827, + "epoch": 0.23181145712162274, + "grad_norm": 8.657533645629883, + "learning_rate": 5.254460540125848e-06, + "loss": 0.4536, + "mean_token_accuracy": 0.855050428211689, + "num_tokens": 89939164.0, + "step": 74780 + }, + { + "entropy": 1.891065989434719, + "epoch": 0.23184245624667243, + "grad_norm": 7.831802845001221, + "learning_rate": 5.254109242729526e-06, + "loss": 0.5505, + "mean_token_accuracy": 0.837081053853035, + "num_tokens": 89950621.0, + "step": 74790 + }, + { + "entropy": 1.8497989937663077, + "epoch": 0.23187345537172213, + "grad_norm": 9.593158721923828, + "learning_rate": 5.2537580157838395e-06, + "loss": 0.487, + "mean_token_accuracy": 0.8481601446866989, + "num_tokens": 89962644.0, + "step": 74800 + }, + { + "entropy": 1.9117231592535973, + "epoch": 0.23190445449677183, + "grad_norm": 8.208306312561035, + "learning_rate": 5.253406859265246e-06, + "loss": 0.4946, + "mean_token_accuracy": 0.8326343983411789, + "num_tokens": 89973841.0, + "step": 74810 + }, + { + "entropy": 1.8912553757429122, + "epoch": 0.23193545362182152, + "grad_norm": 6.744791507720947, + "learning_rate": 5.253055773150209e-06, + "loss": 0.4592, + "mean_token_accuracy": 0.8465361073613167, + "num_tokens": 89985153.0, + "step": 74820 + }, + { + "entropy": 1.9291711524128914, + "epoch": 0.23196645274687122, + "grad_norm": 8.88097858428955, + "learning_rate": 5.252704757415207e-06, + "loss": 0.5299, + "mean_token_accuracy": 0.8393183097243309, + "num_tokens": 89996749.0, + "step": 74830 + }, + { + "entropy": 1.9174196228384972, + "epoch": 0.23199745187192092, + "grad_norm": 8.680536270141602, + "learning_rate": 5.25235381203673e-06, + "loss": 0.4885, + "mean_token_accuracy": 0.8294728398323059, + "num_tokens": 90008694.0, + "step": 74840 + }, + { + "entropy": 1.8946995601058005, + "epoch": 0.23202845099697061, + "grad_norm": 7.604076385498047, + "learning_rate": 5.252002936991277e-06, + "loss": 0.5037, + "mean_token_accuracy": 0.843429696559906, + "num_tokens": 90021433.0, + "step": 74850 + }, + { + "entropy": 1.8540204659104347, + "epoch": 0.2320594501220203, + "grad_norm": 4.1174635887146, + "learning_rate": 5.251652132255359e-06, + "loss": 0.4499, + "mean_token_accuracy": 0.851498831808567, + "num_tokens": 90032932.0, + "step": 74860 + }, + { + "entropy": 1.823809015750885, + "epoch": 0.23209044924707, + "grad_norm": 8.341029167175293, + "learning_rate": 5.251301397805497e-06, + "loss": 0.4716, + "mean_token_accuracy": 0.8412299945950508, + "num_tokens": 90045600.0, + "step": 74870 + }, + { + "entropy": 1.804415312409401, + "epoch": 0.2321214483721197, + "grad_norm": 9.450553894042969, + "learning_rate": 5.250950733618225e-06, + "loss": 0.4902, + "mean_token_accuracy": 0.8472607269883156, + "num_tokens": 90059093.0, + "step": 74880 + }, + { + "entropy": 1.8313181832432748, + "epoch": 0.2321524474971694, + "grad_norm": 8.513847351074219, + "learning_rate": 5.250600139670086e-06, + "loss": 0.4528, + "mean_token_accuracy": 0.8442484304308892, + "num_tokens": 90071931.0, + "step": 74890 + }, + { + "entropy": 1.805875188112259, + "epoch": 0.2321834466222191, + "grad_norm": 8.688248634338379, + "learning_rate": 5.2502496159376335e-06, + "loss": 0.4062, + "mean_token_accuracy": 0.8494523644447327, + "num_tokens": 90085027.0, + "step": 74900 + }, + { + "entropy": 1.8620700597763062, + "epoch": 0.2322144457472688, + "grad_norm": 7.525083065032959, + "learning_rate": 5.249899162397435e-06, + "loss": 0.4726, + "mean_token_accuracy": 0.8424142166972161, + "num_tokens": 90097799.0, + "step": 74910 + }, + { + "entropy": 1.8805269077420235, + "epoch": 0.2322454448723185, + "grad_norm": 9.527615547180176, + "learning_rate": 5.249548779026064e-06, + "loss": 0.4867, + "mean_token_accuracy": 0.8365740343928337, + "num_tokens": 90110343.0, + "step": 74920 + }, + { + "entropy": 1.88951036632061, + "epoch": 0.2322764439973682, + "grad_norm": 8.741109848022461, + "learning_rate": 5.249198465800112e-06, + "loss": 0.4995, + "mean_token_accuracy": 0.8468944728374481, + "num_tokens": 90122279.0, + "step": 74930 + }, + { + "entropy": 1.921808835864067, + "epoch": 0.23230744312241786, + "grad_norm": 7.858125686645508, + "learning_rate": 5.248848222696175e-06, + "loss": 0.4864, + "mean_token_accuracy": 0.8521002262830735, + "num_tokens": 90133878.0, + "step": 74940 + }, + { + "entropy": 1.9586533635854722, + "epoch": 0.23233844224746755, + "grad_norm": 7.217672348022461, + "learning_rate": 5.248498049690861e-06, + "loss": 0.5898, + "mean_token_accuracy": 0.8337603956460953, + "num_tokens": 90144689.0, + "step": 74950 + }, + { + "entropy": 1.942648607492447, + "epoch": 0.23236944137251725, + "grad_norm": 8.679248809814453, + "learning_rate": 5.248147946760793e-06, + "loss": 0.5142, + "mean_token_accuracy": 0.8394230246543884, + "num_tokens": 90155720.0, + "step": 74960 + }, + { + "entropy": 1.791871838271618, + "epoch": 0.23240044049756695, + "grad_norm": 4.113439559936523, + "learning_rate": 5.247797913882602e-06, + "loss": 0.3892, + "mean_token_accuracy": 0.8623890161514283, + "num_tokens": 90169661.0, + "step": 74970 + }, + { + "entropy": 1.8388600483536721, + "epoch": 0.23243143962261664, + "grad_norm": 3.5801477432250977, + "learning_rate": 5.247447951032928e-06, + "loss": 0.4552, + "mean_token_accuracy": 0.8412294924259186, + "num_tokens": 90182346.0, + "step": 74980 + }, + { + "entropy": 1.814172099530697, + "epoch": 0.23246243874766634, + "grad_norm": 3.8935399055480957, + "learning_rate": 5.247098058188425e-06, + "loss": 0.4632, + "mean_token_accuracy": 0.8537375554442406, + "num_tokens": 90195133.0, + "step": 74990 + }, + { + "entropy": 1.9371330052614213, + "epoch": 0.23249343787271604, + "grad_norm": 7.545095443725586, + "learning_rate": 5.246748235325756e-06, + "loss": 0.4919, + "mean_token_accuracy": 0.8489918291568757, + "num_tokens": 90205935.0, + "step": 75000 + }, + { + "entropy": 1.9675442904233933, + "epoch": 0.23252443699776573, + "grad_norm": 7.987549304962158, + "learning_rate": 5.246398482421598e-06, + "loss": 0.555, + "mean_token_accuracy": 0.822739377617836, + "num_tokens": 90216793.0, + "step": 75010 + }, + { + "entropy": 1.9193257197737694, + "epoch": 0.23255543612281543, + "grad_norm": 8.14727783203125, + "learning_rate": 5.246048799452634e-06, + "loss": 0.4991, + "mean_token_accuracy": 0.8483793914318085, + "num_tokens": 90227782.0, + "step": 75020 + }, + { + "entropy": 1.8773325115442276, + "epoch": 0.23258643524786513, + "grad_norm": 8.173250198364258, + "learning_rate": 5.245699186395562e-06, + "loss": 0.4775, + "mean_token_accuracy": 0.8404614195227623, + "num_tokens": 90239587.0, + "step": 75030 + }, + { + "entropy": 1.78584865629673, + "epoch": 0.23261743437291482, + "grad_norm": 8.468997955322266, + "learning_rate": 5.24534964322709e-06, + "loss": 0.4254, + "mean_token_accuracy": 0.8459458023309707, + "num_tokens": 90253213.0, + "step": 75040 + }, + { + "entropy": 1.860848817229271, + "epoch": 0.23264843349796452, + "grad_norm": 8.477463722229004, + "learning_rate": 5.245000169923935e-06, + "loss": 0.4891, + "mean_token_accuracy": 0.8364793375134468, + "num_tokens": 90265280.0, + "step": 75050 + }, + { + "entropy": 1.9089404866099358, + "epoch": 0.23267943262301422, + "grad_norm": 8.954151153564453, + "learning_rate": 5.244650766462827e-06, + "loss": 0.5373, + "mean_token_accuracy": 0.8399009183049202, + "num_tokens": 90276657.0, + "step": 75060 + }, + { + "entropy": 1.9542981505393981, + "epoch": 0.2327104317480639, + "grad_norm": 8.272551536560059, + "learning_rate": 5.244301432820503e-06, + "loss": 0.5212, + "mean_token_accuracy": 0.8442383721470833, + "num_tokens": 90287635.0, + "step": 75070 + }, + { + "entropy": 1.9224041730165482, + "epoch": 0.2327414308731136, + "grad_norm": 8.821447372436523, + "learning_rate": 5.243952168973718e-06, + "loss": 0.6275, + "mean_token_accuracy": 0.8298459380865097, + "num_tokens": 90300036.0, + "step": 75080 + }, + { + "entropy": 1.9240923762321471, + "epoch": 0.2327724299981633, + "grad_norm": 3.3205227851867676, + "learning_rate": 5.243602974899231e-06, + "loss": 0.5128, + "mean_token_accuracy": 0.8405191898345947, + "num_tokens": 90310850.0, + "step": 75090 + }, + { + "entropy": 1.7889164254069327, + "epoch": 0.232803429123213, + "grad_norm": 8.739767074584961, + "learning_rate": 5.243253850573816e-06, + "loss": 0.4138, + "mean_token_accuracy": 0.8499681517481804, + "num_tokens": 90323572.0, + "step": 75100 + }, + { + "entropy": 1.928827765583992, + "epoch": 0.2328344282482627, + "grad_norm": 6.958417892456055, + "learning_rate": 5.2429047959742555e-06, + "loss": 0.5652, + "mean_token_accuracy": 0.8277982458472252, + "num_tokens": 90334888.0, + "step": 75110 + }, + { + "entropy": 1.8841903924942016, + "epoch": 0.2328654273733124, + "grad_norm": 4.459338665008545, + "learning_rate": 5.242555811077344e-06, + "loss": 0.5206, + "mean_token_accuracy": 0.8400102391839027, + "num_tokens": 90346732.0, + "step": 75120 + }, + { + "entropy": 1.8876192346215248, + "epoch": 0.2328964264983621, + "grad_norm": 8.372060775756836, + "learning_rate": 5.242206895859884e-06, + "loss": 0.536, + "mean_token_accuracy": 0.8402991250157357, + "num_tokens": 90358781.0, + "step": 75130 + }, + { + "entropy": 1.9489302426576613, + "epoch": 0.2329274256234118, + "grad_norm": 7.9244384765625, + "learning_rate": 5.241858050298695e-06, + "loss": 0.5783, + "mean_token_accuracy": 0.8289978623390197, + "num_tokens": 90369723.0, + "step": 75140 + }, + { + "entropy": 1.9526596829295157, + "epoch": 0.23295842474846148, + "grad_norm": 9.39733600616455, + "learning_rate": 5.241509274370601e-06, + "loss": 0.5026, + "mean_token_accuracy": 0.8326995015144348, + "num_tokens": 90381573.0, + "step": 75150 + }, + { + "entropy": 1.855046857893467, + "epoch": 0.23298942387351118, + "grad_norm": 8.04930305480957, + "learning_rate": 5.24116056805244e-06, + "loss": 0.4655, + "mean_token_accuracy": 0.8553052991628647, + "num_tokens": 90394574.0, + "step": 75160 + }, + { + "entropy": 1.9039323568344115, + "epoch": 0.23302042299856088, + "grad_norm": 8.174781799316406, + "learning_rate": 5.24081193132106e-06, + "loss": 0.4824, + "mean_token_accuracy": 0.8432982727885246, + "num_tokens": 90406242.0, + "step": 75170 + }, + { + "entropy": 1.8142014488577842, + "epoch": 0.23305142212361057, + "grad_norm": 10.09156608581543, + "learning_rate": 5.240463364153321e-06, + "loss": 0.4246, + "mean_token_accuracy": 0.8498436227440834, + "num_tokens": 90419691.0, + "step": 75180 + }, + { + "entropy": 1.8953629210591316, + "epoch": 0.23308242124866024, + "grad_norm": 7.493961334228516, + "learning_rate": 5.24011486652609e-06, + "loss": 0.506, + "mean_token_accuracy": 0.8336662292480469, + "num_tokens": 90432358.0, + "step": 75190 + }, + { + "entropy": 1.8964366748929025, + "epoch": 0.23311342037370994, + "grad_norm": 8.871479034423828, + "learning_rate": 5.2397664384162504e-06, + "loss": 0.496, + "mean_token_accuracy": 0.8406847149133683, + "num_tokens": 90443925.0, + "step": 75200 + }, + { + "entropy": 1.8410288974642754, + "epoch": 0.23314441949875964, + "grad_norm": 9.23766803741455, + "learning_rate": 5.239418079800691e-06, + "loss": 0.4533, + "mean_token_accuracy": 0.8419799253344535, + "num_tokens": 90456444.0, + "step": 75210 + }, + { + "entropy": 1.9168146222829818, + "epoch": 0.23317541862380933, + "grad_norm": 8.451478004455566, + "learning_rate": 5.239069790656316e-06, + "loss": 0.5372, + "mean_token_accuracy": 0.8360653147101402, + "num_tokens": 90467399.0, + "step": 75220 + }, + { + "entropy": 1.7803557753562926, + "epoch": 0.23320641774885903, + "grad_norm": 4.116580963134766, + "learning_rate": 5.238721570960036e-06, + "loss": 0.4092, + "mean_token_accuracy": 0.853205819427967, + "num_tokens": 90479796.0, + "step": 75230 + }, + { + "entropy": 1.8654976204037665, + "epoch": 0.23323741687390873, + "grad_norm": 9.181920051574707, + "learning_rate": 5.238373420688775e-06, + "loss": 0.5074, + "mean_token_accuracy": 0.8490437477827072, + "num_tokens": 90491819.0, + "step": 75240 + }, + { + "entropy": 1.9017037138342858, + "epoch": 0.23326841599895842, + "grad_norm": 7.338005542755127, + "learning_rate": 5.238025339819467e-06, + "loss": 0.4745, + "mean_token_accuracy": 0.838523656129837, + "num_tokens": 90503606.0, + "step": 75250 + }, + { + "entropy": 1.8169363185763359, + "epoch": 0.23329941512400812, + "grad_norm": 11.015266418457031, + "learning_rate": 5.237677328329057e-06, + "loss": 0.423, + "mean_token_accuracy": 0.8545296162366867, + "num_tokens": 90516076.0, + "step": 75260 + }, + { + "entropy": 1.8559274673461914, + "epoch": 0.23333041424905782, + "grad_norm": 7.690032482147217, + "learning_rate": 5.237329386194502e-06, + "loss": 0.4926, + "mean_token_accuracy": 0.8428187996149064, + "num_tokens": 90528110.0, + "step": 75270 + }, + { + "entropy": 1.8206583276391028, + "epoch": 0.2333614133741075, + "grad_norm": 4.115688323974609, + "learning_rate": 5.236981513392766e-06, + "loss": 0.4286, + "mean_token_accuracy": 0.8493341699242591, + "num_tokens": 90540476.0, + "step": 75280 + }, + { + "entropy": 1.8399672955274582, + "epoch": 0.2333924124991572, + "grad_norm": 8.14948558807373, + "learning_rate": 5.2366337099008265e-06, + "loss": 0.4485, + "mean_token_accuracy": 0.848198390007019, + "num_tokens": 90552496.0, + "step": 75290 + }, + { + "entropy": 1.7906280860304833, + "epoch": 0.2334234116242069, + "grad_norm": 6.901556968688965, + "learning_rate": 5.236285975695673e-06, + "loss": 0.4279, + "mean_token_accuracy": 0.8607045665383339, + "num_tokens": 90565388.0, + "step": 75300 + }, + { + "entropy": 1.8817636847496033, + "epoch": 0.2334544107492566, + "grad_norm": 10.196159362792969, + "learning_rate": 5.235938310754303e-06, + "loss": 0.5052, + "mean_token_accuracy": 0.8407386288046836, + "num_tokens": 90577351.0, + "step": 75310 + }, + { + "entropy": 1.9196124032139779, + "epoch": 0.2334854098743063, + "grad_norm": 7.629261016845703, + "learning_rate": 5.2355907150537245e-06, + "loss": 0.5032, + "mean_token_accuracy": 0.8403740376234055, + "num_tokens": 90588716.0, + "step": 75320 + }, + { + "entropy": 1.9231160968542098, + "epoch": 0.233516408999356, + "grad_norm": 8.21264362335205, + "learning_rate": 5.2352431885709585e-06, + "loss": 0.5624, + "mean_token_accuracy": 0.8309063494205475, + "num_tokens": 90599515.0, + "step": 75330 + }, + { + "entropy": 1.8250703021883965, + "epoch": 0.2335474081244057, + "grad_norm": 8.739277839660645, + "learning_rate": 5.234895731283034e-06, + "loss": 0.4434, + "mean_token_accuracy": 0.8584131434559822, + "num_tokens": 90612420.0, + "step": 75340 + }, + { + "entropy": 1.832010643184185, + "epoch": 0.2335784072494554, + "grad_norm": 8.311301231384277, + "learning_rate": 5.234548343166994e-06, + "loss": 0.494, + "mean_token_accuracy": 0.837817907333374, + "num_tokens": 90624549.0, + "step": 75350 + }, + { + "entropy": 1.9003486275672912, + "epoch": 0.23360940637450509, + "grad_norm": 8.429434776306152, + "learning_rate": 5.234201024199889e-06, + "loss": 0.4714, + "mean_token_accuracy": 0.8493073508143425, + "num_tokens": 90635571.0, + "step": 75360 + }, + { + "entropy": 1.8496991708874702, + "epoch": 0.23364040549955478, + "grad_norm": 8.533575057983398, + "learning_rate": 5.233853774358781e-06, + "loss": 0.4762, + "mean_token_accuracy": 0.8495924681425094, + "num_tokens": 90647578.0, + "step": 75370 + }, + { + "entropy": 1.9252197608351707, + "epoch": 0.23367140462460448, + "grad_norm": 7.957611083984375, + "learning_rate": 5.233506593620745e-06, + "loss": 0.5422, + "mean_token_accuracy": 0.834588311612606, + "num_tokens": 90658684.0, + "step": 75380 + }, + { + "entropy": 1.914403536915779, + "epoch": 0.23370240374965418, + "grad_norm": 8.381974220275879, + "learning_rate": 5.233159481962864e-06, + "loss": 0.544, + "mean_token_accuracy": 0.836005237698555, + "num_tokens": 90670080.0, + "step": 75390 + }, + { + "entropy": 1.8845387056469918, + "epoch": 0.23373340287470387, + "grad_norm": 7.998114109039307, + "learning_rate": 5.232812439362231e-06, + "loss": 0.4658, + "mean_token_accuracy": 0.8451636284589767, + "num_tokens": 90682811.0, + "step": 75400 + }, + { + "entropy": 1.8718926057219505, + "epoch": 0.23376440199975357, + "grad_norm": 3.7296555042266846, + "learning_rate": 5.232465465795954e-06, + "loss": 0.569, + "mean_token_accuracy": 0.8414446488022804, + "num_tokens": 90694457.0, + "step": 75410 + }, + { + "entropy": 1.9601527959108354, + "epoch": 0.23379540112480326, + "grad_norm": 8.428439140319824, + "learning_rate": 5.232118561241144e-06, + "loss": 0.5494, + "mean_token_accuracy": 0.8311134546995163, + "num_tokens": 90705460.0, + "step": 75420 + }, + { + "entropy": 1.803976234793663, + "epoch": 0.23382640024985296, + "grad_norm": 6.161757946014404, + "learning_rate": 5.231771725674932e-06, + "loss": 0.4983, + "mean_token_accuracy": 0.8395594924688339, + "num_tokens": 90718488.0, + "step": 75430 + }, + { + "entropy": 1.8432627946138382, + "epoch": 0.23385739937490263, + "grad_norm": 8.392098426818848, + "learning_rate": 5.231424959074452e-06, + "loss": 0.4113, + "mean_token_accuracy": 0.8638657048344612, + "num_tokens": 90730557.0, + "step": 75440 + }, + { + "entropy": 1.9326068967580796, + "epoch": 0.23388839849995233, + "grad_norm": 9.61757755279541, + "learning_rate": 5.2310782614168524e-06, + "loss": 0.5731, + "mean_token_accuracy": 0.828363224864006, + "num_tokens": 90741224.0, + "step": 75450 + }, + { + "entropy": 1.9034574344754218, + "epoch": 0.23391939762500202, + "grad_norm": 9.683328628540039, + "learning_rate": 5.230731632679291e-06, + "loss": 0.5022, + "mean_token_accuracy": 0.8382599085569382, + "num_tokens": 90752495.0, + "step": 75460 + }, + { + "entropy": 1.889817087352276, + "epoch": 0.23395039675005172, + "grad_norm": 7.614642143249512, + "learning_rate": 5.230385072838936e-06, + "loss": 0.4736, + "mean_token_accuracy": 0.8519553110003472, + "num_tokens": 90764504.0, + "step": 75470 + }, + { + "entropy": 1.846573531627655, + "epoch": 0.23398139587510142, + "grad_norm": 8.647238731384277, + "learning_rate": 5.230038581872968e-06, + "loss": 0.4794, + "mean_token_accuracy": 0.845521879196167, + "num_tokens": 90777223.0, + "step": 75480 + }, + { + "entropy": 1.887799009680748, + "epoch": 0.2340123950001511, + "grad_norm": 9.934295654296875, + "learning_rate": 5.2296921597585774e-06, + "loss": 0.513, + "mean_token_accuracy": 0.8402909606695175, + "num_tokens": 90788632.0, + "step": 75490 + }, + { + "entropy": 1.8547486677765845, + "epoch": 0.2340433941252008, + "grad_norm": 7.495747089385986, + "learning_rate": 5.229345806472961e-06, + "loss": 0.4797, + "mean_token_accuracy": 0.8464630424976349, + "num_tokens": 90800492.0, + "step": 75500 + }, + { + "entropy": 1.8723068907856941, + "epoch": 0.2340743932502505, + "grad_norm": 8.363115310668945, + "learning_rate": 5.228999521993333e-06, + "loss": 0.4835, + "mean_token_accuracy": 0.8466327175498009, + "num_tokens": 90812390.0, + "step": 75510 + }, + { + "entropy": 1.8291691452264787, + "epoch": 0.2341053923753002, + "grad_norm": 4.456087112426758, + "learning_rate": 5.228653306296913e-06, + "loss": 0.4529, + "mean_token_accuracy": 0.8438918322324753, + "num_tokens": 90825014.0, + "step": 75520 + }, + { + "entropy": 1.9418904185295105, + "epoch": 0.2341363915003499, + "grad_norm": 8.06656265258789, + "learning_rate": 5.228307159360937e-06, + "loss": 0.5371, + "mean_token_accuracy": 0.8240366965532303, + "num_tokens": 90836051.0, + "step": 75530 + }, + { + "entropy": 1.8666341871023178, + "epoch": 0.2341673906253996, + "grad_norm": 9.48065185546875, + "learning_rate": 5.2279610811626425e-06, + "loss": 0.473, + "mean_token_accuracy": 0.8425935983657837, + "num_tokens": 90847835.0, + "step": 75540 + }, + { + "entropy": 1.8927869558334351, + "epoch": 0.2341983897504493, + "grad_norm": 9.044870376586914, + "learning_rate": 5.227615071679285e-06, + "loss": 0.5159, + "mean_token_accuracy": 0.8357618257403374, + "num_tokens": 90859514.0, + "step": 75550 + }, + { + "entropy": 1.7879324212670327, + "epoch": 0.234229388875499, + "grad_norm": 4.232240200042725, + "learning_rate": 5.227269130888129e-06, + "loss": 0.368, + "mean_token_accuracy": 0.8541668400168418, + "num_tokens": 90873762.0, + "step": 75560 + }, + { + "entropy": 1.86094261854887, + "epoch": 0.23426038800054869, + "grad_norm": 2.110261917114258, + "learning_rate": 5.226923258766447e-06, + "loss": 0.4928, + "mean_token_accuracy": 0.8436441987752914, + "num_tokens": 90886426.0, + "step": 75570 + }, + { + "entropy": 1.7484543666243553, + "epoch": 0.23429138712559838, + "grad_norm": 4.710477352142334, + "learning_rate": 5.226577455291525e-06, + "loss": 0.3791, + "mean_token_accuracy": 0.8541481211781502, + "num_tokens": 90900094.0, + "step": 75580 + }, + { + "entropy": 1.945685862004757, + "epoch": 0.23432238625064808, + "grad_norm": 8.194548606872559, + "learning_rate": 5.226231720440659e-06, + "loss": 0.5229, + "mean_token_accuracy": 0.8371816173195838, + "num_tokens": 90911642.0, + "step": 75590 + }, + { + "entropy": 1.8348077073693276, + "epoch": 0.23435338537569778, + "grad_norm": 9.100287437438965, + "learning_rate": 5.2258860541911514e-06, + "loss": 0.4516, + "mean_token_accuracy": 0.8539791435003281, + "num_tokens": 90924678.0, + "step": 75600 + }, + { + "entropy": 1.8494689092040062, + "epoch": 0.23438438450074747, + "grad_norm": 7.481054782867432, + "learning_rate": 5.2255404565203234e-06, + "loss": 0.469, + "mean_token_accuracy": 0.8396744161844254, + "num_tokens": 90937460.0, + "step": 75610 + }, + { + "entropy": 1.9320793822407722, + "epoch": 0.23441538362579717, + "grad_norm": 9.441415786743164, + "learning_rate": 5.225194927405498e-06, + "loss": 0.5608, + "mean_token_accuracy": 0.8343402355909347, + "num_tokens": 90949563.0, + "step": 75620 + }, + { + "entropy": 1.8647829070687294, + "epoch": 0.23444638275084687, + "grad_norm": 8.169805526733398, + "learning_rate": 5.224849466824015e-06, + "loss": 0.5014, + "mean_token_accuracy": 0.839415080845356, + "num_tokens": 90961677.0, + "step": 75630 + }, + { + "entropy": 1.8755572125315667, + "epoch": 0.23447738187589656, + "grad_norm": 7.456679344177246, + "learning_rate": 5.2245040747532205e-06, + "loss": 0.5616, + "mean_token_accuracy": 0.8273000553250313, + "num_tokens": 90974918.0, + "step": 75640 + }, + { + "entropy": 1.9732760652899741, + "epoch": 0.23450838100094626, + "grad_norm": 9.377166748046875, + "learning_rate": 5.224158751170473e-06, + "loss": 0.5314, + "mean_token_accuracy": 0.8365205094218254, + "num_tokens": 90986087.0, + "step": 75650 + }, + { + "entropy": 1.9110179245471954, + "epoch": 0.23453938012599596, + "grad_norm": 8.639516830444336, + "learning_rate": 5.223813496053142e-06, + "loss": 0.4806, + "mean_token_accuracy": 0.8477229610085487, + "num_tokens": 90997997.0, + "step": 75660 + }, + { + "entropy": 1.933686862885952, + "epoch": 0.23457037925104565, + "grad_norm": 8.902299880981445, + "learning_rate": 5.223468309378605e-06, + "loss": 0.5609, + "mean_token_accuracy": 0.8341797336935997, + "num_tokens": 91009272.0, + "step": 75670 + }, + { + "entropy": 1.968608994781971, + "epoch": 0.23460137837609532, + "grad_norm": 4.5855712890625, + "learning_rate": 5.2231231911242555e-06, + "loss": 0.5086, + "mean_token_accuracy": 0.8370487987995148, + "num_tokens": 91020661.0, + "step": 75680 + }, + { + "entropy": 1.8787442803382874, + "epoch": 0.23463237750114502, + "grad_norm": 7.962042331695557, + "learning_rate": 5.222778141267488e-06, + "loss": 0.4506, + "mean_token_accuracy": 0.8470729902386666, + "num_tokens": 91033230.0, + "step": 75690 + }, + { + "entropy": 1.8811139300465585, + "epoch": 0.23466337662619471, + "grad_norm": 7.824191093444824, + "learning_rate": 5.222433159785718e-06, + "loss": 0.481, + "mean_token_accuracy": 0.8375716552138328, + "num_tokens": 91045252.0, + "step": 75700 + }, + { + "entropy": 1.9257870644330979, + "epoch": 0.2346943757512444, + "grad_norm": 7.708620548248291, + "learning_rate": 5.222088246656365e-06, + "loss": 0.5305, + "mean_token_accuracy": 0.8385186135768891, + "num_tokens": 91056958.0, + "step": 75710 + }, + { + "entropy": 1.9052812889218331, + "epoch": 0.2347253748762941, + "grad_norm": 7.920125484466553, + "learning_rate": 5.22174340185686e-06, + "loss": 0.4891, + "mean_token_accuracy": 0.8514411136507988, + "num_tokens": 91068010.0, + "step": 75720 + }, + { + "entropy": 1.8879928544163704, + "epoch": 0.2347563740013438, + "grad_norm": 8.99319839477539, + "learning_rate": 5.221398625364644e-06, + "loss": 0.4728, + "mean_token_accuracy": 0.8493816792964936, + "num_tokens": 91079694.0, + "step": 75730 + }, + { + "entropy": 1.860645368695259, + "epoch": 0.2347873731263935, + "grad_norm": 8.87778377532959, + "learning_rate": 5.2210539171571715e-06, + "loss": 0.4795, + "mean_token_accuracy": 0.8425444066524506, + "num_tokens": 91091618.0, + "step": 75740 + }, + { + "entropy": 1.8990991547703744, + "epoch": 0.2348183722514432, + "grad_norm": 4.461852550506592, + "learning_rate": 5.220709277211903e-06, + "loss": 0.519, + "mean_token_accuracy": 0.83267183303833, + "num_tokens": 91104008.0, + "step": 75750 + }, + { + "entropy": 1.9108994349837303, + "epoch": 0.2348493713764929, + "grad_norm": 7.736074924468994, + "learning_rate": 5.220364705506313e-06, + "loss": 0.4869, + "mean_token_accuracy": 0.8414252296090126, + "num_tokens": 91115759.0, + "step": 75760 + }, + { + "entropy": 1.9764820635318756, + "epoch": 0.2348803705015426, + "grad_norm": 9.64189624786377, + "learning_rate": 5.220020202017887e-06, + "loss": 0.5797, + "mean_token_accuracy": 0.8334431812167168, + "num_tokens": 91125911.0, + "step": 75770 + }, + { + "entropy": 1.8024020060896873, + "epoch": 0.2349113696265923, + "grad_norm": 6.673051357269287, + "learning_rate": 5.219675766724114e-06, + "loss": 0.4302, + "mean_token_accuracy": 0.8567843988537789, + "num_tokens": 91139395.0, + "step": 75780 + }, + { + "entropy": 1.9167088627815247, + "epoch": 0.23494236875164198, + "grad_norm": 6.79614782333374, + "learning_rate": 5.219331399602503e-06, + "loss": 0.4774, + "mean_token_accuracy": 0.8450982421636581, + "num_tokens": 91151124.0, + "step": 75790 + }, + { + "entropy": 1.8575886994600297, + "epoch": 0.23497336787669168, + "grad_norm": 9.397269248962402, + "learning_rate": 5.218987100630566e-06, + "loss": 0.4664, + "mean_token_accuracy": 0.8560428529977798, + "num_tokens": 91163168.0, + "step": 75800 + }, + { + "entropy": 1.9414154648780824, + "epoch": 0.23500436700174138, + "grad_norm": 6.633089065551758, + "learning_rate": 5.21864286978583e-06, + "loss": 0.4886, + "mean_token_accuracy": 0.8463586211204529, + "num_tokens": 91174212.0, + "step": 75810 + }, + { + "entropy": 1.7952500343322755, + "epoch": 0.23503536612679107, + "grad_norm": 3.056459426879883, + "learning_rate": 5.218298707045828e-06, + "loss": 0.4439, + "mean_token_accuracy": 0.8485618963837623, + "num_tokens": 91188049.0, + "step": 75820 + }, + { + "entropy": 1.9243362337350844, + "epoch": 0.23506636525184077, + "grad_norm": 8.155814170837402, + "learning_rate": 5.217954612388109e-06, + "loss": 0.5203, + "mean_token_accuracy": 0.8437091827392578, + "num_tokens": 91199119.0, + "step": 75830 + }, + { + "entropy": 1.9802058964967728, + "epoch": 0.23509736437689047, + "grad_norm": 8.803181648254395, + "learning_rate": 5.217610585790226e-06, + "loss": 0.5569, + "mean_token_accuracy": 0.8368562594056129, + "num_tokens": 91209740.0, + "step": 75840 + }, + { + "entropy": 1.8397732719779014, + "epoch": 0.23512836350194016, + "grad_norm": 7.574740886688232, + "learning_rate": 5.217266627229748e-06, + "loss": 0.4793, + "mean_token_accuracy": 0.8465480893850327, + "num_tokens": 91223580.0, + "step": 75850 + }, + { + "entropy": 1.9436632886528968, + "epoch": 0.23515936262698986, + "grad_norm": 7.771301746368408, + "learning_rate": 5.216922736684251e-06, + "loss": 0.5232, + "mean_token_accuracy": 0.8337218299508095, + "num_tokens": 91235380.0, + "step": 75860 + }, + { + "entropy": 1.8707226827740668, + "epoch": 0.23519036175203956, + "grad_norm": 7.848099231719971, + "learning_rate": 5.216578914131323e-06, + "loss": 0.4367, + "mean_token_accuracy": 0.8528139501810074, + "num_tokens": 91247966.0, + "step": 75870 + }, + { + "entropy": 2.011591988801956, + "epoch": 0.23522136087708925, + "grad_norm": 9.857985496520996, + "learning_rate": 5.216235159548561e-06, + "loss": 0.6008, + "mean_token_accuracy": 0.8229044884443283, + "num_tokens": 91259077.0, + "step": 75880 + }, + { + "entropy": 1.9336156010627747, + "epoch": 0.23525236000213895, + "grad_norm": 3.4189701080322266, + "learning_rate": 5.215891472913572e-06, + "loss": 0.4667, + "mean_token_accuracy": 0.8479844376444816, + "num_tokens": 91271077.0, + "step": 75890 + }, + { + "entropy": 1.9687816753983498, + "epoch": 0.23528335912718865, + "grad_norm": 10.6126070022583, + "learning_rate": 5.215547854203976e-06, + "loss": 0.5246, + "mean_token_accuracy": 0.839890718460083, + "num_tokens": 91282469.0, + "step": 75900 + }, + { + "entropy": 1.8437663704156875, + "epoch": 0.23531435825223834, + "grad_norm": 4.00623083114624, + "learning_rate": 5.2152043033974e-06, + "loss": 0.4231, + "mean_token_accuracy": 0.8508341655135154, + "num_tokens": 91295928.0, + "step": 75910 + }, + { + "entropy": 1.9553633004426956, + "epoch": 0.23534535737728804, + "grad_norm": 8.508515357971191, + "learning_rate": 5.214860820471484e-06, + "loss": 0.5215, + "mean_token_accuracy": 0.8452081516385078, + "num_tokens": 91307197.0, + "step": 75920 + }, + { + "entropy": 1.880798263847828, + "epoch": 0.2353763565023377, + "grad_norm": 6.8838791847229, + "learning_rate": 5.214517405403878e-06, + "loss": 0.415, + "mean_token_accuracy": 0.8529353275895118, + "num_tokens": 91319587.0, + "step": 75930 + }, + { + "entropy": 1.8892224609851838, + "epoch": 0.2354073556273874, + "grad_norm": 8.371787071228027, + "learning_rate": 5.214174058172241e-06, + "loss": 0.4759, + "mean_token_accuracy": 0.8420174464583396, + "num_tokens": 91332095.0, + "step": 75940 + }, + { + "entropy": 1.9798435151576996, + "epoch": 0.2354383547524371, + "grad_norm": 10.347806930541992, + "learning_rate": 5.213830778754241e-06, + "loss": 0.6038, + "mean_token_accuracy": 0.8171585813164711, + "num_tokens": 91342922.0, + "step": 75950 + }, + { + "entropy": 1.9197808623313903, + "epoch": 0.2354693538774868, + "grad_norm": 8.591217041015625, + "learning_rate": 5.213487567127559e-06, + "loss": 0.4662, + "mean_token_accuracy": 0.8502022624015808, + "num_tokens": 91354825.0, + "step": 75960 + }, + { + "entropy": 1.9522427856922149, + "epoch": 0.2355003530025365, + "grad_norm": 9.180562973022461, + "learning_rate": 5.213144423269887e-06, + "loss": 0.538, + "mean_token_accuracy": 0.8390685006976127, + "num_tokens": 91365900.0, + "step": 75970 + }, + { + "entropy": 1.9248853296041488, + "epoch": 0.2355313521275862, + "grad_norm": 7.8382248878479, + "learning_rate": 5.212801347158925e-06, + "loss": 0.5151, + "mean_token_accuracy": 0.8395644649863243, + "num_tokens": 91377884.0, + "step": 75980 + }, + { + "entropy": 1.869673204421997, + "epoch": 0.2355623512526359, + "grad_norm": 10.19648551940918, + "learning_rate": 5.212458338772383e-06, + "loss": 0.4567, + "mean_token_accuracy": 0.8442730501294136, + "num_tokens": 91389637.0, + "step": 75990 + }, + { + "entropy": 1.8467582926154136, + "epoch": 0.23559335037768558, + "grad_norm": 4.066962718963623, + "learning_rate": 5.212115398087981e-06, + "loss": 0.4431, + "mean_token_accuracy": 0.8452418327331543, + "num_tokens": 91402099.0, + "step": 76000 + }, + { + "entropy": 1.8998699069023133, + "epoch": 0.23562434950273528, + "grad_norm": 7.766880035400391, + "learning_rate": 5.211772525083454e-06, + "loss": 0.5449, + "mean_token_accuracy": 0.8390342697501183, + "num_tokens": 91415037.0, + "step": 76010 + }, + { + "entropy": 1.8747850939631463, + "epoch": 0.23565534862778498, + "grad_norm": 9.178610801696777, + "learning_rate": 5.2114297197365406e-06, + "loss": 0.5079, + "mean_token_accuracy": 0.8318077206611634, + "num_tokens": 91427375.0, + "step": 76020 + }, + { + "entropy": 1.8805608436465264, + "epoch": 0.23568634775283467, + "grad_norm": 8.223982810974121, + "learning_rate": 5.211086982024995e-06, + "loss": 0.5108, + "mean_token_accuracy": 0.8412626773118973, + "num_tokens": 91439742.0, + "step": 76030 + }, + { + "entropy": 1.8259792655706406, + "epoch": 0.23571734687788437, + "grad_norm": 4.048516273498535, + "learning_rate": 5.210744311926578e-06, + "loss": 0.4072, + "mean_token_accuracy": 0.8551635116338729, + "num_tokens": 91451962.0, + "step": 76040 + }, + { + "entropy": 1.9941584020853043, + "epoch": 0.23574834600293407, + "grad_norm": 8.59123420715332, + "learning_rate": 5.210401709419061e-06, + "loss": 0.6048, + "mean_token_accuracy": 0.8241194665431977, + "num_tokens": 91462894.0, + "step": 76050 + }, + { + "entropy": 1.9185423642396926, + "epoch": 0.23577934512798376, + "grad_norm": 11.106711387634277, + "learning_rate": 5.210059174480229e-06, + "loss": 0.5074, + "mean_token_accuracy": 0.8400084570050239, + "num_tokens": 91474440.0, + "step": 76060 + }, + { + "entropy": 1.9143582820892333, + "epoch": 0.23581034425303346, + "grad_norm": 7.75733757019043, + "learning_rate": 5.2097167070878754e-06, + "loss": 0.5088, + "mean_token_accuracy": 0.8454731091856956, + "num_tokens": 91485553.0, + "step": 76070 + }, + { + "entropy": 1.9490876853466035, + "epoch": 0.23584134337808316, + "grad_norm": 11.029200553894043, + "learning_rate": 5.209374307219801e-06, + "loss": 0.5343, + "mean_token_accuracy": 0.8368668779730797, + "num_tokens": 91495910.0, + "step": 76080 + }, + { + "entropy": 1.9713003784418106, + "epoch": 0.23587234250313285, + "grad_norm": 8.084342956542969, + "learning_rate": 5.20903197485382e-06, + "loss": 0.5299, + "mean_token_accuracy": 0.8417750880122185, + "num_tokens": 91506865.0, + "step": 76090 + }, + { + "entropy": 1.8941795021295547, + "epoch": 0.23590334162818255, + "grad_norm": 7.648937702178955, + "learning_rate": 5.208689709967756e-06, + "loss": 0.5137, + "mean_token_accuracy": 0.8375332474708557, + "num_tokens": 91518800.0, + "step": 76100 + }, + { + "entropy": 1.8584771752357483, + "epoch": 0.23593434075323225, + "grad_norm": 8.80497932434082, + "learning_rate": 5.208347512539442e-06, + "loss": 0.5252, + "mean_token_accuracy": 0.8386746376752854, + "num_tokens": 91530792.0, + "step": 76110 + }, + { + "entropy": 1.8739138320088387, + "epoch": 0.23596533987828194, + "grad_norm": 8.877378463745117, + "learning_rate": 5.2080053825467235e-06, + "loss": 0.4915, + "mean_token_accuracy": 0.8397540152072906, + "num_tokens": 91542992.0, + "step": 76120 + }, + { + "entropy": 1.840997065603733, + "epoch": 0.23599633900333164, + "grad_norm": 7.287625789642334, + "learning_rate": 5.207663319967453e-06, + "loss": 0.4201, + "mean_token_accuracy": 0.8495836406946182, + "num_tokens": 91555954.0, + "step": 76130 + }, + { + "entropy": 1.826866079866886, + "epoch": 0.23602733812838134, + "grad_norm": 8.53991985321045, + "learning_rate": 5.207321324779495e-06, + "loss": 0.4131, + "mean_token_accuracy": 0.8605758026242256, + "num_tokens": 91568711.0, + "step": 76140 + }, + { + "entropy": 1.8429628267884255, + "epoch": 0.23605833725343103, + "grad_norm": 9.361772537231445, + "learning_rate": 5.2069793969607265e-06, + "loss": 0.4777, + "mean_token_accuracy": 0.8468737691640854, + "num_tokens": 91581173.0, + "step": 76150 + }, + { + "entropy": 1.8500198744237424, + "epoch": 0.23608933637848073, + "grad_norm": 8.414274215698242, + "learning_rate": 5.206637536489028e-06, + "loss": 0.4802, + "mean_token_accuracy": 0.8451015651226044, + "num_tokens": 91594341.0, + "step": 76160 + }, + { + "entropy": 1.93178521245718, + "epoch": 0.23612033550353043, + "grad_norm": 7.025452136993408, + "learning_rate": 5.206295743342297e-06, + "loss": 0.5028, + "mean_token_accuracy": 0.8408794820308685, + "num_tokens": 91605890.0, + "step": 76170 + }, + { + "entropy": 1.8896970108151436, + "epoch": 0.2361513346285801, + "grad_norm": 10.62279224395752, + "learning_rate": 5.205954017498437e-06, + "loss": 0.5293, + "mean_token_accuracy": 0.8361875951290131, + "num_tokens": 91617830.0, + "step": 76180 + }, + { + "entropy": 1.8840731859207154, + "epoch": 0.2361823337536298, + "grad_norm": 8.596649169921875, + "learning_rate": 5.205612358935365e-06, + "loss": 0.55, + "mean_token_accuracy": 0.8344793394207954, + "num_tokens": 91629222.0, + "step": 76190 + }, + { + "entropy": 1.8855000153183936, + "epoch": 0.2362133328786795, + "grad_norm": 7.801729202270508, + "learning_rate": 5.205270767631004e-06, + "loss": 0.4518, + "mean_token_accuracy": 0.849211573600769, + "num_tokens": 91641322.0, + "step": 76200 + }, + { + "entropy": 1.9867444038391113, + "epoch": 0.23624433200372918, + "grad_norm": 10.91826057434082, + "learning_rate": 5.2049292435632915e-06, + "loss": 0.5572, + "mean_token_accuracy": 0.8317925870418549, + "num_tokens": 91651690.0, + "step": 76210 + }, + { + "entropy": 1.9123092323541642, + "epoch": 0.23627533112877888, + "grad_norm": 8.301390647888184, + "learning_rate": 5.2045877867101715e-06, + "loss": 0.527, + "mean_token_accuracy": 0.8369743451476097, + "num_tokens": 91662933.0, + "step": 76220 + }, + { + "entropy": 1.9738686069846154, + "epoch": 0.23630633025382858, + "grad_norm": 9.968904495239258, + "learning_rate": 5.2042463970496e-06, + "loss": 0.5797, + "mean_token_accuracy": 0.8200091898441315, + "num_tokens": 91674479.0, + "step": 76230 + }, + { + "entropy": 1.8900879070162773, + "epoch": 0.23633732937887827, + "grad_norm": 10.34799575805664, + "learning_rate": 5.203905074559543e-06, + "loss": 0.4925, + "mean_token_accuracy": 0.8390724554657936, + "num_tokens": 91686217.0, + "step": 76240 + }, + { + "entropy": 1.8866849929094314, + "epoch": 0.23636832850392797, + "grad_norm": 8.001737594604492, + "learning_rate": 5.203563819217977e-06, + "loss": 0.4716, + "mean_token_accuracy": 0.8539379611611366, + "num_tokens": 91697971.0, + "step": 76250 + }, + { + "entropy": 1.9403339833021165, + "epoch": 0.23639932762897767, + "grad_norm": 8.635519027709961, + "learning_rate": 5.203222631002886e-06, + "loss": 0.5552, + "mean_token_accuracy": 0.8294428750872612, + "num_tokens": 91709307.0, + "step": 76260 + }, + { + "entropy": 1.9339663892984391, + "epoch": 0.23643032675402736, + "grad_norm": 8.026914596557617, + "learning_rate": 5.202881509892268e-06, + "loss": 0.4877, + "mean_token_accuracy": 0.8449966043233872, + "num_tokens": 91720715.0, + "step": 76270 + }, + { + "entropy": 1.769917319715023, + "epoch": 0.23646132587907706, + "grad_norm": 3.137450933456421, + "learning_rate": 5.202540455864128e-06, + "loss": 0.4872, + "mean_token_accuracy": 0.8480799734592438, + "num_tokens": 91734185.0, + "step": 76280 + }, + { + "entropy": 1.9272101998329163, + "epoch": 0.23649232500412676, + "grad_norm": 9.050545692443848, + "learning_rate": 5.202199468896483e-06, + "loss": 0.5076, + "mean_token_accuracy": 0.8370890215039253, + "num_tokens": 91746051.0, + "step": 76290 + }, + { + "entropy": 1.890957424044609, + "epoch": 0.23652332412917645, + "grad_norm": 7.916098117828369, + "learning_rate": 5.201858548967359e-06, + "loss": 0.4782, + "mean_token_accuracy": 0.8495743572711945, + "num_tokens": 91757115.0, + "step": 76300 + }, + { + "entropy": 1.904859295487404, + "epoch": 0.23655432325422615, + "grad_norm": 7.8038105964660645, + "learning_rate": 5.201517696054792e-06, + "loss": 0.4633, + "mean_token_accuracy": 0.8543992474675178, + "num_tokens": 91768690.0, + "step": 76310 + }, + { + "entropy": 1.8198333993554114, + "epoch": 0.23658532237927585, + "grad_norm": 8.247322082519531, + "learning_rate": 5.2011769101368294e-06, + "loss": 0.4175, + "mean_token_accuracy": 0.8562105298042297, + "num_tokens": 91781845.0, + "step": 76320 + }, + { + "entropy": 1.879896378517151, + "epoch": 0.23661632150432554, + "grad_norm": 3.9357030391693115, + "learning_rate": 5.200836191191528e-06, + "loss": 0.5112, + "mean_token_accuracy": 0.8404986575245857, + "num_tokens": 91794225.0, + "step": 76330 + }, + { + "entropy": 1.8717378214001656, + "epoch": 0.23664732062937524, + "grad_norm": 3.682194948196411, + "learning_rate": 5.200495539196953e-06, + "loss": 0.4489, + "mean_token_accuracy": 0.8454118952155113, + "num_tokens": 91807385.0, + "step": 76340 + }, + { + "entropy": 1.9737626880407333, + "epoch": 0.23667831975442494, + "grad_norm": 7.0051069259643555, + "learning_rate": 5.200154954131182e-06, + "loss": 0.5498, + "mean_token_accuracy": 0.8395712092518807, + "num_tokens": 91818225.0, + "step": 76350 + }, + { + "entropy": 1.9173112154006957, + "epoch": 0.23670931887947463, + "grad_norm": 7.16079568862915, + "learning_rate": 5.199814435972302e-06, + "loss": 0.5045, + "mean_token_accuracy": 0.8442193806171417, + "num_tokens": 91830394.0, + "step": 76360 + }, + { + "entropy": 1.8628339901566506, + "epoch": 0.23674031800452433, + "grad_norm": 4.580597400665283, + "learning_rate": 5.19947398469841e-06, + "loss": 0.4827, + "mean_token_accuracy": 0.8353284701704979, + "num_tokens": 91843266.0, + "step": 76370 + }, + { + "entropy": 1.9519380658864975, + "epoch": 0.23677131712957403, + "grad_norm": 7.013167381286621, + "learning_rate": 5.1991336002876116e-06, + "loss": 0.5079, + "mean_token_accuracy": 0.8405268996953964, + "num_tokens": 91853694.0, + "step": 76380 + }, + { + "entropy": 1.8750559836626053, + "epoch": 0.23680231625462372, + "grad_norm": 3.834982395172119, + "learning_rate": 5.198793282718023e-06, + "loss": 0.5224, + "mean_token_accuracy": 0.8327495470643044, + "num_tokens": 91866658.0, + "step": 76390 + }, + { + "entropy": 1.778690068423748, + "epoch": 0.23683331537967342, + "grad_norm": 8.452054977416992, + "learning_rate": 5.198453031967774e-06, + "loss": 0.4267, + "mean_token_accuracy": 0.8489452451467514, + "num_tokens": 91880171.0, + "step": 76400 + }, + { + "entropy": 1.8419625982642174, + "epoch": 0.23686431450472312, + "grad_norm": 5.224298000335693, + "learning_rate": 5.198112848015e-06, + "loss": 0.428, + "mean_token_accuracy": 0.8543521463871002, + "num_tokens": 91892362.0, + "step": 76410 + }, + { + "entropy": 1.8903858274221421, + "epoch": 0.2368953136297728, + "grad_norm": 8.055941581726074, + "learning_rate": 5.197772730837848e-06, + "loss": 0.4622, + "mean_token_accuracy": 0.8432309225201606, + "num_tokens": 91904734.0, + "step": 76420 + }, + { + "entropy": 1.8139331057667731, + "epoch": 0.23692631275482248, + "grad_norm": 4.4151506423950195, + "learning_rate": 5.197432680414474e-06, + "loss": 0.3915, + "mean_token_accuracy": 0.855991254746914, + "num_tokens": 91917912.0, + "step": 76430 + }, + { + "entropy": 1.7869535863399506, + "epoch": 0.23695731187987218, + "grad_norm": 8.555081367492676, + "learning_rate": 5.1970926967230455e-06, + "loss": 0.3974, + "mean_token_accuracy": 0.8637645095586777, + "num_tokens": 91931141.0, + "step": 76440 + }, + { + "entropy": 1.8419126272201538, + "epoch": 0.23698831100492188, + "grad_norm": 4.1938910484313965, + "learning_rate": 5.196752779741738e-06, + "loss": 0.4166, + "mean_token_accuracy": 0.8464212834835052, + "num_tokens": 91943834.0, + "step": 76450 + }, + { + "entropy": 1.8724951505661012, + "epoch": 0.23701931012997157, + "grad_norm": 9.071846008300781, + "learning_rate": 5.196412929448742e-06, + "loss": 0.5126, + "mean_token_accuracy": 0.8366881161928177, + "num_tokens": 91956524.0, + "step": 76460 + }, + { + "entropy": 1.9315653994679451, + "epoch": 0.23705030925502127, + "grad_norm": 4.576693534851074, + "learning_rate": 5.1960731458222526e-06, + "loss": 0.5247, + "mean_token_accuracy": 0.832519143819809, + "num_tokens": 91968089.0, + "step": 76470 + }, + { + "entropy": 1.8646988362073897, + "epoch": 0.23708130838007097, + "grad_norm": 9.52219009399414, + "learning_rate": 5.195733428840475e-06, + "loss": 0.486, + "mean_token_accuracy": 0.8468518033623695, + "num_tokens": 91979897.0, + "step": 76480 + }, + { + "entropy": 1.8760708898305893, + "epoch": 0.23711230750512066, + "grad_norm": 8.841971397399902, + "learning_rate": 5.1953937784816275e-06, + "loss": 0.4938, + "mean_token_accuracy": 0.8391115218400955, + "num_tokens": 91992057.0, + "step": 76490 + }, + { + "entropy": 1.9153295263648034, + "epoch": 0.23714330663017036, + "grad_norm": 9.128143310546875, + "learning_rate": 5.195054194723937e-06, + "loss": 0.5021, + "mean_token_accuracy": 0.840194196999073, + "num_tokens": 92004385.0, + "step": 76500 + }, + { + "entropy": 1.871627089381218, + "epoch": 0.23717430575522006, + "grad_norm": 5.556238651275635, + "learning_rate": 5.19471467754564e-06, + "loss": 0.4935, + "mean_token_accuracy": 0.8399588122963906, + "num_tokens": 92016826.0, + "step": 76510 + }, + { + "entropy": 1.9132946953177452, + "epoch": 0.23720530488026975, + "grad_norm": 10.617101669311523, + "learning_rate": 5.194375226924984e-06, + "loss": 0.549, + "mean_token_accuracy": 0.8375571563839912, + "num_tokens": 92028616.0, + "step": 76520 + }, + { + "entropy": 1.8806725934147834, + "epoch": 0.23723630400531945, + "grad_norm": 7.826884746551514, + "learning_rate": 5.194035842840225e-06, + "loss": 0.501, + "mean_token_accuracy": 0.8441179230809212, + "num_tokens": 92040482.0, + "step": 76530 + }, + { + "entropy": 1.9325586080551147, + "epoch": 0.23726730313036914, + "grad_norm": 10.202268600463867, + "learning_rate": 5.193696525269629e-06, + "loss": 0.4956, + "mean_token_accuracy": 0.8491972535848618, + "num_tokens": 92052267.0, + "step": 76540 + }, + { + "entropy": 1.9318912595510482, + "epoch": 0.23729830225541884, + "grad_norm": 8.407309532165527, + "learning_rate": 5.1933572741914726e-06, + "loss": 0.4927, + "mean_token_accuracy": 0.8412642747163772, + "num_tokens": 92063760.0, + "step": 76550 + }, + { + "entropy": 1.9290201410651207, + "epoch": 0.23732930138046854, + "grad_norm": 8.155739784240723, + "learning_rate": 5.193018089584044e-06, + "loss": 0.517, + "mean_token_accuracy": 0.8428064584732056, + "num_tokens": 92075347.0, + "step": 76560 + }, + { + "entropy": 1.8454096555709838, + "epoch": 0.23736030050551823, + "grad_norm": 10.54609203338623, + "learning_rate": 5.192678971425639e-06, + "loss": 0.4513, + "mean_token_accuracy": 0.8462561905384064, + "num_tokens": 92087679.0, + "step": 76570 + }, + { + "entropy": 1.8874484971165657, + "epoch": 0.23739129963056793, + "grad_norm": 9.038010597229004, + "learning_rate": 5.192339919694561e-06, + "loss": 0.4568, + "mean_token_accuracy": 0.8541138485074043, + "num_tokens": 92099382.0, + "step": 76580 + }, + { + "entropy": 1.9192816317081451, + "epoch": 0.23742229875561763, + "grad_norm": 9.369587898254395, + "learning_rate": 5.192000934369129e-06, + "loss": 0.4767, + "mean_token_accuracy": 0.8446416437625885, + "num_tokens": 92111204.0, + "step": 76590 + }, + { + "entropy": 1.9248986691236496, + "epoch": 0.23745329788066732, + "grad_norm": 8.209547996520996, + "learning_rate": 5.19166201542767e-06, + "loss": 0.5492, + "mean_token_accuracy": 0.8277073204517365, + "num_tokens": 92122562.0, + "step": 76600 + }, + { + "entropy": 1.8582263216376305, + "epoch": 0.23748429700571702, + "grad_norm": 8.712994575500488, + "learning_rate": 5.191323162848518e-06, + "loss": 0.4827, + "mean_token_accuracy": 0.8368935108184814, + "num_tokens": 92135073.0, + "step": 76610 + }, + { + "entropy": 1.9173593461513518, + "epoch": 0.23751529613076672, + "grad_norm": 7.943591594696045, + "learning_rate": 5.190984376610021e-06, + "loss": 0.5192, + "mean_token_accuracy": 0.8503361091017723, + "num_tokens": 92146436.0, + "step": 76620 + }, + { + "entropy": 1.8543553605675698, + "epoch": 0.23754629525581641, + "grad_norm": 7.117832183837891, + "learning_rate": 5.190645656690533e-06, + "loss": 0.4328, + "mean_token_accuracy": 0.851584343612194, + "num_tokens": 92158280.0, + "step": 76630 + }, + { + "entropy": 1.868790790438652, + "epoch": 0.2375772943808661, + "grad_norm": 7.763202667236328, + "learning_rate": 5.19030700306842e-06, + "loss": 0.4687, + "mean_token_accuracy": 0.8473842933773994, + "num_tokens": 92170413.0, + "step": 76640 + }, + { + "entropy": 1.9384655416011811, + "epoch": 0.2376082935059158, + "grad_norm": 8.59603214263916, + "learning_rate": 5.189968415722057e-06, + "loss": 0.5514, + "mean_token_accuracy": 0.8368967100977898, + "num_tokens": 92181528.0, + "step": 76650 + }, + { + "entropy": 1.7697623759508132, + "epoch": 0.2376392926309655, + "grad_norm": 9.119267463684082, + "learning_rate": 5.189629894629832e-06, + "loss": 0.414, + "mean_token_accuracy": 0.8625062674283981, + "num_tokens": 92195483.0, + "step": 76660 + }, + { + "entropy": 1.8749955371022224, + "epoch": 0.23767029175601517, + "grad_norm": 7.122738361358643, + "learning_rate": 5.189291439770136e-06, + "loss": 0.4683, + "mean_token_accuracy": 0.8516977295279503, + "num_tokens": 92208548.0, + "step": 76670 + }, + { + "entropy": 1.9658852204680444, + "epoch": 0.23770129088106487, + "grad_norm": 9.981183052062988, + "learning_rate": 5.18895305112138e-06, + "loss": 0.5554, + "mean_token_accuracy": 0.8255149856209755, + "num_tokens": 92219807.0, + "step": 76680 + }, + { + "entropy": 1.9098512694239616, + "epoch": 0.23773229000611457, + "grad_norm": 8.188959121704102, + "learning_rate": 5.188614728661975e-06, + "loss": 0.4968, + "mean_token_accuracy": 0.8477085337042809, + "num_tokens": 92231598.0, + "step": 76690 + }, + { + "entropy": 1.9508659318089485, + "epoch": 0.23776328913116426, + "grad_norm": 9.962569236755371, + "learning_rate": 5.188276472370346e-06, + "loss": 0.5163, + "mean_token_accuracy": 0.8311663627624511, + "num_tokens": 92243313.0, + "step": 76700 + }, + { + "entropy": 1.880191344022751, + "epoch": 0.23779428825621396, + "grad_norm": 8.909346580505371, + "learning_rate": 5.187938282224929e-06, + "loss": 0.5086, + "mean_token_accuracy": 0.8435371667146683, + "num_tokens": 92255339.0, + "step": 76710 + }, + { + "entropy": 1.8568480342626572, + "epoch": 0.23782528738126366, + "grad_norm": 3.7800729274749756, + "learning_rate": 5.187600158204169e-06, + "loss": 0.4585, + "mean_token_accuracy": 0.8489156365394592, + "num_tokens": 92268366.0, + "step": 76720 + }, + { + "entropy": 1.9763164937496185, + "epoch": 0.23785628650631335, + "grad_norm": 7.895925998687744, + "learning_rate": 5.187262100286519e-06, + "loss": 0.6004, + "mean_token_accuracy": 0.8200380340218544, + "num_tokens": 92278966.0, + "step": 76730 + }, + { + "entropy": 1.9572820693254471, + "epoch": 0.23788728563136305, + "grad_norm": 8.173888206481934, + "learning_rate": 5.186924108450444e-06, + "loss": 0.5823, + "mean_token_accuracy": 0.831984531879425, + "num_tokens": 92290141.0, + "step": 76740 + }, + { + "entropy": 1.9257027983665467, + "epoch": 0.23791828475641275, + "grad_norm": 9.356558799743652, + "learning_rate": 5.186586182674418e-06, + "loss": 0.4902, + "mean_token_accuracy": 0.8511083468794822, + "num_tokens": 92301596.0, + "step": 76750 + }, + { + "entropy": 1.9547690361738206, + "epoch": 0.23794928388146244, + "grad_norm": 9.034090995788574, + "learning_rate": 5.186248322936925e-06, + "loss": 0.5407, + "mean_token_accuracy": 0.8326706364750862, + "num_tokens": 92313300.0, + "step": 76760 + }, + { + "entropy": 1.933908286690712, + "epoch": 0.23798028300651214, + "grad_norm": 7.58587646484375, + "learning_rate": 5.1859105292164594e-06, + "loss": 0.4783, + "mean_token_accuracy": 0.8463876083493233, + "num_tokens": 92324745.0, + "step": 76770 + }, + { + "entropy": 1.9384202346205712, + "epoch": 0.23801128213156184, + "grad_norm": 7.760578632354736, + "learning_rate": 5.185572801491523e-06, + "loss": 0.5237, + "mean_token_accuracy": 0.8321873590350151, + "num_tokens": 92335893.0, + "step": 76780 + }, + { + "entropy": 1.7943998739123344, + "epoch": 0.23804228125661153, + "grad_norm": 3.816833734512329, + "learning_rate": 5.18523513974063e-06, + "loss": 0.4183, + "mean_token_accuracy": 0.8502035543322564, + "num_tokens": 92348555.0, + "step": 76790 + }, + { + "entropy": 1.9405929937958717, + "epoch": 0.23807328038166123, + "grad_norm": 8.975422859191895, + "learning_rate": 5.184897543942303e-06, + "loss": 0.5237, + "mean_token_accuracy": 0.8349242195487022, + "num_tokens": 92359797.0, + "step": 76800 + }, + { + "entropy": 1.9422178596258164, + "epoch": 0.23810427950671093, + "grad_norm": 11.456317901611328, + "learning_rate": 5.184560014075075e-06, + "loss": 0.5249, + "mean_token_accuracy": 0.8406445398926735, + "num_tokens": 92370595.0, + "step": 76810 + }, + { + "entropy": 1.7581544771790505, + "epoch": 0.23813527863176062, + "grad_norm": 4.420853614807129, + "learning_rate": 5.184222550117491e-06, + "loss": 0.3966, + "mean_token_accuracy": 0.8657904848456383, + "num_tokens": 92383924.0, + "step": 76820 + }, + { + "entropy": 1.8917575940489768, + "epoch": 0.23816627775681032, + "grad_norm": 7.945245742797852, + "learning_rate": 5.1838851520481e-06, + "loss": 0.4557, + "mean_token_accuracy": 0.8527548462152481, + "num_tokens": 92395855.0, + "step": 76830 + }, + { + "entropy": 1.9285854771733284, + "epoch": 0.23819727688186002, + "grad_norm": 8.082216262817383, + "learning_rate": 5.1835478198454654e-06, + "loss": 0.4869, + "mean_token_accuracy": 0.8462440460920334, + "num_tokens": 92406773.0, + "step": 76840 + }, + { + "entropy": 1.887254835665226, + "epoch": 0.2382282760069097, + "grad_norm": 8.663360595703125, + "learning_rate": 5.1832105534881614e-06, + "loss": 0.4671, + "mean_token_accuracy": 0.8472326830029487, + "num_tokens": 92418785.0, + "step": 76850 + }, + { + "entropy": 1.8737476989626884, + "epoch": 0.2382592751319594, + "grad_norm": 8.888916969299316, + "learning_rate": 5.182873352954766e-06, + "loss": 0.5369, + "mean_token_accuracy": 0.8380342468619346, + "num_tokens": 92431715.0, + "step": 76860 + }, + { + "entropy": 1.8650729537010193, + "epoch": 0.2382902742570091, + "grad_norm": 4.145284175872803, + "learning_rate": 5.182536218223874e-06, + "loss": 0.4606, + "mean_token_accuracy": 0.8427535608410835, + "num_tokens": 92443505.0, + "step": 76870 + }, + { + "entropy": 1.9061666548252105, + "epoch": 0.2383212733820588, + "grad_norm": 9.312609672546387, + "learning_rate": 5.182199149274083e-06, + "loss": 0.4714, + "mean_token_accuracy": 0.8470250204205513, + "num_tokens": 92454897.0, + "step": 76880 + }, + { + "entropy": 1.9185165598988534, + "epoch": 0.2383522725071085, + "grad_norm": 7.761290550231934, + "learning_rate": 5.181862146084008e-06, + "loss": 0.5389, + "mean_token_accuracy": 0.8365787997841835, + "num_tokens": 92467145.0, + "step": 76890 + }, + { + "entropy": 1.893602092564106, + "epoch": 0.2383832716321582, + "grad_norm": 9.334866523742676, + "learning_rate": 5.181525208632266e-06, + "loss": 0.5021, + "mean_token_accuracy": 0.834423853456974, + "num_tokens": 92478937.0, + "step": 76900 + }, + { + "entropy": 1.9183743864297866, + "epoch": 0.2384142707572079, + "grad_norm": 8.78905200958252, + "learning_rate": 5.18118833689749e-06, + "loss": 0.5265, + "mean_token_accuracy": 0.8335022673010826, + "num_tokens": 92490133.0, + "step": 76910 + }, + { + "entropy": 1.9075113162398338, + "epoch": 0.23844526988225756, + "grad_norm": 8.945623397827148, + "learning_rate": 5.18085153085832e-06, + "loss": 0.5024, + "mean_token_accuracy": 0.8388106390833855, + "num_tokens": 92502046.0, + "step": 76920 + }, + { + "entropy": 1.904564779996872, + "epoch": 0.23847626900730726, + "grad_norm": 7.865208625793457, + "learning_rate": 5.180514790493405e-06, + "loss": 0.4981, + "mean_token_accuracy": 0.8397575378417969, + "num_tokens": 92513471.0, + "step": 76930 + }, + { + "entropy": 1.9215335533022881, + "epoch": 0.23850726813235695, + "grad_norm": 4.170154571533203, + "learning_rate": 5.180178115781404e-06, + "loss": 0.4663, + "mean_token_accuracy": 0.8430800303816796, + "num_tokens": 92525752.0, + "step": 76940 + }, + { + "entropy": 1.8328320786356926, + "epoch": 0.23853826725740665, + "grad_norm": 10.626273155212402, + "learning_rate": 5.179841506700989e-06, + "loss": 0.4936, + "mean_token_accuracy": 0.8432081758975982, + "num_tokens": 92539459.0, + "step": 76950 + }, + { + "entropy": 1.8328202441334724, + "epoch": 0.23856926638245635, + "grad_norm": 9.927977561950684, + "learning_rate": 5.179504963230835e-06, + "loss": 0.4327, + "mean_token_accuracy": 0.8494842126965523, + "num_tokens": 92551165.0, + "step": 76960 + }, + { + "entropy": 1.9268303319811821, + "epoch": 0.23860026550750604, + "grad_norm": 8.204046249389648, + "learning_rate": 5.179168485349633e-06, + "loss": 0.5037, + "mean_token_accuracy": 0.8421676337718964, + "num_tokens": 92562511.0, + "step": 76970 + }, + { + "entropy": 1.799267826974392, + "epoch": 0.23863126463255574, + "grad_norm": 3.754833221435547, + "learning_rate": 5.178832073036083e-06, + "loss": 0.4261, + "mean_token_accuracy": 0.860039333999157, + "num_tokens": 92575004.0, + "step": 76980 + }, + { + "entropy": 1.9153159961104393, + "epoch": 0.23866226375760544, + "grad_norm": 8.974563598632812, + "learning_rate": 5.178495726268889e-06, + "loss": 0.5255, + "mean_token_accuracy": 0.8295607075095177, + "num_tokens": 92586146.0, + "step": 76990 + }, + { + "entropy": 1.8078723505139351, + "epoch": 0.23869326288265513, + "grad_norm": 7.464177131652832, + "learning_rate": 5.178159445026772e-06, + "loss": 0.3837, + "mean_token_accuracy": 0.863148321211338, + "num_tokens": 92599499.0, + "step": 77000 + }, + { + "entropy": 1.9081050038337708, + "epoch": 0.23872426200770483, + "grad_norm": 10.655149459838867, + "learning_rate": 5.17782322928846e-06, + "loss": 0.4907, + "mean_token_accuracy": 0.8429564580321312, + "num_tokens": 92610592.0, + "step": 77010 + }, + { + "entropy": 1.9061660870909691, + "epoch": 0.23875526113275453, + "grad_norm": 8.927163124084473, + "learning_rate": 5.177487079032687e-06, + "loss": 0.5, + "mean_token_accuracy": 0.8398579880595207, + "num_tokens": 92622264.0, + "step": 77020 + }, + { + "entropy": 1.8448384143412113, + "epoch": 0.23878626025780422, + "grad_norm": 9.433097839355469, + "learning_rate": 5.177150994238202e-06, + "loss": 0.4609, + "mean_token_accuracy": 0.8562486276030541, + "num_tokens": 92634783.0, + "step": 77030 + }, + { + "entropy": 1.8873264119029045, + "epoch": 0.23881725938285392, + "grad_norm": 3.7170372009277344, + "learning_rate": 5.176814974883761e-06, + "loss": 0.4873, + "mean_token_accuracy": 0.8441817805171012, + "num_tokens": 92646458.0, + "step": 77040 + }, + { + "entropy": 1.9377914816141129, + "epoch": 0.23884825850790362, + "grad_norm": 7.460235118865967, + "learning_rate": 5.176479020948127e-06, + "loss": 0.4937, + "mean_token_accuracy": 0.8441906869411469, + "num_tokens": 92657504.0, + "step": 77050 + }, + { + "entropy": 1.9209259897470474, + "epoch": 0.2388792576329533, + "grad_norm": 8.706901550292969, + "learning_rate": 5.1761431324100805e-06, + "loss": 0.5253, + "mean_token_accuracy": 0.8378788083791733, + "num_tokens": 92668549.0, + "step": 77060 + }, + { + "entropy": 1.939936462044716, + "epoch": 0.238910256758003, + "grad_norm": 7.477108001708984, + "learning_rate": 5.175807309248405e-06, + "loss": 0.5586, + "mean_token_accuracy": 0.8331178665161133, + "num_tokens": 92679326.0, + "step": 77070 + }, + { + "entropy": 1.9476049482822417, + "epoch": 0.2389412558830527, + "grad_norm": 10.547106742858887, + "learning_rate": 5.175471551441896e-06, + "loss": 0.5478, + "mean_token_accuracy": 0.8249987348914146, + "num_tokens": 92690605.0, + "step": 77080 + }, + { + "entropy": 1.8310050159692763, + "epoch": 0.2389722550081024, + "grad_norm": 8.774078369140625, + "learning_rate": 5.175135858969356e-06, + "loss": 0.466, + "mean_token_accuracy": 0.8467973753809929, + "num_tokens": 92702798.0, + "step": 77090 + }, + { + "entropy": 1.8465019643306733, + "epoch": 0.2390032541331521, + "grad_norm": 4.085997104644775, + "learning_rate": 5.174800231809601e-06, + "loss": 0.455, + "mean_token_accuracy": 0.8533832848072052, + "num_tokens": 92714580.0, + "step": 77100 + }, + { + "entropy": 1.85148034542799, + "epoch": 0.2390342532582018, + "grad_norm": 9.144734382629395, + "learning_rate": 5.174464669941455e-06, + "loss": 0.5163, + "mean_token_accuracy": 0.8366668865084648, + "num_tokens": 92726728.0, + "step": 77110 + }, + { + "entropy": 1.8749561220407487, + "epoch": 0.2390652523832515, + "grad_norm": 4.692014217376709, + "learning_rate": 5.17412917334375e-06, + "loss": 0.4937, + "mean_token_accuracy": 0.8414742290973664, + "num_tokens": 92739555.0, + "step": 77120 + }, + { + "entropy": 1.850008523464203, + "epoch": 0.2390962515083012, + "grad_norm": 4.577549934387207, + "learning_rate": 5.17379374199533e-06, + "loss": 0.4734, + "mean_token_accuracy": 0.8429133415222168, + "num_tokens": 92751167.0, + "step": 77130 + }, + { + "entropy": 1.7252720057964326, + "epoch": 0.23912725063335089, + "grad_norm": 2.2258358001708984, + "learning_rate": 5.173458375875047e-06, + "loss": 0.3858, + "mean_token_accuracy": 0.8607361957430839, + "num_tokens": 92764800.0, + "step": 77140 + }, + { + "entropy": 1.7218406319618225, + "epoch": 0.23915824975840058, + "grad_norm": 4.97647762298584, + "learning_rate": 5.1731230749617645e-06, + "loss": 0.4179, + "mean_token_accuracy": 0.8532850712537765, + "num_tokens": 92779776.0, + "step": 77150 + }, + { + "entropy": 1.80644121915102, + "epoch": 0.23918924888345028, + "grad_norm": 9.306987762451172, + "learning_rate": 5.172787839234355e-06, + "loss": 0.4713, + "mean_token_accuracy": 0.8367184489965439, + "num_tokens": 92793137.0, + "step": 77160 + }, + { + "entropy": 1.7925639390945434, + "epoch": 0.23922024800849995, + "grad_norm": 10.766707420349121, + "learning_rate": 5.172452668671697e-06, + "loss": 0.4109, + "mean_token_accuracy": 0.8472514748573303, + "num_tokens": 92806007.0, + "step": 77170 + }, + { + "entropy": 1.9033477440476418, + "epoch": 0.23925124713354964, + "grad_norm": 7.035861492156982, + "learning_rate": 5.172117563252683e-06, + "loss": 0.4931, + "mean_token_accuracy": 0.8353909865021706, + "num_tokens": 92817579.0, + "step": 77180 + }, + { + "entropy": 1.7921052902936936, + "epoch": 0.23928224625859934, + "grad_norm": 7.975375175476074, + "learning_rate": 5.171782522956215e-06, + "loss": 0.4114, + "mean_token_accuracy": 0.8562724024057389, + "num_tokens": 92830855.0, + "step": 77190 + }, + { + "entropy": 1.8471863463521003, + "epoch": 0.23931324538364904, + "grad_norm": 7.514707088470459, + "learning_rate": 5.1714475477612005e-06, + "loss": 0.4613, + "mean_token_accuracy": 0.8399458363652229, + "num_tokens": 92843224.0, + "step": 77200 + }, + { + "entropy": 1.9269508570432663, + "epoch": 0.23934424450869873, + "grad_norm": 7.373228073120117, + "learning_rate": 5.17111263764656e-06, + "loss": 0.535, + "mean_token_accuracy": 0.8384528383612633, + "num_tokens": 92854284.0, + "step": 77210 + }, + { + "entropy": 1.9275295376777648, + "epoch": 0.23937524363374843, + "grad_norm": 7.924658298492432, + "learning_rate": 5.170777792591225e-06, + "loss": 0.521, + "mean_token_accuracy": 0.8433918222784996, + "num_tokens": 92865684.0, + "step": 77220 + }, + { + "entropy": 1.902247828245163, + "epoch": 0.23940624275879813, + "grad_norm": 12.593827247619629, + "learning_rate": 5.170443012574131e-06, + "loss": 0.5043, + "mean_token_accuracy": 0.8516241997480393, + "num_tokens": 92876734.0, + "step": 77230 + }, + { + "entropy": 1.8969348236918449, + "epoch": 0.23943724188384782, + "grad_norm": 8.934786796569824, + "learning_rate": 5.170108297574229e-06, + "loss": 0.4461, + "mean_token_accuracy": 0.8509429812431335, + "num_tokens": 92888454.0, + "step": 77240 + }, + { + "entropy": 1.9256169840693473, + "epoch": 0.23946824100889752, + "grad_norm": 8.596548080444336, + "learning_rate": 5.169773647570475e-06, + "loss": 0.5858, + "mean_token_accuracy": 0.8136735886335373, + "num_tokens": 92900087.0, + "step": 77250 + }, + { + "entropy": 1.9090922564268111, + "epoch": 0.23949924013394722, + "grad_norm": 8.0187406539917, + "learning_rate": 5.169439062541838e-06, + "loss": 0.5194, + "mean_token_accuracy": 0.8492131948471069, + "num_tokens": 92910900.0, + "step": 77260 + }, + { + "entropy": 1.9346697509288788, + "epoch": 0.2395302392589969, + "grad_norm": 8.380687713623047, + "learning_rate": 5.1691045424672945e-06, + "loss": 0.5272, + "mean_token_accuracy": 0.8416820123791695, + "num_tokens": 92922693.0, + "step": 77270 + }, + { + "entropy": 1.9072400107979774, + "epoch": 0.2395612383840466, + "grad_norm": 6.178520679473877, + "learning_rate": 5.16877008732583e-06, + "loss": 0.4838, + "mean_token_accuracy": 0.8457990005612374, + "num_tokens": 92934239.0, + "step": 77280 + }, + { + "entropy": 1.8505976639688015, + "epoch": 0.2395922375090963, + "grad_norm": 8.494392395019531, + "learning_rate": 5.16843569709644e-06, + "loss": 0.4444, + "mean_token_accuracy": 0.8447763130068779, + "num_tokens": 92948046.0, + "step": 77290 + }, + { + "entropy": 1.8789535805583, + "epoch": 0.239623236634146, + "grad_norm": 8.061935424804688, + "learning_rate": 5.168101371758133e-06, + "loss": 0.5276, + "mean_token_accuracy": 0.8349943101406098, + "num_tokens": 92959929.0, + "step": 77300 + }, + { + "entropy": 1.827258250117302, + "epoch": 0.2396542357591957, + "grad_norm": 3.748342275619507, + "learning_rate": 5.1677671112899204e-06, + "loss": 0.4749, + "mean_token_accuracy": 0.8416868060827255, + "num_tokens": 92972031.0, + "step": 77310 + }, + { + "entropy": 1.892835232615471, + "epoch": 0.2396852348842454, + "grad_norm": 8.331387519836426, + "learning_rate": 5.1674329156708305e-06, + "loss": 0.4986, + "mean_token_accuracy": 0.8418567150831222, + "num_tokens": 92983623.0, + "step": 77320 + }, + { + "entropy": 1.8563826560974122, + "epoch": 0.2397162340092951, + "grad_norm": 7.926171779632568, + "learning_rate": 5.1670987848798935e-06, + "loss": 0.4651, + "mean_token_accuracy": 0.8450048446655274, + "num_tokens": 92996094.0, + "step": 77330 + }, + { + "entropy": 1.93873221129179, + "epoch": 0.2397472331343448, + "grad_norm": 3.8182835578918457, + "learning_rate": 5.1667647188961544e-06, + "loss": 0.5358, + "mean_token_accuracy": 0.8399959117174148, + "num_tokens": 93007758.0, + "step": 77340 + }, + { + "entropy": 1.8997079834342003, + "epoch": 0.23977823225939449, + "grad_norm": 3.8245768547058105, + "learning_rate": 5.166430717698667e-06, + "loss": 0.4947, + "mean_token_accuracy": 0.8303521856665611, + "num_tokens": 93019975.0, + "step": 77350 + }, + { + "entropy": 1.9382802799344063, + "epoch": 0.23980923138444418, + "grad_norm": 9.995282173156738, + "learning_rate": 5.166096781266493e-06, + "loss": 0.4967, + "mean_token_accuracy": 0.846211564540863, + "num_tokens": 93031399.0, + "step": 77360 + }, + { + "entropy": 1.872287529706955, + "epoch": 0.23984023050949388, + "grad_norm": 10.423619270324707, + "learning_rate": 5.1657629095787045e-06, + "loss": 0.5554, + "mean_token_accuracy": 0.8343263044953346, + "num_tokens": 93044307.0, + "step": 77370 + }, + { + "entropy": 1.9333751276135445, + "epoch": 0.23987122963454358, + "grad_norm": 9.734416007995605, + "learning_rate": 5.165429102614382e-06, + "loss": 0.5131, + "mean_token_accuracy": 0.8426417291164399, + "num_tokens": 93055636.0, + "step": 77380 + }, + { + "entropy": 1.858459161221981, + "epoch": 0.23990222875959327, + "grad_norm": 8.448528289794922, + "learning_rate": 5.165095360352618e-06, + "loss": 0.4225, + "mean_token_accuracy": 0.8568536981940269, + "num_tokens": 93068414.0, + "step": 77390 + }, + { + "entropy": 1.884053786098957, + "epoch": 0.23993322788464297, + "grad_norm": 9.751893043518066, + "learning_rate": 5.164761682772511e-06, + "loss": 0.4692, + "mean_token_accuracy": 0.8394820794463158, + "num_tokens": 93080506.0, + "step": 77400 + }, + { + "entropy": 1.9235117584466934, + "epoch": 0.23996422700969264, + "grad_norm": 8.76278305053711, + "learning_rate": 5.164428069853172e-06, + "loss": 0.495, + "mean_token_accuracy": 0.8517300844192505, + "num_tokens": 93091210.0, + "step": 77410 + }, + { + "entropy": 1.8631570398807527, + "epoch": 0.23999522613474233, + "grad_norm": 8.752660751342773, + "learning_rate": 5.16409452157372e-06, + "loss": 0.4473, + "mean_token_accuracy": 0.845121631026268, + "num_tokens": 93103714.0, + "step": 77420 + }, + { + "entropy": 1.831365491449833, + "epoch": 0.24002622525979203, + "grad_norm": 3.933938503265381, + "learning_rate": 5.163761037913284e-06, + "loss": 0.4581, + "mean_token_accuracy": 0.8512499988079071, + "num_tokens": 93116096.0, + "step": 77430 + }, + { + "entropy": 1.8161715433001517, + "epoch": 0.24005722438484173, + "grad_norm": 7.886687755584717, + "learning_rate": 5.163427618851002e-06, + "loss": 0.4417, + "mean_token_accuracy": 0.8520355001091957, + "num_tokens": 93128310.0, + "step": 77440 + }, + { + "entropy": 1.9185721650719643, + "epoch": 0.24008822350989142, + "grad_norm": 8.354714393615723, + "learning_rate": 5.163094264366018e-06, + "loss": 0.4982, + "mean_token_accuracy": 0.8505765274167061, + "num_tokens": 93139885.0, + "step": 77450 + }, + { + "entropy": 1.881207676231861, + "epoch": 0.24011922263494112, + "grad_norm": 3.9551658630371094, + "learning_rate": 5.162760974437495e-06, + "loss": 0.4783, + "mean_token_accuracy": 0.8480890035629273, + "num_tokens": 93152362.0, + "step": 77460 + }, + { + "entropy": 1.8166447281837463, + "epoch": 0.24015022175999082, + "grad_norm": 9.925832748413086, + "learning_rate": 5.162427749044595e-06, + "loss": 0.432, + "mean_token_accuracy": 0.8519127413630485, + "num_tokens": 93165382.0, + "step": 77470 + }, + { + "entropy": 1.9012980431318283, + "epoch": 0.24018122088504051, + "grad_norm": 6.780560493469238, + "learning_rate": 5.162094588166495e-06, + "loss": 0.5202, + "mean_token_accuracy": 0.8458901271224022, + "num_tokens": 93177192.0, + "step": 77480 + }, + { + "entropy": 1.8438314184546472, + "epoch": 0.2402122200100902, + "grad_norm": 9.430864334106445, + "learning_rate": 5.161761491782381e-06, + "loss": 0.4953, + "mean_token_accuracy": 0.8461054712533951, + "num_tokens": 93189929.0, + "step": 77490 + }, + { + "entropy": 1.8030365750193595, + "epoch": 0.2402432191351399, + "grad_norm": 8.649742126464844, + "learning_rate": 5.1614284598714455e-06, + "loss": 0.3987, + "mean_token_accuracy": 0.8661585509777069, + "num_tokens": 93203184.0, + "step": 77500 + }, + { + "entropy": 1.8316243276000024, + "epoch": 0.2402742182601896, + "grad_norm": 8.923284530639648, + "learning_rate": 5.1610954924128944e-06, + "loss": 0.4327, + "mean_token_accuracy": 0.8588346287608146, + "num_tokens": 93215255.0, + "step": 77510 + }, + { + "entropy": 1.9364321306347847, + "epoch": 0.2403052173852393, + "grad_norm": 8.266458511352539, + "learning_rate": 5.160762589385941e-06, + "loss": 0.5334, + "mean_token_accuracy": 0.8307982221245765, + "num_tokens": 93226205.0, + "step": 77520 + }, + { + "entropy": 1.868191882967949, + "epoch": 0.240336216510289, + "grad_norm": 9.114846229553223, + "learning_rate": 5.160429750769805e-06, + "loss": 0.4474, + "mean_token_accuracy": 0.8514704316854477, + "num_tokens": 93238768.0, + "step": 77530 + }, + { + "entropy": 1.9055574059486389, + "epoch": 0.2403672156353387, + "grad_norm": 4.970535755157471, + "learning_rate": 5.160096976543722e-06, + "loss": 0.4885, + "mean_token_accuracy": 0.8339187651872635, + "num_tokens": 93251174.0, + "step": 77540 + }, + { + "entropy": 1.8639071702957153, + "epoch": 0.2403982147603884, + "grad_norm": 9.217086791992188, + "learning_rate": 5.159764266686933e-06, + "loss": 0.4784, + "mean_token_accuracy": 0.8411014005541801, + "num_tokens": 93263487.0, + "step": 77550 + }, + { + "entropy": 1.9354406654834748, + "epoch": 0.2404292138854381, + "grad_norm": 7.856409072875977, + "learning_rate": 5.159431621178688e-06, + "loss": 0.5139, + "mean_token_accuracy": 0.8418198600411415, + "num_tokens": 93275107.0, + "step": 77560 + }, + { + "entropy": 1.9459631145000458, + "epoch": 0.24046021301048778, + "grad_norm": 10.06895923614502, + "learning_rate": 5.159099039998247e-06, + "loss": 0.5186, + "mean_token_accuracy": 0.8353525027632713, + "num_tokens": 93285373.0, + "step": 77570 + }, + { + "entropy": 1.8992941722273826, + "epoch": 0.24049121213553748, + "grad_norm": 9.065903663635254, + "learning_rate": 5.158766523124879e-06, + "loss": 0.5077, + "mean_token_accuracy": 0.8420209974050522, + "num_tokens": 93297887.0, + "step": 77580 + }, + { + "entropy": 1.8735466703772545, + "epoch": 0.24052221126058718, + "grad_norm": 8.686766624450684, + "learning_rate": 5.158434070537864e-06, + "loss": 0.4796, + "mean_token_accuracy": 0.8445511654019355, + "num_tokens": 93309645.0, + "step": 77590 + }, + { + "entropy": 1.9320282906293869, + "epoch": 0.24055321038563687, + "grad_norm": 4.7969770431518555, + "learning_rate": 5.158101682216491e-06, + "loss": 0.5188, + "mean_token_accuracy": 0.8482805505394936, + "num_tokens": 93320996.0, + "step": 77600 + }, + { + "entropy": 1.8526810958981514, + "epoch": 0.24058420951068657, + "grad_norm": 3.4721367359161377, + "learning_rate": 5.157769358140056e-06, + "loss": 0.4649, + "mean_token_accuracy": 0.8560703948140145, + "num_tokens": 93332605.0, + "step": 77610 + }, + { + "entropy": 1.8123348727822304, + "epoch": 0.24061520863573627, + "grad_norm": 4.217021465301514, + "learning_rate": 5.157437098287867e-06, + "loss": 0.4348, + "mean_token_accuracy": 0.8512080743908882, + "num_tokens": 93344938.0, + "step": 77620 + }, + { + "entropy": 1.955248984694481, + "epoch": 0.24064620776078596, + "grad_norm": 9.06552505493164, + "learning_rate": 5.157104902639239e-06, + "loss": 0.5756, + "mean_token_accuracy": 0.8257305637001991, + "num_tokens": 93356260.0, + "step": 77630 + }, + { + "entropy": 1.8736144185066224, + "epoch": 0.24067720688583566, + "grad_norm": 8.679845809936523, + "learning_rate": 5.156772771173499e-06, + "loss": 0.4804, + "mean_token_accuracy": 0.835148498415947, + "num_tokens": 93368497.0, + "step": 77640 + }, + { + "entropy": 1.785961812734604, + "epoch": 0.24070820601088536, + "grad_norm": 8.887287139892578, + "learning_rate": 5.15644070386998e-06, + "loss": 0.4267, + "mean_token_accuracy": 0.8569586247205734, + "num_tokens": 93381040.0, + "step": 77650 + }, + { + "entropy": 1.8760237216949462, + "epoch": 0.24073920513593502, + "grad_norm": 7.9106011390686035, + "learning_rate": 5.15610870070803e-06, + "loss": 0.4746, + "mean_token_accuracy": 0.8385165154933929, + "num_tokens": 93392906.0, + "step": 77660 + }, + { + "entropy": 1.9115961521863938, + "epoch": 0.24077020426098472, + "grad_norm": 7.96343469619751, + "learning_rate": 5.155776761666998e-06, + "loss": 0.4577, + "mean_token_accuracy": 0.8567873150110245, + "num_tokens": 93404492.0, + "step": 77670 + }, + { + "entropy": 1.8684505842626096, + "epoch": 0.24080120338603442, + "grad_norm": 8.68083667755127, + "learning_rate": 5.15544488672625e-06, + "loss": 0.5003, + "mean_token_accuracy": 0.8393196210265159, + "num_tokens": 93417211.0, + "step": 77680 + }, + { + "entropy": 1.7916448466479777, + "epoch": 0.24083220251108411, + "grad_norm": 7.983458518981934, + "learning_rate": 5.155113075865157e-06, + "loss": 0.419, + "mean_token_accuracy": 0.8451001644134521, + "num_tokens": 93430639.0, + "step": 77690 + }, + { + "entropy": 1.8966735377907753, + "epoch": 0.2408632016361338, + "grad_norm": 8.755640029907227, + "learning_rate": 5.1547813290631e-06, + "loss": 0.4627, + "mean_token_accuracy": 0.8501760110259056, + "num_tokens": 93441446.0, + "step": 77700 + }, + { + "entropy": 1.8320280969142915, + "epoch": 0.2408942007611835, + "grad_norm": 7.649133205413818, + "learning_rate": 5.154449646299469e-06, + "loss": 0.443, + "mean_token_accuracy": 0.849604444205761, + "num_tokens": 93453520.0, + "step": 77710 + }, + { + "entropy": 1.826886311173439, + "epoch": 0.2409251998862332, + "grad_norm": 3.657710313796997, + "learning_rate": 5.154118027553669e-06, + "loss": 0.4318, + "mean_token_accuracy": 0.849964652955532, + "num_tokens": 93465485.0, + "step": 77720 + }, + { + "entropy": 1.84391158670187, + "epoch": 0.2409561990112829, + "grad_norm": 8.545806884765625, + "learning_rate": 5.153786472805101e-06, + "loss": 0.4595, + "mean_token_accuracy": 0.8504501432180405, + "num_tokens": 93478499.0, + "step": 77730 + }, + { + "entropy": 1.8420162439346313, + "epoch": 0.2409871981363326, + "grad_norm": 4.115400791168213, + "learning_rate": 5.15345498203319e-06, + "loss": 0.454, + "mean_token_accuracy": 0.8493165418505668, + "num_tokens": 93490375.0, + "step": 77740 + }, + { + "entropy": 1.897342699766159, + "epoch": 0.2410181972613823, + "grad_norm": 9.050869941711426, + "learning_rate": 5.153123555217362e-06, + "loss": 0.5218, + "mean_token_accuracy": 0.8383392289280891, + "num_tokens": 93501729.0, + "step": 77750 + }, + { + "entropy": 1.8093791991472243, + "epoch": 0.241049196386432, + "grad_norm": 8.670755386352539, + "learning_rate": 5.1527921923370536e-06, + "loss": 0.4341, + "mean_token_accuracy": 0.8506680727005005, + "num_tokens": 93514603.0, + "step": 77760 + }, + { + "entropy": 1.7984491378068923, + "epoch": 0.2410801955114817, + "grad_norm": 2.556086540222168, + "learning_rate": 5.15246089337171e-06, + "loss": 0.4706, + "mean_token_accuracy": 0.8423952326178551, + "num_tokens": 93527732.0, + "step": 77770 + }, + { + "entropy": 1.857538816332817, + "epoch": 0.24111119463653138, + "grad_norm": 5.878166675567627, + "learning_rate": 5.15212965830079e-06, + "loss": 0.4816, + "mean_token_accuracy": 0.8417029067873955, + "num_tokens": 93539767.0, + "step": 77780 + }, + { + "entropy": 1.9089462801814079, + "epoch": 0.24114219376158108, + "grad_norm": 8.4704008102417, + "learning_rate": 5.151798487103755e-06, + "loss": 0.5014, + "mean_token_accuracy": 0.8439002588391304, + "num_tokens": 93550984.0, + "step": 77790 + }, + { + "entropy": 1.8456274837255477, + "epoch": 0.24117319288663078, + "grad_norm": 4.18363618850708, + "learning_rate": 5.151467379760081e-06, + "loss": 0.4618, + "mean_token_accuracy": 0.8480399310588836, + "num_tokens": 93563757.0, + "step": 77800 + }, + { + "entropy": 1.899016997218132, + "epoch": 0.24120419201168047, + "grad_norm": 7.410801887512207, + "learning_rate": 5.1511363362492515e-06, + "loss": 0.5028, + "mean_token_accuracy": 0.8440502628684043, + "num_tokens": 93575421.0, + "step": 77810 + }, + { + "entropy": 1.8831632077693938, + "epoch": 0.24123519113673017, + "grad_norm": 2.9362380504608154, + "learning_rate": 5.150805356550758e-06, + "loss": 0.475, + "mean_token_accuracy": 0.842276705801487, + "num_tokens": 93587824.0, + "step": 77820 + }, + { + "entropy": 1.9149335369467735, + "epoch": 0.24126619026177987, + "grad_norm": 9.272772789001465, + "learning_rate": 5.150474440644102e-06, + "loss": 0.5188, + "mean_token_accuracy": 0.8346376657485962, + "num_tokens": 93599403.0, + "step": 77830 + }, + { + "entropy": 1.8213121712207794, + "epoch": 0.24129718938682956, + "grad_norm": 11.08968448638916, + "learning_rate": 5.150143588508796e-06, + "loss": 0.4433, + "mean_token_accuracy": 0.8508032530546188, + "num_tokens": 93612087.0, + "step": 77840 + }, + { + "entropy": 1.832365171611309, + "epoch": 0.24132818851187926, + "grad_norm": 8.193666458129883, + "learning_rate": 5.149812800124359e-06, + "loss": 0.5089, + "mean_token_accuracy": 0.8404935225844383, + "num_tokens": 93623785.0, + "step": 77850 + }, + { + "entropy": 1.917369608581066, + "epoch": 0.24135918763692896, + "grad_norm": 8.672357559204102, + "learning_rate": 5.149482075470319e-06, + "loss": 0.495, + "mean_token_accuracy": 0.8380819782614708, + "num_tokens": 93635723.0, + "step": 77860 + }, + { + "entropy": 1.8775453761219978, + "epoch": 0.24139018676197865, + "grad_norm": 9.257630348205566, + "learning_rate": 5.1491514145262174e-06, + "loss": 0.4797, + "mean_token_accuracy": 0.8449780121445656, + "num_tokens": 93647450.0, + "step": 77870 + }, + { + "entropy": 1.8986103981733322, + "epoch": 0.24142118588702835, + "grad_norm": 8.680018424987793, + "learning_rate": 5.148820817271601e-06, + "loss": 0.4554, + "mean_token_accuracy": 0.8502163380384445, + "num_tokens": 93659062.0, + "step": 77880 + }, + { + "entropy": 1.7778849676251411, + "epoch": 0.24145218501207805, + "grad_norm": 4.6502604484558105, + "learning_rate": 5.148490283686026e-06, + "loss": 0.4188, + "mean_token_accuracy": 0.8509754404425621, + "num_tokens": 93673049.0, + "step": 77890 + }, + { + "entropy": 1.83993206769228, + "epoch": 0.24148318413712774, + "grad_norm": 3.465973138809204, + "learning_rate": 5.14815981374906e-06, + "loss": 0.4708, + "mean_token_accuracy": 0.8415236309170723, + "num_tokens": 93685767.0, + "step": 77900 + }, + { + "entropy": 1.863934588432312, + "epoch": 0.2415141832621774, + "grad_norm": 8.9150972366333, + "learning_rate": 5.1478294074402756e-06, + "loss": 0.4804, + "mean_token_accuracy": 0.844880360364914, + "num_tokens": 93698133.0, + "step": 77910 + }, + { + "entropy": 1.8324093401432038, + "epoch": 0.2415451823872271, + "grad_norm": 4.7063188552856445, + "learning_rate": 5.14749906473926e-06, + "loss": 0.4428, + "mean_token_accuracy": 0.851141095161438, + "num_tokens": 93710437.0, + "step": 77920 + }, + { + "entropy": 1.9022783473134042, + "epoch": 0.2415761815122768, + "grad_norm": 8.489745140075684, + "learning_rate": 5.147168785625606e-06, + "loss": 0.5028, + "mean_token_accuracy": 0.8330109208822251, + "num_tokens": 93722370.0, + "step": 77930 + }, + { + "entropy": 1.837750643491745, + "epoch": 0.2416071806373265, + "grad_norm": 8.317510604858398, + "learning_rate": 5.146838570078916e-06, + "loss": 0.453, + "mean_token_accuracy": 0.8525749742984772, + "num_tokens": 93735012.0, + "step": 77940 + }, + { + "entropy": 1.8423487588763237, + "epoch": 0.2416381797623762, + "grad_norm": 3.3121819496154785, + "learning_rate": 5.146508418078802e-06, + "loss": 0.4678, + "mean_token_accuracy": 0.8401482835412025, + "num_tokens": 93747947.0, + "step": 77950 + }, + { + "entropy": 1.858578224480152, + "epoch": 0.2416691788874259, + "grad_norm": 4.246115207672119, + "learning_rate": 5.146178329604885e-06, + "loss": 0.5066, + "mean_token_accuracy": 0.8309834390878678, + "num_tokens": 93760294.0, + "step": 77960 + }, + { + "entropy": 1.8050771802663803, + "epoch": 0.2417001780124756, + "grad_norm": 8.161648750305176, + "learning_rate": 5.145848304636797e-06, + "loss": 0.4388, + "mean_token_accuracy": 0.8522293284535408, + "num_tokens": 93773292.0, + "step": 77970 + }, + { + "entropy": 1.856981235742569, + "epoch": 0.2417311771375253, + "grad_norm": 9.97853946685791, + "learning_rate": 5.1455183431541755e-06, + "loss": 0.4607, + "mean_token_accuracy": 0.8453084260225296, + "num_tokens": 93785337.0, + "step": 77980 + }, + { + "entropy": 1.8165911972522735, + "epoch": 0.24176217626257498, + "grad_norm": 8.15949535369873, + "learning_rate": 5.145188445136669e-06, + "loss": 0.4365, + "mean_token_accuracy": 0.8522542849183082, + "num_tokens": 93798839.0, + "step": 77990 + }, + { + "entropy": 1.8631312981247903, + "epoch": 0.24179317538762468, + "grad_norm": 9.838682174682617, + "learning_rate": 5.144858610563938e-06, + "loss": 0.486, + "mean_token_accuracy": 0.8407356634736061, + "num_tokens": 93811151.0, + "step": 78000 + }, + { + "entropy": 1.9338207572698594, + "epoch": 0.24182417451267438, + "grad_norm": 9.109156608581543, + "learning_rate": 5.144528839415645e-06, + "loss": 0.5236, + "mean_token_accuracy": 0.8407235726714134, + "num_tokens": 93822175.0, + "step": 78010 + }, + { + "entropy": 1.8040778622031213, + "epoch": 0.24185517363772407, + "grad_norm": 4.23243522644043, + "learning_rate": 5.1441991316714694e-06, + "loss": 0.4351, + "mean_token_accuracy": 0.8540575191378593, + "num_tokens": 93834880.0, + "step": 78020 + }, + { + "entropy": 1.893469262123108, + "epoch": 0.24188617276277377, + "grad_norm": 8.863125801086426, + "learning_rate": 5.143869487311095e-06, + "loss": 0.4412, + "mean_token_accuracy": 0.8541948691010475, + "num_tokens": 93846671.0, + "step": 78030 + }, + { + "entropy": 1.8411024257540702, + "epoch": 0.24191717188782347, + "grad_norm": 4.275692462921143, + "learning_rate": 5.143539906314216e-06, + "loss": 0.4935, + "mean_token_accuracy": 0.8460016861557961, + "num_tokens": 93859086.0, + "step": 78040 + }, + { + "entropy": 1.887072142958641, + "epoch": 0.24194817101287316, + "grad_norm": 3.7341136932373047, + "learning_rate": 5.143210388660536e-06, + "loss": 0.464, + "mean_token_accuracy": 0.8523519471287727, + "num_tokens": 93870472.0, + "step": 78050 + }, + { + "entropy": 1.9175701081752776, + "epoch": 0.24197917013792286, + "grad_norm": 10.164228439331055, + "learning_rate": 5.142880934329766e-06, + "loss": 0.5293, + "mean_token_accuracy": 0.8337636604905129, + "num_tokens": 93881653.0, + "step": 78060 + }, + { + "entropy": 1.9110116347670556, + "epoch": 0.24201016926297256, + "grad_norm": 8.151057243347168, + "learning_rate": 5.142551543301631e-06, + "loss": 0.4783, + "mean_token_accuracy": 0.843655027449131, + "num_tokens": 93893107.0, + "step": 78070 + }, + { + "entropy": 1.8371216788887978, + "epoch": 0.24204116838802225, + "grad_norm": 4.863457679748535, + "learning_rate": 5.142222215555856e-06, + "loss": 0.4841, + "mean_token_accuracy": 0.845946654677391, + "num_tokens": 93905843.0, + "step": 78080 + }, + { + "entropy": 1.9443501621484756, + "epoch": 0.24207216751307195, + "grad_norm": 9.060688972473145, + "learning_rate": 5.141892951072186e-06, + "loss": 0.5785, + "mean_token_accuracy": 0.8199516415596009, + "num_tokens": 93916955.0, + "step": 78090 + }, + { + "entropy": 1.8930419281125068, + "epoch": 0.24210316663812165, + "grad_norm": 3.962271213531494, + "learning_rate": 5.141563749830367e-06, + "loss": 0.5161, + "mean_token_accuracy": 0.8397519171237946, + "num_tokens": 93928520.0, + "step": 78100 + }, + { + "entropy": 1.88976591527462, + "epoch": 0.24213416576317134, + "grad_norm": 8.086103439331055, + "learning_rate": 5.141234611810158e-06, + "loss": 0.4857, + "mean_token_accuracy": 0.8491416946053505, + "num_tokens": 93939987.0, + "step": 78110 + }, + { + "entropy": 1.9858234167098998, + "epoch": 0.24216516488822104, + "grad_norm": 7.99221658706665, + "learning_rate": 5.140905536991324e-06, + "loss": 0.5678, + "mean_token_accuracy": 0.8298144713044167, + "num_tokens": 93950877.0, + "step": 78120 + }, + { + "entropy": 1.8197284460067749, + "epoch": 0.24219616401327074, + "grad_norm": 8.68156909942627, + "learning_rate": 5.140576525353643e-06, + "loss": 0.4926, + "mean_token_accuracy": 0.8402933716773987, + "num_tokens": 93963270.0, + "step": 78130 + }, + { + "entropy": 1.8739581018686295, + "epoch": 0.24222716313832043, + "grad_norm": 8.117925643920898, + "learning_rate": 5.1402475768769e-06, + "loss": 0.5305, + "mean_token_accuracy": 0.8324276804924011, + "num_tokens": 93975569.0, + "step": 78140 + }, + { + "entropy": 1.9055853977799415, + "epoch": 0.2422581622633701, + "grad_norm": 8.360283851623535, + "learning_rate": 5.139918691540887e-06, + "loss": 0.5192, + "mean_token_accuracy": 0.8316447660326958, + "num_tokens": 93987298.0, + "step": 78150 + }, + { + "entropy": 2.0304242044687273, + "epoch": 0.2422891613884198, + "grad_norm": 8.648979187011719, + "learning_rate": 5.13958986932541e-06, + "loss": 0.6085, + "mean_token_accuracy": 0.820050160586834, + "num_tokens": 93997995.0, + "step": 78160 + }, + { + "entropy": 1.9047956734895706, + "epoch": 0.2423201605134695, + "grad_norm": 3.7517290115356445, + "learning_rate": 5.139261110210278e-06, + "loss": 0.4528, + "mean_token_accuracy": 0.8566797718405723, + "num_tokens": 94009266.0, + "step": 78170 + }, + { + "entropy": 1.8741468638181686, + "epoch": 0.2423511596385192, + "grad_norm": 8.909819602966309, + "learning_rate": 5.138932414175315e-06, + "loss": 0.5123, + "mean_token_accuracy": 0.8410193353891373, + "num_tokens": 94021292.0, + "step": 78180 + }, + { + "entropy": 1.9001810044050216, + "epoch": 0.2423821587635689, + "grad_norm": 8.745471000671387, + "learning_rate": 5.138603781200349e-06, + "loss": 0.4857, + "mean_token_accuracy": 0.8419952884316444, + "num_tokens": 94033054.0, + "step": 78190 + }, + { + "entropy": 1.9287573903799058, + "epoch": 0.24241315788861859, + "grad_norm": 3.352858543395996, + "learning_rate": 5.138275211265221e-06, + "loss": 0.4676, + "mean_token_accuracy": 0.8414346948266029, + "num_tokens": 94045999.0, + "step": 78200 + }, + { + "entropy": 1.878933884203434, + "epoch": 0.24244415701366828, + "grad_norm": 8.681049346923828, + "learning_rate": 5.137946704349778e-06, + "loss": 0.494, + "mean_token_accuracy": 0.8540082618594169, + "num_tokens": 94057480.0, + "step": 78210 + }, + { + "entropy": 1.913780263066292, + "epoch": 0.24247515613871798, + "grad_norm": 9.51512336730957, + "learning_rate": 5.137618260433878e-06, + "loss": 0.5343, + "mean_token_accuracy": 0.8367677152156829, + "num_tokens": 94068651.0, + "step": 78220 + }, + { + "entropy": 1.8172692015767098, + "epoch": 0.24250615526376768, + "grad_norm": 10.392766952514648, + "learning_rate": 5.137289879497387e-06, + "loss": 0.4076, + "mean_token_accuracy": 0.8513308480381966, + "num_tokens": 94081805.0, + "step": 78230 + }, + { + "entropy": 1.9101085662841797, + "epoch": 0.24253715438881737, + "grad_norm": 4.027338027954102, + "learning_rate": 5.136961561520181e-06, + "loss": 0.5266, + "mean_token_accuracy": 0.8364538699388504, + "num_tokens": 94093434.0, + "step": 78240 + }, + { + "entropy": 1.9400910884141922, + "epoch": 0.24256815351386707, + "grad_norm": 8.5806303024292, + "learning_rate": 5.1366333064821426e-06, + "loss": 0.5406, + "mean_token_accuracy": 0.8367814645171165, + "num_tokens": 94104660.0, + "step": 78250 + }, + { + "entropy": 1.9483735159039497, + "epoch": 0.24259915263891677, + "grad_norm": 4.43411922454834, + "learning_rate": 5.136305114363167e-06, + "loss": 0.4996, + "mean_token_accuracy": 0.8446143805980683, + "num_tokens": 94116243.0, + "step": 78260 + }, + { + "entropy": 1.9460048735141755, + "epoch": 0.24263015176396646, + "grad_norm": 8.685762405395508, + "learning_rate": 5.1359769851431565e-06, + "loss": 0.5085, + "mean_token_accuracy": 0.8413349702954293, + "num_tokens": 94127332.0, + "step": 78270 + }, + { + "entropy": 1.9539130926132202, + "epoch": 0.24266115088901616, + "grad_norm": 9.817856788635254, + "learning_rate": 5.13564891880202e-06, + "loss": 0.5339, + "mean_token_accuracy": 0.8239889681339264, + "num_tokens": 94138346.0, + "step": 78280 + }, + { + "entropy": 1.8358333230018615, + "epoch": 0.24269215001406585, + "grad_norm": 10.21696662902832, + "learning_rate": 5.135320915319681e-06, + "loss": 0.4144, + "mean_token_accuracy": 0.8529067754745483, + "num_tokens": 94151143.0, + "step": 78290 + }, + { + "entropy": 1.9505225792527199, + "epoch": 0.24272314913911555, + "grad_norm": 9.021811485290527, + "learning_rate": 5.134992974676065e-06, + "loss": 0.5422, + "mean_token_accuracy": 0.8432751014828682, + "num_tokens": 94162767.0, + "step": 78300 + }, + { + "entropy": 1.927219384908676, + "epoch": 0.24275414826416525, + "grad_norm": 4.087057590484619, + "learning_rate": 5.134665096851114e-06, + "loss": 0.5086, + "mean_token_accuracy": 0.8410610109567642, + "num_tokens": 94174630.0, + "step": 78310 + }, + { + "entropy": 1.8358907088637353, + "epoch": 0.24278514738921494, + "grad_norm": 9.222640991210938, + "learning_rate": 5.134337281824774e-06, + "loss": 0.4711, + "mean_token_accuracy": 0.8533089682459831, + "num_tokens": 94186789.0, + "step": 78320 + }, + { + "entropy": 1.8353676095604896, + "epoch": 0.24281614651426464, + "grad_norm": 9.232024192810059, + "learning_rate": 5.1340095295769985e-06, + "loss": 0.4311, + "mean_token_accuracy": 0.859758959710598, + "num_tokens": 94198760.0, + "step": 78330 + }, + { + "entropy": 1.9933633387088776, + "epoch": 0.24284714563931434, + "grad_norm": 7.663358688354492, + "learning_rate": 5.1336818400877575e-06, + "loss": 0.6016, + "mean_token_accuracy": 0.8219748124480247, + "num_tokens": 94209973.0, + "step": 78340 + }, + { + "entropy": 1.9356787115335465, + "epoch": 0.24287814476436403, + "grad_norm": 7.0271100997924805, + "learning_rate": 5.1333542133370205e-06, + "loss": 0.5275, + "mean_token_accuracy": 0.8462739869952202, + "num_tokens": 94221268.0, + "step": 78350 + }, + { + "entropy": 1.9141018971800805, + "epoch": 0.24290914388941373, + "grad_norm": 8.173383712768555, + "learning_rate": 5.133026649304772e-06, + "loss": 0.5224, + "mean_token_accuracy": 0.8365029126405716, + "num_tokens": 94233710.0, + "step": 78360 + }, + { + "entropy": 1.8921899944543839, + "epoch": 0.24294014301446343, + "grad_norm": 6.497464656829834, + "learning_rate": 5.132699147971007e-06, + "loss": 0.4443, + "mean_token_accuracy": 0.8493005961179734, + "num_tokens": 94245927.0, + "step": 78370 + }, + { + "entropy": 1.8819943577051164, + "epoch": 0.24297114213951312, + "grad_norm": 8.804264068603516, + "learning_rate": 5.132371709315721e-06, + "loss": 0.4768, + "mean_token_accuracy": 0.8414627909660339, + "num_tokens": 94258498.0, + "step": 78380 + }, + { + "entropy": 1.9550851792097093, + "epoch": 0.24300214126456282, + "grad_norm": 8.61777400970459, + "learning_rate": 5.1320443333189265e-06, + "loss": 0.5731, + "mean_token_accuracy": 0.8282885104417801, + "num_tokens": 94270225.0, + "step": 78390 + }, + { + "entropy": 1.8870521187782288, + "epoch": 0.2430331403896125, + "grad_norm": 10.647053718566895, + "learning_rate": 5.131717019960643e-06, + "loss": 0.46, + "mean_token_accuracy": 0.847943240404129, + "num_tokens": 94282222.0, + "step": 78400 + }, + { + "entropy": 1.942089530825615, + "epoch": 0.2430641395146622, + "grad_norm": 9.514321327209473, + "learning_rate": 5.131389769220897e-06, + "loss": 0.5166, + "mean_token_accuracy": 0.8450910165905953, + "num_tokens": 94293068.0, + "step": 78410 + }, + { + "entropy": 1.9375642448663712, + "epoch": 0.24309513863971188, + "grad_norm": 10.78248405456543, + "learning_rate": 5.131062581079726e-06, + "loss": 0.5297, + "mean_token_accuracy": 0.8404152438044548, + "num_tokens": 94304441.0, + "step": 78420 + }, + { + "entropy": 1.901486437022686, + "epoch": 0.24312613776476158, + "grad_norm": 10.459092140197754, + "learning_rate": 5.130735455517173e-06, + "loss": 0.4886, + "mean_token_accuracy": 0.8432300522923469, + "num_tokens": 94316598.0, + "step": 78430 + }, + { + "entropy": 1.9251017332077027, + "epoch": 0.24315713688981128, + "grad_norm": 10.798014640808105, + "learning_rate": 5.130408392513295e-06, + "loss": 0.5176, + "mean_token_accuracy": 0.8358462870121002, + "num_tokens": 94328615.0, + "step": 78440 + }, + { + "entropy": 1.7331562623381616, + "epoch": 0.24318813601486097, + "grad_norm": 6.879677772521973, + "learning_rate": 5.130081392048156e-06, + "loss": 0.4168, + "mean_token_accuracy": 0.8590050891041756, + "num_tokens": 94342439.0, + "step": 78450 + }, + { + "entropy": 1.876169492304325, + "epoch": 0.24321913513991067, + "grad_norm": 6.808117389678955, + "learning_rate": 5.129754454101825e-06, + "loss": 0.4903, + "mean_token_accuracy": 0.8520722165703773, + "num_tokens": 94354438.0, + "step": 78460 + }, + { + "entropy": 1.9563359498977662, + "epoch": 0.24325013426496037, + "grad_norm": 8.930147171020508, + "learning_rate": 5.129427578654386e-06, + "loss": 0.5295, + "mean_token_accuracy": 0.84179867208004, + "num_tokens": 94365036.0, + "step": 78470 + }, + { + "entropy": 1.9348897516727448, + "epoch": 0.24328113339001006, + "grad_norm": 8.5611572265625, + "learning_rate": 5.129100765685926e-06, + "loss": 0.5008, + "mean_token_accuracy": 0.851142629981041, + "num_tokens": 94375811.0, + "step": 78480 + }, + { + "entropy": 1.9472255200147628, + "epoch": 0.24331213251505976, + "grad_norm": 10.067473411560059, + "learning_rate": 5.1287740151765464e-06, + "loss": 0.5399, + "mean_token_accuracy": 0.8335594087839127, + "num_tokens": 94387094.0, + "step": 78490 + }, + { + "entropy": 1.8310412108898162, + "epoch": 0.24334313164010946, + "grad_norm": 8.096028327941895, + "learning_rate": 5.128447327106353e-06, + "loss": 0.4257, + "mean_token_accuracy": 0.8536302044987678, + "num_tokens": 94400414.0, + "step": 78500 + }, + { + "entropy": 1.8954817593097686, + "epoch": 0.24337413076515915, + "grad_norm": 8.50610637664795, + "learning_rate": 5.128120701455464e-06, + "loss": 0.4761, + "mean_token_accuracy": 0.8491655126214027, + "num_tokens": 94412241.0, + "step": 78510 + }, + { + "entropy": 1.8588109910488129, + "epoch": 0.24340512989020885, + "grad_norm": 8.121857643127441, + "learning_rate": 5.127794138204003e-06, + "loss": 0.4307, + "mean_token_accuracy": 0.8583784848451614, + "num_tokens": 94424289.0, + "step": 78520 + }, + { + "entropy": 1.9707530647516251, + "epoch": 0.24343612901525855, + "grad_norm": 11.943368911743164, + "learning_rate": 5.127467637332106e-06, + "loss": 0.5347, + "mean_token_accuracy": 0.843015332520008, + "num_tokens": 94435163.0, + "step": 78530 + }, + { + "entropy": 1.8756818160414697, + "epoch": 0.24346712814030824, + "grad_norm": 4.680079936981201, + "learning_rate": 5.127141198819916e-06, + "loss": 0.4559, + "mean_token_accuracy": 0.8448122501373291, + "num_tokens": 94447563.0, + "step": 78540 + }, + { + "entropy": 1.9033118396997453, + "epoch": 0.24349812726535794, + "grad_norm": 7.533633708953857, + "learning_rate": 5.126814822647584e-06, + "loss": 0.46, + "mean_token_accuracy": 0.8476984471082687, + "num_tokens": 94459089.0, + "step": 78550 + }, + { + "entropy": 1.9617197692394257, + "epoch": 0.24352912639040764, + "grad_norm": 9.499611854553223, + "learning_rate": 5.126488508795272e-06, + "loss": 0.5812, + "mean_token_accuracy": 0.8220301568508148, + "num_tokens": 94469681.0, + "step": 78560 + }, + { + "entropy": 1.8323184587061405, + "epoch": 0.24356012551545733, + "grad_norm": 9.460269927978516, + "learning_rate": 5.126162257243148e-06, + "loss": 0.4517, + "mean_token_accuracy": 0.8555850937962532, + "num_tokens": 94481974.0, + "step": 78570 + }, + { + "entropy": 1.8734732553362847, + "epoch": 0.24359112464050703, + "grad_norm": 8.973634719848633, + "learning_rate": 5.1258360679713916e-06, + "loss": 0.4844, + "mean_token_accuracy": 0.8513824939727783, + "num_tokens": 94493390.0, + "step": 78580 + }, + { + "entropy": 1.935685896873474, + "epoch": 0.24362212376555673, + "grad_norm": 7.930929183959961, + "learning_rate": 5.125509940960189e-06, + "loss": 0.5584, + "mean_token_accuracy": 0.8314585655927658, + "num_tokens": 94504378.0, + "step": 78590 + }, + { + "entropy": 1.881124185025692, + "epoch": 0.24365312289060642, + "grad_norm": 4.80453634262085, + "learning_rate": 5.12518387618974e-06, + "loss": 0.4441, + "mean_token_accuracy": 0.8595498457551003, + "num_tokens": 94515741.0, + "step": 78600 + }, + { + "entropy": 1.867940901219845, + "epoch": 0.24368412201565612, + "grad_norm": 8.655344009399414, + "learning_rate": 5.124857873640244e-06, + "loss": 0.4633, + "mean_token_accuracy": 0.8418323248624802, + "num_tokens": 94528325.0, + "step": 78610 + }, + { + "entropy": 1.9250017136335373, + "epoch": 0.24371512114070581, + "grad_norm": 9.36817741394043, + "learning_rate": 5.124531933291918e-06, + "loss": 0.5308, + "mean_token_accuracy": 0.8363332346081733, + "num_tokens": 94539198.0, + "step": 78620 + }, + { + "entropy": 1.782043893635273, + "epoch": 0.2437461202657555, + "grad_norm": 6.611782073974609, + "learning_rate": 5.124206055124986e-06, + "loss": 0.4036, + "mean_token_accuracy": 0.8502190038561821, + "num_tokens": 94553598.0, + "step": 78630 + }, + { + "entropy": 1.9345451429486276, + "epoch": 0.2437771193908052, + "grad_norm": 8.913650512695312, + "learning_rate": 5.123880239119677e-06, + "loss": 0.5216, + "mean_token_accuracy": 0.8417821496725082, + "num_tokens": 94564564.0, + "step": 78640 + }, + { + "entropy": 1.900984486937523, + "epoch": 0.24380811851585488, + "grad_norm": 7.233798503875732, + "learning_rate": 5.123554485256231e-06, + "loss": 0.5571, + "mean_token_accuracy": 0.835866068303585, + "num_tokens": 94577049.0, + "step": 78650 + }, + { + "entropy": 1.898327873647213, + "epoch": 0.24383911764090457, + "grad_norm": 9.916297912597656, + "learning_rate": 5.123228793514897e-06, + "loss": 0.5133, + "mean_token_accuracy": 0.8348048180341721, + "num_tokens": 94588672.0, + "step": 78660 + }, + { + "entropy": 1.9103514924645424, + "epoch": 0.24387011676595427, + "grad_norm": 7.947087287902832, + "learning_rate": 5.122903163875935e-06, + "loss": 0.5346, + "mean_token_accuracy": 0.8312714904546737, + "num_tokens": 94600977.0, + "step": 78670 + }, + { + "entropy": 1.9822331309318542, + "epoch": 0.24390111589100397, + "grad_norm": 7.2357001304626465, + "learning_rate": 5.1225775963196104e-06, + "loss": 0.5092, + "mean_token_accuracy": 0.8357154637575149, + "num_tokens": 94612105.0, + "step": 78680 + }, + { + "entropy": 1.8677599892020225, + "epoch": 0.24393211501605366, + "grad_norm": 8.366761207580566, + "learning_rate": 5.1222520908261965e-06, + "loss": 0.4345, + "mean_token_accuracy": 0.8575931638479233, + "num_tokens": 94624016.0, + "step": 78690 + }, + { + "entropy": 1.9394556313753128, + "epoch": 0.24396311414110336, + "grad_norm": 10.68550968170166, + "learning_rate": 5.12192664737598e-06, + "loss": 0.5114, + "mean_token_accuracy": 0.8385277986526489, + "num_tokens": 94635343.0, + "step": 78700 + }, + { + "entropy": 1.8500966012477875, + "epoch": 0.24399411326615306, + "grad_norm": 8.567591667175293, + "learning_rate": 5.121601265949253e-06, + "loss": 0.432, + "mean_token_accuracy": 0.8590314507484436, + "num_tokens": 94647452.0, + "step": 78710 + }, + { + "entropy": 1.900219763815403, + "epoch": 0.24402511239120275, + "grad_norm": 9.341552734375, + "learning_rate": 5.121275946526316e-06, + "loss": 0.4803, + "mean_token_accuracy": 0.8450950443744659, + "num_tokens": 94659220.0, + "step": 78720 + }, + { + "entropy": 1.8428411841392518, + "epoch": 0.24405611151625245, + "grad_norm": 5.176535129547119, + "learning_rate": 5.120950689087481e-06, + "loss": 0.4266, + "mean_token_accuracy": 0.8477652072906494, + "num_tokens": 94672010.0, + "step": 78730 + }, + { + "entropy": 1.960240864753723, + "epoch": 0.24408711064130215, + "grad_norm": 7.885514736175537, + "learning_rate": 5.120625493613066e-06, + "loss": 0.539, + "mean_token_accuracy": 0.8431348979473114, + "num_tokens": 94682844.0, + "step": 78740 + }, + { + "entropy": 1.9407020330429077, + "epoch": 0.24411810976635184, + "grad_norm": 9.455016136169434, + "learning_rate": 5.1203003600834e-06, + "loss": 0.4972, + "mean_token_accuracy": 0.8410814523696899, + "num_tokens": 94694739.0, + "step": 78750 + }, + { + "entropy": 1.922006744146347, + "epoch": 0.24414910889140154, + "grad_norm": 4.940672874450684, + "learning_rate": 5.119975288478818e-06, + "loss": 0.4887, + "mean_token_accuracy": 0.8352184310555458, + "num_tokens": 94707168.0, + "step": 78760 + }, + { + "entropy": 1.8744517169892787, + "epoch": 0.24418010801645124, + "grad_norm": 8.631587982177734, + "learning_rate": 5.119650278779667e-06, + "loss": 0.4898, + "mean_token_accuracy": 0.841625614464283, + "num_tokens": 94719921.0, + "step": 78770 + }, + { + "entropy": 1.8324996635317803, + "epoch": 0.24421110714150093, + "grad_norm": 7.340449333190918, + "learning_rate": 5.119325330966301e-06, + "loss": 0.5179, + "mean_token_accuracy": 0.8373197898268699, + "num_tokens": 94732979.0, + "step": 78780 + }, + { + "entropy": 1.820642825961113, + "epoch": 0.24424210626655063, + "grad_norm": 3.907349109649658, + "learning_rate": 5.119000445019081e-06, + "loss": 0.4211, + "mean_token_accuracy": 0.853115190565586, + "num_tokens": 94745715.0, + "step": 78790 + }, + { + "entropy": 1.992446595430374, + "epoch": 0.24427310539160033, + "grad_norm": 8.855254173278809, + "learning_rate": 5.118675620918381e-06, + "loss": 0.5751, + "mean_token_accuracy": 0.8356500491499901, + "num_tokens": 94757429.0, + "step": 78800 + }, + { + "entropy": 1.915099148452282, + "epoch": 0.24430410451665002, + "grad_norm": 8.259538650512695, + "learning_rate": 5.11835085864458e-06, + "loss": 0.5032, + "mean_token_accuracy": 0.8401688992977142, + "num_tokens": 94769751.0, + "step": 78810 + }, + { + "entropy": 1.999873200058937, + "epoch": 0.24433510364169972, + "grad_norm": 8.896110534667969, + "learning_rate": 5.118026158178065e-06, + "loss": 0.5863, + "mean_token_accuracy": 0.8252399504184723, + "num_tokens": 94781117.0, + "step": 78820 + }, + { + "entropy": 1.9294245585799217, + "epoch": 0.24436610276674942, + "grad_norm": 8.745575904846191, + "learning_rate": 5.117701519499237e-06, + "loss": 0.5684, + "mean_token_accuracy": 0.8319064795970916, + "num_tokens": 94793063.0, + "step": 78830 + }, + { + "entropy": 1.9275419771671296, + "epoch": 0.2443971018917991, + "grad_norm": 9.089315414428711, + "learning_rate": 5.1173769425885015e-06, + "loss": 0.456, + "mean_token_accuracy": 0.8480811953544617, + "num_tokens": 94804637.0, + "step": 78840 + }, + { + "entropy": 1.8522877663373947, + "epoch": 0.2444281010168488, + "grad_norm": 8.082670211791992, + "learning_rate": 5.117052427426272e-06, + "loss": 0.4151, + "mean_token_accuracy": 0.8565731942653656, + "num_tokens": 94817168.0, + "step": 78850 + }, + { + "entropy": 1.9246091678738595, + "epoch": 0.2444591001418985, + "grad_norm": 7.712028980255127, + "learning_rate": 5.116727973992975e-06, + "loss": 0.5084, + "mean_token_accuracy": 0.8396865114569664, + "num_tokens": 94828976.0, + "step": 78860 + }, + { + "entropy": 1.9487199917435647, + "epoch": 0.2444900992669482, + "grad_norm": 8.486215591430664, + "learning_rate": 5.116403582269041e-06, + "loss": 0.4936, + "mean_token_accuracy": 0.8476333856582642, + "num_tokens": 94840153.0, + "step": 78870 + }, + { + "entropy": 1.8542052239179612, + "epoch": 0.2445210983919979, + "grad_norm": 9.091276168823242, + "learning_rate": 5.11607925223491e-06, + "loss": 0.4383, + "mean_token_accuracy": 0.8544638007879257, + "num_tokens": 94852408.0, + "step": 78880 + }, + { + "entropy": 1.938251782208681, + "epoch": 0.24455209751704757, + "grad_norm": 8.672202110290527, + "learning_rate": 5.115754983871035e-06, + "loss": 0.5264, + "mean_token_accuracy": 0.8358112215995789, + "num_tokens": 94864131.0, + "step": 78890 + }, + { + "entropy": 1.9440853491425514, + "epoch": 0.24458309664209726, + "grad_norm": 8.338716506958008, + "learning_rate": 5.115430777157873e-06, + "loss": 0.5453, + "mean_token_accuracy": 0.8359151259064674, + "num_tokens": 94875953.0, + "step": 78900 + }, + { + "entropy": 1.9307004496455193, + "epoch": 0.24461409576714696, + "grad_norm": 8.26486587524414, + "learning_rate": 5.11510663207589e-06, + "loss": 0.503, + "mean_token_accuracy": 0.8382903128862381, + "num_tokens": 94887663.0, + "step": 78910 + }, + { + "entropy": 1.88083965331316, + "epoch": 0.24464509489219666, + "grad_norm": 9.269944190979004, + "learning_rate": 5.114782548605563e-06, + "loss": 0.4844, + "mean_token_accuracy": 0.8377712652087211, + "num_tokens": 94899958.0, + "step": 78920 + }, + { + "entropy": 1.872906593978405, + "epoch": 0.24467609401724635, + "grad_norm": 9.342957496643066, + "learning_rate": 5.1144585267273764e-06, + "loss": 0.4659, + "mean_token_accuracy": 0.8415094420313836, + "num_tokens": 94912389.0, + "step": 78930 + }, + { + "entropy": 2.0011187493801117, + "epoch": 0.24470709314229605, + "grad_norm": 9.931851387023926, + "learning_rate": 5.114134566421823e-06, + "loss": 0.5745, + "mean_token_accuracy": 0.8279946282505989, + "num_tokens": 94923039.0, + "step": 78940 + }, + { + "entropy": 1.8712818920612335, + "epoch": 0.24473809226734575, + "grad_norm": 8.743650436401367, + "learning_rate": 5.113810667669406e-06, + "loss": 0.4473, + "mean_token_accuracy": 0.8516668871045112, + "num_tokens": 94935553.0, + "step": 78950 + }, + { + "entropy": 1.8285594224929809, + "epoch": 0.24476909139239544, + "grad_norm": 6.643009662628174, + "learning_rate": 5.1134868304506335e-06, + "loss": 0.3967, + "mean_token_accuracy": 0.8515212833881378, + "num_tokens": 94949056.0, + "step": 78960 + }, + { + "entropy": 1.8055023223161697, + "epoch": 0.24480009051744514, + "grad_norm": 8.94931411743164, + "learning_rate": 5.1131630547460264e-06, + "loss": 0.419, + "mean_token_accuracy": 0.8564708262681962, + "num_tokens": 94962250.0, + "step": 78970 + }, + { + "entropy": 1.8580831050872804, + "epoch": 0.24483108964249484, + "grad_norm": 8.115015983581543, + "learning_rate": 5.112839340536111e-06, + "loss": 0.4625, + "mean_token_accuracy": 0.8429727420210839, + "num_tokens": 94975092.0, + "step": 78980 + }, + { + "entropy": 1.8813864275813104, + "epoch": 0.24486208876754453, + "grad_norm": 3.8340635299682617, + "learning_rate": 5.112515687801425e-06, + "loss": 0.4508, + "mean_token_accuracy": 0.8568834289908409, + "num_tokens": 94986957.0, + "step": 78990 + }, + { + "entropy": 1.9341845840215683, + "epoch": 0.24489308789259423, + "grad_norm": 9.034078598022461, + "learning_rate": 5.112192096522513e-06, + "loss": 0.5372, + "mean_token_accuracy": 0.8297013834118843, + "num_tokens": 94998820.0, + "step": 79000 + }, + { + "entropy": 1.8774139389395714, + "epoch": 0.24492408701764393, + "grad_norm": 8.676530838012695, + "learning_rate": 5.1118685666799276e-06, + "loss": 0.5233, + "mean_token_accuracy": 0.8356031611561775, + "num_tokens": 95011150.0, + "step": 79010 + }, + { + "entropy": 1.7854979574680327, + "epoch": 0.24495508614269362, + "grad_norm": 4.426049709320068, + "learning_rate": 5.111545098254231e-06, + "loss": 0.3809, + "mean_token_accuracy": 0.8604443162679672, + "num_tokens": 95024853.0, + "step": 79020 + }, + { + "entropy": 1.9145899727940558, + "epoch": 0.24498608526774332, + "grad_norm": 3.6612796783447266, + "learning_rate": 5.111221691225996e-06, + "loss": 0.4687, + "mean_token_accuracy": 0.8452530711889267, + "num_tokens": 95036962.0, + "step": 79030 + }, + { + "entropy": 1.9246152505278586, + "epoch": 0.24501708439279302, + "grad_norm": 4.121425628662109, + "learning_rate": 5.1108983455758e-06, + "loss": 0.4913, + "mean_token_accuracy": 0.8439809292554855, + "num_tokens": 95048716.0, + "step": 79040 + }, + { + "entropy": 1.8734241530299187, + "epoch": 0.2450480835178427, + "grad_norm": 7.86496114730835, + "learning_rate": 5.110575061284232e-06, + "loss": 0.4875, + "mean_token_accuracy": 0.8426591798663139, + "num_tokens": 95061683.0, + "step": 79050 + }, + { + "entropy": 1.9878466486930848, + "epoch": 0.2450790826428924, + "grad_norm": 7.495734691619873, + "learning_rate": 5.110251838331888e-06, + "loss": 0.5437, + "mean_token_accuracy": 0.8370781674981117, + "num_tokens": 95072706.0, + "step": 79060 + }, + { + "entropy": 1.8844273030757903, + "epoch": 0.2451100817679421, + "grad_norm": 8.038873672485352, + "learning_rate": 5.109928676699374e-06, + "loss": 0.5024, + "mean_token_accuracy": 0.8393307909369468, + "num_tokens": 95084183.0, + "step": 79070 + }, + { + "entropy": 1.9518539249897002, + "epoch": 0.2451410808929918, + "grad_norm": 8.668111801147461, + "learning_rate": 5.109605576367302e-06, + "loss": 0.5666, + "mean_token_accuracy": 0.8321204602718353, + "num_tokens": 95095191.0, + "step": 79080 + }, + { + "entropy": 1.8990130126476288, + "epoch": 0.2451720800180415, + "grad_norm": 9.605693817138672, + "learning_rate": 5.1092825373162965e-06, + "loss": 0.4926, + "mean_token_accuracy": 0.8330251052975655, + "num_tokens": 95107290.0, + "step": 79090 + }, + { + "entropy": 1.853539851307869, + "epoch": 0.2452030791430912, + "grad_norm": 8.671045303344727, + "learning_rate": 5.108959559526987e-06, + "loss": 0.4328, + "mean_token_accuracy": 0.8565944582223892, + "num_tokens": 95119603.0, + "step": 79100 + }, + { + "entropy": 1.9775980800390243, + "epoch": 0.2452340782681409, + "grad_norm": 10.049057960510254, + "learning_rate": 5.108636642980014e-06, + "loss": 0.5379, + "mean_token_accuracy": 0.8332414567470551, + "num_tokens": 95130785.0, + "step": 79110 + }, + { + "entropy": 1.8694923490285873, + "epoch": 0.2452650773931906, + "grad_norm": 5.189975738525391, + "learning_rate": 5.108313787656024e-06, + "loss": 0.4671, + "mean_token_accuracy": 0.8414556428790092, + "num_tokens": 95142468.0, + "step": 79120 + }, + { + "entropy": 1.8693754494190216, + "epoch": 0.24529607651824029, + "grad_norm": 9.248258590698242, + "learning_rate": 5.107990993535676e-06, + "loss": 0.4554, + "mean_token_accuracy": 0.848180778324604, + "num_tokens": 95154856.0, + "step": 79130 + }, + { + "entropy": 1.8843225702643394, + "epoch": 0.24532707564328995, + "grad_norm": 8.56281852722168, + "learning_rate": 5.107668260599633e-06, + "loss": 0.4779, + "mean_token_accuracy": 0.8529567748308182, + "num_tokens": 95165960.0, + "step": 79140 + }, + { + "entropy": 1.8180608585476876, + "epoch": 0.24535807476833965, + "grad_norm": 3.8715312480926514, + "learning_rate": 5.107345588828569e-06, + "loss": 0.4352, + "mean_token_accuracy": 0.85515176653862, + "num_tokens": 95178500.0, + "step": 79150 + }, + { + "entropy": 1.9070587247610091, + "epoch": 0.24538907389338935, + "grad_norm": 7.640878677368164, + "learning_rate": 5.107022978203167e-06, + "loss": 0.4931, + "mean_token_accuracy": 0.8319504678249359, + "num_tokens": 95190327.0, + "step": 79160 + }, + { + "entropy": 1.8788634032011031, + "epoch": 0.24542007301843904, + "grad_norm": 7.821987628936768, + "learning_rate": 5.106700428704119e-06, + "loss": 0.5155, + "mean_token_accuracy": 0.8348909318447113, + "num_tokens": 95202577.0, + "step": 79170 + }, + { + "entropy": 1.8948452711105346, + "epoch": 0.24545107214348874, + "grad_norm": 9.048386573791504, + "learning_rate": 5.1063779403121214e-06, + "loss": 0.4898, + "mean_token_accuracy": 0.837831036746502, + "num_tokens": 95214436.0, + "step": 79180 + }, + { + "entropy": 1.889863994717598, + "epoch": 0.24548207126853844, + "grad_norm": 11.092171669006348, + "learning_rate": 5.106055513007883e-06, + "loss": 0.5045, + "mean_token_accuracy": 0.8403749257326126, + "num_tokens": 95225844.0, + "step": 79190 + }, + { + "entropy": 1.8009657189249992, + "epoch": 0.24551307039358813, + "grad_norm": 10.045639038085938, + "learning_rate": 5.105733146772122e-06, + "loss": 0.4362, + "mean_token_accuracy": 0.8504743695259094, + "num_tokens": 95239379.0, + "step": 79200 + }, + { + "entropy": 1.9157941043376923, + "epoch": 0.24554406951863783, + "grad_norm": 8.408306121826172, + "learning_rate": 5.105410841585562e-06, + "loss": 0.4829, + "mean_token_accuracy": 0.837490102648735, + "num_tokens": 95251567.0, + "step": 79210 + }, + { + "entropy": 1.8614981457591058, + "epoch": 0.24557506864368753, + "grad_norm": 8.256009101867676, + "learning_rate": 5.1050885974289354e-06, + "loss": 0.4565, + "mean_token_accuracy": 0.8346313327550888, + "num_tokens": 95264155.0, + "step": 79220 + }, + { + "entropy": 1.9501688122749328, + "epoch": 0.24560606776873722, + "grad_norm": 8.228469848632812, + "learning_rate": 5.104766414282987e-06, + "loss": 0.5083, + "mean_token_accuracy": 0.8341888338327408, + "num_tokens": 95275602.0, + "step": 79230 + }, + { + "entropy": 1.8669834539294243, + "epoch": 0.24563706689378692, + "grad_norm": 7.618318557739258, + "learning_rate": 5.1044442921284635e-06, + "loss": 0.4577, + "mean_token_accuracy": 0.8466882199048996, + "num_tokens": 95288087.0, + "step": 79240 + }, + { + "entropy": 2.0039695501327515, + "epoch": 0.24566806601883662, + "grad_norm": 10.323431968688965, + "learning_rate": 5.104122230946127e-06, + "loss": 0.5297, + "mean_token_accuracy": 0.8389030545949936, + "num_tokens": 95298644.0, + "step": 79250 + }, + { + "entropy": 1.9950080424547196, + "epoch": 0.2456990651438863, + "grad_norm": 8.642731666564941, + "learning_rate": 5.103800230716744e-06, + "loss": 0.5272, + "mean_token_accuracy": 0.8389644399285316, + "num_tokens": 95309147.0, + "step": 79260 + }, + { + "entropy": 1.9348495423793792, + "epoch": 0.245730064268936, + "grad_norm": 9.167795181274414, + "learning_rate": 5.10347829142109e-06, + "loss": 0.5279, + "mean_token_accuracy": 0.8349600344896316, + "num_tokens": 95320103.0, + "step": 79270 + }, + { + "entropy": 1.925625742971897, + "epoch": 0.2457610633939857, + "grad_norm": 9.226808547973633, + "learning_rate": 5.103156413039949e-06, + "loss": 0.4999, + "mean_token_accuracy": 0.8468294337391853, + "num_tokens": 95331516.0, + "step": 79280 + }, + { + "entropy": 1.890987327694893, + "epoch": 0.2457920625190354, + "grad_norm": 9.890280723571777, + "learning_rate": 5.102834595554116e-06, + "loss": 0.4966, + "mean_token_accuracy": 0.8447692885994911, + "num_tokens": 95343047.0, + "step": 79290 + }, + { + "entropy": 1.9169722646474838, + "epoch": 0.2458230616440851, + "grad_norm": 7.960250377655029, + "learning_rate": 5.102512838944389e-06, + "loss": 0.5274, + "mean_token_accuracy": 0.8344372197985649, + "num_tokens": 95354554.0, + "step": 79300 + }, + { + "entropy": 1.9122997641563415, + "epoch": 0.2458540607691348, + "grad_norm": 7.443688869476318, + "learning_rate": 5.102191143191582e-06, + "loss": 0.4783, + "mean_token_accuracy": 0.844282315671444, + "num_tokens": 95367170.0, + "step": 79310 + }, + { + "entropy": 1.989633098244667, + "epoch": 0.2458850598941845, + "grad_norm": 8.14206600189209, + "learning_rate": 5.101869508276509e-06, + "loss": 0.5527, + "mean_token_accuracy": 0.8317234337329864, + "num_tokens": 95377788.0, + "step": 79320 + }, + { + "entropy": 1.8243707031011582, + "epoch": 0.2459160590192342, + "grad_norm": 8.314239501953125, + "learning_rate": 5.10154793418e-06, + "loss": 0.3902, + "mean_token_accuracy": 0.863274447619915, + "num_tokens": 95391018.0, + "step": 79330 + }, + { + "entropy": 2.020896875858307, + "epoch": 0.2459470581442839, + "grad_norm": 9.07596492767334, + "learning_rate": 5.10122642088289e-06, + "loss": 0.5552, + "mean_token_accuracy": 0.8318601101636887, + "num_tokens": 95401518.0, + "step": 79340 + }, + { + "entropy": 1.9158252328634262, + "epoch": 0.24597805726933358, + "grad_norm": 7.7173614501953125, + "learning_rate": 5.100904968366021e-06, + "loss": 0.4968, + "mean_token_accuracy": 0.8370094135403633, + "num_tokens": 95413144.0, + "step": 79350 + }, + { + "entropy": 1.8825179994106294, + "epoch": 0.24600905639438328, + "grad_norm": 9.029237747192383, + "learning_rate": 5.100583576610246e-06, + "loss": 0.4686, + "mean_token_accuracy": 0.8415081441402436, + "num_tokens": 95426258.0, + "step": 79360 + }, + { + "entropy": 1.8851216346025468, + "epoch": 0.24604005551943298, + "grad_norm": 9.818262100219727, + "learning_rate": 5.100262245596426e-06, + "loss": 0.5106, + "mean_token_accuracy": 0.8332912772893906, + "num_tokens": 95438222.0, + "step": 79370 + }, + { + "entropy": 1.8941182017326355, + "epoch": 0.24607105464448267, + "grad_norm": 8.11664867401123, + "learning_rate": 5.099940975305429e-06, + "loss": 0.4486, + "mean_token_accuracy": 0.8558889240026474, + "num_tokens": 95449373.0, + "step": 79380 + }, + { + "entropy": 1.870937429368496, + "epoch": 0.24610205376953234, + "grad_norm": 9.294564247131348, + "learning_rate": 5.099619765718133e-06, + "loss": 0.4675, + "mean_token_accuracy": 0.8433859512209892, + "num_tokens": 95461757.0, + "step": 79390 + }, + { + "entropy": 1.9412083461880685, + "epoch": 0.24613305289458204, + "grad_norm": 9.425738334655762, + "learning_rate": 5.099298616815426e-06, + "loss": 0.5611, + "mean_token_accuracy": 0.8349045276641845, + "num_tokens": 95473347.0, + "step": 79400 + }, + { + "entropy": 1.8201213255524635, + "epoch": 0.24616405201963173, + "grad_norm": 4.238409042358398, + "learning_rate": 5.098977528578199e-06, + "loss": 0.4338, + "mean_token_accuracy": 0.8556158289313316, + "num_tokens": 95486624.0, + "step": 79410 + }, + { + "entropy": 1.8622691087424754, + "epoch": 0.24619505114468143, + "grad_norm": 7.4858012199401855, + "learning_rate": 5.098656500987356e-06, + "loss": 0.4253, + "mean_token_accuracy": 0.8564441815018654, + "num_tokens": 95499802.0, + "step": 79420 + }, + { + "entropy": 1.823809403181076, + "epoch": 0.24622605026973113, + "grad_norm": 2.4532666206359863, + "learning_rate": 5.0983355340238096e-06, + "loss": 0.4602, + "mean_token_accuracy": 0.8558837413787842, + "num_tokens": 95512410.0, + "step": 79430 + }, + { + "entropy": 1.8783535480499267, + "epoch": 0.24625704939478082, + "grad_norm": 7.133194446563721, + "learning_rate": 5.0980146276684775e-06, + "loss": 0.5046, + "mean_token_accuracy": 0.8473603904247284, + "num_tokens": 95524775.0, + "step": 79440 + }, + { + "entropy": 1.8303729638457298, + "epoch": 0.24628804851983052, + "grad_norm": 8.516709327697754, + "learning_rate": 5.097693781902286e-06, + "loss": 0.4894, + "mean_token_accuracy": 0.8431586921215057, + "num_tokens": 95537638.0, + "step": 79450 + }, + { + "entropy": 1.9185088947415352, + "epoch": 0.24631904764488022, + "grad_norm": 10.453644752502441, + "learning_rate": 5.097372996706177e-06, + "loss": 0.4657, + "mean_token_accuracy": 0.84544235765934, + "num_tokens": 95549282.0, + "step": 79460 + }, + { + "entropy": 1.8807260736823082, + "epoch": 0.24635004676992991, + "grad_norm": 11.140459060668945, + "learning_rate": 5.09705227206109e-06, + "loss": 0.4641, + "mean_token_accuracy": 0.8533786773681641, + "num_tokens": 95561550.0, + "step": 79470 + }, + { + "entropy": 1.9539588153362275, + "epoch": 0.2463810458949796, + "grad_norm": 9.068120002746582, + "learning_rate": 5.096731607947981e-06, + "loss": 0.5577, + "mean_token_accuracy": 0.8324769794940948, + "num_tokens": 95572478.0, + "step": 79480 + }, + { + "entropy": 1.803953130543232, + "epoch": 0.2464120450200293, + "grad_norm": 3.0993998050689697, + "learning_rate": 5.096411004347811e-06, + "loss": 0.4285, + "mean_token_accuracy": 0.8539158910512924, + "num_tokens": 95585431.0, + "step": 79490 + }, + { + "entropy": 1.9023709833621978, + "epoch": 0.246443044145079, + "grad_norm": 8.446845054626465, + "learning_rate": 5.096090461241549e-06, + "loss": 0.4672, + "mean_token_accuracy": 0.8470258235931396, + "num_tokens": 95596888.0, + "step": 79500 + }, + { + "entropy": 1.9057755261659621, + "epoch": 0.2464740432701287, + "grad_norm": 3.741781711578369, + "learning_rate": 5.095769978610174e-06, + "loss": 0.4954, + "mean_token_accuracy": 0.8380037263035774, + "num_tokens": 95608611.0, + "step": 79510 + }, + { + "entropy": 1.8339520961046218, + "epoch": 0.2465050423951784, + "grad_norm": 3.7660844326019287, + "learning_rate": 5.095449556434673e-06, + "loss": 0.4444, + "mean_token_accuracy": 0.8561963886022568, + "num_tokens": 95621421.0, + "step": 79520 + }, + { + "entropy": 1.8732254639267922, + "epoch": 0.2465360415202281, + "grad_norm": 8.025121688842773, + "learning_rate": 5.09512919469604e-06, + "loss": 0.4849, + "mean_token_accuracy": 0.8454299658536911, + "num_tokens": 95633799.0, + "step": 79530 + }, + { + "entropy": 1.9583280727267265, + "epoch": 0.2465670406452778, + "grad_norm": 8.794914245605469, + "learning_rate": 5.0948088933752795e-06, + "loss": 0.5379, + "mean_token_accuracy": 0.8344408705830574, + "num_tokens": 95645176.0, + "step": 79540 + }, + { + "entropy": 1.8870036020874976, + "epoch": 0.2465980397703275, + "grad_norm": 3.5362777709960938, + "learning_rate": 5.094488652453403e-06, + "loss": 0.4673, + "mean_token_accuracy": 0.8496022373437881, + "num_tokens": 95657347.0, + "step": 79550 + }, + { + "entropy": 1.9044120475649833, + "epoch": 0.24662903889537718, + "grad_norm": 9.073451042175293, + "learning_rate": 5.094168471911431e-06, + "loss": 0.4952, + "mean_token_accuracy": 0.8413415655493737, + "num_tokens": 95670276.0, + "step": 79560 + }, + { + "entropy": 1.8711192563176156, + "epoch": 0.24666003802042688, + "grad_norm": 7.03518533706665, + "learning_rate": 5.0938483517303914e-06, + "loss": 0.4409, + "mean_token_accuracy": 0.8590395718812942, + "num_tokens": 95682420.0, + "step": 79570 + }, + { + "entropy": 1.8627604782581328, + "epoch": 0.24669103714547658, + "grad_norm": 8.095098495483398, + "learning_rate": 5.093528291891321e-06, + "loss": 0.4162, + "mean_token_accuracy": 0.8616518348455429, + "num_tokens": 95695131.0, + "step": 79580 + }, + { + "entropy": 1.8479859337210656, + "epoch": 0.24672203627052627, + "grad_norm": 10.04355525970459, + "learning_rate": 5.093208292375264e-06, + "loss": 0.4436, + "mean_token_accuracy": 0.8598237797617913, + "num_tokens": 95707350.0, + "step": 79590 + }, + { + "entropy": 1.8836325496435165, + "epoch": 0.24675303539557597, + "grad_norm": 7.801244735717773, + "learning_rate": 5.092888353163278e-06, + "loss": 0.4592, + "mean_token_accuracy": 0.8486521244049072, + "num_tokens": 95719130.0, + "step": 79600 + }, + { + "entropy": 1.8783444881439209, + "epoch": 0.24678403452062567, + "grad_norm": 7.576953411102295, + "learning_rate": 5.092568474236419e-06, + "loss": 0.4971, + "mean_token_accuracy": 0.8356266215443611, + "num_tokens": 95730894.0, + "step": 79610 + }, + { + "entropy": 1.778034082055092, + "epoch": 0.24681503364567536, + "grad_norm": 8.774757385253906, + "learning_rate": 5.0922486555757615e-06, + "loss": 0.4483, + "mean_token_accuracy": 0.8508021235466003, + "num_tokens": 95743797.0, + "step": 79620 + }, + { + "entropy": 1.893149121105671, + "epoch": 0.24684603277072506, + "grad_norm": 10.234265327453613, + "learning_rate": 5.09192889716238e-06, + "loss": 0.4511, + "mean_token_accuracy": 0.8557818323373795, + "num_tokens": 95756512.0, + "step": 79630 + }, + { + "entropy": 1.9457867011427878, + "epoch": 0.24687703189577473, + "grad_norm": 8.266569137573242, + "learning_rate": 5.091609198977366e-06, + "loss": 0.5249, + "mean_token_accuracy": 0.832012552022934, + "num_tokens": 95767848.0, + "step": 79640 + }, + { + "entropy": 1.8916153132915496, + "epoch": 0.24690803102082443, + "grad_norm": 8.928706169128418, + "learning_rate": 5.091289561001813e-06, + "loss": 0.4741, + "mean_token_accuracy": 0.8554543048143387, + "num_tokens": 95779825.0, + "step": 79650 + }, + { + "entropy": 1.843326412141323, + "epoch": 0.24693903014587412, + "grad_norm": 7.779528617858887, + "learning_rate": 5.090969983216823e-06, + "loss": 0.4886, + "mean_token_accuracy": 0.8484487190842629, + "num_tokens": 95792518.0, + "step": 79660 + }, + { + "entropy": 1.8858656302094459, + "epoch": 0.24697002927092382, + "grad_norm": 4.888106822967529, + "learning_rate": 5.090650465603507e-06, + "loss": 0.4602, + "mean_token_accuracy": 0.8411580830812454, + "num_tokens": 95805316.0, + "step": 79670 + }, + { + "entropy": 1.9305724531412125, + "epoch": 0.24700102839597352, + "grad_norm": 8.88187026977539, + "learning_rate": 5.090331008142988e-06, + "loss": 0.5174, + "mean_token_accuracy": 0.8380944326519966, + "num_tokens": 95816570.0, + "step": 79680 + }, + { + "entropy": 1.944003912806511, + "epoch": 0.2470320275210232, + "grad_norm": 8.070425033569336, + "learning_rate": 5.090011610816392e-06, + "loss": 0.5157, + "mean_token_accuracy": 0.8433306530117989, + "num_tokens": 95827116.0, + "step": 79690 + }, + { + "entropy": 1.8563833236694336, + "epoch": 0.2470630266460729, + "grad_norm": 9.2925386428833, + "learning_rate": 5.089692273604857e-06, + "loss": 0.4667, + "mean_token_accuracy": 0.8484684824943542, + "num_tokens": 95839368.0, + "step": 79700 + }, + { + "entropy": 1.8929186090826988, + "epoch": 0.2470940257711226, + "grad_norm": 7.603442668914795, + "learning_rate": 5.089372996489528e-06, + "loss": 0.5032, + "mean_token_accuracy": 0.844200924038887, + "num_tokens": 95850482.0, + "step": 79710 + }, + { + "entropy": 1.9358657032251358, + "epoch": 0.2471250248961723, + "grad_norm": 10.141935348510742, + "learning_rate": 5.089053779451555e-06, + "loss": 0.5079, + "mean_token_accuracy": 0.8503325775265693, + "num_tokens": 95861937.0, + "step": 79720 + }, + { + "entropy": 1.7907779216766357, + "epoch": 0.247156024021222, + "grad_norm": 4.321413040161133, + "learning_rate": 5.088734622472102e-06, + "loss": 0.4334, + "mean_token_accuracy": 0.846727529168129, + "num_tokens": 95875553.0, + "step": 79730 + }, + { + "entropy": 1.889219006896019, + "epoch": 0.2471870231462717, + "grad_norm": 4.462600231170654, + "learning_rate": 5.0884155255323405e-06, + "loss": 0.519, + "mean_token_accuracy": 0.8342496454715729, + "num_tokens": 95888042.0, + "step": 79740 + }, + { + "entropy": 1.9225512325763703, + "epoch": 0.2472180222713214, + "grad_norm": 8.681279182434082, + "learning_rate": 5.088096488613445e-06, + "loss": 0.4779, + "mean_token_accuracy": 0.8418059065937996, + "num_tokens": 95899096.0, + "step": 79750 + }, + { + "entropy": 1.9431275591254233, + "epoch": 0.2472490213963711, + "grad_norm": 10.247178077697754, + "learning_rate": 5.087777511696603e-06, + "loss": 0.5136, + "mean_token_accuracy": 0.8435952246189118, + "num_tokens": 95910960.0, + "step": 79760 + }, + { + "entropy": 1.9674682512879371, + "epoch": 0.24728002052142078, + "grad_norm": 10.39339542388916, + "learning_rate": 5.087458594763011e-06, + "loss": 0.5176, + "mean_token_accuracy": 0.8337034076452255, + "num_tokens": 95922652.0, + "step": 79770 + }, + { + "entropy": 1.9188352286815644, + "epoch": 0.24731101964647048, + "grad_norm": 8.393038749694824, + "learning_rate": 5.087139737793868e-06, + "loss": 0.4999, + "mean_token_accuracy": 0.8489988818764687, + "num_tokens": 95933179.0, + "step": 79780 + }, + { + "entropy": 1.9350088462233543, + "epoch": 0.24734201877152018, + "grad_norm": 7.100698471069336, + "learning_rate": 5.086820940770387e-06, + "loss": 0.4757, + "mean_token_accuracy": 0.8473016381263733, + "num_tokens": 95944650.0, + "step": 79790 + }, + { + "entropy": 1.9587112590670586, + "epoch": 0.24737301789656987, + "grad_norm": 10.05678653717041, + "learning_rate": 5.0865022036737876e-06, + "loss": 0.5308, + "mean_token_accuracy": 0.8289535716176033, + "num_tokens": 95956214.0, + "step": 79800 + }, + { + "entropy": 1.854060137271881, + "epoch": 0.24740401702161957, + "grad_norm": 7.256962776184082, + "learning_rate": 5.086183526485297e-06, + "loss": 0.4153, + "mean_token_accuracy": 0.8500014215707778, + "num_tokens": 95968951.0, + "step": 79810 + }, + { + "entropy": 1.9160088315606116, + "epoch": 0.24743501614666927, + "grad_norm": 8.998300552368164, + "learning_rate": 5.08586490918615e-06, + "loss": 0.5045, + "mean_token_accuracy": 0.8392581641674042, + "num_tokens": 95980577.0, + "step": 79820 + }, + { + "entropy": 1.88340106010437, + "epoch": 0.24746601527171896, + "grad_norm": 5.0286030769348145, + "learning_rate": 5.08554635175759e-06, + "loss": 0.4791, + "mean_token_accuracy": 0.8310941204428672, + "num_tokens": 95993600.0, + "step": 79830 + }, + { + "entropy": 1.831264679133892, + "epoch": 0.24749701439676866, + "grad_norm": 9.444331169128418, + "learning_rate": 5.085227854180872e-06, + "loss": 0.4911, + "mean_token_accuracy": 0.847031046450138, + "num_tokens": 96006803.0, + "step": 79840 + }, + { + "entropy": 1.895230557024479, + "epoch": 0.24752801352181836, + "grad_norm": 9.098989486694336, + "learning_rate": 5.0849094164372525e-06, + "loss": 0.4721, + "mean_token_accuracy": 0.8438019439578056, + "num_tokens": 96018578.0, + "step": 79850 + }, + { + "entropy": 1.875000685453415, + "epoch": 0.24755901264686805, + "grad_norm": 8.089691162109375, + "learning_rate": 5.084591038508003e-06, + "loss": 0.4484, + "mean_token_accuracy": 0.8493760719895362, + "num_tokens": 96030419.0, + "step": 79860 + }, + { + "entropy": 1.9285678580403327, + "epoch": 0.24759001177191775, + "grad_norm": 8.030923843383789, + "learning_rate": 5.0842727203744e-06, + "loss": 0.4933, + "mean_token_accuracy": 0.8497920066118241, + "num_tokens": 96042124.0, + "step": 79870 + }, + { + "entropy": 1.9058194294571877, + "epoch": 0.24762101089696742, + "grad_norm": 6.725025653839111, + "learning_rate": 5.083954462017727e-06, + "loss": 0.45, + "mean_token_accuracy": 0.8582265987992287, + "num_tokens": 96053579.0, + "step": 79880 + }, + { + "entropy": 1.9877204060554505, + "epoch": 0.24765201002201712, + "grad_norm": 7.7386555671691895, + "learning_rate": 5.083636263419278e-06, + "loss": 0.5676, + "mean_token_accuracy": 0.8407276496291161, + "num_tokens": 96064626.0, + "step": 79890 + }, + { + "entropy": 1.9019395023584367, + "epoch": 0.2476830091470668, + "grad_norm": 8.415245056152344, + "learning_rate": 5.083318124560355e-06, + "loss": 0.4931, + "mean_token_accuracy": 0.8463913604617119, + "num_tokens": 96075407.0, + "step": 79900 + }, + { + "entropy": 1.9611292690038682, + "epoch": 0.2477140082721165, + "grad_norm": 7.75224494934082, + "learning_rate": 5.083000045422266e-06, + "loss": 0.5072, + "mean_token_accuracy": 0.8489187434315681, + "num_tokens": 96086058.0, + "step": 79910 + }, + { + "entropy": 1.8318257644772529, + "epoch": 0.2477450073971662, + "grad_norm": 4.178698539733887, + "learning_rate": 5.082682025986331e-06, + "loss": 0.4311, + "mean_token_accuracy": 0.8574068948626519, + "num_tokens": 96098208.0, + "step": 79920 + }, + { + "entropy": 1.8961550071835518, + "epoch": 0.2477760065222159, + "grad_norm": 9.65478515625, + "learning_rate": 5.082364066233872e-06, + "loss": 0.5372, + "mean_token_accuracy": 0.8370327904820443, + "num_tokens": 96109735.0, + "step": 79930 + }, + { + "entropy": 1.8611363500356675, + "epoch": 0.2478070056472656, + "grad_norm": 4.771914482116699, + "learning_rate": 5.082046166146227e-06, + "loss": 0.4327, + "mean_token_accuracy": 0.8574331164360046, + "num_tokens": 96121606.0, + "step": 79940 + }, + { + "entropy": 1.8779680162668229, + "epoch": 0.2478380047723153, + "grad_norm": 10.091143608093262, + "learning_rate": 5.0817283257047375e-06, + "loss": 0.479, + "mean_token_accuracy": 0.8469004735350609, + "num_tokens": 96133832.0, + "step": 79950 + }, + { + "entropy": 1.8258475840091706, + "epoch": 0.247869003897365, + "grad_norm": 8.701608657836914, + "learning_rate": 5.081410544890754e-06, + "loss": 0.4852, + "mean_token_accuracy": 0.84885775744915, + "num_tokens": 96146484.0, + "step": 79960 + }, + { + "entropy": 1.856001165509224, + "epoch": 0.2479000030224147, + "grad_norm": 7.74146842956543, + "learning_rate": 5.081092823685633e-06, + "loss": 0.4187, + "mean_token_accuracy": 0.8504522681236267, + "num_tokens": 96159276.0, + "step": 79970 + }, + { + "entropy": 1.8811848238110542, + "epoch": 0.24793100214746439, + "grad_norm": 4.684483051300049, + "learning_rate": 5.0807751620707425e-06, + "loss": 0.4806, + "mean_token_accuracy": 0.841371999680996, + "num_tokens": 96171602.0, + "step": 79980 + }, + { + "entropy": 1.9496762931346894, + "epoch": 0.24796200127251408, + "grad_norm": 9.64576530456543, + "learning_rate": 5.0804575600274575e-06, + "loss": 0.5328, + "mean_token_accuracy": 0.8369770348072052, + "num_tokens": 96182516.0, + "step": 79990 + }, + { + "entropy": 1.8256424218416214, + "epoch": 0.24799300039756378, + "grad_norm": 7.734647750854492, + "learning_rate": 5.080140017537162e-06, + "loss": 0.4226, + "mean_token_accuracy": 0.8576287388801574, + "num_tokens": 96195002.0, + "step": 80000 + }, + { + "entropy": 1.8538661286234857, + "epoch": 0.24802399952261348, + "grad_norm": 4.36631441116333, + "learning_rate": 5.079822534581246e-06, + "loss": 0.4843, + "mean_token_accuracy": 0.8439467817544937, + "num_tokens": 96207489.0, + "step": 80010 + }, + { + "entropy": 1.9011688023805617, + "epoch": 0.24805499864766317, + "grad_norm": 10.34740924835205, + "learning_rate": 5.07950511114111e-06, + "loss": 0.5364, + "mean_token_accuracy": 0.8319029092788697, + "num_tokens": 96218430.0, + "step": 80020 + }, + { + "entropy": 1.8638526298105718, + "epoch": 0.24808599777271287, + "grad_norm": 3.3797831535339355, + "learning_rate": 5.07918774719816e-06, + "loss": 0.4521, + "mean_token_accuracy": 0.8378943756222725, + "num_tokens": 96231778.0, + "step": 80030 + }, + { + "entropy": 1.861025558412075, + "epoch": 0.24811699689776257, + "grad_norm": 8.062844276428223, + "learning_rate": 5.078870442733811e-06, + "loss": 0.4897, + "mean_token_accuracy": 0.8469403609633446, + "num_tokens": 96243932.0, + "step": 80040 + }, + { + "entropy": 1.835094903409481, + "epoch": 0.24814799602281226, + "grad_norm": 4.131919860839844, + "learning_rate": 5.07855319772949e-06, + "loss": 0.4282, + "mean_token_accuracy": 0.8548068985342979, + "num_tokens": 96256765.0, + "step": 80050 + }, + { + "entropy": 1.9781760573387146, + "epoch": 0.24817899514786196, + "grad_norm": 8.877656936645508, + "learning_rate": 5.078236012166626e-06, + "loss": 0.546, + "mean_token_accuracy": 0.8398910224437713, + "num_tokens": 96267560.0, + "step": 80060 + }, + { + "entropy": 1.8965528056025505, + "epoch": 0.24820999427291165, + "grad_norm": 9.912452697753906, + "learning_rate": 5.077918886026659e-06, + "loss": 0.521, + "mean_token_accuracy": 0.8338564082980156, + "num_tokens": 96279157.0, + "step": 80070 + }, + { + "entropy": 1.8625015437602996, + "epoch": 0.24824099339796135, + "grad_norm": 7.917440891265869, + "learning_rate": 5.077601819291041e-06, + "loss": 0.4465, + "mean_token_accuracy": 0.8532622784376145, + "num_tokens": 96291601.0, + "step": 80080 + }, + { + "entropy": 1.8780291944742202, + "epoch": 0.24827199252301105, + "grad_norm": 4.1151123046875, + "learning_rate": 5.077284811941222e-06, + "loss": 0.5087, + "mean_token_accuracy": 0.837240794301033, + "num_tokens": 96303685.0, + "step": 80090 + }, + { + "entropy": 1.9204677224159241, + "epoch": 0.24830299164806074, + "grad_norm": 8.22902774810791, + "learning_rate": 5.076967863958671e-06, + "loss": 0.5369, + "mean_token_accuracy": 0.8477205768227577, + "num_tokens": 96314313.0, + "step": 80100 + }, + { + "entropy": 1.8754928812384606, + "epoch": 0.24833399077311044, + "grad_norm": 3.904203176498413, + "learning_rate": 5.076650975324857e-06, + "loss": 0.4377, + "mean_token_accuracy": 0.852228082716465, + "num_tokens": 96326655.0, + "step": 80110 + }, + { + "entropy": 1.8840164422988892, + "epoch": 0.24836498989816014, + "grad_norm": 7.1997504234313965, + "learning_rate": 5.076334146021265e-06, + "loss": 0.459, + "mean_token_accuracy": 0.850933963060379, + "num_tokens": 96338425.0, + "step": 80120 + }, + { + "entropy": 1.923794236779213, + "epoch": 0.2483959890232098, + "grad_norm": 9.449333190917969, + "learning_rate": 5.076017376029378e-06, + "loss": 0.5339, + "mean_token_accuracy": 0.8383274003863335, + "num_tokens": 96349766.0, + "step": 80130 + }, + { + "entropy": 1.9393807545304298, + "epoch": 0.2484269881482595, + "grad_norm": 7.0489397048950195, + "learning_rate": 5.0757006653306975e-06, + "loss": 0.488, + "mean_token_accuracy": 0.8506465151906013, + "num_tokens": 96360918.0, + "step": 80140 + }, + { + "entropy": 1.9307123482227326, + "epoch": 0.2484579872733092, + "grad_norm": 8.270514488220215, + "learning_rate": 5.075384013906726e-06, + "loss": 0.5188, + "mean_token_accuracy": 0.8430813983082771, + "num_tokens": 96371709.0, + "step": 80150 + }, + { + "entropy": 1.8894588977098465, + "epoch": 0.2484889863983589, + "grad_norm": 8.323099136352539, + "learning_rate": 5.075067421738976e-06, + "loss": 0.4728, + "mean_token_accuracy": 0.8491836950182915, + "num_tokens": 96384161.0, + "step": 80160 + }, + { + "entropy": 1.9335136204957961, + "epoch": 0.2485199855234086, + "grad_norm": 8.298654556274414, + "learning_rate": 5.074750888808969e-06, + "loss": 0.5369, + "mean_token_accuracy": 0.8408285826444626, + "num_tokens": 96395080.0, + "step": 80170 + }, + { + "entropy": 1.9049769386649131, + "epoch": 0.2485509846484583, + "grad_norm": 8.685126304626465, + "learning_rate": 5.074434415098235e-06, + "loss": 0.4992, + "mean_token_accuracy": 0.8457057103514671, + "num_tokens": 96406602.0, + "step": 80180 + }, + { + "entropy": 1.8694998741149902, + "epoch": 0.248581983773508, + "grad_norm": 9.182705879211426, + "learning_rate": 5.07411800058831e-06, + "loss": 0.4787, + "mean_token_accuracy": 0.8451626107096673, + "num_tokens": 96418323.0, + "step": 80190 + }, + { + "entropy": 1.8484734997153283, + "epoch": 0.24861298289855768, + "grad_norm": 8.897099494934082, + "learning_rate": 5.0738016452607374e-06, + "loss": 0.4857, + "mean_token_accuracy": 0.8333059296011924, + "num_tokens": 96430631.0, + "step": 80200 + }, + { + "entropy": 1.9490864470601081, + "epoch": 0.24864398202360738, + "grad_norm": 8.925657272338867, + "learning_rate": 5.073485349097073e-06, + "loss": 0.5366, + "mean_token_accuracy": 0.8292359367012978, + "num_tokens": 96442572.0, + "step": 80210 + }, + { + "entropy": 1.9751644432544708, + "epoch": 0.24867498114865708, + "grad_norm": 8.735760688781738, + "learning_rate": 5.073169112078877e-06, + "loss": 0.5984, + "mean_token_accuracy": 0.8311158329248428, + "num_tokens": 96453557.0, + "step": 80220 + }, + { + "entropy": 1.8875069230794908, + "epoch": 0.24870598027370677, + "grad_norm": 3.8038547039031982, + "learning_rate": 5.072852934187719e-06, + "loss": 0.4878, + "mean_token_accuracy": 0.8413284301757813, + "num_tokens": 96465184.0, + "step": 80230 + }, + { + "entropy": 1.9452220991253852, + "epoch": 0.24873697939875647, + "grad_norm": 10.38296127319336, + "learning_rate": 5.072536815405176e-06, + "loss": 0.5095, + "mean_token_accuracy": 0.8376503065228462, + "num_tokens": 96476422.0, + "step": 80240 + }, + { + "entropy": 1.9426960051059723, + "epoch": 0.24876797852380617, + "grad_norm": 8.761679649353027, + "learning_rate": 5.072220755712832e-06, + "loss": 0.4998, + "mean_token_accuracy": 0.8422357112169265, + "num_tokens": 96487607.0, + "step": 80250 + }, + { + "entropy": 1.968003484606743, + "epoch": 0.24879897764885586, + "grad_norm": 9.588860511779785, + "learning_rate": 5.071904755092282e-06, + "loss": 0.537, + "mean_token_accuracy": 0.8418572053313256, + "num_tokens": 96498517.0, + "step": 80260 + }, + { + "entropy": 1.798211957514286, + "epoch": 0.24882997677390556, + "grad_norm": 4.688083648681641, + "learning_rate": 5.071588813525126e-06, + "loss": 0.383, + "mean_token_accuracy": 0.8539124757051468, + "num_tokens": 96511459.0, + "step": 80270 + }, + { + "entropy": 1.9249870374798774, + "epoch": 0.24886097589895526, + "grad_norm": 9.468367576599121, + "learning_rate": 5.071272930992976e-06, + "loss": 0.5748, + "mean_token_accuracy": 0.8211545005440712, + "num_tokens": 96523206.0, + "step": 80280 + }, + { + "entropy": 1.9651254445314408, + "epoch": 0.24889197502400495, + "grad_norm": 8.676048278808594, + "learning_rate": 5.070957107477445e-06, + "loss": 0.5158, + "mean_token_accuracy": 0.8506427973508834, + "num_tokens": 96534006.0, + "step": 80290 + }, + { + "entropy": 1.9338118746876716, + "epoch": 0.24892297414905465, + "grad_norm": 10.065473556518555, + "learning_rate": 5.070641342960163e-06, + "loss": 0.5124, + "mean_token_accuracy": 0.8366307452321052, + "num_tokens": 96545681.0, + "step": 80300 + }, + { + "entropy": 1.9056643381714822, + "epoch": 0.24895397327410435, + "grad_norm": 7.843869686126709, + "learning_rate": 5.070325637422762e-06, + "loss": 0.5366, + "mean_token_accuracy": 0.8389701470732689, + "num_tokens": 96557894.0, + "step": 80310 + }, + { + "entropy": 1.766680945456028, + "epoch": 0.24898497239915404, + "grad_norm": 8.891898155212402, + "learning_rate": 5.070009990846881e-06, + "loss": 0.4187, + "mean_token_accuracy": 0.8574616640806199, + "num_tokens": 96572023.0, + "step": 80320 + }, + { + "entropy": 1.873360113799572, + "epoch": 0.24901597152420374, + "grad_norm": 8.903762817382812, + "learning_rate": 5.069694403214172e-06, + "loss": 0.4984, + "mean_token_accuracy": 0.8422929286956787, + "num_tokens": 96584169.0, + "step": 80330 + }, + { + "entropy": 1.949498575925827, + "epoch": 0.24904697064925344, + "grad_norm": 5.78523063659668, + "learning_rate": 5.069378874506292e-06, + "loss": 0.5428, + "mean_token_accuracy": 0.8299229055643081, + "num_tokens": 96596779.0, + "step": 80340 + }, + { + "entropy": 1.8998542904853821, + "epoch": 0.24907796977430313, + "grad_norm": 8.483941078186035, + "learning_rate": 5.069063404704906e-06, + "loss": 0.4755, + "mean_token_accuracy": 0.8469171851873398, + "num_tokens": 96608569.0, + "step": 80350 + }, + { + "entropy": 1.8074628964066506, + "epoch": 0.24910896889935283, + "grad_norm": 8.821971893310547, + "learning_rate": 5.068747993791688e-06, + "loss": 0.4681, + "mean_token_accuracy": 0.850117315351963, + "num_tokens": 96620814.0, + "step": 80360 + }, + { + "entropy": 1.818614599108696, + "epoch": 0.24913996802440252, + "grad_norm": 8.12628173828125, + "learning_rate": 5.068432641748318e-06, + "loss": 0.4463, + "mean_token_accuracy": 0.8480607718229294, + "num_tokens": 96634266.0, + "step": 80370 + }, + { + "entropy": 1.865596404671669, + "epoch": 0.2491709671494522, + "grad_norm": 7.75429630279541, + "learning_rate": 5.068117348556486e-06, + "loss": 0.4391, + "mean_token_accuracy": 0.8603037342429161, + "num_tokens": 96646716.0, + "step": 80380 + }, + { + "entropy": 1.9810962960124017, + "epoch": 0.2492019662745019, + "grad_norm": 9.098136901855469, + "learning_rate": 5.06780211419789e-06, + "loss": 0.508, + "mean_token_accuracy": 0.841071504354477, + "num_tokens": 96657927.0, + "step": 80390 + }, + { + "entropy": 1.8735760763287543, + "epoch": 0.2492329653995516, + "grad_norm": 8.33572769165039, + "learning_rate": 5.067486938654235e-06, + "loss": 0.504, + "mean_token_accuracy": 0.8473378553986549, + "num_tokens": 96670616.0, + "step": 80400 + }, + { + "entropy": 1.9213013619184494, + "epoch": 0.24926396452460128, + "grad_norm": 8.421183586120605, + "learning_rate": 5.067171821907233e-06, + "loss": 0.4998, + "mean_token_accuracy": 0.8397013619542122, + "num_tokens": 96682228.0, + "step": 80410 + }, + { + "entropy": 1.8531699359416962, + "epoch": 0.24929496364965098, + "grad_norm": 7.683859825134277, + "learning_rate": 5.066856763938607e-06, + "loss": 0.4447, + "mean_token_accuracy": 0.8473792493343353, + "num_tokens": 96695485.0, + "step": 80420 + }, + { + "entropy": 1.8864355862140656, + "epoch": 0.24932596277470068, + "grad_norm": 8.114187240600586, + "learning_rate": 5.066541764730085e-06, + "loss": 0.4721, + "mean_token_accuracy": 0.8485756784677505, + "num_tokens": 96707903.0, + "step": 80430 + }, + { + "entropy": 1.9324491962790489, + "epoch": 0.24935696189975037, + "grad_norm": 7.229660987854004, + "learning_rate": 5.066226824263405e-06, + "loss": 0.524, + "mean_token_accuracy": 0.8466485366225243, + "num_tokens": 96719778.0, + "step": 80440 + }, + { + "entropy": 1.9333231955766679, + "epoch": 0.24938796102480007, + "grad_norm": 10.796730041503906, + "learning_rate": 5.0659119425203116e-06, + "loss": 0.5191, + "mean_token_accuracy": 0.8371896788477897, + "num_tokens": 96731297.0, + "step": 80450 + }, + { + "entropy": 1.8949188530445098, + "epoch": 0.24941896014984977, + "grad_norm": 8.327264785766602, + "learning_rate": 5.0655971194825586e-06, + "loss": 0.4731, + "mean_token_accuracy": 0.840109933912754, + "num_tokens": 96743476.0, + "step": 80460 + }, + { + "entropy": 1.9443549513816833, + "epoch": 0.24944995927489946, + "grad_norm": 8.616156578063965, + "learning_rate": 5.065282355131904e-06, + "loss": 0.5149, + "mean_token_accuracy": 0.8376484885811806, + "num_tokens": 96755299.0, + "step": 80470 + }, + { + "entropy": 1.8526927456259727, + "epoch": 0.24948095839994916, + "grad_norm": 4.920670032501221, + "learning_rate": 5.06496764945012e-06, + "loss": 0.4747, + "mean_token_accuracy": 0.8482913061976433, + "num_tokens": 96768603.0, + "step": 80480 + }, + { + "entropy": 1.9240273088216782, + "epoch": 0.24951195752499886, + "grad_norm": 7.758111000061035, + "learning_rate": 5.064653002418982e-06, + "loss": 0.4934, + "mean_token_accuracy": 0.8401697605848313, + "num_tokens": 96780005.0, + "step": 80490 + }, + { + "entropy": 1.8932354971766472, + "epoch": 0.24954295665004855, + "grad_norm": 8.145684242248535, + "learning_rate": 5.064338414020274e-06, + "loss": 0.449, + "mean_token_accuracy": 0.8508092865347863, + "num_tokens": 96792344.0, + "step": 80500 + }, + { + "entropy": 1.8841583669185638, + "epoch": 0.24957395577509825, + "grad_norm": 7.790008068084717, + "learning_rate": 5.064023884235791e-06, + "loss": 0.499, + "mean_token_accuracy": 0.8376784399151802, + "num_tokens": 96804429.0, + "step": 80510 + }, + { + "entropy": 1.905587163567543, + "epoch": 0.24960495490014795, + "grad_norm": 8.785361289978027, + "learning_rate": 5.063709413047332e-06, + "loss": 0.4873, + "mean_token_accuracy": 0.8395880877971649, + "num_tokens": 96815913.0, + "step": 80520 + }, + { + "entropy": 1.884190782904625, + "epoch": 0.24963595402519764, + "grad_norm": 7.5868659019470215, + "learning_rate": 5.063395000436705e-06, + "loss": 0.4536, + "mean_token_accuracy": 0.8370744809508324, + "num_tokens": 96828062.0, + "step": 80530 + }, + { + "entropy": 1.9852372884750367, + "epoch": 0.24966695315024734, + "grad_norm": 7.919397354125977, + "learning_rate": 5.063080646385727e-06, + "loss": 0.5456, + "mean_token_accuracy": 0.8372212365269661, + "num_tokens": 96838879.0, + "step": 80540 + }, + { + "entropy": 1.8885437712073325, + "epoch": 0.24969795227529704, + "grad_norm": 8.829280853271484, + "learning_rate": 5.062766350876223e-06, + "loss": 0.442, + "mean_token_accuracy": 0.8520119696855545, + "num_tokens": 96851447.0, + "step": 80550 + }, + { + "entropy": 1.9246813550591468, + "epoch": 0.24972895140034673, + "grad_norm": 9.468807220458984, + "learning_rate": 5.062452113890023e-06, + "loss": 0.4682, + "mean_token_accuracy": 0.8505311653017997, + "num_tokens": 96863095.0, + "step": 80560 + }, + { + "entropy": 1.9204016119241714, + "epoch": 0.24975995052539643, + "grad_norm": 9.094423294067383, + "learning_rate": 5.06213793540897e-06, + "loss": 0.4745, + "mean_token_accuracy": 0.8493364602327347, + "num_tokens": 96874536.0, + "step": 80570 + }, + { + "entropy": 1.8319402411580086, + "epoch": 0.24979094965044613, + "grad_norm": 6.896250247955322, + "learning_rate": 5.061823815414909e-06, + "loss": 0.4068, + "mean_token_accuracy": 0.8573963329195976, + "num_tokens": 96887175.0, + "step": 80580 + }, + { + "entropy": 1.8221399798989295, + "epoch": 0.24982194877549582, + "grad_norm": 8.107512474060059, + "learning_rate": 5.061509753889697e-06, + "loss": 0.4384, + "mean_token_accuracy": 0.8606969341635704, + "num_tokens": 96899578.0, + "step": 80590 + }, + { + "entropy": 1.8691432937979697, + "epoch": 0.24985294790054552, + "grad_norm": 4.902885913848877, + "learning_rate": 5.0611957508152e-06, + "loss": 0.4548, + "mean_token_accuracy": 0.8467679604887962, + "num_tokens": 96911585.0, + "step": 80600 + }, + { + "entropy": 1.8758586004376412, + "epoch": 0.24988394702559522, + "grad_norm": 9.201493263244629, + "learning_rate": 5.0608818061732855e-06, + "loss": 0.4911, + "mean_token_accuracy": 0.8440716713666916, + "num_tokens": 96923853.0, + "step": 80610 + }, + { + "entropy": 1.9490404814481734, + "epoch": 0.24991494615064488, + "grad_norm": 9.697155952453613, + "learning_rate": 5.0605679199458365e-06, + "loss": 0.5383, + "mean_token_accuracy": 0.8389123737812042, + "num_tokens": 96935124.0, + "step": 80620 + }, + { + "entropy": 1.86717938631773, + "epoch": 0.24994594527569458, + "grad_norm": 8.20489501953125, + "learning_rate": 5.060254092114738e-06, + "loss": 0.4591, + "mean_token_accuracy": 0.8462685376405716, + "num_tokens": 96946886.0, + "step": 80630 + }, + { + "entropy": 1.856597825884819, + "epoch": 0.24997694440074428, + "grad_norm": 8.653462409973145, + "learning_rate": 5.059940322661886e-06, + "loss": 0.4763, + "mean_token_accuracy": 0.8464651450514793, + "num_tokens": 96958754.0, + "step": 80640 + }, + { + "entropy": 1.9769204616546632, + "epoch": 0.250007943525794, + "grad_norm": 8.563497543334961, + "learning_rate": 5.059626611569183e-06, + "loss": 0.527, + "mean_token_accuracy": 0.8379195779561996, + "num_tokens": 96969763.0, + "step": 80650 + }, + { + "entropy": 1.8927036389708518, + "epoch": 0.2500389426508437, + "grad_norm": 10.38947868347168, + "learning_rate": 5.059312958818542e-06, + "loss": 0.4557, + "mean_token_accuracy": 0.8479386597871781, + "num_tokens": 96981550.0, + "step": 80660 + }, + { + "entropy": 1.9301312297582627, + "epoch": 0.25006994177589337, + "grad_norm": 10.17356014251709, + "learning_rate": 5.058999364391879e-06, + "loss": 0.5176, + "mean_token_accuracy": 0.8410406053066254, + "num_tokens": 96992490.0, + "step": 80670 + }, + { + "entropy": 1.89289371073246, + "epoch": 0.2501009409009431, + "grad_norm": 7.985718250274658, + "learning_rate": 5.058685828271122e-06, + "loss": 0.4597, + "mean_token_accuracy": 0.8415110245347023, + "num_tokens": 97004316.0, + "step": 80680 + }, + { + "entropy": 1.9630913496017457, + "epoch": 0.25013194002599276, + "grad_norm": 7.961121559143066, + "learning_rate": 5.0583723504382044e-06, + "loss": 0.5045, + "mean_token_accuracy": 0.8376610234379769, + "num_tokens": 97015485.0, + "step": 80690 + }, + { + "entropy": 1.934187889099121, + "epoch": 0.2501629391510425, + "grad_norm": 8.046591758728027, + "learning_rate": 5.05805893087507e-06, + "loss": 0.5516, + "mean_token_accuracy": 0.8271699488162995, + "num_tokens": 97027583.0, + "step": 80700 + }, + { + "entropy": 1.9274443075060845, + "epoch": 0.25019393827609215, + "grad_norm": 3.1628236770629883, + "learning_rate": 5.057745569563669e-06, + "loss": 0.5329, + "mean_token_accuracy": 0.8287164881825447, + "num_tokens": 97039330.0, + "step": 80710 + }, + { + "entropy": 1.9359917491674423, + "epoch": 0.2502249374011419, + "grad_norm": 7.84744119644165, + "learning_rate": 5.057432266485958e-06, + "loss": 0.4986, + "mean_token_accuracy": 0.8385084256529808, + "num_tokens": 97050574.0, + "step": 80720 + }, + { + "entropy": 1.9594187870621682, + "epoch": 0.25025593652619155, + "grad_norm": 7.170385837554932, + "learning_rate": 5.057119021623903e-06, + "loss": 0.4977, + "mean_token_accuracy": 0.8456411883234978, + "num_tokens": 97062756.0, + "step": 80730 + }, + { + "entropy": 1.9243200287222861, + "epoch": 0.25028693565124127, + "grad_norm": 7.072485446929932, + "learning_rate": 5.056805834959478e-06, + "loss": 0.5129, + "mean_token_accuracy": 0.8350550681352615, + "num_tokens": 97074540.0, + "step": 80740 + }, + { + "entropy": 1.9237784013152122, + "epoch": 0.25031793477629094, + "grad_norm": 8.005240440368652, + "learning_rate": 5.056492706474664e-06, + "loss": 0.5107, + "mean_token_accuracy": 0.8389381229877472, + "num_tokens": 97086382.0, + "step": 80750 + }, + { + "entropy": 1.939412146806717, + "epoch": 0.2503489339013406, + "grad_norm": 8.643763542175293, + "learning_rate": 5.056179636151449e-06, + "loss": 0.4937, + "mean_token_accuracy": 0.847502426803112, + "num_tokens": 97097391.0, + "step": 80760 + }, + { + "entropy": 1.9007414281368256, + "epoch": 0.25037993302639033, + "grad_norm": 5.617809295654297, + "learning_rate": 5.055866623971834e-06, + "loss": 0.4676, + "mean_token_accuracy": 0.8450678214430809, + "num_tokens": 97109904.0, + "step": 80770 + }, + { + "entropy": 1.9128052070736885, + "epoch": 0.25041093215144, + "grad_norm": 10.466955184936523, + "learning_rate": 5.05555366991782e-06, + "loss": 0.4726, + "mean_token_accuracy": 0.8468824103474617, + "num_tokens": 97121730.0, + "step": 80780 + }, + { + "entropy": 1.9782243341207504, + "epoch": 0.2504419312764897, + "grad_norm": 9.422632217407227, + "learning_rate": 5.0552407739714205e-06, + "loss": 0.5243, + "mean_token_accuracy": 0.8376658350229264, + "num_tokens": 97133221.0, + "step": 80790 + }, + { + "entropy": 1.8325415551662445, + "epoch": 0.2504729304015394, + "grad_norm": 4.565164089202881, + "learning_rate": 5.0549279361146554e-06, + "loss": 0.3966, + "mean_token_accuracy": 0.8581367552280426, + "num_tokens": 97145983.0, + "step": 80800 + }, + { + "entropy": 1.8578097239136695, + "epoch": 0.2505039295265891, + "grad_norm": 8.166582107543945, + "learning_rate": 5.0546151563295545e-06, + "loss": 0.4472, + "mean_token_accuracy": 0.8440769106149674, + "num_tokens": 97159728.0, + "step": 80810 + }, + { + "entropy": 1.8590758338570594, + "epoch": 0.2505349286516388, + "grad_norm": 10.394712448120117, + "learning_rate": 5.054302434598153e-06, + "loss": 0.4654, + "mean_token_accuracy": 0.8476429939270019, + "num_tokens": 97172401.0, + "step": 80820 + }, + { + "entropy": 1.8516904532909393, + "epoch": 0.2505659277766885, + "grad_norm": 4.466270923614502, + "learning_rate": 5.053989770902494e-06, + "loss": 0.4435, + "mean_token_accuracy": 0.8435142487287521, + "num_tokens": 97184703.0, + "step": 80830 + }, + { + "entropy": 1.9154213652014733, + "epoch": 0.2505969269017382, + "grad_norm": 3.9221031665802, + "learning_rate": 5.053677165224629e-06, + "loss": 0.4504, + "mean_token_accuracy": 0.8629103854298592, + "num_tokens": 97195885.0, + "step": 80840 + }, + { + "entropy": 1.8582031592726707, + "epoch": 0.2506279260267879, + "grad_norm": 4.536048412322998, + "learning_rate": 5.053364617546619e-06, + "loss": 0.4221, + "mean_token_accuracy": 0.8564950287342071, + "num_tokens": 97207732.0, + "step": 80850 + }, + { + "entropy": 1.953849881887436, + "epoch": 0.2506589251518376, + "grad_norm": 7.936028003692627, + "learning_rate": 5.05305212785053e-06, + "loss": 0.5772, + "mean_token_accuracy": 0.838682298362255, + "num_tokens": 97218464.0, + "step": 80860 + }, + { + "entropy": 1.91454386562109, + "epoch": 0.2506899242768873, + "grad_norm": 10.065215110778809, + "learning_rate": 5.052739696118435e-06, + "loss": 0.5076, + "mean_token_accuracy": 0.837799771130085, + "num_tokens": 97230250.0, + "step": 80870 + }, + { + "entropy": 1.8838393643498421, + "epoch": 0.25072092340193697, + "grad_norm": 9.204728126525879, + "learning_rate": 5.05242732233242e-06, + "loss": 0.4506, + "mean_token_accuracy": 0.8544922456145286, + "num_tokens": 97242703.0, + "step": 80880 + }, + { + "entropy": 1.7693653479218483, + "epoch": 0.2507519225269867, + "grad_norm": 2.456925392150879, + "learning_rate": 5.052115006474571e-06, + "loss": 0.38, + "mean_token_accuracy": 0.8611262261867523, + "num_tokens": 97256526.0, + "step": 80890 + }, + { + "entropy": 1.830974417924881, + "epoch": 0.25078292165203636, + "grad_norm": 9.351210594177246, + "learning_rate": 5.051802748526991e-06, + "loss": 0.4618, + "mean_token_accuracy": 0.8481344610452652, + "num_tokens": 97268711.0, + "step": 80900 + }, + { + "entropy": 1.8810481294989585, + "epoch": 0.2508139207770861, + "grad_norm": 7.888736724853516, + "learning_rate": 5.051490548471781e-06, + "loss": 0.4459, + "mean_token_accuracy": 0.8467276513576507, + "num_tokens": 97281167.0, + "step": 80910 + }, + { + "entropy": 1.9074201628565788, + "epoch": 0.25084491990213575, + "grad_norm": 9.632227897644043, + "learning_rate": 5.051178406291058e-06, + "loss": 0.4912, + "mean_token_accuracy": 0.8443691492080688, + "num_tokens": 97293034.0, + "step": 80920 + }, + { + "entropy": 1.9555223256349563, + "epoch": 0.2508759190271855, + "grad_norm": 9.613581657409668, + "learning_rate": 5.050866321966943e-06, + "loss": 0.5608, + "mean_token_accuracy": 0.8227898702025414, + "num_tokens": 97304238.0, + "step": 80930 + }, + { + "entropy": 1.852898570895195, + "epoch": 0.25090691815223515, + "grad_norm": 4.1173553466796875, + "learning_rate": 5.050554295481563e-06, + "loss": 0.4399, + "mean_token_accuracy": 0.8532208919525146, + "num_tokens": 97317260.0, + "step": 80940 + }, + { + "entropy": 1.9421288311481475, + "epoch": 0.25093791727728487, + "grad_norm": 8.63619613647461, + "learning_rate": 5.0502423268170556e-06, + "loss": 0.5326, + "mean_token_accuracy": 0.8310228928923606, + "num_tokens": 97329358.0, + "step": 80950 + }, + { + "entropy": 1.9374548494815826, + "epoch": 0.25096891640233454, + "grad_norm": 8.348370552062988, + "learning_rate": 5.049930415955566e-06, + "loss": 0.4762, + "mean_token_accuracy": 0.8462329894304276, + "num_tokens": 97340759.0, + "step": 80960 + }, + { + "entropy": 1.9339235559105874, + "epoch": 0.25099991552738427, + "grad_norm": 3.9944283962249756, + "learning_rate": 5.049618562879247e-06, + "loss": 0.5083, + "mean_token_accuracy": 0.8473946884274483, + "num_tokens": 97351791.0, + "step": 80970 + }, + { + "entropy": 1.936272232234478, + "epoch": 0.25103091465243393, + "grad_norm": 3.7326107025146484, + "learning_rate": 5.049306767570257e-06, + "loss": 0.5225, + "mean_token_accuracy": 0.8310480803251267, + "num_tokens": 97363291.0, + "step": 80980 + }, + { + "entropy": 1.924396450817585, + "epoch": 0.2510619137774836, + "grad_norm": 8.854395866394043, + "learning_rate": 5.048995030010763e-06, + "loss": 0.5065, + "mean_token_accuracy": 0.8389732867479325, + "num_tokens": 97374667.0, + "step": 80990 + }, + { + "entropy": 1.9151296749711038, + "epoch": 0.2510929129025333, + "grad_norm": 8.511651992797852, + "learning_rate": 5.048683350182941e-06, + "loss": 0.474, + "mean_token_accuracy": 0.8510990381240845, + "num_tokens": 97385441.0, + "step": 81000 + }, + { + "entropy": 1.9651855736970902, + "epoch": 0.251123912027583, + "grad_norm": 8.566028594970703, + "learning_rate": 5.048371728068976e-06, + "loss": 0.4987, + "mean_token_accuracy": 0.8424704790115356, + "num_tokens": 97396228.0, + "step": 81010 + }, + { + "entropy": 1.8919865861535072, + "epoch": 0.2511549111526327, + "grad_norm": 7.970651149749756, + "learning_rate": 5.048060163651056e-06, + "loss": 0.4567, + "mean_token_accuracy": 0.8576287001371383, + "num_tokens": 97408337.0, + "step": 81020 + }, + { + "entropy": 1.8696946695446968, + "epoch": 0.2511859102776824, + "grad_norm": 8.631657600402832, + "learning_rate": 5.047748656911381e-06, + "loss": 0.4702, + "mean_token_accuracy": 0.8475007191300392, + "num_tokens": 97420330.0, + "step": 81030 + }, + { + "entropy": 1.8537717416882515, + "epoch": 0.2512169094027321, + "grad_norm": 8.173912048339844, + "learning_rate": 5.047437207832157e-06, + "loss": 0.4509, + "mean_token_accuracy": 0.8481451213359833, + "num_tokens": 97432563.0, + "step": 81040 + }, + { + "entropy": 1.8485816575586795, + "epoch": 0.2512479085277818, + "grad_norm": 9.571538925170898, + "learning_rate": 5.047125816395597e-06, + "loss": 0.4444, + "mean_token_accuracy": 0.8497818827629089, + "num_tokens": 97445205.0, + "step": 81050 + }, + { + "entropy": 1.9077823638916016, + "epoch": 0.2512789076528315, + "grad_norm": 9.712011337280273, + "learning_rate": 5.046814482583923e-06, + "loss": 0.532, + "mean_token_accuracy": 0.8364897713065147, + "num_tokens": 97456191.0, + "step": 81060 + }, + { + "entropy": 1.9123283997178078, + "epoch": 0.2513099067778812, + "grad_norm": 3.9969263076782227, + "learning_rate": 5.046503206379363e-06, + "loss": 0.5006, + "mean_token_accuracy": 0.8428677946329117, + "num_tokens": 97468188.0, + "step": 81070 + }, + { + "entropy": 1.8334277987480163, + "epoch": 0.2513409059029309, + "grad_norm": 8.850257873535156, + "learning_rate": 5.046191987764155e-06, + "loss": 0.472, + "mean_token_accuracy": 0.8447373628616333, + "num_tokens": 97481082.0, + "step": 81080 + }, + { + "entropy": 1.9028025731444358, + "epoch": 0.25137190502798057, + "grad_norm": 8.100180625915527, + "learning_rate": 5.045880826720544e-06, + "loss": 0.5003, + "mean_token_accuracy": 0.8357649400830269, + "num_tokens": 97492640.0, + "step": 81090 + }, + { + "entropy": 1.9044124081730842, + "epoch": 0.2514029041530303, + "grad_norm": 7.797017574310303, + "learning_rate": 5.045569723230781e-06, + "loss": 0.495, + "mean_token_accuracy": 0.8423758924007416, + "num_tokens": 97503638.0, + "step": 81100 + }, + { + "entropy": 1.9882340669631957, + "epoch": 0.25143390327807996, + "grad_norm": 7.203917980194092, + "learning_rate": 5.045258677277125e-06, + "loss": 0.557, + "mean_token_accuracy": 0.8339864879846572, + "num_tokens": 97514886.0, + "step": 81110 + }, + { + "entropy": 1.9591509833931924, + "epoch": 0.2514649024031297, + "grad_norm": 7.691939830780029, + "learning_rate": 5.044947688841846e-06, + "loss": 0.5052, + "mean_token_accuracy": 0.8347377508878708, + "num_tokens": 97526189.0, + "step": 81120 + }, + { + "entropy": 1.8668823108077048, + "epoch": 0.25149590152817936, + "grad_norm": 4.000504970550537, + "learning_rate": 5.044636757907217e-06, + "loss": 0.4437, + "mean_token_accuracy": 0.8517090499401092, + "num_tokens": 97538527.0, + "step": 81130 + }, + { + "entropy": 1.8323442935943604, + "epoch": 0.2515269006532291, + "grad_norm": 9.106334686279297, + "learning_rate": 5.044325884455522e-06, + "loss": 0.4715, + "mean_token_accuracy": 0.8402392759919166, + "num_tokens": 97551259.0, + "step": 81140 + }, + { + "entropy": 1.8727603957057, + "epoch": 0.25155789977827875, + "grad_norm": 4.572244167327881, + "learning_rate": 5.04401506846905e-06, + "loss": 0.4993, + "mean_token_accuracy": 0.8374785885214806, + "num_tokens": 97563511.0, + "step": 81150 + }, + { + "entropy": 1.858438329398632, + "epoch": 0.2515888989033285, + "grad_norm": 7.100371360778809, + "learning_rate": 5.0437043099301006e-06, + "loss": 0.4453, + "mean_token_accuracy": 0.8520154163241387, + "num_tokens": 97575837.0, + "step": 81160 + }, + { + "entropy": 1.8653832495212554, + "epoch": 0.25161989802837814, + "grad_norm": 7.727576732635498, + "learning_rate": 5.043393608820979e-06, + "loss": 0.4841, + "mean_token_accuracy": 0.8588355794548989, + "num_tokens": 97587041.0, + "step": 81170 + }, + { + "entropy": 1.8832016855478286, + "epoch": 0.25165089715342787, + "grad_norm": 9.95567512512207, + "learning_rate": 5.043082965123996e-06, + "loss": 0.5331, + "mean_token_accuracy": 0.8414867803454399, + "num_tokens": 97598721.0, + "step": 81180 + }, + { + "entropy": 1.9306304231286049, + "epoch": 0.25168189627847753, + "grad_norm": 9.012680053710938, + "learning_rate": 5.042772378821477e-06, + "loss": 0.5017, + "mean_token_accuracy": 0.8460228756070137, + "num_tokens": 97609705.0, + "step": 81190 + }, + { + "entropy": 1.8687114715576172, + "epoch": 0.25171289540352726, + "grad_norm": 7.753897190093994, + "learning_rate": 5.042461849895747e-06, + "loss": 0.4928, + "mean_token_accuracy": 0.8424293950200081, + "num_tokens": 97622075.0, + "step": 81200 + }, + { + "entropy": 1.8677958205342293, + "epoch": 0.25174389452857693, + "grad_norm": 9.426398277282715, + "learning_rate": 5.0421513783291445e-06, + "loss": 0.4804, + "mean_token_accuracy": 0.8376675888895988, + "num_tokens": 97634437.0, + "step": 81210 + }, + { + "entropy": 1.85673858076334, + "epoch": 0.25177489365362665, + "grad_norm": 3.3894803524017334, + "learning_rate": 5.04184096410401e-06, + "loss": 0.449, + "mean_token_accuracy": 0.849704897403717, + "num_tokens": 97647188.0, + "step": 81220 + }, + { + "entropy": 1.8845996797084807, + "epoch": 0.2518058927786763, + "grad_norm": 4.1136322021484375, + "learning_rate": 5.041530607202698e-06, + "loss": 0.4931, + "mean_token_accuracy": 0.8388149186968803, + "num_tokens": 97659181.0, + "step": 81230 + }, + { + "entropy": 1.8818348929286004, + "epoch": 0.251836891903726, + "grad_norm": 8.5837984085083, + "learning_rate": 5.041220307607568e-06, + "loss": 0.4394, + "mean_token_accuracy": 0.8552572011947632, + "num_tokens": 97670943.0, + "step": 81240 + }, + { + "entropy": 1.9214624166488647, + "epoch": 0.2518678910287757, + "grad_norm": 8.553311347961426, + "learning_rate": 5.040910065300984e-06, + "loss": 0.5291, + "mean_token_accuracy": 0.8412210613489151, + "num_tokens": 97682965.0, + "step": 81250 + }, + { + "entropy": 1.7995470568537713, + "epoch": 0.2518988901538254, + "grad_norm": 10.36379623413086, + "learning_rate": 5.04059988026532e-06, + "loss": 0.436, + "mean_token_accuracy": 0.8609731838107109, + "num_tokens": 97696313.0, + "step": 81260 + }, + { + "entropy": 1.9689121127128602, + "epoch": 0.2519298892788751, + "grad_norm": 7.462158679962158, + "learning_rate": 5.0402897524829595e-06, + "loss": 0.5265, + "mean_token_accuracy": 0.8380709275603294, + "num_tokens": 97708096.0, + "step": 81270 + }, + { + "entropy": 1.9260016784071923, + "epoch": 0.2519608884039248, + "grad_norm": 3.709622859954834, + "learning_rate": 5.039979681936291e-06, + "loss": 0.5007, + "mean_token_accuracy": 0.8301331534981727, + "num_tokens": 97720285.0, + "step": 81280 + }, + { + "entropy": 1.868030358850956, + "epoch": 0.2519918875289745, + "grad_norm": 9.611808776855469, + "learning_rate": 5.039669668607713e-06, + "loss": 0.4107, + "mean_token_accuracy": 0.8606329753994941, + "num_tokens": 97733109.0, + "step": 81290 + }, + { + "entropy": 1.8924655795097352, + "epoch": 0.25202288665402417, + "grad_norm": 9.830230712890625, + "learning_rate": 5.039359712479628e-06, + "loss": 0.4907, + "mean_token_accuracy": 0.8400240853428841, + "num_tokens": 97745924.0, + "step": 81300 + }, + { + "entropy": 1.900289523601532, + "epoch": 0.2520538857790739, + "grad_norm": 7.719475746154785, + "learning_rate": 5.039049813534448e-06, + "loss": 0.4758, + "mean_token_accuracy": 0.84294343739748, + "num_tokens": 97758200.0, + "step": 81310 + }, + { + "entropy": 1.8474449053406716, + "epoch": 0.25208488490412356, + "grad_norm": 4.5430684089660645, + "learning_rate": 5.0387399717545945e-06, + "loss": 0.4601, + "mean_token_accuracy": 0.8552279889583587, + "num_tokens": 97770861.0, + "step": 81320 + }, + { + "entropy": 1.965096142888069, + "epoch": 0.2521158840291733, + "grad_norm": 10.388503074645996, + "learning_rate": 5.038430187122494e-06, + "loss": 0.4992, + "mean_token_accuracy": 0.8290927574038506, + "num_tokens": 97783190.0, + "step": 81330 + }, + { + "entropy": 1.9893094927072525, + "epoch": 0.25214688315422296, + "grad_norm": 7.405788898468018, + "learning_rate": 5.03812045962058e-06, + "loss": 0.5311, + "mean_token_accuracy": 0.8353447288274765, + "num_tokens": 97794623.0, + "step": 81340 + }, + { + "entropy": 1.9728745833039283, + "epoch": 0.2521778822792727, + "grad_norm": 7.6129045486450195, + "learning_rate": 5.037810789231295e-06, + "loss": 0.5364, + "mean_token_accuracy": 0.8378026425838471, + "num_tokens": 97806542.0, + "step": 81350 + }, + { + "entropy": 1.9820994019508362, + "epoch": 0.25220888140432235, + "grad_norm": 8.426169395446777, + "learning_rate": 5.0375011759370905e-06, + "loss": 0.5264, + "mean_token_accuracy": 0.8454423397779465, + "num_tokens": 97817791.0, + "step": 81360 + }, + { + "entropy": 1.9337128177285194, + "epoch": 0.2522398805293721, + "grad_norm": 9.23006534576416, + "learning_rate": 5.037191619720424e-06, + "loss": 0.5056, + "mean_token_accuracy": 0.8424638077616692, + "num_tokens": 97830448.0, + "step": 81370 + }, + { + "entropy": 1.8902158245444298, + "epoch": 0.25227087965442174, + "grad_norm": 8.688264846801758, + "learning_rate": 5.036882120563758e-06, + "loss": 0.4861, + "mean_token_accuracy": 0.8471032664179802, + "num_tokens": 97842561.0, + "step": 81380 + }, + { + "entropy": 1.7683439910411836, + "epoch": 0.25230187877947147, + "grad_norm": 7.763362407684326, + "learning_rate": 5.036572678449568e-06, + "loss": 0.3783, + "mean_token_accuracy": 0.8493596822023392, + "num_tokens": 97856622.0, + "step": 81390 + }, + { + "entropy": 1.9423942849040032, + "epoch": 0.25233287790452114, + "grad_norm": 7.845071792602539, + "learning_rate": 5.036263293360331e-06, + "loss": 0.5274, + "mean_token_accuracy": 0.8349541559815407, + "num_tokens": 97867976.0, + "step": 81400 + }, + { + "entropy": 1.9156456768512726, + "epoch": 0.25236387702957086, + "grad_norm": 9.051328659057617, + "learning_rate": 5.035953965278539e-06, + "loss": 0.5054, + "mean_token_accuracy": 0.8346843421459198, + "num_tokens": 97880411.0, + "step": 81410 + }, + { + "entropy": 1.9632582008838653, + "epoch": 0.25239487615462053, + "grad_norm": 8.78569507598877, + "learning_rate": 5.035644694186681e-06, + "loss": 0.5321, + "mean_token_accuracy": 0.8315117686986924, + "num_tokens": 97891776.0, + "step": 81420 + }, + { + "entropy": 1.9228404372930528, + "epoch": 0.25242587527967025, + "grad_norm": 3.864513397216797, + "learning_rate": 5.035335480067265e-06, + "loss": 0.4328, + "mean_token_accuracy": 0.8493173897266388, + "num_tokens": 97903613.0, + "step": 81430 + }, + { + "entropy": 1.8714887380599976, + "epoch": 0.2524568744047199, + "grad_norm": 2.324655294418335, + "learning_rate": 5.035026322902799e-06, + "loss": 0.4154, + "mean_token_accuracy": 0.8534365460276604, + "num_tokens": 97915252.0, + "step": 81440 + }, + { + "entropy": 1.9306543365120887, + "epoch": 0.25248787352976965, + "grad_norm": 3.5631608963012695, + "learning_rate": 5.0347172226758e-06, + "loss": 0.5313, + "mean_token_accuracy": 0.8418522760272026, + "num_tokens": 97926415.0, + "step": 81450 + }, + { + "entropy": 1.9040459290146827, + "epoch": 0.2525188726548193, + "grad_norm": 8.15766429901123, + "learning_rate": 5.034408179368794e-06, + "loss": 0.5042, + "mean_token_accuracy": 0.8509362369775773, + "num_tokens": 97937394.0, + "step": 81460 + }, + { + "entropy": 1.8772794365882874, + "epoch": 0.25254987177986904, + "grad_norm": 4.18629789352417, + "learning_rate": 5.034099192964314e-06, + "loss": 0.458, + "mean_token_accuracy": 0.8516110569238663, + "num_tokens": 97949121.0, + "step": 81470 + }, + { + "entropy": 1.9287880495190621, + "epoch": 0.2525808709049187, + "grad_norm": 9.347918510437012, + "learning_rate": 5.033790263444901e-06, + "loss": 0.5062, + "mean_token_accuracy": 0.8376461073756218, + "num_tokens": 97960754.0, + "step": 81480 + }, + { + "entropy": 1.9006264075636863, + "epoch": 0.2526118700299684, + "grad_norm": 4.023174285888672, + "learning_rate": 5.0334813907931005e-06, + "loss": 0.4957, + "mean_token_accuracy": 0.8348308220505715, + "num_tokens": 97972812.0, + "step": 81490 + }, + { + "entropy": 1.983270612359047, + "epoch": 0.2526428691550181, + "grad_norm": 10.707830429077148, + "learning_rate": 5.033172574991469e-06, + "loss": 0.5652, + "mean_token_accuracy": 0.8273524597287178, + "num_tokens": 97984573.0, + "step": 81500 + }, + { + "entropy": 1.8726325988769532, + "epoch": 0.25267386828006777, + "grad_norm": 9.031349182128906, + "learning_rate": 5.03286381602257e-06, + "loss": 0.4764, + "mean_token_accuracy": 0.8485777363181114, + "num_tokens": 97996152.0, + "step": 81510 + }, + { + "entropy": 1.9527307838201522, + "epoch": 0.2527048674051175, + "grad_norm": 8.993306159973145, + "learning_rate": 5.032555113868971e-06, + "loss": 0.5688, + "mean_token_accuracy": 0.8380294933915138, + "num_tokens": 98007120.0, + "step": 81520 + }, + { + "entropy": 1.9475104868412019, + "epoch": 0.25273586653016716, + "grad_norm": 8.436033248901367, + "learning_rate": 5.032246468513252e-06, + "loss": 0.5843, + "mean_token_accuracy": 0.8303594037890434, + "num_tokens": 98018955.0, + "step": 81530 + }, + { + "entropy": 1.8729032024741172, + "epoch": 0.2527668656552169, + "grad_norm": 7.274577617645264, + "learning_rate": 5.031937879937998e-06, + "loss": 0.4667, + "mean_token_accuracy": 0.8430945709347725, + "num_tokens": 98030512.0, + "step": 81540 + }, + { + "entropy": 1.8622186571359634, + "epoch": 0.25279786478026656, + "grad_norm": 7.648394584655762, + "learning_rate": 5.031629348125801e-06, + "loss": 0.4795, + "mean_token_accuracy": 0.8509231805801392, + "num_tokens": 98043454.0, + "step": 81550 + }, + { + "entropy": 1.8595874547958373, + "epoch": 0.2528288639053163, + "grad_norm": 8.356622695922852, + "learning_rate": 5.031320873059261e-06, + "loss": 0.4618, + "mean_token_accuracy": 0.8488967612385749, + "num_tokens": 98055991.0, + "step": 81560 + }, + { + "entropy": 1.8365911930799483, + "epoch": 0.25285986303036595, + "grad_norm": 3.591733932495117, + "learning_rate": 5.031012454720986e-06, + "loss": 0.4586, + "mean_token_accuracy": 0.8576977401971817, + "num_tokens": 98067770.0, + "step": 81570 + }, + { + "entropy": 1.7719722762703896, + "epoch": 0.2528908621554157, + "grad_norm": 3.6505837440490723, + "learning_rate": 5.03070409309359e-06, + "loss": 0.4642, + "mean_token_accuracy": 0.8484556913375855, + "num_tokens": 98081361.0, + "step": 81580 + }, + { + "entropy": 1.801680639386177, + "epoch": 0.25292186128046534, + "grad_norm": 8.050759315490723, + "learning_rate": 5.030395788159697e-06, + "loss": 0.4351, + "mean_token_accuracy": 0.8547103390097618, + "num_tokens": 98094649.0, + "step": 81590 + }, + { + "entropy": 1.9035234346985817, + "epoch": 0.25295286040551507, + "grad_norm": 9.848466873168945, + "learning_rate": 5.030087539901935e-06, + "loss": 0.5127, + "mean_token_accuracy": 0.8474085479974747, + "num_tokens": 98106101.0, + "step": 81600 + }, + { + "entropy": 1.925628274679184, + "epoch": 0.25298385953056474, + "grad_norm": 7.4657206535339355, + "learning_rate": 5.0297793483029445e-06, + "loss": 0.5281, + "mean_token_accuracy": 0.8374746307730675, + "num_tokens": 98116953.0, + "step": 81610 + }, + { + "entropy": 1.8128420755267143, + "epoch": 0.25301485865561446, + "grad_norm": 8.32465934753418, + "learning_rate": 5.029471213345367e-06, + "loss": 0.4391, + "mean_token_accuracy": 0.856282414495945, + "num_tokens": 98129159.0, + "step": 81620 + }, + { + "entropy": 1.8045406460762023, + "epoch": 0.25304585778066413, + "grad_norm": 8.122908592224121, + "learning_rate": 5.029163135011857e-06, + "loss": 0.4565, + "mean_token_accuracy": 0.8475161448121071, + "num_tokens": 98142941.0, + "step": 81630 + }, + { + "entropy": 1.9032706007361413, + "epoch": 0.25307685690571385, + "grad_norm": 7.461269855499268, + "learning_rate": 5.028855113285072e-06, + "loss": 0.5337, + "mean_token_accuracy": 0.8353031173348426, + "num_tokens": 98154626.0, + "step": 81640 + }, + { + "entropy": 1.8948104843497275, + "epoch": 0.2531078560307635, + "grad_norm": 3.860278606414795, + "learning_rate": 5.02854714814768e-06, + "loss": 0.4937, + "mean_token_accuracy": 0.8362452149391174, + "num_tokens": 98166557.0, + "step": 81650 + }, + { + "entropy": 1.9045625925064087, + "epoch": 0.25313885515581325, + "grad_norm": 8.716605186462402, + "learning_rate": 5.028239239582357e-06, + "loss": 0.4543, + "mean_token_accuracy": 0.8562442421913147, + "num_tokens": 98177738.0, + "step": 81660 + }, + { + "entropy": 1.7965163722634316, + "epoch": 0.2531698542808629, + "grad_norm": 7.601503849029541, + "learning_rate": 5.027931387571784e-06, + "loss": 0.4912, + "mean_token_accuracy": 0.8531433448195458, + "num_tokens": 98190832.0, + "step": 81670 + }, + { + "entropy": 1.9607613369822503, + "epoch": 0.25320085340591264, + "grad_norm": 10.182971000671387, + "learning_rate": 5.0276235920986505e-06, + "loss": 0.5278, + "mean_token_accuracy": 0.8343812167644501, + "num_tokens": 98202527.0, + "step": 81680 + }, + { + "entropy": 1.8433429718017578, + "epoch": 0.2532318525309623, + "grad_norm": 10.645054817199707, + "learning_rate": 5.027315853145653e-06, + "loss": 0.5052, + "mean_token_accuracy": 0.8503381237387657, + "num_tokens": 98215514.0, + "step": 81690 + }, + { + "entropy": 1.8184980183839798, + "epoch": 0.25326285165601203, + "grad_norm": 8.565644264221191, + "learning_rate": 5.0270081706954955e-06, + "loss": 0.4284, + "mean_token_accuracy": 0.8563423335552216, + "num_tokens": 98228449.0, + "step": 81700 + }, + { + "entropy": 1.9226618602871894, + "epoch": 0.2532938507810617, + "grad_norm": 8.709757804870605, + "learning_rate": 5.02670054473089e-06, + "loss": 0.5179, + "mean_token_accuracy": 0.841064678132534, + "num_tokens": 98240774.0, + "step": 81710 + }, + { + "entropy": 1.8866528853774072, + "epoch": 0.2533248499061114, + "grad_norm": 8.839398384094238, + "learning_rate": 5.0263929752345564e-06, + "loss": 0.4838, + "mean_token_accuracy": 0.8352084383368492, + "num_tokens": 98253172.0, + "step": 81720 + }, + { + "entropy": 1.9001985654234885, + "epoch": 0.2533558490311611, + "grad_norm": 7.948025226593018, + "learning_rate": 5.0260854621892196e-06, + "loss": 0.4723, + "mean_token_accuracy": 0.8508359596133233, + "num_tokens": 98265024.0, + "step": 81730 + }, + { + "entropy": 1.9735320836305619, + "epoch": 0.25338684815621076, + "grad_norm": 8.330632209777832, + "learning_rate": 5.0257780055776154e-06, + "loss": 0.5311, + "mean_token_accuracy": 0.8348188728094101, + "num_tokens": 98275695.0, + "step": 81740 + }, + { + "entropy": 1.852024681866169, + "epoch": 0.2534178472812605, + "grad_norm": 7.069023132324219, + "learning_rate": 5.025470605382483e-06, + "loss": 0.4779, + "mean_token_accuracy": 0.8471727296710014, + "num_tokens": 98288269.0, + "step": 81750 + }, + { + "entropy": 1.8877501636743546, + "epoch": 0.25344884640631016, + "grad_norm": 7.931328296661377, + "learning_rate": 5.0251632615865705e-06, + "loss": 0.4818, + "mean_token_accuracy": 0.8467486530542374, + "num_tokens": 98299962.0, + "step": 81760 + }, + { + "entropy": 1.9214854270219803, + "epoch": 0.2534798455313599, + "grad_norm": 8.707962989807129, + "learning_rate": 5.024855974172638e-06, + "loss": 0.5115, + "mean_token_accuracy": 0.8390247240662575, + "num_tokens": 98311678.0, + "step": 81770 + }, + { + "entropy": 1.88422030210495, + "epoch": 0.25351084465640955, + "grad_norm": 8.946457862854004, + "learning_rate": 5.024548743123444e-06, + "loss": 0.5201, + "mean_token_accuracy": 0.8396681264042855, + "num_tokens": 98324228.0, + "step": 81780 + }, + { + "entropy": 1.8994328901171684, + "epoch": 0.2535418437814593, + "grad_norm": 8.10692024230957, + "learning_rate": 5.024241568421762e-06, + "loss": 0.4675, + "mean_token_accuracy": 0.8410519272089004, + "num_tokens": 98335294.0, + "step": 81790 + }, + { + "entropy": 1.9097337901592255, + "epoch": 0.25357284290650894, + "grad_norm": 3.829202175140381, + "learning_rate": 5.02393445005037e-06, + "loss": 0.4827, + "mean_token_accuracy": 0.8425879299640655, + "num_tokens": 98347548.0, + "step": 81800 + }, + { + "entropy": 1.864240688085556, + "epoch": 0.25360384203155867, + "grad_norm": 4.576921463012695, + "learning_rate": 5.0236273879920534e-06, + "loss": 0.4262, + "mean_token_accuracy": 0.8558398082852363, + "num_tokens": 98359543.0, + "step": 81810 + }, + { + "entropy": 1.885629440844059, + "epoch": 0.25363484115660834, + "grad_norm": 8.460371017456055, + "learning_rate": 5.023320382229604e-06, + "loss": 0.4796, + "mean_token_accuracy": 0.8493375033140182, + "num_tokens": 98370400.0, + "step": 81820 + }, + { + "entropy": 1.953327089548111, + "epoch": 0.25366584028165806, + "grad_norm": 9.160079956054688, + "learning_rate": 5.023013432745823e-06, + "loss": 0.532, + "mean_token_accuracy": 0.8363123252987862, + "num_tokens": 98381007.0, + "step": 81830 + }, + { + "entropy": 1.9075039952993393, + "epoch": 0.25369683940670773, + "grad_norm": 8.570027351379395, + "learning_rate": 5.022706539523518e-06, + "loss": 0.5037, + "mean_token_accuracy": 0.8331753626465798, + "num_tokens": 98393129.0, + "step": 81840 + }, + { + "entropy": 1.9241829916834832, + "epoch": 0.25372783853175745, + "grad_norm": 3.9442126750946045, + "learning_rate": 5.022399702545504e-06, + "loss": 0.4657, + "mean_token_accuracy": 0.8449306398630142, + "num_tokens": 98405137.0, + "step": 81850 + }, + { + "entropy": 1.9484668985009193, + "epoch": 0.2537588376568071, + "grad_norm": 9.32621955871582, + "learning_rate": 5.022092921794602e-06, + "loss": 0.4975, + "mean_token_accuracy": 0.8393065080046653, + "num_tokens": 98416316.0, + "step": 81860 + }, + { + "entropy": 1.8186310246586799, + "epoch": 0.25378983678185685, + "grad_norm": 8.844595909118652, + "learning_rate": 5.021786197253644e-06, + "loss": 0.4889, + "mean_token_accuracy": 0.84593296200037, + "num_tokens": 98429654.0, + "step": 81870 + }, + { + "entropy": 1.8217352986335755, + "epoch": 0.2538208359069065, + "grad_norm": 3.006070137023926, + "learning_rate": 5.021479528905465e-06, + "loss": 0.4014, + "mean_token_accuracy": 0.8578614339232444, + "num_tokens": 98442831.0, + "step": 81880 + }, + { + "entropy": 1.9646976083517074, + "epoch": 0.25385183503195624, + "grad_norm": 8.035730361938477, + "learning_rate": 5.02117291673291e-06, + "loss": 0.5245, + "mean_token_accuracy": 0.8446223840117455, + "num_tokens": 98454107.0, + "step": 81890 + }, + { + "entropy": 1.8987499296665191, + "epoch": 0.2538828341570059, + "grad_norm": 7.47988748550415, + "learning_rate": 5.02086636071883e-06, + "loss": 0.493, + "mean_token_accuracy": 0.8420893490314484, + "num_tokens": 98466334.0, + "step": 81900 + }, + { + "entropy": 1.9272898733615875, + "epoch": 0.25391383328205563, + "grad_norm": 9.14543342590332, + "learning_rate": 5.020559860846086e-06, + "loss": 0.4936, + "mean_token_accuracy": 0.841991500556469, + "num_tokens": 98477479.0, + "step": 81910 + }, + { + "entropy": 1.929322722554207, + "epoch": 0.2539448324071053, + "grad_norm": 10.226633071899414, + "learning_rate": 5.020253417097542e-06, + "loss": 0.5184, + "mean_token_accuracy": 0.8391476318240165, + "num_tokens": 98489083.0, + "step": 81920 + }, + { + "entropy": 1.9254152104258537, + "epoch": 0.253975831532155, + "grad_norm": 4.747035026550293, + "learning_rate": 5.019947029456072e-06, + "loss": 0.5488, + "mean_token_accuracy": 0.841267442703247, + "num_tokens": 98500874.0, + "step": 81930 + }, + { + "entropy": 1.8812214568257333, + "epoch": 0.2540068306572047, + "grad_norm": 8.595871925354004, + "learning_rate": 5.019640697904557e-06, + "loss": 0.4644, + "mean_token_accuracy": 0.8475135773420334, + "num_tokens": 98513127.0, + "step": 81940 + }, + { + "entropy": 1.8378457948565483, + "epoch": 0.2540378297822544, + "grad_norm": 8.983741760253906, + "learning_rate": 5.019334422425887e-06, + "loss": 0.4303, + "mean_token_accuracy": 0.8528070077300072, + "num_tokens": 98526234.0, + "step": 81950 + }, + { + "entropy": 1.953259851038456, + "epoch": 0.2540688289073041, + "grad_norm": 8.572074890136719, + "learning_rate": 5.019028203002956e-06, + "loss": 0.5288, + "mean_token_accuracy": 0.8342129945755005, + "num_tokens": 98537899.0, + "step": 81960 + }, + { + "entropy": 1.9012918874621392, + "epoch": 0.2540998280323538, + "grad_norm": 7.741192817687988, + "learning_rate": 5.018722039618667e-06, + "loss": 0.5136, + "mean_token_accuracy": 0.8414129927754402, + "num_tokens": 98549927.0, + "step": 81970 + }, + { + "entropy": 1.890120567381382, + "epoch": 0.2541308271574035, + "grad_norm": 9.252531051635742, + "learning_rate": 5.018415932255929e-06, + "loss": 0.4762, + "mean_token_accuracy": 0.8448996976017952, + "num_tokens": 98562215.0, + "step": 81980 + }, + { + "entropy": 1.8428632244467735, + "epoch": 0.25416182628245315, + "grad_norm": 3.551053762435913, + "learning_rate": 5.0181098808976615e-06, + "loss": 0.5019, + "mean_token_accuracy": 0.843362420797348, + "num_tokens": 98574911.0, + "step": 81990 + }, + { + "entropy": 1.9683369904756547, + "epoch": 0.2541928254075029, + "grad_norm": 7.981224060058594, + "learning_rate": 5.0178038855267885e-06, + "loss": 0.5373, + "mean_token_accuracy": 0.841914513707161, + "num_tokens": 98585178.0, + "step": 82000 + }, + { + "entropy": 1.8377211585640907, + "epoch": 0.25422382453255254, + "grad_norm": 3.9185397624969482, + "learning_rate": 5.017497946126241e-06, + "loss": 0.3844, + "mean_token_accuracy": 0.8608157888054848, + "num_tokens": 98598026.0, + "step": 82010 + }, + { + "entropy": 1.882440300285816, + "epoch": 0.25425482365760227, + "grad_norm": 10.865336418151855, + "learning_rate": 5.01719206267896e-06, + "loss": 0.4595, + "mean_token_accuracy": 0.8479091212153435, + "num_tokens": 98609992.0, + "step": 82020 + }, + { + "entropy": 1.8768093585968018, + "epoch": 0.25428582278265194, + "grad_norm": 9.238245010375977, + "learning_rate": 5.016886235167892e-06, + "loss": 0.4918, + "mean_token_accuracy": 0.8360858023166656, + "num_tokens": 98622049.0, + "step": 82030 + }, + { + "entropy": 1.9196750313043593, + "epoch": 0.25431682190770166, + "grad_norm": 10.01637077331543, + "learning_rate": 5.016580463575987e-06, + "loss": 0.4847, + "mean_token_accuracy": 0.8404786124825477, + "num_tokens": 98633923.0, + "step": 82040 + }, + { + "entropy": 1.9348924160003662, + "epoch": 0.25434782103275133, + "grad_norm": 8.29347038269043, + "learning_rate": 5.016274747886213e-06, + "loss": 0.5139, + "mean_token_accuracy": 0.8385466530919075, + "num_tokens": 98646426.0, + "step": 82050 + }, + { + "entropy": 1.8654908925294875, + "epoch": 0.25437882015780106, + "grad_norm": 7.821435451507568, + "learning_rate": 5.0159690880815324e-06, + "loss": 0.4524, + "mean_token_accuracy": 0.8516849011182785, + "num_tokens": 98659280.0, + "step": 82060 + }, + { + "entropy": 1.9111699253320693, + "epoch": 0.2544098192828507, + "grad_norm": 7.601536273956299, + "learning_rate": 5.015663484144925e-06, + "loss": 0.5258, + "mean_token_accuracy": 0.83255735039711, + "num_tokens": 98671363.0, + "step": 82070 + }, + { + "entropy": 1.9060380011796951, + "epoch": 0.25444081840790045, + "grad_norm": 7.3169965744018555, + "learning_rate": 5.0153579360593696e-06, + "loss": 0.5096, + "mean_token_accuracy": 0.8374595761299133, + "num_tokens": 98683802.0, + "step": 82080 + }, + { + "entropy": 1.93545441031456, + "epoch": 0.2544718175329501, + "grad_norm": 4.195225238800049, + "learning_rate": 5.015052443807861e-06, + "loss": 0.4874, + "mean_token_accuracy": 0.846612386405468, + "num_tokens": 98695666.0, + "step": 82090 + }, + { + "entropy": 1.8932590246200562, + "epoch": 0.25450281665799984, + "grad_norm": 4.190147399902344, + "learning_rate": 5.014747007373392e-06, + "loss": 0.457, + "mean_token_accuracy": 0.8346954330801963, + "num_tokens": 98707862.0, + "step": 82100 + }, + { + "entropy": 1.8419014051556588, + "epoch": 0.2545338157830495, + "grad_norm": 9.007070541381836, + "learning_rate": 5.014441626738971e-06, + "loss": 0.4531, + "mean_token_accuracy": 0.8538463622331619, + "num_tokens": 98719963.0, + "step": 82110 + }, + { + "entropy": 1.9747213378548623, + "epoch": 0.25456481490809924, + "grad_norm": 7.8509297370910645, + "learning_rate": 5.014136301887608e-06, + "loss": 0.5592, + "mean_token_accuracy": 0.8374496519565582, + "num_tokens": 98731159.0, + "step": 82120 + }, + { + "entropy": 1.950367696583271, + "epoch": 0.2545958140331489, + "grad_norm": 9.41358470916748, + "learning_rate": 5.013831032802323e-06, + "loss": 0.5059, + "mean_token_accuracy": 0.847783799469471, + "num_tokens": 98742604.0, + "step": 82130 + }, + { + "entropy": 1.967973504960537, + "epoch": 0.25462681315819863, + "grad_norm": 8.383604049682617, + "learning_rate": 5.013525819466142e-06, + "loss": 0.51, + "mean_token_accuracy": 0.8421026647090912, + "num_tokens": 98754125.0, + "step": 82140 + }, + { + "entropy": 1.984065267443657, + "epoch": 0.2546578122832483, + "grad_norm": 8.877246856689453, + "learning_rate": 5.013220661862098e-06, + "loss": 0.5354, + "mean_token_accuracy": 0.8417667001485825, + "num_tokens": 98764605.0, + "step": 82150 + }, + { + "entropy": 1.878348208218813, + "epoch": 0.254688811408298, + "grad_norm": 3.9120850563049316, + "learning_rate": 5.012915559973233e-06, + "loss": 0.4629, + "mean_token_accuracy": 0.8392788946628571, + "num_tokens": 98777263.0, + "step": 82160 + }, + { + "entropy": 2.001929074525833, + "epoch": 0.2547198105333477, + "grad_norm": 8.413508415222168, + "learning_rate": 5.012610513782595e-06, + "loss": 0.5951, + "mean_token_accuracy": 0.8199850216507911, + "num_tokens": 98788877.0, + "step": 82170 + }, + { + "entropy": 1.9609450832009316, + "epoch": 0.2547508096583974, + "grad_norm": 9.111734390258789, + "learning_rate": 5.01230552327324e-06, + "loss": 0.4973, + "mean_token_accuracy": 0.844335462152958, + "num_tokens": 98800367.0, + "step": 82180 + }, + { + "entropy": 1.9912359118461609, + "epoch": 0.2547818087834471, + "grad_norm": 9.116207122802734, + "learning_rate": 5.012000588428227e-06, + "loss": 0.5586, + "mean_token_accuracy": 0.8303110048174858, + "num_tokens": 98811373.0, + "step": 82190 + }, + { + "entropy": 1.8973192408680917, + "epoch": 0.2548128079084968, + "grad_norm": 8.382734298706055, + "learning_rate": 5.01169570923063e-06, + "loss": 0.441, + "mean_token_accuracy": 0.8509496867656707, + "num_tokens": 98823915.0, + "step": 82200 + }, + { + "entropy": 1.888469786942005, + "epoch": 0.2548438070335465, + "grad_norm": 4.078553676605225, + "learning_rate": 5.011390885663524e-06, + "loss": 0.4514, + "mean_token_accuracy": 0.8516004219651222, + "num_tokens": 98835873.0, + "step": 82210 + }, + { + "entropy": 1.9228701919317246, + "epoch": 0.2548748061585962, + "grad_norm": 9.359138488769531, + "learning_rate": 5.011086117709992e-06, + "loss": 0.5104, + "mean_token_accuracy": 0.8420409306883812, + "num_tokens": 98846828.0, + "step": 82220 + }, + { + "entropy": 1.9370484218001365, + "epoch": 0.25490580528364587, + "grad_norm": 9.458020210266113, + "learning_rate": 5.010781405353129e-06, + "loss": 0.5082, + "mean_token_accuracy": 0.8457821294665336, + "num_tokens": 98858261.0, + "step": 82230 + }, + { + "entropy": 1.815031287074089, + "epoch": 0.25493680440869554, + "grad_norm": 7.948138236999512, + "learning_rate": 5.010476748576029e-06, + "loss": 0.4499, + "mean_token_accuracy": 0.8458429276943207, + "num_tokens": 98870539.0, + "step": 82240 + }, + { + "entropy": 1.9159539863467216, + "epoch": 0.25496780353374526, + "grad_norm": 4.097768306732178, + "learning_rate": 5.010172147361801e-06, + "loss": 0.542, + "mean_token_accuracy": 0.8348632201552391, + "num_tokens": 98882616.0, + "step": 82250 + }, + { + "entropy": 1.8219385787844657, + "epoch": 0.25499880265879493, + "grad_norm": 4.100628852844238, + "learning_rate": 5.009867601693556e-06, + "loss": 0.4851, + "mean_token_accuracy": 0.8387814164161682, + "num_tokens": 98895560.0, + "step": 82260 + }, + { + "entropy": 1.8934132039546967, + "epoch": 0.25502980178384466, + "grad_norm": 9.86856746673584, + "learning_rate": 5.009563111554415e-06, + "loss": 0.4734, + "mean_token_accuracy": 0.8422671303153038, + "num_tokens": 98906966.0, + "step": 82270 + }, + { + "entropy": 1.9088118925690651, + "epoch": 0.2550608009088943, + "grad_norm": 4.092050075531006, + "learning_rate": 5.009258676927506e-06, + "loss": 0.4658, + "mean_token_accuracy": 0.845549589395523, + "num_tokens": 98918825.0, + "step": 82280 + }, + { + "entropy": 1.9305264785885812, + "epoch": 0.25509180003394405, + "grad_norm": 9.015393257141113, + "learning_rate": 5.008954297795962e-06, + "loss": 0.5136, + "mean_token_accuracy": 0.8419323205947876, + "num_tokens": 98930506.0, + "step": 82290 + }, + { + "entropy": 1.9115617856383325, + "epoch": 0.2551227991589937, + "grad_norm": 9.485702514648438, + "learning_rate": 5.008649974142926e-06, + "loss": 0.4921, + "mean_token_accuracy": 0.8377510070800781, + "num_tokens": 98941230.0, + "step": 82300 + }, + { + "entropy": 1.8585439771413803, + "epoch": 0.25515379828404344, + "grad_norm": 8.97275447845459, + "learning_rate": 5.0083457059515476e-06, + "loss": 0.4594, + "mean_token_accuracy": 0.8494518890976905, + "num_tokens": 98953271.0, + "step": 82310 + }, + { + "entropy": 1.781735722720623, + "epoch": 0.2551847974090931, + "grad_norm": 3.6371991634368896, + "learning_rate": 5.008041493204978e-06, + "loss": 0.4071, + "mean_token_accuracy": 0.8600785121321678, + "num_tokens": 98966568.0, + "step": 82320 + }, + { + "entropy": 1.9445871606469154, + "epoch": 0.25521579653414284, + "grad_norm": 7.9240899085998535, + "learning_rate": 5.007737335886387e-06, + "loss": 0.4885, + "mean_token_accuracy": 0.8479147583246232, + "num_tokens": 98977867.0, + "step": 82330 + }, + { + "entropy": 1.9001877725124359, + "epoch": 0.2552467956591925, + "grad_norm": 9.06245231628418, + "learning_rate": 5.007433233978939e-06, + "loss": 0.4689, + "mean_token_accuracy": 0.8466285720467568, + "num_tokens": 98990050.0, + "step": 82340 + }, + { + "entropy": 1.96756109893322, + "epoch": 0.25527779478424223, + "grad_norm": 7.101724624633789, + "learning_rate": 5.007129187465815e-06, + "loss": 0.4917, + "mean_token_accuracy": 0.8483997210860252, + "num_tokens": 99001143.0, + "step": 82350 + }, + { + "entropy": 1.9655298113822937, + "epoch": 0.2553087939092919, + "grad_norm": 11.712712287902832, + "learning_rate": 5.006825196330199e-06, + "loss": 0.5492, + "mean_token_accuracy": 0.8361737057566643, + "num_tokens": 99012612.0, + "step": 82360 + }, + { + "entropy": 1.8864617615938186, + "epoch": 0.2553397930343416, + "grad_norm": 9.521370887756348, + "learning_rate": 5.006521260555282e-06, + "loss": 0.4869, + "mean_token_accuracy": 0.8489149019122124, + "num_tokens": 99024254.0, + "step": 82370 + }, + { + "entropy": 1.9336427330970765, + "epoch": 0.2553707921593913, + "grad_norm": 8.10055923461914, + "learning_rate": 5.006217380124263e-06, + "loss": 0.5025, + "mean_token_accuracy": 0.8430611327290535, + "num_tokens": 99035921.0, + "step": 82380 + }, + { + "entropy": 1.8473542481660843, + "epoch": 0.255401791284441, + "grad_norm": 3.776082992553711, + "learning_rate": 5.005913555020348e-06, + "loss": 0.4707, + "mean_token_accuracy": 0.8381215184926987, + "num_tokens": 99048817.0, + "step": 82390 + }, + { + "entropy": 1.9615378364920617, + "epoch": 0.2554327904094907, + "grad_norm": 4.9302215576171875, + "learning_rate": 5.00560978522675e-06, + "loss": 0.4888, + "mean_token_accuracy": 0.8320355072617531, + "num_tokens": 99060522.0, + "step": 82400 + }, + { + "entropy": 1.8505050733685493, + "epoch": 0.2554637895345404, + "grad_norm": 10.341068267822266, + "learning_rate": 5.0053060707266894e-06, + "loss": 0.4186, + "mean_token_accuracy": 0.852343961596489, + "num_tokens": 99073004.0, + "step": 82410 + }, + { + "entropy": 1.918088473379612, + "epoch": 0.2554947886595901, + "grad_norm": 7.087062835693359, + "learning_rate": 5.005002411503397e-06, + "loss": 0.4884, + "mean_token_accuracy": 0.836826567351818, + "num_tokens": 99084688.0, + "step": 82420 + }, + { + "entropy": 1.8801950231194495, + "epoch": 0.2555257877846398, + "grad_norm": 8.337331771850586, + "learning_rate": 5.004698807540101e-06, + "loss": 0.4613, + "mean_token_accuracy": 0.8451901748776436, + "num_tokens": 99097086.0, + "step": 82430 + }, + { + "entropy": 1.9792321264743804, + "epoch": 0.25555678690968947, + "grad_norm": 9.403802871704102, + "learning_rate": 5.004395258820048e-06, + "loss": 0.5651, + "mean_token_accuracy": 0.8320185244083405, + "num_tokens": 99108547.0, + "step": 82440 + }, + { + "entropy": 1.88391984552145, + "epoch": 0.2555877860347392, + "grad_norm": 7.921189308166504, + "learning_rate": 5.004091765326484e-06, + "loss": 0.4816, + "mean_token_accuracy": 0.8392195641994477, + "num_tokens": 99121168.0, + "step": 82450 + }, + { + "entropy": 1.936862115561962, + "epoch": 0.25561878515978886, + "grad_norm": 8.814833641052246, + "learning_rate": 5.003788327042666e-06, + "loss": 0.5028, + "mean_token_accuracy": 0.842677928507328, + "num_tokens": 99133107.0, + "step": 82460 + }, + { + "entropy": 1.900815899670124, + "epoch": 0.25564978428483853, + "grad_norm": 8.44135570526123, + "learning_rate": 5.003484943951856e-06, + "loss": 0.4759, + "mean_token_accuracy": 0.8460040614008904, + "num_tokens": 99145316.0, + "step": 82470 + }, + { + "entropy": 1.9658162996172905, + "epoch": 0.25568078340988826, + "grad_norm": 9.297836303710938, + "learning_rate": 5.003181616037326e-06, + "loss": 0.5277, + "mean_token_accuracy": 0.842269380390644, + "num_tokens": 99157003.0, + "step": 82480 + }, + { + "entropy": 1.8745719879865645, + "epoch": 0.2557117825349379, + "grad_norm": 6.9280805587768555, + "learning_rate": 5.002878343282352e-06, + "loss": 0.4638, + "mean_token_accuracy": 0.8584736675024033, + "num_tokens": 99168906.0, + "step": 82490 + }, + { + "entropy": 1.8955281019210815, + "epoch": 0.25574278165998765, + "grad_norm": 4.146218299865723, + "learning_rate": 5.0025751256702195e-06, + "loss": 0.4766, + "mean_token_accuracy": 0.8466937229037285, + "num_tokens": 99181020.0, + "step": 82500 + }, + { + "entropy": 1.9218576028943062, + "epoch": 0.2557737807850373, + "grad_norm": 7.757521629333496, + "learning_rate": 5.0022719631842165e-06, + "loss": 0.4913, + "mean_token_accuracy": 0.8493802517652511, + "num_tokens": 99192352.0, + "step": 82510 + }, + { + "entropy": 1.9906231686472893, + "epoch": 0.25580477991008704, + "grad_norm": 8.75942611694336, + "learning_rate": 5.001968855807645e-06, + "loss": 0.5567, + "mean_token_accuracy": 0.8316405296325684, + "num_tokens": 99203591.0, + "step": 82520 + }, + { + "entropy": 1.9479998797178268, + "epoch": 0.2558357790351367, + "grad_norm": 8.903733253479004, + "learning_rate": 5.001665803523808e-06, + "loss": 0.4778, + "mean_token_accuracy": 0.8445803746581078, + "num_tokens": 99214411.0, + "step": 82530 + }, + { + "entropy": 1.920227263867855, + "epoch": 0.25586677816018644, + "grad_norm": 8.966920852661133, + "learning_rate": 5.001362806316021e-06, + "loss": 0.5075, + "mean_token_accuracy": 0.8462096214294433, + "num_tokens": 99226223.0, + "step": 82540 + }, + { + "entropy": 1.914782066643238, + "epoch": 0.2558977772852361, + "grad_norm": 9.925724983215332, + "learning_rate": 5.001059864167602e-06, + "loss": 0.4852, + "mean_token_accuracy": 0.8446723312139511, + "num_tokens": 99238079.0, + "step": 82550 + }, + { + "entropy": 1.9496339425444602, + "epoch": 0.25592877641028583, + "grad_norm": 7.7270612716674805, + "learning_rate": 5.000756977061877e-06, + "loss": 0.4965, + "mean_token_accuracy": 0.8507271915674209, + "num_tokens": 99249265.0, + "step": 82560 + }, + { + "entropy": 1.9208646342158318, + "epoch": 0.2559597755353355, + "grad_norm": 7.940464019775391, + "learning_rate": 5.000454144982181e-06, + "loss": 0.4896, + "mean_token_accuracy": 0.8384231016039848, + "num_tokens": 99261128.0, + "step": 82570 + }, + { + "entropy": 1.9947969660162925, + "epoch": 0.2559907746603852, + "grad_norm": 8.898752212524414, + "learning_rate": 5.000151367911854e-06, + "loss": 0.5323, + "mean_token_accuracy": 0.8397686898708343, + "num_tokens": 99273022.0, + "step": 82580 + }, + { + "entropy": 2.0049765020608903, + "epoch": 0.2560217737854349, + "grad_norm": 8.215011596679688, + "learning_rate": 4.999848645834245e-06, + "loss": 0.5369, + "mean_token_accuracy": 0.8358174741268158, + "num_tokens": 99283993.0, + "step": 82590 + }, + { + "entropy": 1.9047711238265037, + "epoch": 0.2560527729104846, + "grad_norm": 7.696551322937012, + "learning_rate": 4.999545978732709e-06, + "loss": 0.4165, + "mean_token_accuracy": 0.8554992079734802, + "num_tokens": 99295756.0, + "step": 82600 + }, + { + "entropy": 1.7249569281935693, + "epoch": 0.2560837720355343, + "grad_norm": 3.8466684818267822, + "learning_rate": 4.999243366590607e-06, + "loss": 0.3453, + "mean_token_accuracy": 0.8684180244803429, + "num_tokens": 99310148.0, + "step": 82610 + }, + { + "entropy": 1.8993670761585235, + "epoch": 0.256114771160584, + "grad_norm": 4.150681972503662, + "learning_rate": 4.998940809391308e-06, + "loss": 0.4989, + "mean_token_accuracy": 0.8339141398668289, + "num_tokens": 99322579.0, + "step": 82620 + }, + { + "entropy": 1.8982570886611938, + "epoch": 0.2561457702856337, + "grad_norm": 7.722603797912598, + "learning_rate": 4.998638307118189e-06, + "loss": 0.4361, + "mean_token_accuracy": 0.8633927822113037, + "num_tokens": 99334404.0, + "step": 82630 + }, + { + "entropy": 1.8842903196811676, + "epoch": 0.2561767694106834, + "grad_norm": 8.307390213012695, + "learning_rate": 4.998335859754634e-06, + "loss": 0.4148, + "mean_token_accuracy": 0.8525113880634307, + "num_tokens": 99347056.0, + "step": 82640 + }, + { + "entropy": 1.9822999000549317, + "epoch": 0.25620776853573307, + "grad_norm": 3.5337090492248535, + "learning_rate": 4.998033467284031e-06, + "loss": 0.5451, + "mean_token_accuracy": 0.8361595958471298, + "num_tokens": 99359039.0, + "step": 82650 + }, + { + "entropy": 1.8225185006856919, + "epoch": 0.2562387676607828, + "grad_norm": 8.926518440246582, + "learning_rate": 4.997731129689778e-06, + "loss": 0.4907, + "mean_token_accuracy": 0.8412432596087456, + "num_tokens": 99373001.0, + "step": 82660 + }, + { + "entropy": 1.9116671606898308, + "epoch": 0.25626976678583246, + "grad_norm": 9.220154762268066, + "learning_rate": 4.997428846955282e-06, + "loss": 0.6017, + "mean_token_accuracy": 0.8289077386260033, + "num_tokens": 99385753.0, + "step": 82670 + }, + { + "entropy": 1.981253059208393, + "epoch": 0.2563007659108822, + "grad_norm": 9.764948844909668, + "learning_rate": 4.99712661906395e-06, + "loss": 0.5408, + "mean_token_accuracy": 0.8350262567400932, + "num_tokens": 99396793.0, + "step": 82680 + }, + { + "entropy": 1.922914719581604, + "epoch": 0.25633176503593186, + "grad_norm": 9.412257194519043, + "learning_rate": 4.996824445999205e-06, + "loss": 0.493, + "mean_token_accuracy": 0.8538893148303032, + "num_tokens": 99408399.0, + "step": 82690 + }, + { + "entropy": 1.9097683414816857, + "epoch": 0.2563627641609816, + "grad_norm": 3.221794366836548, + "learning_rate": 4.996522327744468e-06, + "loss": 0.4683, + "mean_token_accuracy": 0.8507839366793633, + "num_tokens": 99420256.0, + "step": 82700 + }, + { + "entropy": 1.8777303710579871, + "epoch": 0.25639376328603125, + "grad_norm": 8.001666069030762, + "learning_rate": 4.996220264283173e-06, + "loss": 0.4211, + "mean_token_accuracy": 0.8601910769939423, + "num_tokens": 99433400.0, + "step": 82710 + }, + { + "entropy": 1.9291005671024322, + "epoch": 0.2564247624110809, + "grad_norm": 10.794572830200195, + "learning_rate": 4.99591825559876e-06, + "loss": 0.5153, + "mean_token_accuracy": 0.8337120041251183, + "num_tokens": 99445875.0, + "step": 82720 + }, + { + "entropy": 1.8854432210326195, + "epoch": 0.25645576153613064, + "grad_norm": 3.7990822792053223, + "learning_rate": 4.995616301674676e-06, + "loss": 0.4319, + "mean_token_accuracy": 0.8595590248703957, + "num_tokens": 99458264.0, + "step": 82730 + }, + { + "entropy": 1.9592019245028496, + "epoch": 0.2564867606611803, + "grad_norm": 8.916022300720215, + "learning_rate": 4.995314402494373e-06, + "loss": 0.5263, + "mean_token_accuracy": 0.8408660888671875, + "num_tokens": 99469951.0, + "step": 82740 + }, + { + "entropy": 1.9431997925043105, + "epoch": 0.25651775978623004, + "grad_norm": 6.906679153442383, + "learning_rate": 4.99501255804131e-06, + "loss": 0.4109, + "mean_token_accuracy": 0.8524601250886917, + "num_tokens": 99482201.0, + "step": 82750 + }, + { + "entropy": 1.91104666441679, + "epoch": 0.2565487589112797, + "grad_norm": 7.423837184906006, + "learning_rate": 4.9947107682989585e-06, + "loss": 0.4479, + "mean_token_accuracy": 0.8510598838329315, + "num_tokens": 99494427.0, + "step": 82760 + }, + { + "entropy": 1.9561400279402732, + "epoch": 0.25657975803632943, + "grad_norm": 9.361892700195312, + "learning_rate": 4.994409033250789e-06, + "loss": 0.5083, + "mean_token_accuracy": 0.8367945715785027, + "num_tokens": 99506187.0, + "step": 82770 + }, + { + "entropy": 1.9289663940668107, + "epoch": 0.2566107571613791, + "grad_norm": 8.348969459533691, + "learning_rate": 4.9941073528802855e-06, + "loss": 0.4674, + "mean_token_accuracy": 0.8491498351097106, + "num_tokens": 99518185.0, + "step": 82780 + }, + { + "entropy": 1.9189318656921386, + "epoch": 0.2566417562864288, + "grad_norm": 10.50049877166748, + "learning_rate": 4.993805727170934e-06, + "loss": 0.4672, + "mean_token_accuracy": 0.8476282939314842, + "num_tokens": 99531191.0, + "step": 82790 + }, + { + "entropy": 1.9094542279839515, + "epoch": 0.2566727554114785, + "grad_norm": 8.101419448852539, + "learning_rate": 4.993504156106232e-06, + "loss": 0.4681, + "mean_token_accuracy": 0.8485571637749671, + "num_tokens": 99542937.0, + "step": 82800 + }, + { + "entropy": 1.880283573269844, + "epoch": 0.2567037545365282, + "grad_norm": 7.880818843841553, + "learning_rate": 4.99320263966968e-06, + "loss": 0.4746, + "mean_token_accuracy": 0.8481839671730995, + "num_tokens": 99555383.0, + "step": 82810 + }, + { + "entropy": 1.8960697516798972, + "epoch": 0.2567347536615779, + "grad_norm": Infinity, + "learning_rate": 4.992901177844789e-06, + "loss": 0.419, + "mean_token_accuracy": 0.857147465646267, + "num_tokens": 99567485.0, + "step": 82820 + }, + { + "entropy": 1.992298111319542, + "epoch": 0.2567657527866276, + "grad_norm": 7.422923564910889, + "learning_rate": 4.992599770615074e-06, + "loss": 0.492, + "mean_token_accuracy": 0.8521854832768441, + "num_tokens": 99578298.0, + "step": 82830 + }, + { + "entropy": 1.9098379969596864, + "epoch": 0.2567967519116773, + "grad_norm": 7.3638739585876465, + "learning_rate": 4.992298417964059e-06, + "loss": 0.4517, + "mean_token_accuracy": 0.8458767265081406, + "num_tokens": 99590734.0, + "step": 82840 + }, + { + "entropy": 1.8104907512664794, + "epoch": 0.256827751036727, + "grad_norm": 4.290040493011475, + "learning_rate": 4.991997119875274e-06, + "loss": 0.4357, + "mean_token_accuracy": 0.8509851738810539, + "num_tokens": 99603863.0, + "step": 82850 + }, + { + "entropy": 1.9496165543794632, + "epoch": 0.25685875016177667, + "grad_norm": 8.491985321044922, + "learning_rate": 4.991695876332256e-06, + "loss": 0.5497, + "mean_token_accuracy": 0.8299289435148239, + "num_tokens": 99614829.0, + "step": 82860 + }, + { + "entropy": 1.7824837416410446, + "epoch": 0.2568897492868264, + "grad_norm": 10.624833106994629, + "learning_rate": 4.991394687318549e-06, + "loss": 0.3994, + "mean_token_accuracy": 0.8593706652522087, + "num_tokens": 99627639.0, + "step": 82870 + }, + { + "entropy": 1.8935058623552323, + "epoch": 0.25692074841187607, + "grad_norm": 3.4996988773345947, + "learning_rate": 4.9910935528177034e-06, + "loss": 0.4734, + "mean_token_accuracy": 0.8467808067798615, + "num_tokens": 99639515.0, + "step": 82880 + }, + { + "entropy": 1.9544168338179588, + "epoch": 0.2569517475369258, + "grad_norm": 8.53878402709961, + "learning_rate": 4.990792472813278e-06, + "loss": 0.4862, + "mean_token_accuracy": 0.846103484928608, + "num_tokens": 99651187.0, + "step": 82890 + }, + { + "entropy": 1.9207098990678788, + "epoch": 0.25698274666197546, + "grad_norm": 9.49964427947998, + "learning_rate": 4.990491447288838e-06, + "loss": 0.5052, + "mean_token_accuracy": 0.8317215725779533, + "num_tokens": 99662564.0, + "step": 82900 + }, + { + "entropy": 1.876941241323948, + "epoch": 0.2570137457870252, + "grad_norm": 7.434007167816162, + "learning_rate": 4.990190476227954e-06, + "loss": 0.4525, + "mean_token_accuracy": 0.8501386985182762, + "num_tokens": 99674973.0, + "step": 82910 + }, + { + "entropy": 1.9188931852579116, + "epoch": 0.25704474491207485, + "grad_norm": 7.2484331130981445, + "learning_rate": 4.989889559614206e-06, + "loss": 0.4591, + "mean_token_accuracy": 0.8477975860238075, + "num_tokens": 99686447.0, + "step": 82920 + }, + { + "entropy": 1.8817449413239955, + "epoch": 0.2570757440371246, + "grad_norm": 4.548129081726074, + "learning_rate": 4.98958869743118e-06, + "loss": 0.4449, + "mean_token_accuracy": 0.8410625770688057, + "num_tokens": 99698742.0, + "step": 82930 + }, + { + "entropy": 1.8077333301305771, + "epoch": 0.25710674316217424, + "grad_norm": 8.342117309570312, + "learning_rate": 4.989287889662468e-06, + "loss": 0.3919, + "mean_token_accuracy": 0.8545293301343918, + "num_tokens": 99712362.0, + "step": 82940 + }, + { + "entropy": 1.9139455810189248, + "epoch": 0.25713774228722397, + "grad_norm": 5.589305400848389, + "learning_rate": 4.988987136291668e-06, + "loss": 0.4878, + "mean_token_accuracy": 0.8507546544075012, + "num_tokens": 99724409.0, + "step": 82950 + }, + { + "entropy": 1.8849102705717087, + "epoch": 0.25716874141227364, + "grad_norm": 8.964681625366211, + "learning_rate": 4.98868643730239e-06, + "loss": 0.5041, + "mean_token_accuracy": 0.8342619195580483, + "num_tokens": 99736094.0, + "step": 82960 + }, + { + "entropy": 1.9025137901306153, + "epoch": 0.2571997405373233, + "grad_norm": 4.793386936187744, + "learning_rate": 4.9883857926782435e-06, + "loss": 0.46, + "mean_token_accuracy": 0.8469575837254524, + "num_tokens": 99748673.0, + "step": 82970 + }, + { + "entropy": 1.9453041523694992, + "epoch": 0.25723073966237303, + "grad_norm": 6.930491924285889, + "learning_rate": 4.988085202402852e-06, + "loss": 0.4549, + "mean_token_accuracy": 0.8533410027623176, + "num_tokens": 99760599.0, + "step": 82980 + }, + { + "entropy": 1.9440397009253503, + "epoch": 0.2572617387874227, + "grad_norm": 10.45510196685791, + "learning_rate": 4.987784666459842e-06, + "loss": 0.5538, + "mean_token_accuracy": 0.8280631914734841, + "num_tokens": 99771974.0, + "step": 82990 + }, + { + "entropy": 1.9070080369710922, + "epoch": 0.2572927379124724, + "grad_norm": 10.232966423034668, + "learning_rate": 4.987484184832846e-06, + "loss": 0.4834, + "mean_token_accuracy": 0.8449847355484963, + "num_tokens": 99784534.0, + "step": 83000 + }, + { + "entropy": 1.9375251710414887, + "epoch": 0.2573237370375221, + "grad_norm": 3.9058806896209717, + "learning_rate": 4.9871837575055064e-06, + "loss": 0.4807, + "mean_token_accuracy": 0.8462153524160385, + "num_tokens": 99796298.0, + "step": 83010 + }, + { + "entropy": 1.9144327610731124, + "epoch": 0.2573547361625718, + "grad_norm": 10.11092472076416, + "learning_rate": 4.9868833844614715e-06, + "loss": 0.4911, + "mean_token_accuracy": 0.8435170888900757, + "num_tokens": 99807740.0, + "step": 83020 + }, + { + "entropy": 1.8731989122927188, + "epoch": 0.2573857352876215, + "grad_norm": 7.537945747375488, + "learning_rate": 4.986583065684396e-06, + "loss": 0.476, + "mean_token_accuracy": 0.8457079976797104, + "num_tokens": 99819888.0, + "step": 83030 + }, + { + "entropy": 1.9202357351779937, + "epoch": 0.2574167344126712, + "grad_norm": 8.85262393951416, + "learning_rate": 4.986282801157941e-06, + "loss": 0.4819, + "mean_token_accuracy": 0.844878327846527, + "num_tokens": 99831444.0, + "step": 83040 + }, + { + "entropy": 1.8694255024194717, + "epoch": 0.2574477335377209, + "grad_norm": 5.111952304840088, + "learning_rate": 4.9859825908657735e-06, + "loss": 0.512, + "mean_token_accuracy": 0.835330268740654, + "num_tokens": 99844184.0, + "step": 83050 + }, + { + "entropy": 1.9209684014320374, + "epoch": 0.2574787326627706, + "grad_norm": 6.903238773345947, + "learning_rate": 4.985682434791573e-06, + "loss": 0.5015, + "mean_token_accuracy": 0.845658715069294, + "num_tokens": 99856287.0, + "step": 83060 + }, + { + "entropy": 1.8327378258109093, + "epoch": 0.2575097317878203, + "grad_norm": 9.568243026733398, + "learning_rate": 4.9853823329190185e-06, + "loss": 0.4809, + "mean_token_accuracy": 0.8382641449570656, + "num_tokens": 99869468.0, + "step": 83070 + }, + { + "entropy": 1.9063044503331183, + "epoch": 0.25754073091287, + "grad_norm": 3.617058515548706, + "learning_rate": 4.985082285231801e-06, + "loss": 0.4922, + "mean_token_accuracy": 0.8415888145565986, + "num_tokens": 99881146.0, + "step": 83080 + }, + { + "entropy": 1.7789159893989563, + "epoch": 0.25757173003791967, + "grad_norm": 7.718278884887695, + "learning_rate": 4.9847822917136154e-06, + "loss": 0.4231, + "mean_token_accuracy": 0.8528930768370628, + "num_tokens": 99894684.0, + "step": 83090 + }, + { + "entropy": 1.946078921854496, + "epoch": 0.2576027291629694, + "grad_norm": 8.70456600189209, + "learning_rate": 4.984482352348164e-06, + "loss": 0.5013, + "mean_token_accuracy": 0.8418860971927643, + "num_tokens": 99905660.0, + "step": 83100 + }, + { + "entropy": 1.9768987134099008, + "epoch": 0.25763372828801906, + "grad_norm": 9.16089153289795, + "learning_rate": 4.9841824671191594e-06, + "loss": 0.5825, + "mean_token_accuracy": 0.8343336895108223, + "num_tokens": 99917120.0, + "step": 83110 + }, + { + "entropy": 1.889198412001133, + "epoch": 0.2576647274130688, + "grad_norm": 7.599856853485107, + "learning_rate": 4.983882636010317e-06, + "loss": 0.4276, + "mean_token_accuracy": 0.8526261046528816, + "num_tokens": 99928830.0, + "step": 83120 + }, + { + "entropy": 1.9562062606215478, + "epoch": 0.25769572653811845, + "grad_norm": 8.07655143737793, + "learning_rate": 4.983582859005359e-06, + "loss": 0.4726, + "mean_token_accuracy": 0.831449817121029, + "num_tokens": 99940719.0, + "step": 83130 + }, + { + "entropy": 1.896943534910679, + "epoch": 0.2577267256631682, + "grad_norm": 8.602059364318848, + "learning_rate": 4.983283136088018e-06, + "loss": 0.4598, + "mean_token_accuracy": 0.8445844024419784, + "num_tokens": 99953117.0, + "step": 83140 + }, + { + "entropy": 1.9443399429321289, + "epoch": 0.25775772478821785, + "grad_norm": 7.0254693031311035, + "learning_rate": 4.982983467242029e-06, + "loss": 0.5087, + "mean_token_accuracy": 0.8422896787524223, + "num_tokens": 99964325.0, + "step": 83150 + }, + { + "entropy": 1.9491709470748901, + "epoch": 0.25778872391326757, + "grad_norm": 11.82594108581543, + "learning_rate": 4.982683852451138e-06, + "loss": 0.5412, + "mean_token_accuracy": 0.8369784370064736, + "num_tokens": 99975197.0, + "step": 83160 + }, + { + "entropy": 1.8532451003789903, + "epoch": 0.25781972303831724, + "grad_norm": 3.9568448066711426, + "learning_rate": 4.982384291699096e-06, + "loss": 0.4711, + "mean_token_accuracy": 0.8396353736519814, + "num_tokens": 99987783.0, + "step": 83170 + }, + { + "entropy": 1.9464505165815353, + "epoch": 0.25785072216336696, + "grad_norm": 10.583782196044922, + "learning_rate": 4.982084784969659e-06, + "loss": 0.515, + "mean_token_accuracy": 0.8377991825342178, + "num_tokens": 100000020.0, + "step": 83180 + }, + { + "entropy": 1.9166669502854348, + "epoch": 0.25788172128841663, + "grad_norm": 9.20700740814209, + "learning_rate": 4.981785332246592e-06, + "loss": 0.4661, + "mean_token_accuracy": 0.8458949193358422, + "num_tokens": 100011762.0, + "step": 83190 + }, + { + "entropy": 1.9713429033756256, + "epoch": 0.25791272041346636, + "grad_norm": 9.420412063598633, + "learning_rate": 4.981485933513668e-06, + "loss": 0.4776, + "mean_token_accuracy": 0.8544860139489174, + "num_tokens": 100023022.0, + "step": 83200 + }, + { + "entropy": 1.9141755923628807, + "epoch": 0.257943719538516, + "grad_norm": 9.200664520263672, + "learning_rate": 4.981186588754664e-06, + "loss": 0.5146, + "mean_token_accuracy": 0.8375491261482239, + "num_tokens": 100034493.0, + "step": 83210 + }, + { + "entropy": 1.8373278394341468, + "epoch": 0.2579747186635657, + "grad_norm": 3.6070780754089355, + "learning_rate": 4.980887297953366e-06, + "loss": 0.4488, + "mean_token_accuracy": 0.853793577849865, + "num_tokens": 100047907.0, + "step": 83220 + }, + { + "entropy": 1.905658522248268, + "epoch": 0.2580057177886154, + "grad_norm": 33.18852996826172, + "learning_rate": 4.980588061093565e-06, + "loss": 0.4779, + "mean_token_accuracy": 0.844824655354023, + "num_tokens": 100059902.0, + "step": 83230 + }, + { + "entropy": 1.9518269658088685, + "epoch": 0.2580367169136651, + "grad_norm": 7.96235990524292, + "learning_rate": 4.980288878159059e-06, + "loss": 0.4883, + "mean_token_accuracy": 0.8474616572260857, + "num_tokens": 100070453.0, + "step": 83240 + }, + { + "entropy": 1.971319329738617, + "epoch": 0.2580677160387148, + "grad_norm": 8.494059562683105, + "learning_rate": 4.9799897491336564e-06, + "loss": 0.5307, + "mean_token_accuracy": 0.8315307438373566, + "num_tokens": 100082070.0, + "step": 83250 + }, + { + "entropy": 1.9625032275915146, + "epoch": 0.2580987151637645, + "grad_norm": 7.6323466300964355, + "learning_rate": 4.979690674001167e-06, + "loss": 0.5205, + "mean_token_accuracy": 0.8371849060058594, + "num_tokens": 100093397.0, + "step": 83260 + }, + { + "entropy": 1.8805483281612396, + "epoch": 0.2581297142888142, + "grad_norm": 8.261248588562012, + "learning_rate": 4.979391652745411e-06, + "loss": 0.4452, + "mean_token_accuracy": 0.8561758771538734, + "num_tokens": 100105997.0, + "step": 83270 + }, + { + "entropy": 1.8767956405878068, + "epoch": 0.2581607134138639, + "grad_norm": 3.6510441303253174, + "learning_rate": 4.9790926853502125e-06, + "loss": 0.4729, + "mean_token_accuracy": 0.8472672030329704, + "num_tokens": 100117793.0, + "step": 83280 + }, + { + "entropy": 1.7951404377818108, + "epoch": 0.2581917125389136, + "grad_norm": 4.375608921051025, + "learning_rate": 4.978793771799407e-06, + "loss": 0.3975, + "mean_token_accuracy": 0.8583429366350174, + "num_tokens": 100130835.0, + "step": 83290 + }, + { + "entropy": 1.971182020008564, + "epoch": 0.25822271166396327, + "grad_norm": 10.078766822814941, + "learning_rate": 4.978494912076833e-06, + "loss": 0.5194, + "mean_token_accuracy": 0.8344864457845688, + "num_tokens": 100141909.0, + "step": 83300 + }, + { + "entropy": 1.9942039638757705, + "epoch": 0.258253710789013, + "grad_norm": 8.207865715026855, + "learning_rate": 4.978196106166336e-06, + "loss": 0.574, + "mean_token_accuracy": 0.828905712068081, + "num_tokens": 100152944.0, + "step": 83310 + }, + { + "entropy": 1.8606795385479926, + "epoch": 0.25828470991406266, + "grad_norm": 4.853648662567139, + "learning_rate": 4.97789735405177e-06, + "loss": 0.4582, + "mean_token_accuracy": 0.849544158577919, + "num_tokens": 100166124.0, + "step": 83320 + }, + { + "entropy": 1.9509618058800697, + "epoch": 0.2583157090391124, + "grad_norm": 6.577378273010254, + "learning_rate": 4.977598655716994e-06, + "loss": 0.5034, + "mean_token_accuracy": 0.8365138009190559, + "num_tokens": 100177997.0, + "step": 83330 + }, + { + "entropy": 1.9413925766944886, + "epoch": 0.25834670816416205, + "grad_norm": 7.664201736450195, + "learning_rate": 4.977300011145877e-06, + "loss": 0.497, + "mean_token_accuracy": 0.8374240696430206, + "num_tokens": 100189432.0, + "step": 83340 + }, + { + "entropy": 1.9520146340131759, + "epoch": 0.2583777072892118, + "grad_norm": 3.7987093925476074, + "learning_rate": 4.97700142032229e-06, + "loss": 0.4852, + "mean_token_accuracy": 0.8407413721084595, + "num_tokens": 100201276.0, + "step": 83350 + }, + { + "entropy": 1.9646510049700736, + "epoch": 0.25840870641426145, + "grad_norm": 8.533371925354004, + "learning_rate": 4.976702883230112e-06, + "loss": 0.5696, + "mean_token_accuracy": 0.8276545464992523, + "num_tokens": 100212137.0, + "step": 83360 + }, + { + "entropy": 1.8566021844744682, + "epoch": 0.25843970553931117, + "grad_norm": 9.078433990478516, + "learning_rate": 4.976404399853234e-06, + "loss": 0.4749, + "mean_token_accuracy": 0.8363482743501663, + "num_tokens": 100224796.0, + "step": 83370 + }, + { + "entropy": 1.9266142144799232, + "epoch": 0.25847070466436084, + "grad_norm": 8.397682189941406, + "learning_rate": 4.9761059701755475e-06, + "loss": 0.4819, + "mean_token_accuracy": 0.8483969137072563, + "num_tokens": 100236575.0, + "step": 83380 + }, + { + "entropy": 1.9420603141188622, + "epoch": 0.25850170378941056, + "grad_norm": 8.181031227111816, + "learning_rate": 4.975807594180953e-06, + "loss": 0.5213, + "mean_token_accuracy": 0.844786761701107, + "num_tokens": 100248133.0, + "step": 83390 + }, + { + "entropy": 1.7950069807469844, + "epoch": 0.25853270291446023, + "grad_norm": 7.972817420959473, + "learning_rate": 4.975509271853358e-06, + "loss": 0.3808, + "mean_token_accuracy": 0.8578445136547088, + "num_tokens": 100261552.0, + "step": 83400 + }, + { + "entropy": 1.9505249321460725, + "epoch": 0.25856370203950996, + "grad_norm": 3.8303351402282715, + "learning_rate": 4.975211003176676e-06, + "loss": 0.5553, + "mean_token_accuracy": 0.8300097927451133, + "num_tokens": 100272407.0, + "step": 83410 + }, + { + "entropy": 1.9518933594226837, + "epoch": 0.2585947011645596, + "grad_norm": 8.778966903686523, + "learning_rate": 4.97491278813483e-06, + "loss": 0.4873, + "mean_token_accuracy": 0.8443615302443505, + "num_tokens": 100284483.0, + "step": 83420 + }, + { + "entropy": 1.825792530924082, + "epoch": 0.25862570028960935, + "grad_norm": 8.191307067871094, + "learning_rate": 4.974614626711745e-06, + "loss": 0.4156, + "mean_token_accuracy": 0.8553913697600365, + "num_tokens": 100298331.0, + "step": 83430 + }, + { + "entropy": 1.9290891095995903, + "epoch": 0.258656699414659, + "grad_norm": 8.639996528625488, + "learning_rate": 4.9743165188913564e-06, + "loss": 0.5654, + "mean_token_accuracy": 0.8340361103415489, + "num_tokens": 100309563.0, + "step": 83440 + }, + { + "entropy": 1.8901295900344848, + "epoch": 0.25868769853970874, + "grad_norm": 4.191493988037109, + "learning_rate": 4.9740184646576036e-06, + "loss": 0.4504, + "mean_token_accuracy": 0.8442709073424339, + "num_tokens": 100321894.0, + "step": 83450 + }, + { + "entropy": 1.8269127234816551, + "epoch": 0.2587186976647584, + "grad_norm": 4.574268817901611, + "learning_rate": 4.9737204639944376e-06, + "loss": 0.4119, + "mean_token_accuracy": 0.8548773661255836, + "num_tokens": 100334501.0, + "step": 83460 + }, + { + "entropy": 1.8532571867108345, + "epoch": 0.2587496967898081, + "grad_norm": 3.918750762939453, + "learning_rate": 4.973422516885809e-06, + "loss": 0.4792, + "mean_token_accuracy": 0.846004131436348, + "num_tokens": 100346656.0, + "step": 83470 + }, + { + "entropy": 1.8245261393487453, + "epoch": 0.2587806959148578, + "grad_norm": 3.927574872970581, + "learning_rate": 4.973124623315682e-06, + "loss": 0.4089, + "mean_token_accuracy": 0.8500414654612541, + "num_tokens": 100359699.0, + "step": 83480 + }, + { + "entropy": 1.77280533015728, + "epoch": 0.2588116950399075, + "grad_norm": 4.003964424133301, + "learning_rate": 4.972826783268022e-06, + "loss": 0.3831, + "mean_token_accuracy": 0.8541926577687263, + "num_tokens": 100373537.0, + "step": 83490 + }, + { + "entropy": 1.894656127691269, + "epoch": 0.2588426941649572, + "grad_norm": 8.42563247680664, + "learning_rate": 4.972528996726807e-06, + "loss": 0.494, + "mean_token_accuracy": 0.8406502559781075, + "num_tokens": 100385609.0, + "step": 83500 + }, + { + "entropy": 1.9357464522123338, + "epoch": 0.25887369329000687, + "grad_norm": 9.4671049118042, + "learning_rate": 4.972231263676015e-06, + "loss": 0.496, + "mean_token_accuracy": 0.8405605420470238, + "num_tokens": 100397032.0, + "step": 83510 + }, + { + "entropy": 1.9452675521373748, + "epoch": 0.2589046924150566, + "grad_norm": 12.05567455291748, + "learning_rate": 4.971933584099637e-06, + "loss": 0.5118, + "mean_token_accuracy": 0.832793453335762, + "num_tokens": 100408610.0, + "step": 83520 + }, + { + "entropy": 1.9751826629042626, + "epoch": 0.25893569154010626, + "grad_norm": 8.384647369384766, + "learning_rate": 4.971635957981665e-06, + "loss": 0.513, + "mean_token_accuracy": 0.8382089316844941, + "num_tokens": 100420251.0, + "step": 83530 + }, + { + "entropy": 1.911265040934086, + "epoch": 0.258966690665156, + "grad_norm": 8.947986602783203, + "learning_rate": 4.971338385306102e-06, + "loss": 0.4612, + "mean_token_accuracy": 0.853412701189518, + "num_tokens": 100432233.0, + "step": 83540 + }, + { + "entropy": 1.9375739723443985, + "epoch": 0.25899768979020565, + "grad_norm": 7.377249240875244, + "learning_rate": 4.9710408660569555e-06, + "loss": 0.4935, + "mean_token_accuracy": 0.8441775634884834, + "num_tokens": 100443781.0, + "step": 83550 + }, + { + "entropy": 1.9460734203457832, + "epoch": 0.2590286889152554, + "grad_norm": 8.327649116516113, + "learning_rate": 4.970743400218241e-06, + "loss": 0.5368, + "mean_token_accuracy": 0.8340301439166069, + "num_tokens": 100454515.0, + "step": 83560 + }, + { + "entropy": 1.9006448999047278, + "epoch": 0.25905968804030505, + "grad_norm": 4.181915760040283, + "learning_rate": 4.9704459877739805e-06, + "loss": 0.4663, + "mean_token_accuracy": 0.8439620837569237, + "num_tokens": 100466532.0, + "step": 83570 + }, + { + "entropy": 2.0110969811677935, + "epoch": 0.25909068716535477, + "grad_norm": 7.133867263793945, + "learning_rate": 4.9701486287082e-06, + "loss": 0.5219, + "mean_token_accuracy": 0.84031240940094, + "num_tokens": 100477071.0, + "step": 83580 + }, + { + "entropy": 1.8151050120592118, + "epoch": 0.25912168629040444, + "grad_norm": 4.122489929199219, + "learning_rate": 4.9698513230049375e-06, + "loss": 0.4686, + "mean_token_accuracy": 0.8427058979868889, + "num_tokens": 100490782.0, + "step": 83590 + }, + { + "entropy": 1.8606194391846658, + "epoch": 0.25915268541545416, + "grad_norm": 3.6981208324432373, + "learning_rate": 4.969554070648234e-06, + "loss": 0.456, + "mean_token_accuracy": 0.8553460776805878, + "num_tokens": 100503176.0, + "step": 83600 + }, + { + "entropy": 1.9358109071850778, + "epoch": 0.25918368454050383, + "grad_norm": 4.142949104309082, + "learning_rate": 4.9692568716221355e-06, + "loss": 0.5155, + "mean_token_accuracy": 0.8340194925665856, + "num_tokens": 100514798.0, + "step": 83610 + }, + { + "entropy": 1.8827938050031663, + "epoch": 0.25921468366555356, + "grad_norm": 8.898327827453613, + "learning_rate": 4.968959725910699e-06, + "loss": 0.4757, + "mean_token_accuracy": 0.8450506746768951, + "num_tokens": 100526985.0, + "step": 83620 + }, + { + "entropy": 1.9060676455497743, + "epoch": 0.2592456827906032, + "grad_norm": 9.307757377624512, + "learning_rate": 4.968662633497986e-06, + "loss": 0.4762, + "mean_token_accuracy": 0.8441753014922142, + "num_tokens": 100539529.0, + "step": 83630 + }, + { + "entropy": 1.8917318254709243, + "epoch": 0.25927668191565295, + "grad_norm": 7.30048942565918, + "learning_rate": 4.968365594368065e-06, + "loss": 0.4478, + "mean_token_accuracy": 0.849764208495617, + "num_tokens": 100551756.0, + "step": 83640 + }, + { + "entropy": 1.8924952074885368, + "epoch": 0.2593076810407026, + "grad_norm": 4.036615371704102, + "learning_rate": 4.968068608505009e-06, + "loss": 0.4507, + "mean_token_accuracy": 0.8556442365050316, + "num_tokens": 100564203.0, + "step": 83650 + }, + { + "entropy": 1.8785409778356552, + "epoch": 0.25933868016575234, + "grad_norm": 4.065882682800293, + "learning_rate": 4.967771675892903e-06, + "loss": 0.4462, + "mean_token_accuracy": 0.8419948130846023, + "num_tokens": 100575948.0, + "step": 83660 + }, + { + "entropy": 1.92302625477314, + "epoch": 0.259369679290802, + "grad_norm": 5.085048675537109, + "learning_rate": 4.967474796515832e-06, + "loss": 0.54, + "mean_token_accuracy": 0.8281594708561897, + "num_tokens": 100588049.0, + "step": 83670 + }, + { + "entropy": 1.8481286302208901, + "epoch": 0.25940067841585174, + "grad_norm": 10.417789459228516, + "learning_rate": 4.9671779703578934e-06, + "loss": 0.4989, + "mean_token_accuracy": 0.848712483048439, + "num_tokens": 100600632.0, + "step": 83680 + }, + { + "entropy": 1.8013351663947106, + "epoch": 0.2594316775409014, + "grad_norm": 8.973665237426758, + "learning_rate": 4.966881197403189e-06, + "loss": 0.4899, + "mean_token_accuracy": 0.83788081407547, + "num_tokens": 100613842.0, + "step": 83690 + }, + { + "entropy": 1.8483421131968498, + "epoch": 0.25946267666595113, + "grad_norm": 7.858611583709717, + "learning_rate": 4.966584477635825e-06, + "loss": 0.4966, + "mean_token_accuracy": 0.8433775931596756, + "num_tokens": 100626204.0, + "step": 83700 + }, + { + "entropy": 1.8308247201144696, + "epoch": 0.2594936757910008, + "grad_norm": 8.385130882263184, + "learning_rate": 4.966287811039917e-06, + "loss": 0.4519, + "mean_token_accuracy": 0.8558708354830742, + "num_tokens": 100639013.0, + "step": 83710 + }, + { + "entropy": 1.9155432254076004, + "epoch": 0.25952467491605047, + "grad_norm": 9.558300971984863, + "learning_rate": 4.965991197599587e-06, + "loss": 0.4991, + "mean_token_accuracy": 0.8412322223186492, + "num_tokens": 100649836.0, + "step": 83720 + }, + { + "entropy": 1.8477103784680367, + "epoch": 0.2595556740411002, + "grad_norm": 3.852271556854248, + "learning_rate": 4.965694637298964e-06, + "loss": 0.4422, + "mean_token_accuracy": 0.848893304169178, + "num_tokens": 100662993.0, + "step": 83730 + }, + { + "entropy": 1.9299596205353737, + "epoch": 0.25958667316614986, + "grad_norm": 8.642110824584961, + "learning_rate": 4.9653981301221825e-06, + "loss": 0.5333, + "mean_token_accuracy": 0.8376413077116013, + "num_tokens": 100674572.0, + "step": 83740 + }, + { + "entropy": 1.8460013434290885, + "epoch": 0.2596176722911996, + "grad_norm": 10.720597267150879, + "learning_rate": 4.9651016760533816e-06, + "loss": 0.4302, + "mean_token_accuracy": 0.8525181710720062, + "num_tokens": 100687154.0, + "step": 83750 + }, + { + "entropy": 1.8346559152007103, + "epoch": 0.25964867141624925, + "grad_norm": 7.259960651397705, + "learning_rate": 4.964805275076713e-06, + "loss": 0.4357, + "mean_token_accuracy": 0.8434622973203659, + "num_tokens": 100699665.0, + "step": 83760 + }, + { + "entropy": 1.9489260196685791, + "epoch": 0.259679670541299, + "grad_norm": 8.808201789855957, + "learning_rate": 4.96450892717633e-06, + "loss": 0.5478, + "mean_token_accuracy": 0.8384017795324326, + "num_tokens": 100710924.0, + "step": 83770 + }, + { + "entropy": 1.771421131491661, + "epoch": 0.25971066966634865, + "grad_norm": 7.782841682434082, + "learning_rate": 4.964212632336392e-06, + "loss": 0.4088, + "mean_token_accuracy": 0.8582013815641403, + "num_tokens": 100724256.0, + "step": 83780 + }, + { + "entropy": 1.8205000385642052, + "epoch": 0.2597416687913984, + "grad_norm": 7.884888648986816, + "learning_rate": 4.963916390541071e-06, + "loss": 0.4246, + "mean_token_accuracy": 0.8426259756088257, + "num_tokens": 100737066.0, + "step": 83790 + }, + { + "entropy": 1.9249374747276307, + "epoch": 0.25977266791644804, + "grad_norm": 8.629660606384277, + "learning_rate": 4.963620201774537e-06, + "loss": 0.5384, + "mean_token_accuracy": 0.8426753237843514, + "num_tokens": 100748371.0, + "step": 83800 + }, + { + "entropy": 1.869993396103382, + "epoch": 0.25980366704149777, + "grad_norm": 7.948463439941406, + "learning_rate": 4.963324066020974e-06, + "loss": 0.5011, + "mean_token_accuracy": 0.8404388725757599, + "num_tokens": 100760062.0, + "step": 83810 + }, + { + "entropy": 1.8810945719480514, + "epoch": 0.25983466616654743, + "grad_norm": 8.10258960723877, + "learning_rate": 4.963027983264569e-06, + "loss": 0.4583, + "mean_token_accuracy": 0.85190070271492, + "num_tokens": 100771799.0, + "step": 83820 + }, + { + "entropy": 1.932461032271385, + "epoch": 0.25986566529159716, + "grad_norm": 10.165177345275879, + "learning_rate": 4.9627319534895166e-06, + "loss": 0.5158, + "mean_token_accuracy": 0.8437979429960251, + "num_tokens": 100783029.0, + "step": 83830 + }, + { + "entropy": 1.8803840085864068, + "epoch": 0.2598966644166468, + "grad_norm": 3.2739675045013428, + "learning_rate": 4.962435976680019e-06, + "loss": 0.4717, + "mean_token_accuracy": 0.853711499273777, + "num_tokens": 100794075.0, + "step": 83840 + }, + { + "entropy": 1.879416285455227, + "epoch": 0.25992766354169655, + "grad_norm": 8.004817008972168, + "learning_rate": 4.9621400528202814e-06, + "loss": 0.5403, + "mean_token_accuracy": 0.829861244559288, + "num_tokens": 100805286.0, + "step": 83850 + }, + { + "entropy": 1.8929785311222076, + "epoch": 0.2599586626667462, + "grad_norm": 8.447615623474121, + "learning_rate": 4.96184418189452e-06, + "loss": 0.5056, + "mean_token_accuracy": 0.8426691144704819, + "num_tokens": 100816771.0, + "step": 83860 + }, + { + "entropy": 1.8302429109811782, + "epoch": 0.25998966179179595, + "grad_norm": 6.880725860595703, + "learning_rate": 4.9615483638869545e-06, + "loss": 0.4712, + "mean_token_accuracy": 0.8440744116902351, + "num_tokens": 100829800.0, + "step": 83870 + }, + { + "entropy": 1.947381141781807, + "epoch": 0.2600206609168456, + "grad_norm": 8.487883567810059, + "learning_rate": 4.961252598781812e-06, + "loss": 0.5444, + "mean_token_accuracy": 0.8282511562108994, + "num_tokens": 100840370.0, + "step": 83880 + }, + { + "entropy": 1.8844257071614265, + "epoch": 0.26005166004189534, + "grad_norm": 5.970224380493164, + "learning_rate": 4.960956886563326e-06, + "loss": 0.5086, + "mean_token_accuracy": 0.8489205956459045, + "num_tokens": 100852053.0, + "step": 83890 + }, + { + "entropy": 1.783471368253231, + "epoch": 0.260082659166945, + "grad_norm": 9.093459129333496, + "learning_rate": 4.960661227215739e-06, + "loss": 0.4188, + "mean_token_accuracy": 0.858287438750267, + "num_tokens": 100865934.0, + "step": 83900 + }, + { + "entropy": 1.9448846295475959, + "epoch": 0.26011365829199473, + "grad_norm": 4.641866683959961, + "learning_rate": 4.960365620723297e-06, + "loss": 0.5616, + "mean_token_accuracy": 0.8277464538812638, + "num_tokens": 100876883.0, + "step": 83910 + }, + { + "entropy": 1.8289760783314706, + "epoch": 0.2601446574170444, + "grad_norm": 4.197445392608643, + "learning_rate": 4.960070067070255e-06, + "loss": 0.4408, + "mean_token_accuracy": 0.8453979283571244, + "num_tokens": 100889433.0, + "step": 83920 + }, + { + "entropy": 1.7806072607636452, + "epoch": 0.2601756565420941, + "grad_norm": 3.8781611919403076, + "learning_rate": 4.95977456624087e-06, + "loss": 0.3753, + "mean_token_accuracy": 0.85598254352808, + "num_tokens": 100902238.0, + "step": 83930 + }, + { + "entropy": 1.8489570021629333, + "epoch": 0.2602066556671438, + "grad_norm": 4.011428356170654, + "learning_rate": 4.95947911821941e-06, + "loss": 0.447, + "mean_token_accuracy": 0.8497495472431182, + "num_tokens": 100914487.0, + "step": 83940 + }, + { + "entropy": 1.8972808972001076, + "epoch": 0.2602376547921935, + "grad_norm": 7.408459663391113, + "learning_rate": 4.959183722990151e-06, + "loss": 0.4737, + "mean_token_accuracy": 0.8526468485593796, + "num_tokens": 100925789.0, + "step": 83950 + }, + { + "entropy": 1.8424657888710498, + "epoch": 0.2602686539172432, + "grad_norm": 8.171313285827637, + "learning_rate": 4.9588883805373686e-06, + "loss": 0.3904, + "mean_token_accuracy": 0.8596995994448662, + "num_tokens": 100937994.0, + "step": 83960 + }, + { + "entropy": 1.8914167776703834, + "epoch": 0.26029965304229286, + "grad_norm": 7.714399814605713, + "learning_rate": 4.958593090845352e-06, + "loss": 0.4511, + "mean_token_accuracy": 0.8574990868568421, + "num_tokens": 100949553.0, + "step": 83970 + }, + { + "entropy": 1.8375475853681564, + "epoch": 0.2603306521673426, + "grad_norm": 9.123080253601074, + "learning_rate": 4.958297853898395e-06, + "loss": 0.4537, + "mean_token_accuracy": 0.8516534611582756, + "num_tokens": 100962264.0, + "step": 83980 + }, + { + "entropy": 1.9303360790014268, + "epoch": 0.26036165129239225, + "grad_norm": 9.462535858154297, + "learning_rate": 4.958002669680794e-06, + "loss": 0.4873, + "mean_token_accuracy": 0.8448026522994041, + "num_tokens": 100972790.0, + "step": 83990 + }, + { + "entropy": 1.9033367425203322, + "epoch": 0.260392650417442, + "grad_norm": 6.338131427764893, + "learning_rate": 4.957707538176859e-06, + "loss": 0.4897, + "mean_token_accuracy": 0.837484510242939, + "num_tokens": 100984286.0, + "step": 84000 + }, + { + "entropy": 1.9744462817907333, + "epoch": 0.26042364954249164, + "grad_norm": 8.549060821533203, + "learning_rate": 4.9574124593708985e-06, + "loss": 0.5128, + "mean_token_accuracy": 0.8344295799732209, + "num_tokens": 100995371.0, + "step": 84010 + }, + { + "entropy": 1.7369410261511802, + "epoch": 0.26045464866754137, + "grad_norm": 8.24365520477295, + "learning_rate": 4.957117433247236e-06, + "loss": 0.4083, + "mean_token_accuracy": 0.8557371169328689, + "num_tokens": 101009845.0, + "step": 84020 + }, + { + "entropy": 1.9079995438456536, + "epoch": 0.26048564779259104, + "grad_norm": 9.164064407348633, + "learning_rate": 4.956822459790193e-06, + "loss": 0.5175, + "mean_token_accuracy": 0.8357024490833282, + "num_tokens": 101021686.0, + "step": 84030 + }, + { + "entropy": 1.8953372284770011, + "epoch": 0.26051664691764076, + "grad_norm": 9.431413650512695, + "learning_rate": 4.956527538984104e-06, + "loss": 0.4786, + "mean_token_accuracy": 0.8483018189668655, + "num_tokens": 101033454.0, + "step": 84040 + }, + { + "entropy": 1.8954351365566253, + "epoch": 0.26054764604269043, + "grad_norm": 3.8915398120880127, + "learning_rate": 4.956232670813308e-06, + "loss": 0.4902, + "mean_token_accuracy": 0.8407752260565757, + "num_tokens": 101044798.0, + "step": 84050 + }, + { + "entropy": 1.8448843389749527, + "epoch": 0.26057864516774015, + "grad_norm": 4.189465045928955, + "learning_rate": 4.955937855262149e-06, + "loss": 0.436, + "mean_token_accuracy": 0.85302524715662, + "num_tokens": 101057618.0, + "step": 84060 + }, + { + "entropy": 1.832655143737793, + "epoch": 0.2606096442927898, + "grad_norm": 7.561342239379883, + "learning_rate": 4.955643092314979e-06, + "loss": 0.4504, + "mean_token_accuracy": 0.8396886929869651, + "num_tokens": 101069666.0, + "step": 84070 + }, + { + "entropy": 1.898691761493683, + "epoch": 0.26064064341783955, + "grad_norm": 8.714390754699707, + "learning_rate": 4.955348381956157e-06, + "loss": 0.5377, + "mean_token_accuracy": 0.827970777451992, + "num_tokens": 101081285.0, + "step": 84080 + }, + { + "entropy": 1.7665072850883008, + "epoch": 0.2606716425428892, + "grad_norm": 8.24121379852295, + "learning_rate": 4.9550537241700474e-06, + "loss": 0.4068, + "mean_token_accuracy": 0.8553405046463013, + "num_tokens": 101094537.0, + "step": 84090 + }, + { + "entropy": 1.8713910579681396, + "epoch": 0.26070264166793894, + "grad_norm": 9.43482780456543, + "learning_rate": 4.954759118941024e-06, + "loss": 0.4749, + "mean_token_accuracy": 0.845010656118393, + "num_tokens": 101106520.0, + "step": 84100 + }, + { + "entropy": 1.8151496931910516, + "epoch": 0.2607336407929886, + "grad_norm": 9.085859298706055, + "learning_rate": 4.954464566253459e-06, + "loss": 0.4454, + "mean_token_accuracy": 0.8474389865994454, + "num_tokens": 101119130.0, + "step": 84110 + }, + { + "entropy": 1.9014802396297454, + "epoch": 0.26076463991803833, + "grad_norm": 3.9436869621276855, + "learning_rate": 4.95417006609174e-06, + "loss": 0.5645, + "mean_token_accuracy": 0.8273421004414558, + "num_tokens": 101130816.0, + "step": 84120 + }, + { + "entropy": 1.9717437848448753, + "epoch": 0.260795639043088, + "grad_norm": 8.339731216430664, + "learning_rate": 4.953875618440259e-06, + "loss": 0.5301, + "mean_token_accuracy": 0.8363761708140374, + "num_tokens": 101142058.0, + "step": 84130 + }, + { + "entropy": 1.8874025538563728, + "epoch": 0.2608266381681377, + "grad_norm": 5.998051643371582, + "learning_rate": 4.953581223283413e-06, + "loss": 0.4681, + "mean_token_accuracy": 0.8477589651942253, + "num_tokens": 101153159.0, + "step": 84140 + }, + { + "entropy": 1.7967896267771721, + "epoch": 0.2608576372931874, + "grad_norm": 8.227131843566895, + "learning_rate": 4.953286880605603e-06, + "loss": 0.4636, + "mean_token_accuracy": 0.8455402344465256, + "num_tokens": 101165998.0, + "step": 84150 + }, + { + "entropy": 1.8968286886811256, + "epoch": 0.2608886364182371, + "grad_norm": 7.774280071258545, + "learning_rate": 4.952992590391242e-06, + "loss": 0.4362, + "mean_token_accuracy": 0.8559000626206398, + "num_tokens": 101177800.0, + "step": 84160 + }, + { + "entropy": 1.7744996875524521, + "epoch": 0.2609196355432868, + "grad_norm": 8.268373489379883, + "learning_rate": 4.952698352624746e-06, + "loss": 0.4205, + "mean_token_accuracy": 0.8595765858888627, + "num_tokens": 101190665.0, + "step": 84170 + }, + { + "entropy": 1.8480232998728752, + "epoch": 0.2609506346683365, + "grad_norm": 7.481622219085693, + "learning_rate": 4.952404167290538e-06, + "loss": 0.4727, + "mean_token_accuracy": 0.849323433637619, + "num_tokens": 101203178.0, + "step": 84180 + }, + { + "entropy": 1.8875873282551765, + "epoch": 0.2609816337933862, + "grad_norm": 4.315346717834473, + "learning_rate": 4.952110034373047e-06, + "loss": 0.462, + "mean_token_accuracy": 0.8455449759960174, + "num_tokens": 101214641.0, + "step": 84190 + }, + { + "entropy": 1.7977977305650712, + "epoch": 0.26101263291843585, + "grad_norm": 7.563590049743652, + "learning_rate": 4.9518159538567115e-06, + "loss": 0.4434, + "mean_token_accuracy": 0.8515077039599419, + "num_tokens": 101227190.0, + "step": 84200 + }, + { + "entropy": 1.8671275533735752, + "epoch": 0.2610436320434856, + "grad_norm": 8.546191215515137, + "learning_rate": 4.951521925725971e-06, + "loss": 0.4726, + "mean_token_accuracy": 0.8378291219472885, + "num_tokens": 101239940.0, + "step": 84210 + }, + { + "entropy": 1.9210672780871392, + "epoch": 0.26107463116853524, + "grad_norm": 8.786494255065918, + "learning_rate": 4.951227949965277e-06, + "loss": 0.5202, + "mean_token_accuracy": 0.8403745800256729, + "num_tokens": 101251411.0, + "step": 84220 + }, + { + "entropy": 1.870493806898594, + "epoch": 0.26110563029358497, + "grad_norm": 7.429269313812256, + "learning_rate": 4.9509340265590865e-06, + "loss": 0.4554, + "mean_token_accuracy": 0.8402852207422257, + "num_tokens": 101263421.0, + "step": 84230 + }, + { + "entropy": 1.8900667324662208, + "epoch": 0.26113662941863464, + "grad_norm": 9.321823120117188, + "learning_rate": 4.950640155491857e-06, + "loss": 0.4765, + "mean_token_accuracy": 0.8420423567295074, + "num_tokens": 101275836.0, + "step": 84240 + }, + { + "entropy": 1.6973878130316735, + "epoch": 0.26116762854368436, + "grad_norm": 7.488030910491943, + "learning_rate": 4.95034633674806e-06, + "loss": 0.3739, + "mean_token_accuracy": 0.868793734908104, + "num_tokens": 101289722.0, + "step": 84250 + }, + { + "entropy": 1.8956893503665924, + "epoch": 0.26119862766873403, + "grad_norm": 8.01175594329834, + "learning_rate": 4.950052570312171e-06, + "loss": 0.5072, + "mean_token_accuracy": 0.8392576932907104, + "num_tokens": 101301555.0, + "step": 84260 + }, + { + "entropy": 1.9166808992624282, + "epoch": 0.26122962679378375, + "grad_norm": 6.5919928550720215, + "learning_rate": 4.94975885616867e-06, + "loss": 0.5282, + "mean_token_accuracy": 0.8426938772201538, + "num_tokens": 101312392.0, + "step": 84270 + }, + { + "entropy": 1.9476675420999527, + "epoch": 0.2612606259188334, + "grad_norm": 9.677026748657227, + "learning_rate": 4.9494651943020455e-06, + "loss": 0.5275, + "mean_token_accuracy": 0.8333592966198922, + "num_tokens": 101323006.0, + "step": 84280 + }, + { + "entropy": 1.818124982714653, + "epoch": 0.26129162504388315, + "grad_norm": 4.290999889373779, + "learning_rate": 4.94917158469679e-06, + "loss": 0.4158, + "mean_token_accuracy": 0.8480872571468353, + "num_tokens": 101335938.0, + "step": 84290 + }, + { + "entropy": 1.8131288312375546, + "epoch": 0.2613226241689328, + "grad_norm": 8.009013175964355, + "learning_rate": 4.948878027337407e-06, + "loss": 0.4989, + "mean_token_accuracy": 0.839995177090168, + "num_tokens": 101349763.0, + "step": 84300 + }, + { + "entropy": 1.8972848400473594, + "epoch": 0.26135362329398254, + "grad_norm": 8.380783081054688, + "learning_rate": 4.948584522208402e-06, + "loss": 0.4657, + "mean_token_accuracy": 0.8484173402190208, + "num_tokens": 101361738.0, + "step": 84310 + }, + { + "entropy": 1.91866305321455, + "epoch": 0.2613846224190322, + "grad_norm": 5.500089645385742, + "learning_rate": 4.948291069294289e-06, + "loss": 0.5167, + "mean_token_accuracy": 0.8474255546927452, + "num_tokens": 101373866.0, + "step": 84320 + }, + { + "entropy": 1.777488799393177, + "epoch": 0.26141562154408193, + "grad_norm": 8.77873706817627, + "learning_rate": 4.947997668579589e-06, + "loss": 0.4046, + "mean_token_accuracy": 0.8580469012260437, + "num_tokens": 101387715.0, + "step": 84330 + }, + { + "entropy": 1.9092938348650932, + "epoch": 0.2614466206691316, + "grad_norm": 8.913098335266113, + "learning_rate": 4.947704320048827e-06, + "loss": 0.5127, + "mean_token_accuracy": 0.8450995326042176, + "num_tokens": 101399203.0, + "step": 84340 + }, + { + "entropy": 1.8070560455322267, + "epoch": 0.2614776197941813, + "grad_norm": 8.269847869873047, + "learning_rate": 4.947411023686535e-06, + "loss": 0.4214, + "mean_token_accuracy": 0.8473980888724327, + "num_tokens": 101411534.0, + "step": 84350 + }, + { + "entropy": 1.8068130150437356, + "epoch": 0.261508618919231, + "grad_norm": 4.175093173980713, + "learning_rate": 4.9471177794772555e-06, + "loss": 0.4111, + "mean_token_accuracy": 0.8587175875902175, + "num_tokens": 101425216.0, + "step": 84360 + }, + { + "entropy": 1.906698650121689, + "epoch": 0.2615396180442807, + "grad_norm": 9.216361999511719, + "learning_rate": 4.946824587405532e-06, + "loss": 0.5222, + "mean_token_accuracy": 0.8331103935837746, + "num_tokens": 101436502.0, + "step": 84370 + }, + { + "entropy": 1.7797369614243508, + "epoch": 0.2615706171693304, + "grad_norm": 2.6891257762908936, + "learning_rate": 4.946531447455915e-06, + "loss": 0.3698, + "mean_token_accuracy": 0.8665702223777771, + "num_tokens": 101449912.0, + "step": 84380 + }, + { + "entropy": 1.8645511791110039, + "epoch": 0.2616016162943801, + "grad_norm": 4.295123100280762, + "learning_rate": 4.946238359612967e-06, + "loss": 0.4511, + "mean_token_accuracy": 0.8417528823018074, + "num_tokens": 101463078.0, + "step": 84390 + }, + { + "entropy": 1.8012978717684747, + "epoch": 0.2616326154194298, + "grad_norm": 7.812502861022949, + "learning_rate": 4.945945323861249e-06, + "loss": 0.4191, + "mean_token_accuracy": 0.8614039555191993, + "num_tokens": 101475914.0, + "step": 84400 + }, + { + "entropy": 1.886396862566471, + "epoch": 0.2616636145444795, + "grad_norm": 7.294875144958496, + "learning_rate": 4.945652340185336e-06, + "loss": 0.4551, + "mean_token_accuracy": 0.8441071733832359, + "num_tokens": 101487890.0, + "step": 84410 + }, + { + "entropy": 1.7817871391773223, + "epoch": 0.2616946136695292, + "grad_norm": 10.357475280761719, + "learning_rate": 4.9453594085698036e-06, + "loss": 0.4135, + "mean_token_accuracy": 0.8648449271917343, + "num_tokens": 101501358.0, + "step": 84420 + }, + { + "entropy": 1.9272992476820945, + "epoch": 0.2617256127945789, + "grad_norm": 9.080680847167969, + "learning_rate": 4.9450665289992355e-06, + "loss": 0.5201, + "mean_token_accuracy": 0.8325537592172623, + "num_tokens": 101513086.0, + "step": 84430 + }, + { + "entropy": 1.9224831491708756, + "epoch": 0.26175661191962857, + "grad_norm": 8.535758972167969, + "learning_rate": 4.9447737014582235e-06, + "loss": 0.4723, + "mean_token_accuracy": 0.8473838672041893, + "num_tokens": 101524876.0, + "step": 84440 + }, + { + "entropy": 1.8453692942857742, + "epoch": 0.26178761104467824, + "grad_norm": 4.549447059631348, + "learning_rate": 4.9444809259313635e-06, + "loss": 0.4148, + "mean_token_accuracy": 0.8549572035670281, + "num_tokens": 101537098.0, + "step": 84450 + }, + { + "entropy": 1.9371949970722198, + "epoch": 0.26181861016972796, + "grad_norm": 8.131068229675293, + "learning_rate": 4.944188202403261e-06, + "loss": 0.5529, + "mean_token_accuracy": 0.8304179921746254, + "num_tokens": 101548246.0, + "step": 84460 + }, + { + "entropy": 1.8725028142333031, + "epoch": 0.26184960929477763, + "grad_norm": 7.878946781158447, + "learning_rate": 4.943895530858521e-06, + "loss": 0.4804, + "mean_token_accuracy": 0.840125747025013, + "num_tokens": 101560349.0, + "step": 84470 + }, + { + "entropy": 1.86340638846159, + "epoch": 0.26188060841982735, + "grad_norm": 7.6368818283081055, + "learning_rate": 4.943602911281764e-06, + "loss": 0.4558, + "mean_token_accuracy": 0.8389788880944252, + "num_tokens": 101572940.0, + "step": 84480 + }, + { + "entropy": 1.9475991562008859, + "epoch": 0.261911607544877, + "grad_norm": 8.88925552368164, + "learning_rate": 4.943310343657611e-06, + "loss": 0.5051, + "mean_token_accuracy": 0.8385830983519554, + "num_tokens": 101584643.0, + "step": 84490 + }, + { + "entropy": 1.9739305421710014, + "epoch": 0.26194260666992675, + "grad_norm": 6.695905685424805, + "learning_rate": 4.943017827970689e-06, + "loss": 0.5063, + "mean_token_accuracy": 0.8335414931178093, + "num_tokens": 101595908.0, + "step": 84500 + }, + { + "entropy": 1.9505710929632187, + "epoch": 0.2619736057949764, + "grad_norm": 8.216191291809082, + "learning_rate": 4.942725364205635e-06, + "loss": 0.5215, + "mean_token_accuracy": 0.8390222921967506, + "num_tokens": 101606676.0, + "step": 84510 + }, + { + "entropy": 1.932286986708641, + "epoch": 0.26200460492002614, + "grad_norm": 7.449299335479736, + "learning_rate": 4.942432952347092e-06, + "loss": 0.5658, + "mean_token_accuracy": 0.8347490966320038, + "num_tokens": 101618109.0, + "step": 84520 + }, + { + "entropy": 1.9106614410877227, + "epoch": 0.2620356040450758, + "grad_norm": 9.041545867919922, + "learning_rate": 4.942140592379704e-06, + "loss": 0.5185, + "mean_token_accuracy": 0.8461478322744369, + "num_tokens": 101629907.0, + "step": 84530 + }, + { + "entropy": 1.8061072051525116, + "epoch": 0.26206660317012553, + "grad_norm": 9.392251968383789, + "learning_rate": 4.941848284288129e-06, + "loss": 0.4095, + "mean_token_accuracy": 0.8683505341410637, + "num_tokens": 101642763.0, + "step": 84540 + }, + { + "entropy": 1.8147373288869857, + "epoch": 0.2620976022951752, + "grad_norm": 3.8025999069213867, + "learning_rate": 4.941556028057024e-06, + "loss": 0.3993, + "mean_token_accuracy": 0.8555989354848862, + "num_tokens": 101655606.0, + "step": 84550 + }, + { + "entropy": 1.9344567239284516, + "epoch": 0.2621286014202249, + "grad_norm": 9.05136489868164, + "learning_rate": 4.9412638236710595e-06, + "loss": 0.5001, + "mean_token_accuracy": 0.8368930295109749, + "num_tokens": 101667711.0, + "step": 84560 + }, + { + "entropy": 1.882543933391571, + "epoch": 0.2621596005452746, + "grad_norm": 7.798995018005371, + "learning_rate": 4.940971671114905e-06, + "loss": 0.4799, + "mean_token_accuracy": 0.852934005856514, + "num_tokens": 101678953.0, + "step": 84570 + }, + { + "entropy": 1.9046371474862098, + "epoch": 0.2621905996703243, + "grad_norm": 8.572905540466309, + "learning_rate": 4.9406795703732436e-06, + "loss": 0.4999, + "mean_token_accuracy": 0.8400033757090568, + "num_tokens": 101690994.0, + "step": 84580 + }, + { + "entropy": 1.955969288945198, + "epoch": 0.262221598795374, + "grad_norm": 6.911218166351318, + "learning_rate": 4.94038752143076e-06, + "loss": 0.5409, + "mean_token_accuracy": 0.8287007570266723, + "num_tokens": 101702443.0, + "step": 84590 + }, + { + "entropy": 1.8909879684448243, + "epoch": 0.2622525979204237, + "grad_norm": 9.11408519744873, + "learning_rate": 4.940095524272145e-06, + "loss": 0.4748, + "mean_token_accuracy": 0.8456226110458374, + "num_tokens": 101714218.0, + "step": 84600 + }, + { + "entropy": 1.9254190102219582, + "epoch": 0.2622835970454734, + "grad_norm": 6.650926113128662, + "learning_rate": 4.939803578882099e-06, + "loss": 0.446, + "mean_token_accuracy": 0.8574258908629417, + "num_tokens": 101725211.0, + "step": 84610 + }, + { + "entropy": 1.9131127685308456, + "epoch": 0.2623145961705231, + "grad_norm": 8.146158218383789, + "learning_rate": 4.939511685245327e-06, + "loss": 0.4687, + "mean_token_accuracy": 0.8508924171328545, + "num_tokens": 101736251.0, + "step": 84620 + }, + { + "entropy": 1.9253426373004914, + "epoch": 0.2623455952955728, + "grad_norm": 6.606905937194824, + "learning_rate": 4.939219843346538e-06, + "loss": 0.4778, + "mean_token_accuracy": 0.8473613321781158, + "num_tokens": 101748009.0, + "step": 84630 + }, + { + "entropy": 1.9077803820371628, + "epoch": 0.2623765944206225, + "grad_norm": 9.684782981872559, + "learning_rate": 4.938928053170453e-06, + "loss": 0.5242, + "mean_token_accuracy": 0.8346043735742569, + "num_tokens": 101759254.0, + "step": 84640 + }, + { + "entropy": 1.9003472611308099, + "epoch": 0.26240759354567217, + "grad_norm": 4.827385425567627, + "learning_rate": 4.938636314701793e-06, + "loss": 0.5056, + "mean_token_accuracy": 0.8391556084156037, + "num_tokens": 101770387.0, + "step": 84650 + }, + { + "entropy": 1.911148864030838, + "epoch": 0.2624385926707219, + "grad_norm": 8.909619331359863, + "learning_rate": 4.9383446279252895e-06, + "loss": 0.4964, + "mean_token_accuracy": 0.8415949329733848, + "num_tokens": 101781627.0, + "step": 84660 + }, + { + "entropy": 1.8494424894452095, + "epoch": 0.26246959179577156, + "grad_norm": 4.314811706542969, + "learning_rate": 4.93805299282568e-06, + "loss": 0.4077, + "mean_token_accuracy": 0.8588860586285592, + "num_tokens": 101794354.0, + "step": 84670 + }, + { + "entropy": 1.9854395300149918, + "epoch": 0.2625005909208213, + "grad_norm": 4.768241882324219, + "learning_rate": 4.937761409387704e-06, + "loss": 0.5498, + "mean_token_accuracy": 0.8315106898546218, + "num_tokens": 101805875.0, + "step": 84680 + }, + { + "entropy": 1.8694578632712364, + "epoch": 0.26253159004587096, + "grad_norm": 11.459022521972656, + "learning_rate": 4.937469877596115e-06, + "loss": 0.4507, + "mean_token_accuracy": 0.8435337409377098, + "num_tokens": 101818131.0, + "step": 84690 + }, + { + "entropy": 1.9157974675297738, + "epoch": 0.2625625891709206, + "grad_norm": 10.549239158630371, + "learning_rate": 4.9371783974356665e-06, + "loss": 0.4791, + "mean_token_accuracy": 0.8447314709424972, + "num_tokens": 101830440.0, + "step": 84700 + }, + { + "entropy": 1.9070345923304557, + "epoch": 0.26259358829597035, + "grad_norm": 7.496677398681641, + "learning_rate": 4.93688696889112e-06, + "loss": 0.4687, + "mean_token_accuracy": 0.8451457783579827, + "num_tokens": 101841964.0, + "step": 84710 + }, + { + "entropy": 1.8439010679721832, + "epoch": 0.26262458742102, + "grad_norm": 3.86055850982666, + "learning_rate": 4.936595591947242e-06, + "loss": 0.4291, + "mean_token_accuracy": 0.8469921678304673, + "num_tokens": 101854984.0, + "step": 84720 + }, + { + "entropy": 1.8208919912576675, + "epoch": 0.26265558654606974, + "grad_norm": 8.848628044128418, + "learning_rate": 4.93630426658881e-06, + "loss": 0.4266, + "mean_token_accuracy": 0.8524587988853455, + "num_tokens": 101868205.0, + "step": 84730 + }, + { + "entropy": 1.9341197982430458, + "epoch": 0.2626865856711194, + "grad_norm": 9.021467208862305, + "learning_rate": 4.936012992800602e-06, + "loss": 0.5118, + "mean_token_accuracy": 0.8401493951678276, + "num_tokens": 101879993.0, + "step": 84740 + }, + { + "entropy": 1.900348238646984, + "epoch": 0.26271758479616913, + "grad_norm": 9.046464920043945, + "learning_rate": 4.935721770567406e-06, + "loss": 0.4902, + "mean_token_accuracy": 0.8420239523053169, + "num_tokens": 101891699.0, + "step": 84750 + }, + { + "entropy": 2.011109836399555, + "epoch": 0.2627485839212188, + "grad_norm": 9.504515647888184, + "learning_rate": 4.935430599874017e-06, + "loss": 0.5426, + "mean_token_accuracy": 0.8297847151756287, + "num_tokens": 101902570.0, + "step": 84760 + }, + { + "entropy": 1.8008052319288255, + "epoch": 0.26277958304626853, + "grad_norm": 7.699909687042236, + "learning_rate": 4.9351394807052325e-06, + "loss": 0.3834, + "mean_token_accuracy": 0.8641871735453606, + "num_tokens": 101915635.0, + "step": 84770 + }, + { + "entropy": 1.8245060086250304, + "epoch": 0.2628105821713182, + "grad_norm": 8.3301362991333, + "learning_rate": 4.934848413045857e-06, + "loss": 0.4336, + "mean_token_accuracy": 0.8514155328273774, + "num_tokens": 101928385.0, + "step": 84780 + }, + { + "entropy": 1.8403597161173821, + "epoch": 0.2628415812963679, + "grad_norm": 3.8224892616271973, + "learning_rate": 4.934557396880704e-06, + "loss": 0.4186, + "mean_token_accuracy": 0.8499826923012733, + "num_tokens": 101941790.0, + "step": 84790 + }, + { + "entropy": 1.950501237809658, + "epoch": 0.2628725804214176, + "grad_norm": 6.607431411743164, + "learning_rate": 4.934266432194593e-06, + "loss": 0.5115, + "mean_token_accuracy": 0.8484028398990631, + "num_tokens": 101953387.0, + "step": 84800 + }, + { + "entropy": 1.913737191259861, + "epoch": 0.2629035795464673, + "grad_norm": 7.8601508140563965, + "learning_rate": 4.933975518972347e-06, + "loss": 0.485, + "mean_token_accuracy": 0.850081168115139, + "num_tokens": 101964983.0, + "step": 84810 + }, + { + "entropy": 1.9299700886011124, + "epoch": 0.262934578671517, + "grad_norm": 8.866097450256348, + "learning_rate": 4.9336846571987965e-06, + "loss": 0.4687, + "mean_token_accuracy": 0.8501459851861, + "num_tokens": 101976619.0, + "step": 84820 + }, + { + "entropy": 1.9162129878997802, + "epoch": 0.2629655777965667, + "grad_norm": 8.77447509765625, + "learning_rate": 4.93339384685878e-06, + "loss": 0.4388, + "mean_token_accuracy": 0.8534630805253982, + "num_tokens": 101988887.0, + "step": 84830 + }, + { + "entropy": 1.9051405146718026, + "epoch": 0.2629965769216164, + "grad_norm": 7.113283157348633, + "learning_rate": 4.93310308793714e-06, + "loss": 0.4795, + "mean_token_accuracy": 0.8431653559207917, + "num_tokens": 102000822.0, + "step": 84840 + }, + { + "entropy": 1.8918147072196008, + "epoch": 0.2630275760466661, + "grad_norm": 7.229731559753418, + "learning_rate": 4.9328123804187265e-06, + "loss": 0.42, + "mean_token_accuracy": 0.8591841533780098, + "num_tokens": 102012280.0, + "step": 84850 + }, + { + "entropy": 1.84690400660038, + "epoch": 0.26305857517171577, + "grad_norm": 4.94510555267334, + "learning_rate": 4.932521724288395e-06, + "loss": 0.4528, + "mean_token_accuracy": 0.8478389665484428, + "num_tokens": 102025734.0, + "step": 84860 + }, + { + "entropy": 1.8911878764629364, + "epoch": 0.2630895742967655, + "grad_norm": 4.1434783935546875, + "learning_rate": 4.932231119531007e-06, + "loss": 0.4728, + "mean_token_accuracy": 0.8523555234074592, + "num_tokens": 102038473.0, + "step": 84870 + }, + { + "entropy": 1.9033065840601922, + "epoch": 0.26312057342181516, + "grad_norm": 3.400432586669922, + "learning_rate": 4.9319405661314326e-06, + "loss": 0.4971, + "mean_token_accuracy": 0.8344430983066559, + "num_tokens": 102050229.0, + "step": 84880 + }, + { + "entropy": 1.9568065509200097, + "epoch": 0.2631515725468649, + "grad_norm": 10.253649711608887, + "learning_rate": 4.931650064074543e-06, + "loss": 0.5204, + "mean_token_accuracy": 0.8353859558701515, + "num_tokens": 102062301.0, + "step": 84890 + }, + { + "entropy": 1.8090133965015411, + "epoch": 0.26318257167191456, + "grad_norm": 9.690705299377441, + "learning_rate": 4.931359613345223e-06, + "loss": 0.409, + "mean_token_accuracy": 0.8670252472162246, + "num_tokens": 102076084.0, + "step": 84900 + }, + { + "entropy": 1.881265440583229, + "epoch": 0.2632135707969643, + "grad_norm": 8.040104866027832, + "learning_rate": 4.9310692139283576e-06, + "loss": 0.4399, + "mean_token_accuracy": 0.848574922978878, + "num_tokens": 102088089.0, + "step": 84910 + }, + { + "entropy": 1.823999959230423, + "epoch": 0.26324456992201395, + "grad_norm": 4.09398078918457, + "learning_rate": 4.9307788658088396e-06, + "loss": 0.4375, + "mean_token_accuracy": 0.8424740388989449, + "num_tokens": 102101088.0, + "step": 84920 + }, + { + "entropy": 1.8733570471405983, + "epoch": 0.2632755690470637, + "grad_norm": 7.2603230476379395, + "learning_rate": 4.93048856897157e-06, + "loss": 0.4377, + "mean_token_accuracy": 0.8479010179638863, + "num_tokens": 102113485.0, + "step": 84930 + }, + { + "entropy": 1.8576104506850242, + "epoch": 0.26330656817211334, + "grad_norm": 8.178943634033203, + "learning_rate": 4.930198323401454e-06, + "loss": 0.3913, + "mean_token_accuracy": 0.8650277197360993, + "num_tokens": 102126971.0, + "step": 84940 + }, + { + "entropy": 1.819041645526886, + "epoch": 0.263337567297163, + "grad_norm": 10.852204322814941, + "learning_rate": 4.929908129083402e-06, + "loss": 0.4542, + "mean_token_accuracy": 0.84122084826231, + "num_tokens": 102140395.0, + "step": 84950 + }, + { + "entropy": 1.8517862141132355, + "epoch": 0.26336856642221274, + "grad_norm": 7.924666404724121, + "learning_rate": 4.929617986002334e-06, + "loss": 0.5, + "mean_token_accuracy": 0.841532975435257, + "num_tokens": 102153419.0, + "step": 84960 + }, + { + "entropy": 1.9836448311805726, + "epoch": 0.2633995655472624, + "grad_norm": 7.1847004890441895, + "learning_rate": 4.9293278941431724e-06, + "loss": 0.5787, + "mean_token_accuracy": 0.830291448533535, + "num_tokens": 102164560.0, + "step": 84970 + }, + { + "entropy": 1.9570752799510955, + "epoch": 0.26343056467231213, + "grad_norm": 7.852522850036621, + "learning_rate": 4.929037853490851e-06, + "loss": 0.5222, + "mean_token_accuracy": 0.8373702332377434, + "num_tokens": 102176029.0, + "step": 84980 + }, + { + "entropy": 1.9419500917196273, + "epoch": 0.2634615637973618, + "grad_norm": 8.110418319702148, + "learning_rate": 4.928747864030304e-06, + "loss": 0.5139, + "mean_token_accuracy": 0.8400467559695244, + "num_tokens": 102188021.0, + "step": 84990 + }, + { + "entropy": 1.9458017632365228, + "epoch": 0.2634925629224115, + "grad_norm": 10.08768081665039, + "learning_rate": 4.928457925746475e-06, + "loss": 0.486, + "mean_token_accuracy": 0.8457490563392639, + "num_tokens": 102199697.0, + "step": 85000 + }, + { + "entropy": 1.9600411295890807, + "epoch": 0.2635235620474612, + "grad_norm": 8.410282135009766, + "learning_rate": 4.928168038624313e-06, + "loss": 0.483, + "mean_token_accuracy": 0.8549587532877923, + "num_tokens": 102210184.0, + "step": 85010 + }, + { + "entropy": 1.7980479046702385, + "epoch": 0.2635545611725109, + "grad_norm": 7.882757663726807, + "learning_rate": 4.927878202648774e-06, + "loss": 0.4294, + "mean_token_accuracy": 0.8440237864851952, + "num_tokens": 102223273.0, + "step": 85020 + }, + { + "entropy": 1.9615017265081405, + "epoch": 0.2635855602975606, + "grad_norm": 9.58923053741455, + "learning_rate": 4.927588417804819e-06, + "loss": 0.5578, + "mean_token_accuracy": 0.8358608677983284, + "num_tokens": 102234501.0, + "step": 85030 + }, + { + "entropy": 1.864892715215683, + "epoch": 0.2636165594226103, + "grad_norm": 7.730620384216309, + "learning_rate": 4.9272986840774155e-06, + "loss": 0.456, + "mean_token_accuracy": 0.8360424354672432, + "num_tokens": 102247359.0, + "step": 85040 + }, + { + "entropy": 1.8440838590264321, + "epoch": 0.26364755854766, + "grad_norm": 3.6796610355377197, + "learning_rate": 4.927009001451538e-06, + "loss": 0.4102, + "mean_token_accuracy": 0.8569944024085998, + "num_tokens": 102260021.0, + "step": 85050 + }, + { + "entropy": 1.905400250852108, + "epoch": 0.2636785576727097, + "grad_norm": 3.788116931915283, + "learning_rate": 4.926719369912167e-06, + "loss": 0.4323, + "mean_token_accuracy": 0.8591458886861801, + "num_tokens": 102271674.0, + "step": 85060 + }, + { + "entropy": 1.8443732798099517, + "epoch": 0.26370955679775937, + "grad_norm": 7.661864757537842, + "learning_rate": 4.926429789444288e-06, + "loss": 0.4706, + "mean_token_accuracy": 0.8470209822058677, + "num_tokens": 102284578.0, + "step": 85070 + }, + { + "entropy": 1.8414702624082566, + "epoch": 0.2637405559228091, + "grad_norm": 7.228357791900635, + "learning_rate": 4.926140260032895e-06, + "loss": 0.4743, + "mean_token_accuracy": 0.8367054253816605, + "num_tokens": 102296979.0, + "step": 85080 + }, + { + "entropy": 1.9466857939958573, + "epoch": 0.26377155504785876, + "grad_norm": 11.023140907287598, + "learning_rate": 4.925850781662983e-06, + "loss": 0.5557, + "mean_token_accuracy": 0.8282995998859406, + "num_tokens": 102308493.0, + "step": 85090 + }, + { + "entropy": 1.963821244239807, + "epoch": 0.2638025541729085, + "grad_norm": 11.524614334106445, + "learning_rate": 4.92556135431956e-06, + "loss": 0.5303, + "mean_token_accuracy": 0.8317586749792099, + "num_tokens": 102320319.0, + "step": 85100 + }, + { + "entropy": 1.9346794202923774, + "epoch": 0.26383355329795816, + "grad_norm": 7.5674214363098145, + "learning_rate": 4.9252719779876374e-06, + "loss": 0.5261, + "mean_token_accuracy": 0.8399398401379585, + "num_tokens": 102331510.0, + "step": 85110 + }, + { + "entropy": 1.8758284986019134, + "epoch": 0.2638645524230079, + "grad_norm": 3.6614625453948975, + "learning_rate": 4.92498265265223e-06, + "loss": 0.4775, + "mean_token_accuracy": 0.8341593369841576, + "num_tokens": 102344007.0, + "step": 85120 + }, + { + "entropy": 1.9138728231191635, + "epoch": 0.26389555154805755, + "grad_norm": 4.82459831237793, + "learning_rate": 4.924693378298362e-06, + "loss": 0.473, + "mean_token_accuracy": 0.8473570510745049, + "num_tokens": 102355863.0, + "step": 85130 + }, + { + "entropy": 1.9841463685035705, + "epoch": 0.2639265506731073, + "grad_norm": 7.99963903427124, + "learning_rate": 4.924404154911063e-06, + "loss": 0.5413, + "mean_token_accuracy": 0.8389569863677024, + "num_tokens": 102366684.0, + "step": 85140 + }, + { + "entropy": 1.8069503650069236, + "epoch": 0.26395754979815694, + "grad_norm": 7.8926568031311035, + "learning_rate": 4.92411498247537e-06, + "loss": 0.4926, + "mean_token_accuracy": 0.8335721820592881, + "num_tokens": 102380841.0, + "step": 85150 + }, + { + "entropy": 1.9148933947086335, + "epoch": 0.26398854892320667, + "grad_norm": 7.529796600341797, + "learning_rate": 4.923825860976324e-06, + "loss": 0.4734, + "mean_token_accuracy": 0.8480879321694375, + "num_tokens": 102392714.0, + "step": 85160 + }, + { + "entropy": 1.9118664294481278, + "epoch": 0.26401954804825634, + "grad_norm": 9.968255996704102, + "learning_rate": 4.9235367903989705e-06, + "loss": 0.4858, + "mean_token_accuracy": 0.8447333693504333, + "num_tokens": 102404766.0, + "step": 85170 + }, + { + "entropy": 1.9002967342734336, + "epoch": 0.26405054717330606, + "grad_norm": 9.95118236541748, + "learning_rate": 4.923247770728366e-06, + "loss": 0.4568, + "mean_token_accuracy": 0.855803194642067, + "num_tokens": 102416668.0, + "step": 85180 + }, + { + "entropy": 1.8763260886073112, + "epoch": 0.26408154629835573, + "grad_norm": 7.586075305938721, + "learning_rate": 4.9229588019495714e-06, + "loss": 0.4963, + "mean_token_accuracy": 0.8425167426466942, + "num_tokens": 102429365.0, + "step": 85190 + }, + { + "entropy": 2.0041319727897644, + "epoch": 0.2641125454234054, + "grad_norm": 9.366758346557617, + "learning_rate": 4.922669884047651e-06, + "loss": 0.5717, + "mean_token_accuracy": 0.8297568678855896, + "num_tokens": 102440584.0, + "step": 85200 + }, + { + "entropy": 1.9379279300570489, + "epoch": 0.2641435445484551, + "grad_norm": 8.975931167602539, + "learning_rate": 4.922381017007679e-06, + "loss": 0.5412, + "mean_token_accuracy": 0.8403313905000687, + "num_tokens": 102452027.0, + "step": 85210 + }, + { + "entropy": 1.9602735459804534, + "epoch": 0.2641745436735048, + "grad_norm": 8.235448837280273, + "learning_rate": 4.9220922008147325e-06, + "loss": 0.5294, + "mean_token_accuracy": 0.8311275467276573, + "num_tokens": 102463662.0, + "step": 85220 + }, + { + "entropy": 1.8260437846183777, + "epoch": 0.2642055427985545, + "grad_norm": 9.317481994628906, + "learning_rate": 4.921803435453896e-06, + "loss": 0.4387, + "mean_token_accuracy": 0.8505254268646241, + "num_tokens": 102476893.0, + "step": 85230 + }, + { + "entropy": 1.8744411259889602, + "epoch": 0.2642365419236042, + "grad_norm": 7.509456634521484, + "learning_rate": 4.921514720910262e-06, + "loss": 0.5191, + "mean_token_accuracy": 0.8349858596920967, + "num_tokens": 102489557.0, + "step": 85240 + }, + { + "entropy": 1.8670647412538528, + "epoch": 0.2642675410486539, + "grad_norm": 3.518993377685547, + "learning_rate": 4.921226057168927e-06, + "loss": 0.4244, + "mean_token_accuracy": 0.8494436904788017, + "num_tokens": 102502419.0, + "step": 85250 + }, + { + "entropy": 1.9050493866205216, + "epoch": 0.2642985401737036, + "grad_norm": 8.357398986816406, + "learning_rate": 4.920937444214995e-06, + "loss": 0.5004, + "mean_token_accuracy": 0.8412919878959656, + "num_tokens": 102514450.0, + "step": 85260 + }, + { + "entropy": 1.79098000228405, + "epoch": 0.2643295392987533, + "grad_norm": 4.370672702789307, + "learning_rate": 4.920648882033572e-06, + "loss": 0.4494, + "mean_token_accuracy": 0.8489180520176888, + "num_tokens": 102527984.0, + "step": 85270 + }, + { + "entropy": 1.9773102968931198, + "epoch": 0.26436053842380297, + "grad_norm": 7.729239463806152, + "learning_rate": 4.920360370609777e-06, + "loss": 0.4939, + "mean_token_accuracy": 0.8391767382621765, + "num_tokens": 102538749.0, + "step": 85280 + }, + { + "entropy": 1.935066755115986, + "epoch": 0.2643915375488527, + "grad_norm": 8.969949722290039, + "learning_rate": 4.920071909928729e-06, + "loss": 0.5233, + "mean_token_accuracy": 0.842116117477417, + "num_tokens": 102550262.0, + "step": 85290 + }, + { + "entropy": 1.8114179536700248, + "epoch": 0.26442253667390236, + "grad_norm": 7.595683574676514, + "learning_rate": 4.919783499975556e-06, + "loss": 0.3997, + "mean_token_accuracy": 0.8623112186789512, + "num_tokens": 102563124.0, + "step": 85300 + }, + { + "entropy": 1.8905057892203332, + "epoch": 0.2644535357989521, + "grad_norm": 7.8862199783325195, + "learning_rate": 4.919495140735392e-06, + "loss": 0.4715, + "mean_token_accuracy": 0.8516170725226402, + "num_tokens": 102575911.0, + "step": 85310 + }, + { + "entropy": 1.7555623829364777, + "epoch": 0.26448453492400176, + "grad_norm": 7.201220989227295, + "learning_rate": 4.919206832193378e-06, + "loss": 0.4096, + "mean_token_accuracy": 0.8540243923664093, + "num_tokens": 102589847.0, + "step": 85320 + }, + { + "entropy": 1.9273912906646729, + "epoch": 0.2645155340490515, + "grad_norm": 8.060796737670898, + "learning_rate": 4.918918574334659e-06, + "loss": 0.4865, + "mean_token_accuracy": 0.848893666267395, + "num_tokens": 102601212.0, + "step": 85330 + }, + { + "entropy": 1.8354052126407623, + "epoch": 0.26454653317410115, + "grad_norm": 8.087235450744629, + "learning_rate": 4.918630367144384e-06, + "loss": 0.543, + "mean_token_accuracy": 0.8323055505752563, + "num_tokens": 102614529.0, + "step": 85340 + }, + { + "entropy": 1.9022644311189651, + "epoch": 0.2645775322991509, + "grad_norm": 9.103813171386719, + "learning_rate": 4.918342210607715e-06, + "loss": 0.5306, + "mean_token_accuracy": 0.828366307914257, + "num_tokens": 102627163.0, + "step": 85350 + }, + { + "entropy": 1.8333250015974045, + "epoch": 0.26460853142420054, + "grad_norm": 9.28365707397461, + "learning_rate": 4.918054104709815e-06, + "loss": 0.4207, + "mean_token_accuracy": 0.8535311058163643, + "num_tokens": 102639926.0, + "step": 85360 + }, + { + "entropy": 1.8946420654654503, + "epoch": 0.26463953054925027, + "grad_norm": 7.266073703765869, + "learning_rate": 4.917766049435854e-06, + "loss": 0.457, + "mean_token_accuracy": 0.8455268830060959, + "num_tokens": 102652224.0, + "step": 85370 + }, + { + "entropy": 1.9376175180077553, + "epoch": 0.26467052967429994, + "grad_norm": 7.632312297821045, + "learning_rate": 4.917478044771007e-06, + "loss": 0.4726, + "mean_token_accuracy": 0.8479833841323853, + "num_tokens": 102664350.0, + "step": 85380 + }, + { + "entropy": 1.9229725405573845, + "epoch": 0.26470152879934966, + "grad_norm": 7.888539791107178, + "learning_rate": 4.9171900907004585e-06, + "loss": 0.5669, + "mean_token_accuracy": 0.8408885851502419, + "num_tokens": 102675841.0, + "step": 85390 + }, + { + "entropy": 1.8650186344981194, + "epoch": 0.26473252792439933, + "grad_norm": 7.847538471221924, + "learning_rate": 4.916902187209395e-06, + "loss": 0.4484, + "mean_token_accuracy": 0.8481592908501625, + "num_tokens": 102687826.0, + "step": 85400 + }, + { + "entropy": 1.8229721501469611, + "epoch": 0.26476352704944905, + "grad_norm": 7.471333026885986, + "learning_rate": 4.916614334283012e-06, + "loss": 0.406, + "mean_token_accuracy": 0.8582251816987991, + "num_tokens": 102700892.0, + "step": 85410 + }, + { + "entropy": 1.934105758368969, + "epoch": 0.2647945261744987, + "grad_norm": 3.659540891647339, + "learning_rate": 4.91632653190651e-06, + "loss": 0.4666, + "mean_token_accuracy": 0.8533608466386795, + "num_tokens": 102712704.0, + "step": 85420 + }, + { + "entropy": 1.9130233809351922, + "epoch": 0.26482552529954845, + "grad_norm": 9.22014331817627, + "learning_rate": 4.916038780065096e-06, + "loss": 0.4777, + "mean_token_accuracy": 0.8448394879698753, + "num_tokens": 102724684.0, + "step": 85430 + }, + { + "entropy": 1.9155945912003518, + "epoch": 0.2648565244245981, + "grad_norm": 8.854886054992676, + "learning_rate": 4.9157510787439814e-06, + "loss": 0.4884, + "mean_token_accuracy": 0.8422875538468361, + "num_tokens": 102736595.0, + "step": 85440 + }, + { + "entropy": 1.8779947102069854, + "epoch": 0.2648875235496478, + "grad_norm": 7.41634464263916, + "learning_rate": 4.9154634279283864e-06, + "loss": 0.505, + "mean_token_accuracy": 0.8430365577340126, + "num_tokens": 102749126.0, + "step": 85450 + }, + { + "entropy": 1.8960552558302879, + "epoch": 0.2649185226746975, + "grad_norm": 10.775057792663574, + "learning_rate": 4.915175827603535e-06, + "loss": 0.4752, + "mean_token_accuracy": 0.8464867159724235, + "num_tokens": 102761000.0, + "step": 85460 + }, + { + "entropy": 1.8219962686300277, + "epoch": 0.2649495217997472, + "grad_norm": 4.283982753753662, + "learning_rate": 4.914888277754658e-06, + "loss": 0.4364, + "mean_token_accuracy": 0.8421214401721955, + "num_tokens": 102774313.0, + "step": 85470 + }, + { + "entropy": 1.9417266234755517, + "epoch": 0.2649805209247969, + "grad_norm": 3.7757527828216553, + "learning_rate": 4.914600778366993e-06, + "loss": 0.5156, + "mean_token_accuracy": 0.8345933735370636, + "num_tokens": 102785961.0, + "step": 85480 + }, + { + "entropy": 1.9103323504328729, + "epoch": 0.26501152004984657, + "grad_norm": 3.362926721572876, + "learning_rate": 4.9143133294257815e-06, + "loss": 0.4855, + "mean_token_accuracy": 0.8477786153554916, + "num_tokens": 102798060.0, + "step": 85490 + }, + { + "entropy": 1.943329544365406, + "epoch": 0.2650425191748963, + "grad_norm": 9.327890396118164, + "learning_rate": 4.914025930916273e-06, + "loss": 0.5187, + "mean_token_accuracy": 0.8373201444745064, + "num_tokens": 102809320.0, + "step": 85500 + }, + { + "entropy": 1.881453277170658, + "epoch": 0.26507351829994596, + "grad_norm": 3.5227839946746826, + "learning_rate": 4.913738582823723e-06, + "loss": 0.4471, + "mean_token_accuracy": 0.8572242721915245, + "num_tokens": 102820907.0, + "step": 85510 + }, + { + "entropy": 1.8902042105793952, + "epoch": 0.2651045174249957, + "grad_norm": 7.871299743652344, + "learning_rate": 4.913451285133394e-06, + "loss": 0.4718, + "mean_token_accuracy": 0.8486515626311302, + "num_tokens": 102832171.0, + "step": 85520 + }, + { + "entropy": 1.9584900826215743, + "epoch": 0.26513551655004536, + "grad_norm": 3.7320501804351807, + "learning_rate": 4.91316403783055e-06, + "loss": 0.5608, + "mean_token_accuracy": 0.8346157044172287, + "num_tokens": 102843429.0, + "step": 85530 + }, + { + "entropy": 1.8431198254227639, + "epoch": 0.2651665156750951, + "grad_norm": 3.781135320663452, + "learning_rate": 4.912876840900466e-06, + "loss": 0.4362, + "mean_token_accuracy": 0.8510622009634972, + "num_tokens": 102855860.0, + "step": 85540 + }, + { + "entropy": 1.9608791798353196, + "epoch": 0.26519751480014475, + "grad_norm": 9.239676475524902, + "learning_rate": 4.91258969432842e-06, + "loss": 0.5309, + "mean_token_accuracy": 0.8367750391364097, + "num_tokens": 102867302.0, + "step": 85550 + }, + { + "entropy": 1.8789279222488404, + "epoch": 0.2652285139251945, + "grad_norm": 3.7677621841430664, + "learning_rate": 4.912302598099698e-06, + "loss": 0.4519, + "mean_token_accuracy": 0.8507544234395027, + "num_tokens": 102879643.0, + "step": 85560 + }, + { + "entropy": 1.9066937759518623, + "epoch": 0.26525951305024414, + "grad_norm": 10.09460163116455, + "learning_rate": 4.9120155521995925e-06, + "loss": 0.4615, + "mean_token_accuracy": 0.8424730479717255, + "num_tokens": 102892044.0, + "step": 85570 + }, + { + "entropy": 1.8171610802412033, + "epoch": 0.26529051217529387, + "grad_norm": 7.337011337280273, + "learning_rate": 4.911728556613397e-06, + "loss": 0.4325, + "mean_token_accuracy": 0.8553290620446206, + "num_tokens": 102904614.0, + "step": 85580 + }, + { + "entropy": 1.8738331109285356, + "epoch": 0.26532151130034354, + "grad_norm": 9.422406196594238, + "learning_rate": 4.911441611326418e-06, + "loss": 0.4839, + "mean_token_accuracy": 0.8550050288438797, + "num_tokens": 102916052.0, + "step": 85590 + }, + { + "entropy": 1.776879619061947, + "epoch": 0.26535251042539326, + "grad_norm": 7.145488262176514, + "learning_rate": 4.911154716323966e-06, + "loss": 0.3784, + "mean_token_accuracy": 0.8612538367509842, + "num_tokens": 102929553.0, + "step": 85600 + }, + { + "entropy": 1.9105664394795894, + "epoch": 0.26538350955044293, + "grad_norm": 12.720219612121582, + "learning_rate": 4.91086787159135e-06, + "loss": 0.5435, + "mean_token_accuracy": 0.8275511726737023, + "num_tokens": 102940839.0, + "step": 85610 + }, + { + "entropy": 1.8572928413748742, + "epoch": 0.26541450867549266, + "grad_norm": 2.5268802642822266, + "learning_rate": 4.910581077113897e-06, + "loss": 0.4746, + "mean_token_accuracy": 0.8346114948391914, + "num_tokens": 102953161.0, + "step": 85620 + }, + { + "entropy": 1.8211661458015442, + "epoch": 0.2654455078005423, + "grad_norm": 4.548563480377197, + "learning_rate": 4.910294332876931e-06, + "loss": 0.4697, + "mean_token_accuracy": 0.8422662571072579, + "num_tokens": 102966676.0, + "step": 85630 + }, + { + "entropy": 1.874080342054367, + "epoch": 0.26547650692559205, + "grad_norm": 9.284010887145996, + "learning_rate": 4.910007638865787e-06, + "loss": 0.4907, + "mean_token_accuracy": 0.8418890178203583, + "num_tokens": 102978834.0, + "step": 85640 + }, + { + "entropy": 1.9538467079401016, + "epoch": 0.2655075060506417, + "grad_norm": 7.954508304595947, + "learning_rate": 4.909720995065805e-06, + "loss": 0.4945, + "mean_token_accuracy": 0.8483369365334511, + "num_tokens": 102989479.0, + "step": 85650 + }, + { + "entropy": 1.9834106057882308, + "epoch": 0.26553850517569144, + "grad_norm": 9.530325889587402, + "learning_rate": 4.909434401462327e-06, + "loss": 0.5648, + "mean_token_accuracy": 0.8310058429837227, + "num_tokens": 103000295.0, + "step": 85660 + }, + { + "entropy": 1.8795513778924942, + "epoch": 0.2655695043007411, + "grad_norm": 9.067574501037598, + "learning_rate": 4.9091478580407075e-06, + "loss": 0.4291, + "mean_token_accuracy": 0.8638965725898743, + "num_tokens": 103012052.0, + "step": 85670 + }, + { + "entropy": 1.8716339603066445, + "epoch": 0.2656005034257908, + "grad_norm": 11.293997764587402, + "learning_rate": 4.908861364786301e-06, + "loss": 0.4986, + "mean_token_accuracy": 0.8458243295550346, + "num_tokens": 103023681.0, + "step": 85680 + }, + { + "entropy": 1.8574876859784126, + "epoch": 0.2656315025508405, + "grad_norm": 8.510407447814941, + "learning_rate": 4.908574921684474e-06, + "loss": 0.4554, + "mean_token_accuracy": 0.8516736879944802, + "num_tokens": 103035416.0, + "step": 85690 + }, + { + "entropy": 1.8569199055433274, + "epoch": 0.2656625016758902, + "grad_norm": 4.086944103240967, + "learning_rate": 4.908288528720592e-06, + "loss": 0.4604, + "mean_token_accuracy": 0.8491792261600495, + "num_tokens": 103046882.0, + "step": 85700 + }, + { + "entropy": 1.9134871244430542, + "epoch": 0.2656935008009399, + "grad_norm": 7.807006359100342, + "learning_rate": 4.908002185880031e-06, + "loss": 0.4602, + "mean_token_accuracy": 0.847848904132843, + "num_tokens": 103058705.0, + "step": 85710 + }, + { + "entropy": 1.911807608604431, + "epoch": 0.26572449992598957, + "grad_norm": 9.756507873535156, + "learning_rate": 4.907715893148174e-06, + "loss": 0.5384, + "mean_token_accuracy": 0.8321962848305702, + "num_tokens": 103071199.0, + "step": 85720 + }, + { + "entropy": 1.883806975185871, + "epoch": 0.2657554990510393, + "grad_norm": 6.826350212097168, + "learning_rate": 4.9074296505104055e-06, + "loss": 0.4805, + "mean_token_accuracy": 0.8427292719483376, + "num_tokens": 103082635.0, + "step": 85730 + }, + { + "entropy": 1.9495513945817948, + "epoch": 0.26578649817608896, + "grad_norm": 10.174969673156738, + "learning_rate": 4.9071434579521205e-06, + "loss": 0.5348, + "mean_token_accuracy": 0.8444296821951867, + "num_tokens": 103093525.0, + "step": 85740 + }, + { + "entropy": 1.934284047782421, + "epoch": 0.2658174973011387, + "grad_norm": 9.920119285583496, + "learning_rate": 4.9068573154587165e-06, + "loss": 0.5163, + "mean_token_accuracy": 0.8450621247291565, + "num_tokens": 103105373.0, + "step": 85750 + }, + { + "entropy": 1.7569790095090867, + "epoch": 0.26584849642618835, + "grad_norm": 7.43773078918457, + "learning_rate": 4.9065712230156e-06, + "loss": 0.3879, + "mean_token_accuracy": 0.8663360059261322, + "num_tokens": 103118573.0, + "step": 85760 + }, + { + "entropy": 1.8341956838965416, + "epoch": 0.2658794955512381, + "grad_norm": 7.597226619720459, + "learning_rate": 4.906285180608181e-06, + "loss": 0.4638, + "mean_token_accuracy": 0.8481891751289368, + "num_tokens": 103131304.0, + "step": 85770 + }, + { + "entropy": 1.9248315215110778, + "epoch": 0.26591049467628775, + "grad_norm": 8.323822021484375, + "learning_rate": 4.905999188221875e-06, + "loss": 0.519, + "mean_token_accuracy": 0.8360444948077201, + "num_tokens": 103142765.0, + "step": 85780 + }, + { + "entropy": 1.892122246325016, + "epoch": 0.26594149380133747, + "grad_norm": 8.809585571289062, + "learning_rate": 4.905713245842107e-06, + "loss": 0.5083, + "mean_token_accuracy": 0.834871856868267, + "num_tokens": 103154444.0, + "step": 85790 + }, + { + "entropy": 1.9127579972147941, + "epoch": 0.26597249292638714, + "grad_norm": 8.541977882385254, + "learning_rate": 4.905427353454305e-06, + "loss": 0.5018, + "mean_token_accuracy": 0.839243420958519, + "num_tokens": 103166217.0, + "step": 85800 + }, + { + "entropy": 1.8976189360022544, + "epoch": 0.26600349205143686, + "grad_norm": 8.723061561584473, + "learning_rate": 4.905141511043905e-06, + "loss": 0.4897, + "mean_token_accuracy": 0.8352738916873932, + "num_tokens": 103178533.0, + "step": 85810 + }, + { + "entropy": 1.9483844459056854, + "epoch": 0.26603449117648653, + "grad_norm": 8.283819198608398, + "learning_rate": 4.904855718596345e-06, + "loss": 0.5166, + "mean_token_accuracy": 0.8419344946742058, + "num_tokens": 103188784.0, + "step": 85820 + }, + { + "entropy": 1.8302007086575032, + "epoch": 0.26606549030153626, + "grad_norm": 7.834589004516602, + "learning_rate": 4.9045699760970725e-06, + "loss": 0.4607, + "mean_token_accuracy": 0.8487353563308716, + "num_tokens": 103201469.0, + "step": 85830 + }, + { + "entropy": 1.9522307723760606, + "epoch": 0.2660964894265859, + "grad_norm": 11.1272554397583, + "learning_rate": 4.904284283531541e-06, + "loss": 0.4868, + "mean_token_accuracy": 0.8433923795819283, + "num_tokens": 103212554.0, + "step": 85840 + }, + { + "entropy": 1.9034996896982193, + "epoch": 0.26612748855163565, + "grad_norm": 8.404441833496094, + "learning_rate": 4.903998640885207e-06, + "loss": 0.5074, + "mean_token_accuracy": 0.8423371136188507, + "num_tokens": 103223756.0, + "step": 85850 + }, + { + "entropy": 1.8009837806224822, + "epoch": 0.2661584876766853, + "grad_norm": 9.704791069030762, + "learning_rate": 4.903713048143537e-06, + "loss": 0.4614, + "mean_token_accuracy": 0.851232835650444, + "num_tokens": 103237242.0, + "step": 85860 + }, + { + "entropy": 1.8711876735091209, + "epoch": 0.26618948680173504, + "grad_norm": 8.211481094360352, + "learning_rate": 4.903427505292001e-06, + "loss": 0.4864, + "mean_token_accuracy": 0.842264424264431, + "num_tokens": 103249121.0, + "step": 85870 + }, + { + "entropy": 1.7938581064343453, + "epoch": 0.2662204859267847, + "grad_norm": 3.8567850589752197, + "learning_rate": 4.903142012316073e-06, + "loss": 0.4382, + "mean_token_accuracy": 0.8489397838711739, + "num_tokens": 103261025.0, + "step": 85880 + }, + { + "entropy": 1.9541205585002899, + "epoch": 0.26625148505183444, + "grad_norm": 10.45516300201416, + "learning_rate": 4.902856569201237e-06, + "loss": 0.5165, + "mean_token_accuracy": 0.8429744258522988, + "num_tokens": 103271985.0, + "step": 85890 + }, + { + "entropy": 1.7870109647512435, + "epoch": 0.2662824841768841, + "grad_norm": 2.5378193855285645, + "learning_rate": 4.90257117593298e-06, + "loss": 0.4031, + "mean_token_accuracy": 0.8521895721554756, + "num_tokens": 103285897.0, + "step": 85900 + }, + { + "entropy": 1.769122688472271, + "epoch": 0.26631348330193383, + "grad_norm": 8.693305969238281, + "learning_rate": 4.902285832496798e-06, + "loss": 0.4092, + "mean_token_accuracy": 0.8523093193769455, + "num_tokens": 103299434.0, + "step": 85910 + }, + { + "entropy": 1.8774210557341575, + "epoch": 0.2663444824269835, + "grad_norm": 7.8302507400512695, + "learning_rate": 4.902000538878188e-06, + "loss": 0.4501, + "mean_token_accuracy": 0.8508623972535133, + "num_tokens": 103312561.0, + "step": 85920 + }, + { + "entropy": 1.8622436970472336, + "epoch": 0.26637548155203317, + "grad_norm": 8.453393936157227, + "learning_rate": 4.9017152950626585e-06, + "loss": 0.4582, + "mean_token_accuracy": 0.8456780537962914, + "num_tokens": 103324471.0, + "step": 85930 + }, + { + "entropy": 1.9303288459777832, + "epoch": 0.2664064806770829, + "grad_norm": 9.193410873413086, + "learning_rate": 4.901430101035719e-06, + "loss": 0.494, + "mean_token_accuracy": 0.8492061391472816, + "num_tokens": 103336441.0, + "step": 85940 + }, + { + "entropy": 1.9568013072013855, + "epoch": 0.26643747980213256, + "grad_norm": 8.285205841064453, + "learning_rate": 4.901144956782889e-06, + "loss": 0.5306, + "mean_token_accuracy": 0.8406900480389595, + "num_tokens": 103347523.0, + "step": 85950 + }, + { + "entropy": 1.9028612434864045, + "epoch": 0.2664684789271823, + "grad_norm": 8.355502128601074, + "learning_rate": 4.900859862289691e-06, + "loss": 0.4835, + "mean_token_accuracy": 0.8526147872209549, + "num_tokens": 103358369.0, + "step": 85960 + }, + { + "entropy": 1.929163258522749, + "epoch": 0.26649947805223195, + "grad_norm": 8.101810455322266, + "learning_rate": 4.900574817541653e-06, + "loss": 0.5188, + "mean_token_accuracy": 0.8357960835099221, + "num_tokens": 103369659.0, + "step": 85970 + }, + { + "entropy": 1.910251635313034, + "epoch": 0.2665304771772817, + "grad_norm": 8.729976654052734, + "learning_rate": 4.900289822524311e-06, + "loss": 0.4835, + "mean_token_accuracy": 0.8469779431819916, + "num_tokens": 103381688.0, + "step": 85980 + }, + { + "entropy": 1.9128791213035583, + "epoch": 0.26656147630233135, + "grad_norm": 4.791084289550781, + "learning_rate": 4.900004877223208e-06, + "loss": 0.5225, + "mean_token_accuracy": 0.8399335280060768, + "num_tokens": 103393309.0, + "step": 85990 + }, + { + "entropy": 1.8070705935359002, + "epoch": 0.26659247542738107, + "grad_norm": 4.363813877105713, + "learning_rate": 4.899719981623888e-06, + "loss": 0.3931, + "mean_token_accuracy": 0.8653892949223518, + "num_tokens": 103406198.0, + "step": 86000 + }, + { + "entropy": 1.858493185043335, + "epoch": 0.26662347455243074, + "grad_norm": 8.77523422241211, + "learning_rate": 4.899435135711908e-06, + "loss": 0.4392, + "mean_token_accuracy": 0.853272658586502, + "num_tokens": 103418547.0, + "step": 86010 + }, + { + "entropy": 1.8741918861865998, + "epoch": 0.26665447367748046, + "grad_norm": 8.620582580566406, + "learning_rate": 4.899150339472823e-06, + "loss": 0.4967, + "mean_token_accuracy": 0.8357633009552956, + "num_tokens": 103430557.0, + "step": 86020 + }, + { + "entropy": 1.7336426332592965, + "epoch": 0.26668547280253013, + "grad_norm": 3.6474039554595947, + "learning_rate": 4.898865592892199e-06, + "loss": 0.4074, + "mean_token_accuracy": 0.8620437294244766, + "num_tokens": 103444908.0, + "step": 86030 + }, + { + "entropy": 1.8742715016007423, + "epoch": 0.26671647192757986, + "grad_norm": 9.908348083496094, + "learning_rate": 4.8985808959556055e-06, + "loss": 0.4596, + "mean_token_accuracy": 0.8436573952436447, + "num_tokens": 103457439.0, + "step": 86040 + }, + { + "entropy": 1.8521887600421905, + "epoch": 0.2667474710526295, + "grad_norm": 9.044089317321777, + "learning_rate": 4.8982962486486215e-06, + "loss": 0.5119, + "mean_token_accuracy": 0.8412206932902336, + "num_tokens": 103469444.0, + "step": 86050 + }, + { + "entropy": 1.892989605665207, + "epoch": 0.26677847017767925, + "grad_norm": 9.907114028930664, + "learning_rate": 4.898011650956826e-06, + "loss": 0.5118, + "mean_token_accuracy": 0.8311768040060997, + "num_tokens": 103480820.0, + "step": 86060 + }, + { + "entropy": 1.9290689766407012, + "epoch": 0.2668094693027289, + "grad_norm": 9.498516082763672, + "learning_rate": 4.897727102865811e-06, + "loss": 0.5325, + "mean_token_accuracy": 0.8411960631608963, + "num_tokens": 103491988.0, + "step": 86070 + }, + { + "entropy": 1.857162345945835, + "epoch": 0.26684046842777864, + "grad_norm": 9.090229988098145, + "learning_rate": 4.897442604361166e-06, + "loss": 0.4517, + "mean_token_accuracy": 0.8425956487655639, + "num_tokens": 103504063.0, + "step": 86080 + }, + { + "entropy": 1.8890363931655885, + "epoch": 0.2668714675528283, + "grad_norm": 7.7313947677612305, + "learning_rate": 4.8971581554284956e-06, + "loss": 0.5048, + "mean_token_accuracy": 0.8431884482502937, + "num_tokens": 103515289.0, + "step": 86090 + }, + { + "entropy": 1.7703155070543288, + "epoch": 0.26690246667787804, + "grad_norm": 9.580257415771484, + "learning_rate": 4.896873756053401e-06, + "loss": 0.4185, + "mean_token_accuracy": 0.8482003748416901, + "num_tokens": 103528699.0, + "step": 86100 + }, + { + "entropy": 1.8586272314190864, + "epoch": 0.2669334658029277, + "grad_norm": 8.929677963256836, + "learning_rate": 4.8965894062214955e-06, + "loss": 0.4653, + "mean_token_accuracy": 0.8518487945199013, + "num_tokens": 103540418.0, + "step": 86110 + }, + { + "entropy": 1.7248114220798016, + "epoch": 0.26696446492797743, + "grad_norm": 9.42658805847168, + "learning_rate": 4.896305105918398e-06, + "loss": 0.3427, + "mean_token_accuracy": 0.8690110579133034, + "num_tokens": 103554470.0, + "step": 86120 + }, + { + "entropy": 1.8832508057355881, + "epoch": 0.2669954640530271, + "grad_norm": 8.39008903503418, + "learning_rate": 4.89602085512973e-06, + "loss": 0.5325, + "mean_token_accuracy": 0.8384171515703202, + "num_tokens": 103566005.0, + "step": 86130 + }, + { + "entropy": 1.88730780929327, + "epoch": 0.2670264631780768, + "grad_norm": 7.347865104675293, + "learning_rate": 4.895736653841122e-06, + "loss": 0.5064, + "mean_token_accuracy": 0.8440589159727097, + "num_tokens": 103577436.0, + "step": 86140 + }, + { + "entropy": 1.87120311409235, + "epoch": 0.2670574623031265, + "grad_norm": 10.774239540100098, + "learning_rate": 4.895452502038206e-06, + "loss": 0.4719, + "mean_token_accuracy": 0.8394436612725258, + "num_tokens": 103589660.0, + "step": 86150 + }, + { + "entropy": 1.8455160409212112, + "epoch": 0.2670884614281762, + "grad_norm": 4.862356662750244, + "learning_rate": 4.895168399706626e-06, + "loss": 0.4755, + "mean_token_accuracy": 0.8532706961035729, + "num_tokens": 103601065.0, + "step": 86160 + }, + { + "entropy": 1.8480889692902565, + "epoch": 0.2671194605532259, + "grad_norm": 7.0500688552856445, + "learning_rate": 4.894884346832027e-06, + "loss": 0.4816, + "mean_token_accuracy": 0.8402373760938644, + "num_tokens": 103613385.0, + "step": 86170 + }, + { + "entropy": 1.8862942337989808, + "epoch": 0.26715045967827555, + "grad_norm": 8.611431121826172, + "learning_rate": 4.894600343400061e-06, + "loss": 0.4786, + "mean_token_accuracy": 0.8492077127099037, + "num_tokens": 103625382.0, + "step": 86180 + }, + { + "entropy": 1.8930604338645936, + "epoch": 0.2671814588033253, + "grad_norm": 7.554574966430664, + "learning_rate": 4.894316389396388e-06, + "loss": 0.4678, + "mean_token_accuracy": 0.8508794024586678, + "num_tokens": 103637191.0, + "step": 86190 + }, + { + "entropy": 1.921440924704075, + "epoch": 0.26721245792837495, + "grad_norm": 8.372522354125977, + "learning_rate": 4.894032484806671e-06, + "loss": 0.485, + "mean_token_accuracy": 0.8466569393873214, + "num_tokens": 103648987.0, + "step": 86200 + }, + { + "entropy": 1.9750257551670074, + "epoch": 0.26724345705342467, + "grad_norm": 6.741246700286865, + "learning_rate": 4.893748629616579e-06, + "loss": 0.5252, + "mean_token_accuracy": 0.83692237585783, + "num_tokens": 103659952.0, + "step": 86210 + }, + { + "entropy": 1.8079271107912063, + "epoch": 0.26727445617847434, + "grad_norm": 4.520543098449707, + "learning_rate": 4.89346482381179e-06, + "loss": 0.4497, + "mean_token_accuracy": 0.851072619855404, + "num_tokens": 103672555.0, + "step": 86220 + }, + { + "entropy": 1.9372185349464417, + "epoch": 0.26730545530352406, + "grad_norm": 8.379465103149414, + "learning_rate": 4.8931810673779826e-06, + "loss": 0.5333, + "mean_token_accuracy": 0.8367386668920517, + "num_tokens": 103683329.0, + "step": 86230 + }, + { + "entropy": 1.8972872629761697, + "epoch": 0.26733645442857373, + "grad_norm": 8.481284141540527, + "learning_rate": 4.8928973603008466e-06, + "loss": 0.5154, + "mean_token_accuracy": 0.836910292506218, + "num_tokens": 103694708.0, + "step": 86240 + }, + { + "entropy": 1.8982973158359528, + "epoch": 0.26736745355362346, + "grad_norm": 9.208887100219727, + "learning_rate": 4.892613702566074e-06, + "loss": 0.528, + "mean_token_accuracy": 0.8313050597906113, + "num_tokens": 103707234.0, + "step": 86250 + }, + { + "entropy": 1.9922932714223862, + "epoch": 0.2673984526786731, + "grad_norm": 8.469325065612793, + "learning_rate": 4.892330094159364e-06, + "loss": 0.5287, + "mean_token_accuracy": 0.8502902209758758, + "num_tokens": 103717727.0, + "step": 86260 + }, + { + "entropy": 1.8680451080203055, + "epoch": 0.26742945180372285, + "grad_norm": 9.319469451904297, + "learning_rate": 4.892046535066422e-06, + "loss": 0.4582, + "mean_token_accuracy": 0.844627107679844, + "num_tokens": 103729854.0, + "step": 86270 + }, + { + "entropy": 1.9178496971726418, + "epoch": 0.2674604509287725, + "grad_norm": 3.9577114582061768, + "learning_rate": 4.891763025272957e-06, + "loss": 0.4889, + "mean_token_accuracy": 0.8466630190610885, + "num_tokens": 103742121.0, + "step": 86280 + }, + { + "entropy": 1.9498088628053665, + "epoch": 0.26749145005382224, + "grad_norm": 7.915012359619141, + "learning_rate": 4.891479564764686e-06, + "loss": 0.5296, + "mean_token_accuracy": 0.8389876633882523, + "num_tokens": 103753077.0, + "step": 86290 + }, + { + "entropy": 1.8598709747195243, + "epoch": 0.2675224491788719, + "grad_norm": 5.166788101196289, + "learning_rate": 4.891196153527332e-06, + "loss": 0.4362, + "mean_token_accuracy": 0.8542357131838798, + "num_tokens": 103765572.0, + "step": 86300 + }, + { + "entropy": 1.87834425419569, + "epoch": 0.26755344830392164, + "grad_norm": 5.004603862762451, + "learning_rate": 4.890912791546621e-06, + "loss": 0.4865, + "mean_token_accuracy": 0.8426024809479713, + "num_tokens": 103777759.0, + "step": 86310 + }, + { + "entropy": 1.9219666391611099, + "epoch": 0.2675844474289713, + "grad_norm": 7.858938694000244, + "learning_rate": 4.8906294788082895e-06, + "loss": 0.4515, + "mean_token_accuracy": 0.8506090447306633, + "num_tokens": 103790518.0, + "step": 86320 + }, + { + "entropy": 1.9264658272266388, + "epoch": 0.26761544655402103, + "grad_norm": 8.828166007995605, + "learning_rate": 4.890346215298074e-06, + "loss": 0.4848, + "mean_token_accuracy": 0.8476721942424774, + "num_tokens": 103801617.0, + "step": 86330 + }, + { + "entropy": 1.8663738921284676, + "epoch": 0.2676464456790707, + "grad_norm": 8.534181594848633, + "learning_rate": 4.890063001001723e-06, + "loss": 0.4735, + "mean_token_accuracy": 0.8441844269633293, + "num_tokens": 103814001.0, + "step": 86340 + }, + { + "entropy": 1.9115709066390991, + "epoch": 0.2676774448041204, + "grad_norm": 3.800837278366089, + "learning_rate": 4.889779835904984e-06, + "loss": 0.4427, + "mean_token_accuracy": 0.8440373882651329, + "num_tokens": 103826239.0, + "step": 86350 + }, + { + "entropy": 1.9562127739191055, + "epoch": 0.2677084439291701, + "grad_norm": 9.03986930847168, + "learning_rate": 4.889496719993616e-06, + "loss": 0.5581, + "mean_token_accuracy": 0.8341035321354866, + "num_tokens": 103837216.0, + "step": 86360 + }, + { + "entropy": 1.8933446779847145, + "epoch": 0.2677394430542198, + "grad_norm": 8.497458457946777, + "learning_rate": 4.889213653253382e-06, + "loss": 0.4572, + "mean_token_accuracy": 0.8447739273309708, + "num_tokens": 103849512.0, + "step": 86370 + }, + { + "entropy": 1.940016995370388, + "epoch": 0.2677704421792695, + "grad_norm": 9.415840148925781, + "learning_rate": 4.88893063567005e-06, + "loss": 0.4701, + "mean_token_accuracy": 0.8498956128954888, + "num_tokens": 103860985.0, + "step": 86380 + }, + { + "entropy": 1.9313148841261865, + "epoch": 0.2678014413043192, + "grad_norm": 8.689431190490723, + "learning_rate": 4.888647667229392e-06, + "loss": 0.5064, + "mean_token_accuracy": 0.8414283052086831, + "num_tokens": 103872780.0, + "step": 86390 + }, + { + "entropy": 1.9360795393586159, + "epoch": 0.2678324404293689, + "grad_norm": 9.888842582702637, + "learning_rate": 4.888364747917191e-06, + "loss": 0.4754, + "mean_token_accuracy": 0.8488156452775002, + "num_tokens": 103884056.0, + "step": 86400 + }, + { + "entropy": 1.8884512677788734, + "epoch": 0.2678634395544186, + "grad_norm": 9.365774154663086, + "learning_rate": 4.888081877719231e-06, + "loss": 0.489, + "mean_token_accuracy": 0.8477799639105796, + "num_tokens": 103895788.0, + "step": 86410 + }, + { + "entropy": 1.8175974920392037, + "epoch": 0.26789443867946827, + "grad_norm": 8.061196327209473, + "learning_rate": 4.887799056621303e-06, + "loss": 0.3801, + "mean_token_accuracy": 0.8709731310606003, + "num_tokens": 103908055.0, + "step": 86420 + }, + { + "entropy": 1.815473848581314, + "epoch": 0.26792543780451794, + "grad_norm": 8.550416946411133, + "learning_rate": 4.887516284609206e-06, + "loss": 0.4819, + "mean_token_accuracy": 0.8482015028595924, + "num_tokens": 103921019.0, + "step": 86430 + }, + { + "entropy": 1.8964615538716316, + "epoch": 0.26795643692956767, + "grad_norm": 7.224480152130127, + "learning_rate": 4.887233561668741e-06, + "loss": 0.5041, + "mean_token_accuracy": 0.8473669067025185, + "num_tokens": 103932376.0, + "step": 86440 + }, + { + "entropy": 1.8842689141631126, + "epoch": 0.26798743605461733, + "grad_norm": 3.8930001258850098, + "learning_rate": 4.886950887785717e-06, + "loss": 0.4318, + "mean_token_accuracy": 0.8496074840426445, + "num_tokens": 103944476.0, + "step": 86450 + }, + { + "entropy": 1.946980032324791, + "epoch": 0.26801843517966706, + "grad_norm": 9.502989768981934, + "learning_rate": 4.886668262945951e-06, + "loss": 0.5331, + "mean_token_accuracy": 0.8362909764051437, + "num_tokens": 103955653.0, + "step": 86460 + }, + { + "entropy": 1.9322603926062585, + "epoch": 0.2680494343047167, + "grad_norm": 8.219178199768066, + "learning_rate": 4.886385687135257e-06, + "loss": 0.5317, + "mean_token_accuracy": 0.8409213215112686, + "num_tokens": 103967278.0, + "step": 86470 + }, + { + "entropy": 1.9176155909895898, + "epoch": 0.26808043342976645, + "grad_norm": 6.983169078826904, + "learning_rate": 4.886103160339469e-06, + "loss": 0.4947, + "mean_token_accuracy": 0.8338638916611671, + "num_tokens": 103979747.0, + "step": 86480 + }, + { + "entropy": 1.89949888586998, + "epoch": 0.2681114325548161, + "grad_norm": 8.026739120483398, + "learning_rate": 4.885820682544414e-06, + "loss": 0.4411, + "mean_token_accuracy": 0.8553324237465858, + "num_tokens": 103991452.0, + "step": 86490 + }, + { + "entropy": 1.9148534148931504, + "epoch": 0.26814243167986584, + "grad_norm": 8.056224822998047, + "learning_rate": 4.885538253735928e-06, + "loss": 0.4862, + "mean_token_accuracy": 0.8458040565252304, + "num_tokens": 104003229.0, + "step": 86500 + }, + { + "entropy": 1.9580560460686685, + "epoch": 0.2681734308049155, + "grad_norm": 9.577115058898926, + "learning_rate": 4.885255873899857e-06, + "loss": 0.5021, + "mean_token_accuracy": 0.8413164436817169, + "num_tokens": 104014372.0, + "step": 86510 + }, + { + "entropy": 1.8256202667951584, + "epoch": 0.26820442992996524, + "grad_norm": 4.1934733390808105, + "learning_rate": 4.884973543022048e-06, + "loss": 0.5344, + "mean_token_accuracy": 0.8329330936074257, + "num_tokens": 104027131.0, + "step": 86520 + }, + { + "entropy": 1.9278512462973594, + "epoch": 0.2682354290550149, + "grad_norm": 6.770676612854004, + "learning_rate": 4.884691261088359e-06, + "loss": 0.5187, + "mean_token_accuracy": 0.8434366106986999, + "num_tokens": 104038489.0, + "step": 86530 + }, + { + "entropy": 1.7697638273239136, + "epoch": 0.26826642818006463, + "grad_norm": 9.314817428588867, + "learning_rate": 4.884409028084645e-06, + "loss": 0.4188, + "mean_token_accuracy": 0.8589093953371048, + "num_tokens": 104051697.0, + "step": 86540 + }, + { + "entropy": 1.8212177708745003, + "epoch": 0.2682974273051143, + "grad_norm": 8.558985710144043, + "learning_rate": 4.8841268439967744e-06, + "loss": 0.3656, + "mean_token_accuracy": 0.8659472689032555, + "num_tokens": 104064685.0, + "step": 86550 + }, + { + "entropy": 1.8339843198657035, + "epoch": 0.268328426430164, + "grad_norm": 5.08605432510376, + "learning_rate": 4.883844708810621e-06, + "loss": 0.4406, + "mean_token_accuracy": 0.8505037307739258, + "num_tokens": 104077583.0, + "step": 86560 + }, + { + "entropy": 1.9253122627735137, + "epoch": 0.2683594255552137, + "grad_norm": 7.690542697906494, + "learning_rate": 4.883562622512059e-06, + "loss": 0.522, + "mean_token_accuracy": 0.8420386493206025, + "num_tokens": 104088267.0, + "step": 86570 + }, + { + "entropy": 1.8858140379190445, + "epoch": 0.2683904246802634, + "grad_norm": 8.237641334533691, + "learning_rate": 4.883280585086974e-06, + "loss": 0.4529, + "mean_token_accuracy": 0.8476707622408867, + "num_tokens": 104099930.0, + "step": 86580 + }, + { + "entropy": 1.858066162467003, + "epoch": 0.2684214238053131, + "grad_norm": 8.798260688781738, + "learning_rate": 4.882998596521253e-06, + "loss": 0.4129, + "mean_token_accuracy": 0.8484656035900116, + "num_tokens": 104111901.0, + "step": 86590 + }, + { + "entropy": 1.92180934548378, + "epoch": 0.2684524229303628, + "grad_norm": 9.742270469665527, + "learning_rate": 4.882716656800792e-06, + "loss": 0.5152, + "mean_token_accuracy": 0.8452163085341453, + "num_tokens": 104122977.0, + "step": 86600 + }, + { + "entropy": 1.867686577141285, + "epoch": 0.2684834220554125, + "grad_norm": 10.891302108764648, + "learning_rate": 4.882434765911489e-06, + "loss": 0.4698, + "mean_token_accuracy": 0.8573295056819916, + "num_tokens": 104135141.0, + "step": 86610 + }, + { + "entropy": 1.9070853009819984, + "epoch": 0.2685144211804622, + "grad_norm": 6.908443450927734, + "learning_rate": 4.882152923839252e-06, + "loss": 0.5018, + "mean_token_accuracy": 0.8457126155495643, + "num_tokens": 104146361.0, + "step": 86620 + }, + { + "entropy": 1.9306767612695694, + "epoch": 0.2685454203055119, + "grad_norm": 8.809544563293457, + "learning_rate": 4.881871130569993e-06, + "loss": 0.4938, + "mean_token_accuracy": 0.831186157464981, + "num_tokens": 104158055.0, + "step": 86630 + }, + { + "entropy": 1.7939895704388618, + "epoch": 0.2685764194305616, + "grad_norm": 6.0357184410095215, + "learning_rate": 4.8815893860896265e-06, + "loss": 0.3657, + "mean_token_accuracy": 0.8565925478935241, + "num_tokens": 104171215.0, + "step": 86640 + }, + { + "entropy": 1.8932900041341783, + "epoch": 0.26860741855561127, + "grad_norm": 9.490089416503906, + "learning_rate": 4.881307690384079e-06, + "loss": 0.4687, + "mean_token_accuracy": 0.851920661330223, + "num_tokens": 104183346.0, + "step": 86650 + }, + { + "entropy": 1.7981626734137535, + "epoch": 0.268638417680661, + "grad_norm": 4.065007209777832, + "learning_rate": 4.881026043439277e-06, + "loss": 0.4316, + "mean_token_accuracy": 0.850606782734394, + "num_tokens": 104197135.0, + "step": 86660 + }, + { + "entropy": 1.9110074192285538, + "epoch": 0.26866941680571066, + "grad_norm": 8.759778022766113, + "learning_rate": 4.880744445241155e-06, + "loss": 0.4975, + "mean_token_accuracy": 0.8564579889178277, + "num_tokens": 104207822.0, + "step": 86670 + }, + { + "entropy": 1.980569313466549, + "epoch": 0.26870041593076033, + "grad_norm": 8.87386703491211, + "learning_rate": 4.880462895775654e-06, + "loss": 0.5313, + "mean_token_accuracy": 0.8343950539827347, + "num_tokens": 104219081.0, + "step": 86680 + }, + { + "entropy": 1.8610002383589745, + "epoch": 0.26873141505581005, + "grad_norm": 8.677059173583984, + "learning_rate": 4.880181395028719e-06, + "loss": 0.4443, + "mean_token_accuracy": 0.8492666438221932, + "num_tokens": 104231630.0, + "step": 86690 + }, + { + "entropy": 1.899726065993309, + "epoch": 0.2687624141808597, + "grad_norm": 8.745160102844238, + "learning_rate": 4.879899942986303e-06, + "loss": 0.5213, + "mean_token_accuracy": 0.8413066044449806, + "num_tokens": 104242889.0, + "step": 86700 + }, + { + "entropy": 1.8926833271980286, + "epoch": 0.26879341330590945, + "grad_norm": 3.9941442012786865, + "learning_rate": 4.87961853963436e-06, + "loss": 0.4936, + "mean_token_accuracy": 0.8299556851387024, + "num_tokens": 104254389.0, + "step": 86710 + }, + { + "entropy": 1.875252665579319, + "epoch": 0.2688244124309591, + "grad_norm": 8.266586303710938, + "learning_rate": 4.879337184958854e-06, + "loss": 0.5124, + "mean_token_accuracy": 0.8451120510697365, + "num_tokens": 104266592.0, + "step": 86720 + }, + { + "entropy": 1.8953094065189362, + "epoch": 0.26885541155600884, + "grad_norm": 8.299905776977539, + "learning_rate": 4.8790558789457545e-06, + "loss": 0.4767, + "mean_token_accuracy": 0.8386603578925133, + "num_tokens": 104278272.0, + "step": 86730 + }, + { + "entropy": 1.8657784268260003, + "epoch": 0.2688864106810585, + "grad_norm": 8.039691925048828, + "learning_rate": 4.878774621581035e-06, + "loss": 0.4618, + "mean_token_accuracy": 0.8498806357383728, + "num_tokens": 104290052.0, + "step": 86740 + }, + { + "entropy": 1.924768103659153, + "epoch": 0.26891740980610823, + "grad_norm": 8.882854461669922, + "learning_rate": 4.878493412850675e-06, + "loss": 0.5499, + "mean_token_accuracy": 0.8329790845513344, + "num_tokens": 104301303.0, + "step": 86750 + }, + { + "entropy": 1.8597337052226066, + "epoch": 0.2689484089311579, + "grad_norm": 8.194012641906738, + "learning_rate": 4.878212252740661e-06, + "loss": 0.4851, + "mean_token_accuracy": 0.8411440491676331, + "num_tokens": 104313725.0, + "step": 86760 + }, + { + "entropy": 1.955818921327591, + "epoch": 0.2689794080562076, + "grad_norm": 8.100213050842285, + "learning_rate": 4.877931141236982e-06, + "loss": 0.5445, + "mean_token_accuracy": 0.8334660112857819, + "num_tokens": 104325362.0, + "step": 86770 + }, + { + "entropy": 1.8957918226718902, + "epoch": 0.2690104071812573, + "grad_norm": 5.112435340881348, + "learning_rate": 4.877650078325635e-06, + "loss": 0.4801, + "mean_token_accuracy": 0.8381680518388748, + "num_tokens": 104337583.0, + "step": 86780 + }, + { + "entropy": 1.9326987490057945, + "epoch": 0.269041406306307, + "grad_norm": 8.216394424438477, + "learning_rate": 4.8773690639926246e-06, + "loss": 0.4806, + "mean_token_accuracy": 0.8478187531232834, + "num_tokens": 104349160.0, + "step": 86790 + }, + { + "entropy": 1.8107408866286279, + "epoch": 0.2690724054313567, + "grad_norm": 8.589932441711426, + "learning_rate": 4.8770880982239565e-06, + "loss": 0.4214, + "mean_token_accuracy": 0.8569507896900177, + "num_tokens": 104362032.0, + "step": 86800 + }, + { + "entropy": 1.9010547578334809, + "epoch": 0.2691034045564064, + "grad_norm": 7.747185707092285, + "learning_rate": 4.876807181005645e-06, + "loss": 0.4862, + "mean_token_accuracy": 0.8419987246394157, + "num_tokens": 104374183.0, + "step": 86810 + }, + { + "entropy": 1.9327505372464657, + "epoch": 0.2691344036814561, + "grad_norm": 7.296648025512695, + "learning_rate": 4.87652631232371e-06, + "loss": 0.5457, + "mean_token_accuracy": 0.8373480141162872, + "num_tokens": 104385704.0, + "step": 86820 + }, + { + "entropy": 1.9187294945120812, + "epoch": 0.2691654028065058, + "grad_norm": 9.021369934082031, + "learning_rate": 4.876245492164175e-06, + "loss": 0.5099, + "mean_token_accuracy": 0.8350897148251534, + "num_tokens": 104397570.0, + "step": 86830 + }, + { + "entropy": 1.8674516141414643, + "epoch": 0.2691964019315555, + "grad_norm": 8.63692855834961, + "learning_rate": 4.875964720513072e-06, + "loss": 0.5416, + "mean_token_accuracy": 0.8315334841609001, + "num_tokens": 104409275.0, + "step": 86840 + }, + { + "entropy": 1.8427454948425293, + "epoch": 0.2692274010566052, + "grad_norm": 3.408676862716675, + "learning_rate": 4.875683997356437e-06, + "loss": 0.4465, + "mean_token_accuracy": 0.85171507447958, + "num_tokens": 104421885.0, + "step": 86850 + }, + { + "entropy": 1.9858239054679871, + "epoch": 0.26925840018165487, + "grad_norm": 8.626346588134766, + "learning_rate": 4.87540332268031e-06, + "loss": 0.6016, + "mean_token_accuracy": 0.8261340126395226, + "num_tokens": 104433208.0, + "step": 86860 + }, + { + "entropy": 1.8474789157509803, + "epoch": 0.2692893993067046, + "grad_norm": 3.6940155029296875, + "learning_rate": 4.87512269647074e-06, + "loss": 0.4441, + "mean_token_accuracy": 0.8468289896845818, + "num_tokens": 104445950.0, + "step": 86870 + }, + { + "entropy": 1.7919086948037148, + "epoch": 0.26932039843175426, + "grad_norm": 7.589000225067139, + "learning_rate": 4.8748421187137786e-06, + "loss": 0.4006, + "mean_token_accuracy": 0.8585388869047165, + "num_tokens": 104459408.0, + "step": 86880 + }, + { + "entropy": 1.817798225581646, + "epoch": 0.269351397556804, + "grad_norm": 2.4566121101379395, + "learning_rate": 4.8745615893954875e-06, + "loss": 0.4488, + "mean_token_accuracy": 0.856153316795826, + "num_tokens": 104472083.0, + "step": 86890 + }, + { + "entropy": 1.829006166756153, + "epoch": 0.26938239668185365, + "grad_norm": 3.47737455368042, + "learning_rate": 4.8742811085019294e-06, + "loss": 0.4355, + "mean_token_accuracy": 0.8479893788695335, + "num_tokens": 104485313.0, + "step": 86900 + }, + { + "entropy": 1.824818679690361, + "epoch": 0.2694133958069034, + "grad_norm": 10.421551704406738, + "learning_rate": 4.8740006760191715e-06, + "loss": 0.4166, + "mean_token_accuracy": 0.8599545955657959, + "num_tokens": 104498290.0, + "step": 86910 + }, + { + "entropy": 1.8766320884227752, + "epoch": 0.26944439493195305, + "grad_norm": 8.567687034606934, + "learning_rate": 4.873720291933294e-06, + "loss": 0.4632, + "mean_token_accuracy": 0.8494315207004547, + "num_tokens": 104510307.0, + "step": 86920 + }, + { + "entropy": 1.8442302539944648, + "epoch": 0.2694753940570027, + "grad_norm": 3.7030580043792725, + "learning_rate": 4.873439956230375e-06, + "loss": 0.4602, + "mean_token_accuracy": 0.848169319331646, + "num_tokens": 104522429.0, + "step": 86930 + }, + { + "entropy": 1.890008282661438, + "epoch": 0.26950639318205244, + "grad_norm": 8.97164249420166, + "learning_rate": 4.873159668896501e-06, + "loss": 0.5014, + "mean_token_accuracy": 0.8344573676586151, + "num_tokens": 104534058.0, + "step": 86940 + }, + { + "entropy": 1.818005445599556, + "epoch": 0.2695373923071021, + "grad_norm": 8.09427547454834, + "learning_rate": 4.8728794299177655e-06, + "loss": 0.4666, + "mean_token_accuracy": 0.8522487103939056, + "num_tokens": 104546415.0, + "step": 86950 + }, + { + "entropy": 1.838931292295456, + "epoch": 0.26956839143215183, + "grad_norm": 7.319267272949219, + "learning_rate": 4.8725992392802655e-06, + "loss": 0.504, + "mean_token_accuracy": 0.8448920026421547, + "num_tokens": 104558731.0, + "step": 86960 + }, + { + "entropy": 1.8411687076091767, + "epoch": 0.2695993905572015, + "grad_norm": 3.573763847351074, + "learning_rate": 4.872319096970106e-06, + "loss": 0.439, + "mean_token_accuracy": 0.8461699530482292, + "num_tokens": 104571213.0, + "step": 86970 + }, + { + "entropy": 1.8673091575503349, + "epoch": 0.2696303896822512, + "grad_norm": 4.311919689178467, + "learning_rate": 4.872039002973394e-06, + "loss": 0.5261, + "mean_token_accuracy": 0.8393310904502869, + "num_tokens": 104583049.0, + "step": 86980 + }, + { + "entropy": 1.8796320587396622, + "epoch": 0.2696613888073009, + "grad_norm": 9.332188606262207, + "learning_rate": 4.871758957276246e-06, + "loss": 0.5274, + "mean_token_accuracy": 0.8384488835930825, + "num_tokens": 104595379.0, + "step": 86990 + }, + { + "entropy": 1.8900238052010536, + "epoch": 0.2696923879323506, + "grad_norm": 7.770622253417969, + "learning_rate": 4.871478959864781e-06, + "loss": 0.4843, + "mean_token_accuracy": 0.8497345179319382, + "num_tokens": 104606394.0, + "step": 87000 + }, + { + "entropy": 1.839431057870388, + "epoch": 0.2697233870574003, + "grad_norm": 8.380780220031738, + "learning_rate": 4.871199010725126e-06, + "loss": 0.5151, + "mean_token_accuracy": 0.8396735802292824, + "num_tokens": 104619650.0, + "step": 87010 + }, + { + "entropy": 1.827296996116638, + "epoch": 0.26975438618245, + "grad_norm": 8.714934349060059, + "learning_rate": 4.870919109843412e-06, + "loss": 0.509, + "mean_token_accuracy": 0.8432153165340424, + "num_tokens": 104632127.0, + "step": 87020 + }, + { + "entropy": 1.8935890957713126, + "epoch": 0.2697853853074997, + "grad_norm": 9.051390647888184, + "learning_rate": 4.870639257205774e-06, + "loss": 0.5075, + "mean_token_accuracy": 0.8443873509764671, + "num_tokens": 104643478.0, + "step": 87030 + }, + { + "entropy": 1.8204589426517486, + "epoch": 0.2698163844325494, + "grad_norm": 7.4907379150390625, + "learning_rate": 4.870359452798357e-06, + "loss": 0.4349, + "mean_token_accuracy": 0.8611741289496422, + "num_tokens": 104655632.0, + "step": 87040 + }, + { + "entropy": 1.8853262856602668, + "epoch": 0.2698473835575991, + "grad_norm": 8.072641372680664, + "learning_rate": 4.8700796966073084e-06, + "loss": 0.4742, + "mean_token_accuracy": 0.8454017639160156, + "num_tokens": 104667018.0, + "step": 87050 + }, + { + "entropy": 1.8324878126382829, + "epoch": 0.2698783826826488, + "grad_norm": 8.132426261901855, + "learning_rate": 4.869799988618784e-06, + "loss": 0.4688, + "mean_token_accuracy": 0.8415536895394325, + "num_tokens": 104679052.0, + "step": 87060 + }, + { + "entropy": 1.884521934390068, + "epoch": 0.26990938180769847, + "grad_norm": 9.089740753173828, + "learning_rate": 4.869520328818938e-06, + "loss": 0.4913, + "mean_token_accuracy": 0.8491548255085946, + "num_tokens": 104690472.0, + "step": 87070 + }, + { + "entropy": 1.7616024523973466, + "epoch": 0.2699403809327482, + "grad_norm": 5.119749546051025, + "learning_rate": 4.86924071719394e-06, + "loss": 0.4834, + "mean_token_accuracy": 0.8421557962894439, + "num_tokens": 104703935.0, + "step": 87080 + }, + { + "entropy": 1.8466105580329895, + "epoch": 0.26997138005779786, + "grad_norm": 6.891587734222412, + "learning_rate": 4.868961153729958e-06, + "loss": 0.5664, + "mean_token_accuracy": 0.8311922624707222, + "num_tokens": 104715762.0, + "step": 87090 + }, + { + "entropy": 1.830435362458229, + "epoch": 0.2700023791828476, + "grad_norm": 9.212553024291992, + "learning_rate": 4.86868163841317e-06, + "loss": 0.5407, + "mean_token_accuracy": 0.8380068346858025, + "num_tokens": 104727912.0, + "step": 87100 + }, + { + "entropy": 1.8718600660562514, + "epoch": 0.27003337830789725, + "grad_norm": 8.763904571533203, + "learning_rate": 4.8684021712297545e-06, + "loss": 0.485, + "mean_token_accuracy": 0.8420531466603279, + "num_tokens": 104740144.0, + "step": 87110 + }, + { + "entropy": 1.8686859756708145, + "epoch": 0.270064377432947, + "grad_norm": 4.998740196228027, + "learning_rate": 4.868122752165901e-06, + "loss": 0.5043, + "mean_token_accuracy": 0.8439597055315972, + "num_tokens": 104751419.0, + "step": 87120 + }, + { + "entropy": 1.821141104400158, + "epoch": 0.27009537655799665, + "grad_norm": 8.166898727416992, + "learning_rate": 4.867843381207802e-06, + "loss": 0.4898, + "mean_token_accuracy": 0.8448257312178612, + "num_tokens": 104764014.0, + "step": 87130 + }, + { + "entropy": 1.8268358632922173, + "epoch": 0.27012637568304637, + "grad_norm": 9.196722984313965, + "learning_rate": 4.867564058341654e-06, + "loss": 0.4653, + "mean_token_accuracy": 0.8561849161982537, + "num_tokens": 104776029.0, + "step": 87140 + }, + { + "entropy": 1.8853852570056915, + "epoch": 0.27015737480809604, + "grad_norm": 7.830870151519775, + "learning_rate": 4.867284783553663e-06, + "loss": 0.5312, + "mean_token_accuracy": 0.8442418292164803, + "num_tokens": 104787139.0, + "step": 87150 + }, + { + "entropy": 1.7887479558587074, + "epoch": 0.27018837393314576, + "grad_norm": 8.097960472106934, + "learning_rate": 4.867005556830035e-06, + "loss": 0.3924, + "mean_token_accuracy": 0.8651418328285218, + "num_tokens": 104799482.0, + "step": 87160 + }, + { + "entropy": 1.8204817980527879, + "epoch": 0.27021937305819543, + "grad_norm": 8.041654586791992, + "learning_rate": 4.8667263781569875e-06, + "loss": 0.4393, + "mean_token_accuracy": 0.8555315598845482, + "num_tokens": 104811350.0, + "step": 87170 + }, + { + "entropy": 1.7814991921186447, + "epoch": 0.2702503721832451, + "grad_norm": 7.963624477386475, + "learning_rate": 4.86644724752074e-06, + "loss": 0.4126, + "mean_token_accuracy": 0.8552462846040726, + "num_tokens": 104824216.0, + "step": 87180 + }, + { + "entropy": 1.9131552398204803, + "epoch": 0.2702813713082948, + "grad_norm": 9.447894096374512, + "learning_rate": 4.86616816490752e-06, + "loss": 0.5184, + "mean_token_accuracy": 0.8396668806672096, + "num_tokens": 104835560.0, + "step": 87190 + }, + { + "entropy": 1.824287013709545, + "epoch": 0.2703123704333445, + "grad_norm": 9.329700469970703, + "learning_rate": 4.865889130303556e-06, + "loss": 0.4567, + "mean_token_accuracy": 0.8538261905312539, + "num_tokens": 104847449.0, + "step": 87200 + }, + { + "entropy": 1.8483013391494751, + "epoch": 0.2703433695583942, + "grad_norm": 8.532864570617676, + "learning_rate": 4.865610143695086e-06, + "loss": 0.4943, + "mean_token_accuracy": 0.8494655951857567, + "num_tokens": 104859220.0, + "step": 87210 + }, + { + "entropy": 1.8401647135615349, + "epoch": 0.2703743686834439, + "grad_norm": 7.677708148956299, + "learning_rate": 4.8653312050683524e-06, + "loss": 0.5101, + "mean_token_accuracy": 0.8437712222337723, + "num_tokens": 104870632.0, + "step": 87220 + }, + { + "entropy": 1.8975766867399215, + "epoch": 0.2704053678084936, + "grad_norm": 7.6379780769348145, + "learning_rate": 4.865052314409605e-06, + "loss": 0.5677, + "mean_token_accuracy": 0.8343770399689674, + "num_tokens": 104881139.0, + "step": 87230 + }, + { + "entropy": 1.842423902451992, + "epoch": 0.2704363669335433, + "grad_norm": 8.821908950805664, + "learning_rate": 4.864773471705094e-06, + "loss": 0.4859, + "mean_token_accuracy": 0.8355495795607567, + "num_tokens": 104893189.0, + "step": 87240 + }, + { + "entropy": 1.8596946865320205, + "epoch": 0.270467366058593, + "grad_norm": 8.274313926696777, + "learning_rate": 4.86449467694108e-06, + "loss": 0.4714, + "mean_token_accuracy": 0.8518381923437118, + "num_tokens": 104904842.0, + "step": 87250 + }, + { + "entropy": 1.7499984815716743, + "epoch": 0.2704983651836427, + "grad_norm": 8.799256324768066, + "learning_rate": 4.864215930103828e-06, + "loss": 0.367, + "mean_token_accuracy": 0.8650006785988807, + "num_tokens": 104918602.0, + "step": 87260 + }, + { + "entropy": 1.8299191161990165, + "epoch": 0.2705293643086924, + "grad_norm": 8.756028175354004, + "learning_rate": 4.863937231179608e-06, + "loss": 0.4585, + "mean_token_accuracy": 0.8430431827902793, + "num_tokens": 104930274.0, + "step": 87270 + }, + { + "entropy": 1.8168650731444358, + "epoch": 0.27056036343374207, + "grad_norm": 8.896665573120117, + "learning_rate": 4.863658580154694e-06, + "loss": 0.4216, + "mean_token_accuracy": 0.8473124504089355, + "num_tokens": 104943125.0, + "step": 87280 + }, + { + "entropy": 1.8909821808338165, + "epoch": 0.2705913625587918, + "grad_norm": 11.013216018676758, + "learning_rate": 4.863379977015369e-06, + "loss": 0.5405, + "mean_token_accuracy": 0.8345570892095566, + "num_tokens": 104954178.0, + "step": 87290 + }, + { + "entropy": 1.8781416594982148, + "epoch": 0.27062236168384146, + "grad_norm": 8.503053665161133, + "learning_rate": 4.863101421747918e-06, + "loss": 0.5134, + "mean_token_accuracy": 0.8471224218606949, + "num_tokens": 104965323.0, + "step": 87300 + }, + { + "entropy": 1.7377824038267136, + "epoch": 0.2706533608088912, + "grad_norm": 3.2067818641662598, + "learning_rate": 4.862822914338635e-06, + "loss": 0.3778, + "mean_token_accuracy": 0.8619665712118149, + "num_tokens": 104979529.0, + "step": 87310 + }, + { + "entropy": 1.8459437713027, + "epoch": 0.27068435993394085, + "grad_norm": 9.494819641113281, + "learning_rate": 4.862544454773815e-06, + "loss": 0.4917, + "mean_token_accuracy": 0.8475456729531288, + "num_tokens": 104991778.0, + "step": 87320 + }, + { + "entropy": 1.8079747796058654, + "epoch": 0.2707153590589906, + "grad_norm": 4.102128505706787, + "learning_rate": 4.86226604303976e-06, + "loss": 0.495, + "mean_token_accuracy": 0.8489979475736618, + "num_tokens": 105004887.0, + "step": 87330 + }, + { + "entropy": 1.8566383227705956, + "epoch": 0.27074635818404025, + "grad_norm": 9.459664344787598, + "learning_rate": 4.8619876791227834e-06, + "loss": 0.463, + "mean_token_accuracy": 0.8472496911883354, + "num_tokens": 105015787.0, + "step": 87340 + }, + { + "entropy": 1.8375443920493126, + "epoch": 0.27077735730908997, + "grad_norm": 8.758172988891602, + "learning_rate": 4.861709363009195e-06, + "loss": 0.4461, + "mean_token_accuracy": 0.8570204228162766, + "num_tokens": 105027235.0, + "step": 87350 + }, + { + "entropy": 1.8649962782859801, + "epoch": 0.27080835643413964, + "grad_norm": 9.725914001464844, + "learning_rate": 4.861431094685316e-06, + "loss": 0.5095, + "mean_token_accuracy": 0.8399304628372193, + "num_tokens": 105038434.0, + "step": 87360 + }, + { + "entropy": 1.7663211345672607, + "epoch": 0.27083935555918937, + "grad_norm": 7.783099174499512, + "learning_rate": 4.86115287413747e-06, + "loss": 0.4074, + "mean_token_accuracy": 0.8482378587126732, + "num_tokens": 105050998.0, + "step": 87370 + }, + { + "entropy": 1.8368576869368554, + "epoch": 0.27087035468423903, + "grad_norm": 9.434002876281738, + "learning_rate": 4.8608747013519896e-06, + "loss": 0.4833, + "mean_token_accuracy": 0.8487260073423386, + "num_tokens": 105062505.0, + "step": 87380 + }, + { + "entropy": 1.7946744754910469, + "epoch": 0.27090135380928876, + "grad_norm": 8.645105361938477, + "learning_rate": 4.860596576315209e-06, + "loss": 0.4614, + "mean_token_accuracy": 0.8437353923916817, + "num_tokens": 105075561.0, + "step": 87390 + }, + { + "entropy": 1.8670944333076478, + "epoch": 0.2709323529343384, + "grad_norm": 8.151835441589355, + "learning_rate": 4.860318499013468e-06, + "loss": 0.5036, + "mean_token_accuracy": 0.8455032199621201, + "num_tokens": 105087761.0, + "step": 87400 + }, + { + "entropy": 1.8563807129859924, + "epoch": 0.2709633520593881, + "grad_norm": 8.382278442382812, + "learning_rate": 4.860040469433119e-06, + "loss": 0.4837, + "mean_token_accuracy": 0.8437006428837777, + "num_tokens": 105099328.0, + "step": 87410 + }, + { + "entropy": 1.8392608642578125, + "epoch": 0.2709943511844378, + "grad_norm": 6.8433098793029785, + "learning_rate": 4.8597624875605076e-06, + "loss": 0.4492, + "mean_token_accuracy": 0.8475878596305847, + "num_tokens": 105111492.0, + "step": 87420 + }, + { + "entropy": 1.896904969215393, + "epoch": 0.2710253503094875, + "grad_norm": 9.722173690795898, + "learning_rate": 4.859484553381996e-06, + "loss": 0.5157, + "mean_token_accuracy": 0.8329481184482574, + "num_tokens": 105123552.0, + "step": 87430 + }, + { + "entropy": 1.8754973500967025, + "epoch": 0.2710563494345372, + "grad_norm": 3.6622695922851562, + "learning_rate": 4.859206666883946e-06, + "loss": 0.4939, + "mean_token_accuracy": 0.8413586497306824, + "num_tokens": 105135820.0, + "step": 87440 + }, + { + "entropy": 1.8094146370887756, + "epoch": 0.2710873485595869, + "grad_norm": 9.525456428527832, + "learning_rate": 4.858928828052725e-06, + "loss": 0.4518, + "mean_token_accuracy": 0.8486891463398933, + "num_tokens": 105147391.0, + "step": 87450 + }, + { + "entropy": 1.9204721599817276, + "epoch": 0.2711183476846366, + "grad_norm": 8.702652931213379, + "learning_rate": 4.858651036874711e-06, + "loss": 0.5346, + "mean_token_accuracy": 0.837224043905735, + "num_tokens": 105159245.0, + "step": 87460 + }, + { + "entropy": 1.9499018788337708, + "epoch": 0.2711493468096863, + "grad_norm": 9.13373851776123, + "learning_rate": 4.858373293336278e-06, + "loss": 0.5221, + "mean_token_accuracy": 0.8495534911751748, + "num_tokens": 105169906.0, + "step": 87470 + }, + { + "entropy": 1.865763219445944, + "epoch": 0.271180345934736, + "grad_norm": 8.000102043151855, + "learning_rate": 4.858095597423816e-06, + "loss": 0.4663, + "mean_token_accuracy": 0.8466974958777428, + "num_tokens": 105182696.0, + "step": 87480 + }, + { + "entropy": 1.8530535832047463, + "epoch": 0.27121134505978567, + "grad_norm": 9.345272064208984, + "learning_rate": 4.8578179491237135e-06, + "loss": 0.5162, + "mean_token_accuracy": 0.8383201539516449, + "num_tokens": 105194697.0, + "step": 87490 + }, + { + "entropy": 1.9272264629602431, + "epoch": 0.2712423441848354, + "grad_norm": 8.332130432128906, + "learning_rate": 4.857540348422365e-06, + "loss": 0.4812, + "mean_token_accuracy": 0.8568297758698463, + "num_tokens": 105205400.0, + "step": 87500 + }, + { + "entropy": 1.8942699432373047, + "epoch": 0.27127334330988506, + "grad_norm": 8.65912914276123, + "learning_rate": 4.857262795306176e-06, + "loss": 0.4641, + "mean_token_accuracy": 0.8520406141877175, + "num_tokens": 105217304.0, + "step": 87510 + }, + { + "entropy": 1.7813436336815358, + "epoch": 0.2713043424349348, + "grad_norm": 4.422085762023926, + "learning_rate": 4.8569852897615476e-06, + "loss": 0.4322, + "mean_token_accuracy": 0.8593872472643852, + "num_tokens": 105230048.0, + "step": 87520 + }, + { + "entropy": 1.9077577859163284, + "epoch": 0.27133534155998446, + "grad_norm": 8.263802528381348, + "learning_rate": 4.856707831774897e-06, + "loss": 0.4897, + "mean_token_accuracy": 0.8396870777010917, + "num_tokens": 105241231.0, + "step": 87530 + }, + { + "entropy": 1.8345076471567154, + "epoch": 0.2713663406850342, + "grad_norm": 8.313085556030273, + "learning_rate": 4.856430421332639e-06, + "loss": 0.4701, + "mean_token_accuracy": 0.8465406790375709, + "num_tokens": 105253356.0, + "step": 87540 + }, + { + "entropy": 1.8547306582331657, + "epoch": 0.27139733981008385, + "grad_norm": 7.723386287689209, + "learning_rate": 4.856153058421199e-06, + "loss": 0.4872, + "mean_token_accuracy": 0.8484765499830246, + "num_tokens": 105265696.0, + "step": 87550 + }, + { + "entropy": 1.8216210514307023, + "epoch": 0.2714283389351336, + "grad_norm": 7.835054874420166, + "learning_rate": 4.855875743027003e-06, + "loss": 0.4493, + "mean_token_accuracy": 0.8526194721460343, + "num_tokens": 105277875.0, + "step": 87560 + }, + { + "entropy": 1.8399008169770241, + "epoch": 0.27145933806018324, + "grad_norm": 7.611944198608398, + "learning_rate": 4.855598475136486e-06, + "loss": 0.4977, + "mean_token_accuracy": 0.8431843638420105, + "num_tokens": 105290058.0, + "step": 87570 + }, + { + "entropy": 1.8355013683438302, + "epoch": 0.27149033718523297, + "grad_norm": 8.615690231323242, + "learning_rate": 4.855321254736087e-06, + "loss": 0.4688, + "mean_token_accuracy": 0.8420334681868553, + "num_tokens": 105302569.0, + "step": 87580 + }, + { + "entropy": 1.8711734786629677, + "epoch": 0.27152133631028263, + "grad_norm": 8.991096496582031, + "learning_rate": 4.855044081812253e-06, + "loss": 0.5087, + "mean_token_accuracy": 0.8353176578879357, + "num_tokens": 105314367.0, + "step": 87590 + }, + { + "entropy": 1.8946691662073136, + "epoch": 0.27155233543533236, + "grad_norm": 7.5710577964782715, + "learning_rate": 4.854766956351432e-06, + "loss": 0.5232, + "mean_token_accuracy": 0.8443698287010193, + "num_tokens": 105325617.0, + "step": 87600 + }, + { + "entropy": 1.8058865994215012, + "epoch": 0.27158333456038203, + "grad_norm": 2.2330517768859863, + "learning_rate": 4.854489878340079e-06, + "loss": 0.4736, + "mean_token_accuracy": 0.8525756880640983, + "num_tokens": 105338254.0, + "step": 87610 + }, + { + "entropy": 1.9165008813142776, + "epoch": 0.27161433368543175, + "grad_norm": 8.385638236999512, + "learning_rate": 4.854212847764657e-06, + "loss": 0.5176, + "mean_token_accuracy": 0.8384706929326058, + "num_tokens": 105349177.0, + "step": 87620 + }, + { + "entropy": 1.906524208188057, + "epoch": 0.2716453328104814, + "grad_norm": 8.768911361694336, + "learning_rate": 4.853935864611632e-06, + "loss": 0.5187, + "mean_token_accuracy": 0.8458487808704376, + "num_tokens": 105360116.0, + "step": 87630 + }, + { + "entropy": 1.8195328041911125, + "epoch": 0.27167633193553115, + "grad_norm": 4.211945056915283, + "learning_rate": 4.853658928867475e-06, + "loss": 0.4673, + "mean_token_accuracy": 0.8428912833333015, + "num_tokens": 105372704.0, + "step": 87640 + }, + { + "entropy": 1.8345939561724662, + "epoch": 0.2717073310605808, + "grad_norm": 10.063714981079102, + "learning_rate": 4.853382040518665e-06, + "loss": 0.4395, + "mean_token_accuracy": 0.8555810198187828, + "num_tokens": 105384756.0, + "step": 87650 + }, + { + "entropy": 1.8712797194719315, + "epoch": 0.2717383301856305, + "grad_norm": 7.05558967590332, + "learning_rate": 4.853105199551681e-06, + "loss": 0.5203, + "mean_token_accuracy": 0.8413851290941239, + "num_tokens": 105396711.0, + "step": 87660 + }, + { + "entropy": 1.8855430006980896, + "epoch": 0.2717693293106802, + "grad_norm": 9.500574111938477, + "learning_rate": 4.8528284059530145e-06, + "loss": 0.507, + "mean_token_accuracy": 0.841337351500988, + "num_tokens": 105407862.0, + "step": 87670 + }, + { + "entropy": 1.7173856884241103, + "epoch": 0.2718003284357299, + "grad_norm": 4.654793739318848, + "learning_rate": 4.852551659709158e-06, + "loss": 0.3756, + "mean_token_accuracy": 0.8598586186766625, + "num_tokens": 105422391.0, + "step": 87680 + }, + { + "entropy": 1.9247530341148376, + "epoch": 0.2718313275607796, + "grad_norm": 9.287614822387695, + "learning_rate": 4.85227496080661e-06, + "loss": 0.5294, + "mean_token_accuracy": 0.8380204871296882, + "num_tokens": 105433845.0, + "step": 87690 + }, + { + "entropy": 1.9517512962222099, + "epoch": 0.27186232668582927, + "grad_norm": 6.864774227142334, + "learning_rate": 4.851998309231874e-06, + "loss": 0.5121, + "mean_token_accuracy": 0.8360002964735032, + "num_tokens": 105445129.0, + "step": 87700 + }, + { + "entropy": 1.9424969971179962, + "epoch": 0.271893325810879, + "grad_norm": 8.023564338684082, + "learning_rate": 4.8517217049714625e-06, + "loss": 0.5175, + "mean_token_accuracy": 0.8406699001789093, + "num_tokens": 105457210.0, + "step": 87710 + }, + { + "entropy": 1.8285282671451568, + "epoch": 0.27192432493592866, + "grad_norm": 9.891963958740234, + "learning_rate": 4.851445148011887e-06, + "loss": 0.454, + "mean_token_accuracy": 0.853777602314949, + "num_tokens": 105469532.0, + "step": 87720 + }, + { + "entropy": 1.825080545246601, + "epoch": 0.2719553240609784, + "grad_norm": 6.351779460906982, + "learning_rate": 4.8511686383396706e-06, + "loss": 0.4105, + "mean_token_accuracy": 0.858852119743824, + "num_tokens": 105482237.0, + "step": 87730 + }, + { + "entropy": 1.9200027763843537, + "epoch": 0.27198632318602806, + "grad_norm": 8.065184593200684, + "learning_rate": 4.850892175941337e-06, + "loss": 0.5068, + "mean_token_accuracy": 0.8401974439620972, + "num_tokens": 105493906.0, + "step": 87740 + }, + { + "entropy": 1.8792470768094063, + "epoch": 0.2720173223110778, + "grad_norm": 8.423270225524902, + "learning_rate": 4.8506157608034186e-06, + "loss": 0.534, + "mean_token_accuracy": 0.8373197540640831, + "num_tokens": 105505414.0, + "step": 87750 + }, + { + "entropy": 1.900409395992756, + "epoch": 0.27204832143612745, + "grad_norm": 7.148952007293701, + "learning_rate": 4.850339392912451e-06, + "loss": 0.4789, + "mean_token_accuracy": 0.8442846789956093, + "num_tokens": 105516469.0, + "step": 87760 + }, + { + "entropy": 1.8958803132176398, + "epoch": 0.2720793205611772, + "grad_norm": 8.72636604309082, + "learning_rate": 4.850063072254976e-06, + "loss": 0.5062, + "mean_token_accuracy": 0.8429311379790306, + "num_tokens": 105528375.0, + "step": 87770 + }, + { + "entropy": 1.8233585372567176, + "epoch": 0.27211031968622684, + "grad_norm": 8.03514289855957, + "learning_rate": 4.849786798817542e-06, + "loss": 0.478, + "mean_token_accuracy": 0.8540633916854858, + "num_tokens": 105540482.0, + "step": 87780 + }, + { + "entropy": 1.8922078132629394, + "epoch": 0.27214131881127657, + "grad_norm": 8.679289817810059, + "learning_rate": 4.8495105725867e-06, + "loss": 0.4951, + "mean_token_accuracy": 0.8497767493128776, + "num_tokens": 105551735.0, + "step": 87790 + }, + { + "entropy": 1.809168304502964, + "epoch": 0.27217231793632624, + "grad_norm": 9.962617874145508, + "learning_rate": 4.84923439354901e-06, + "loss": 0.4717, + "mean_token_accuracy": 0.8472581446170807, + "num_tokens": 105564314.0, + "step": 87800 + }, + { + "entropy": 1.843416164815426, + "epoch": 0.27220331706137596, + "grad_norm": 7.68436861038208, + "learning_rate": 4.848958261691033e-06, + "loss": 0.5533, + "mean_token_accuracy": 0.8343981161713601, + "num_tokens": 105576827.0, + "step": 87810 + }, + { + "entropy": 1.8259743750095367, + "epoch": 0.27223431618642563, + "grad_norm": 7.573136329650879, + "learning_rate": 4.84868217699934e-06, + "loss": 0.4568, + "mean_token_accuracy": 0.852935828268528, + "num_tokens": 105589417.0, + "step": 87820 + }, + { + "entropy": 1.8123556807637216, + "epoch": 0.27226531531147535, + "grad_norm": 3.7434639930725098, + "learning_rate": 4.848406139460503e-06, + "loss": 0.4605, + "mean_token_accuracy": 0.8481271311640739, + "num_tokens": 105602368.0, + "step": 87830 + }, + { + "entropy": 1.8485308945178986, + "epoch": 0.272296314436525, + "grad_norm": 7.756927490234375, + "learning_rate": 4.848130149061103e-06, + "loss": 0.448, + "mean_token_accuracy": 0.8554665118455886, + "num_tokens": 105613811.0, + "step": 87840 + }, + { + "entropy": 1.8490839540958404, + "epoch": 0.27232731356157475, + "grad_norm": 6.909817695617676, + "learning_rate": 4.847854205787724e-06, + "loss": 0.4426, + "mean_token_accuracy": 0.8522930264472961, + "num_tokens": 105625880.0, + "step": 87850 + }, + { + "entropy": 1.882150113582611, + "epoch": 0.2723583126866244, + "grad_norm": 9.245566368103027, + "learning_rate": 4.847578309626954e-06, + "loss": 0.4931, + "mean_token_accuracy": 0.8421803861856461, + "num_tokens": 105638135.0, + "step": 87860 + }, + { + "entropy": 1.8645110860466958, + "epoch": 0.27238931181167414, + "grad_norm": 8.556130409240723, + "learning_rate": 4.847302460565392e-06, + "loss": 0.5045, + "mean_token_accuracy": 0.8380250200629235, + "num_tokens": 105649997.0, + "step": 87870 + }, + { + "entropy": 1.9121182456612587, + "epoch": 0.2724203109367238, + "grad_norm": 6.9276862144470215, + "learning_rate": 4.847026658589637e-06, + "loss": 0.4903, + "mean_token_accuracy": 0.8434757009148598, + "num_tokens": 105661149.0, + "step": 87880 + }, + { + "entropy": 1.8295169189572333, + "epoch": 0.27245131006177353, + "grad_norm": 9.222490310668945, + "learning_rate": 4.846750903686295e-06, + "loss": 0.4377, + "mean_token_accuracy": 0.8513430684804917, + "num_tokens": 105673234.0, + "step": 87890 + }, + { + "entropy": 1.9055490285158156, + "epoch": 0.2724823091868232, + "grad_norm": 8.43136215209961, + "learning_rate": 4.846475195841978e-06, + "loss": 0.5092, + "mean_token_accuracy": 0.849004752933979, + "num_tokens": 105684180.0, + "step": 87900 + }, + { + "entropy": 1.7300630405545234, + "epoch": 0.27251330831187287, + "grad_norm": 4.451493740081787, + "learning_rate": 4.846199535043302e-06, + "loss": 0.453, + "mean_token_accuracy": 0.8522528484463692, + "num_tokens": 105698528.0, + "step": 87910 + }, + { + "entropy": 1.855320343375206, + "epoch": 0.2725443074369226, + "grad_norm": 7.741222381591797, + "learning_rate": 4.845923921276889e-06, + "loss": 0.4827, + "mean_token_accuracy": 0.843564510345459, + "num_tokens": 105710686.0, + "step": 87920 + }, + { + "entropy": 1.8979840464890003, + "epoch": 0.27257530656197226, + "grad_norm": Infinity, + "learning_rate": 4.845648354529367e-06, + "loss": 0.4751, + "mean_token_accuracy": 0.8410883545875549, + "num_tokens": 105722636.0, + "step": 87930 + }, + { + "entropy": 1.9001470863819123, + "epoch": 0.272606305687022, + "grad_norm": 7.072505474090576, + "learning_rate": 4.845372834787369e-06, + "loss": 0.5521, + "mean_token_accuracy": 0.8388607785105705, + "num_tokens": 105733839.0, + "step": 87940 + }, + { + "entropy": 1.8749860867857933, + "epoch": 0.27263730481207166, + "grad_norm": 9.139892578125, + "learning_rate": 4.845097362037533e-06, + "loss": 0.4707, + "mean_token_accuracy": 0.8396959900856018, + "num_tokens": 105746121.0, + "step": 87950 + }, + { + "entropy": 1.8686931714415551, + "epoch": 0.2726683039371214, + "grad_norm": 9.034399032592773, + "learning_rate": 4.844821936266501e-06, + "loss": 0.4891, + "mean_token_accuracy": 0.8411045849323273, + "num_tokens": 105758031.0, + "step": 87960 + }, + { + "entropy": 1.9092384189367295, + "epoch": 0.27269930306217105, + "grad_norm": 4.113952159881592, + "learning_rate": 4.844546557460922e-06, + "loss": 0.4858, + "mean_token_accuracy": 0.8454341217875481, + "num_tokens": 105769453.0, + "step": 87970 + }, + { + "entropy": 1.8871904879808425, + "epoch": 0.2727303021872208, + "grad_norm": 7.443089962005615, + "learning_rate": 4.844271225607452e-06, + "loss": 0.4942, + "mean_token_accuracy": 0.8477641880512238, + "num_tokens": 105781468.0, + "step": 87980 + }, + { + "entropy": 1.887850184738636, + "epoch": 0.27276130131227044, + "grad_norm": 7.757199287414551, + "learning_rate": 4.843995940692748e-06, + "loss": 0.4486, + "mean_token_accuracy": 0.8484135344624519, + "num_tokens": 105793191.0, + "step": 87990 + }, + { + "entropy": 1.9092398703098297, + "epoch": 0.27279230043732017, + "grad_norm": 8.517298698425293, + "learning_rate": 4.843720702703475e-06, + "loss": 0.5326, + "mean_token_accuracy": 0.8369758918881416, + "num_tokens": 105804532.0, + "step": 88000 + }, + { + "entropy": 1.7902554288506507, + "epoch": 0.27282329956236984, + "grad_norm": 4.456912517547607, + "learning_rate": 4.843445511626304e-06, + "loss": 0.4211, + "mean_token_accuracy": 0.8474618881940842, + "num_tokens": 105817217.0, + "step": 88010 + }, + { + "entropy": 1.8608660578727723, + "epoch": 0.27285429868741956, + "grad_norm": 8.066084861755371, + "learning_rate": 4.843170367447909e-06, + "loss": 0.4997, + "mean_token_accuracy": 0.8490109965205193, + "num_tokens": 105828552.0, + "step": 88020 + }, + { + "entropy": 1.89578920006752, + "epoch": 0.27288529781246923, + "grad_norm": 6.772432327270508, + "learning_rate": 4.842895270154972e-06, + "loss": 0.5289, + "mean_token_accuracy": 0.8335303783416748, + "num_tokens": 105840037.0, + "step": 88030 + }, + { + "entropy": 1.7588407546281815, + "epoch": 0.27291629693751895, + "grad_norm": 1.8999171257019043, + "learning_rate": 4.842620219734178e-06, + "loss": 0.4357, + "mean_token_accuracy": 0.8491491839289665, + "num_tokens": 105854237.0, + "step": 88040 + }, + { + "entropy": 1.7782690718770027, + "epoch": 0.2729472960625686, + "grad_norm": 8.12587833404541, + "learning_rate": 4.842345216172217e-06, + "loss": 0.4459, + "mean_token_accuracy": 0.8528685718774796, + "num_tokens": 105866817.0, + "step": 88050 + }, + { + "entropy": 1.8298373103141785, + "epoch": 0.27297829518761835, + "grad_norm": 7.805505752563477, + "learning_rate": 4.8420702594557855e-06, + "loss": 0.4965, + "mean_token_accuracy": 0.8397832661867142, + "num_tokens": 105878717.0, + "step": 88060 + }, + { + "entropy": 1.8683532044291495, + "epoch": 0.273009294312668, + "grad_norm": 3.9877965450286865, + "learning_rate": 4.841795349571587e-06, + "loss": 0.4879, + "mean_token_accuracy": 0.8447675094008446, + "num_tokens": 105890128.0, + "step": 88070 + }, + { + "entropy": 1.8671092882752418, + "epoch": 0.27304029343771774, + "grad_norm": 8.055480003356934, + "learning_rate": 4.841520486506328e-06, + "loss": 0.4736, + "mean_token_accuracy": 0.8493238598108291, + "num_tokens": 105901651.0, + "step": 88080 + }, + { + "entropy": 1.9300762385129928, + "epoch": 0.2730712925627674, + "grad_norm": 8.968958854675293, + "learning_rate": 4.841245670246719e-06, + "loss": 0.5404, + "mean_token_accuracy": 0.8325337320566177, + "num_tokens": 105912601.0, + "step": 88090 + }, + { + "entropy": 1.7842349156737327, + "epoch": 0.27310229168781713, + "grad_norm": 2.736703634262085, + "learning_rate": 4.840970900779478e-06, + "loss": 0.4399, + "mean_token_accuracy": 0.8453866109251976, + "num_tokens": 105926027.0, + "step": 88100 + }, + { + "entropy": 1.8227722927927972, + "epoch": 0.2731332908128668, + "grad_norm": 7.752776622772217, + "learning_rate": 4.840696178091329e-06, + "loss": 0.4375, + "mean_token_accuracy": 0.8525062575936317, + "num_tokens": 105938989.0, + "step": 88110 + }, + { + "entropy": 1.85889712870121, + "epoch": 0.2731642899379165, + "grad_norm": 7.539937973022461, + "learning_rate": 4.840421502169e-06, + "loss": 0.5062, + "mean_token_accuracy": 0.8374249458312988, + "num_tokens": 105950886.0, + "step": 88120 + }, + { + "entropy": 1.846862156689167, + "epoch": 0.2731952890629662, + "grad_norm": 8.776389122009277, + "learning_rate": 4.840146872999224e-06, + "loss": 0.458, + "mean_token_accuracy": 0.8551399633288383, + "num_tokens": 105962835.0, + "step": 88130 + }, + { + "entropy": 1.7878411993384362, + "epoch": 0.2732262881880159, + "grad_norm": 7.770561695098877, + "learning_rate": 4.839872290568737e-06, + "loss": 0.4268, + "mean_token_accuracy": 0.8593245849013329, + "num_tokens": 105975339.0, + "step": 88140 + }, + { + "entropy": 1.8643562525510788, + "epoch": 0.2732572873130656, + "grad_norm": 12.23235034942627, + "learning_rate": 4.839597754864288e-06, + "loss": 0.4731, + "mean_token_accuracy": 0.8453767746686935, + "num_tokens": 105987199.0, + "step": 88150 + }, + { + "entropy": 1.887287637591362, + "epoch": 0.27328828643811526, + "grad_norm": 7.948936462402344, + "learning_rate": 4.839323265872622e-06, + "loss": 0.4833, + "mean_token_accuracy": 0.8448623090982437, + "num_tokens": 105998566.0, + "step": 88160 + }, + { + "entropy": 1.831418040394783, + "epoch": 0.273319285563165, + "grad_norm": 9.216192245483398, + "learning_rate": 4.839048823580495e-06, + "loss": 0.482, + "mean_token_accuracy": 0.8426416292786598, + "num_tokens": 106011036.0, + "step": 88170 + }, + { + "entropy": 1.8624485075473785, + "epoch": 0.27335028468821465, + "grad_norm": 8.75737476348877, + "learning_rate": 4.838774427974665e-06, + "loss": 0.5206, + "mean_token_accuracy": 0.837440450489521, + "num_tokens": 106022579.0, + "step": 88180 + }, + { + "entropy": 1.87282153069973, + "epoch": 0.2733812838132644, + "grad_norm": 3.783078670501709, + "learning_rate": 4.8385000790419005e-06, + "loss": 0.4822, + "mean_token_accuracy": 0.8478828489780426, + "num_tokens": 106033958.0, + "step": 88190 + }, + { + "entropy": 1.8366525799036026, + "epoch": 0.27341228293831404, + "grad_norm": 4.001052379608154, + "learning_rate": 4.8382257767689696e-06, + "loss": 0.4476, + "mean_token_accuracy": 0.8528647780418396, + "num_tokens": 106046043.0, + "step": 88200 + }, + { + "entropy": 1.9092376694083213, + "epoch": 0.27344328206336377, + "grad_norm": 8.00796127319336, + "learning_rate": 4.837951521142646e-06, + "loss": 0.5147, + "mean_token_accuracy": 0.8405633881688118, + "num_tokens": 106057385.0, + "step": 88210 + }, + { + "entropy": 1.8548787474632262, + "epoch": 0.27347428118841344, + "grad_norm": 6.67507791519165, + "learning_rate": 4.837677312149712e-06, + "loss": 0.4822, + "mean_token_accuracy": 0.8439633920788765, + "num_tokens": 106069738.0, + "step": 88220 + }, + { + "entropy": 1.7936237141489983, + "epoch": 0.27350528031346316, + "grad_norm": 10.650125503540039, + "learning_rate": 4.837403149776953e-06, + "loss": 0.5144, + "mean_token_accuracy": 0.8396651700139046, + "num_tokens": 106083549.0, + "step": 88230 + }, + { + "entropy": 1.8829040303826332, + "epoch": 0.27353627943851283, + "grad_norm": 8.387964248657227, + "learning_rate": 4.837129034011162e-06, + "loss": 0.4883, + "mean_token_accuracy": 0.8502602845430374, + "num_tokens": 106094686.0, + "step": 88240 + }, + { + "entropy": 1.8475438334047793, + "epoch": 0.27356727856356255, + "grad_norm": 4.180426597595215, + "learning_rate": 4.836854964839133e-06, + "loss": 0.4467, + "mean_token_accuracy": 0.8456549108028412, + "num_tokens": 106108422.0, + "step": 88250 + }, + { + "entropy": 1.9390867114067079, + "epoch": 0.2735982776886122, + "grad_norm": 7.525297164916992, + "learning_rate": 4.836580942247668e-06, + "loss": 0.5079, + "mean_token_accuracy": 0.8467183902859687, + "num_tokens": 106119386.0, + "step": 88260 + }, + { + "entropy": 1.8454492062330246, + "epoch": 0.27362927681366195, + "grad_norm": 7.91515588760376, + "learning_rate": 4.836306966223574e-06, + "loss": 0.5361, + "mean_token_accuracy": 0.84375009983778, + "num_tokens": 106131751.0, + "step": 88270 + }, + { + "entropy": 1.8392127811908723, + "epoch": 0.2736602759387116, + "grad_norm": 4.0122270584106445, + "learning_rate": 4.8360330367536644e-06, + "loss": 0.4702, + "mean_token_accuracy": 0.8450425997376442, + "num_tokens": 106144785.0, + "step": 88280 + }, + { + "entropy": 1.902709110081196, + "epoch": 0.27369127506376134, + "grad_norm": 9.007641792297363, + "learning_rate": 4.835759153824755e-06, + "loss": 0.5141, + "mean_token_accuracy": 0.8378954946994781, + "num_tokens": 106156151.0, + "step": 88290 + }, + { + "entropy": 1.8917675152420999, + "epoch": 0.273722274188811, + "grad_norm": 8.450139045715332, + "learning_rate": 4.835485317423669e-06, + "loss": 0.5173, + "mean_token_accuracy": 0.8372221887111664, + "num_tokens": 106167862.0, + "step": 88300 + }, + { + "entropy": 1.7971489533782006, + "epoch": 0.27375327331386073, + "grad_norm": 8.281867980957031, + "learning_rate": 4.835211527537234e-06, + "loss": 0.419, + "mean_token_accuracy": 0.8558110296726227, + "num_tokens": 106181012.0, + "step": 88310 + }, + { + "entropy": 1.9332615464925766, + "epoch": 0.2737842724389104, + "grad_norm": 7.937662124633789, + "learning_rate": 4.834937784152283e-06, + "loss": 0.5597, + "mean_token_accuracy": 0.8360219776630402, + "num_tokens": 106192536.0, + "step": 88320 + }, + { + "entropy": 1.8722572714090346, + "epoch": 0.2738152715639601, + "grad_norm": 9.418374061584473, + "learning_rate": 4.834664087255653e-06, + "loss": 0.4561, + "mean_token_accuracy": 0.8445107057690621, + "num_tokens": 106204886.0, + "step": 88330 + }, + { + "entropy": 1.8897190272808075, + "epoch": 0.2738462706890098, + "grad_norm": 7.282384872436523, + "learning_rate": 4.83439043683419e-06, + "loss": 0.4265, + "mean_token_accuracy": 0.8582799851894378, + "num_tokens": 106216856.0, + "step": 88340 + }, + { + "entropy": 1.8999308928847314, + "epoch": 0.2738772698140595, + "grad_norm": 9.030301094055176, + "learning_rate": 4.8341168328747395e-06, + "loss": 0.496, + "mean_token_accuracy": 0.8379582807421684, + "num_tokens": 106228905.0, + "step": 88350 + }, + { + "entropy": 1.8722223863005638, + "epoch": 0.2739082689391092, + "grad_norm": 7.225384712219238, + "learning_rate": 4.833843275364157e-06, + "loss": 0.4883, + "mean_token_accuracy": 0.8510954797267913, + "num_tokens": 106240136.0, + "step": 88360 + }, + { + "entropy": 1.7975994154810906, + "epoch": 0.2739392680641589, + "grad_norm": 7.694894790649414, + "learning_rate": 4.833569764289303e-06, + "loss": 0.3527, + "mean_token_accuracy": 0.8697395831346512, + "num_tokens": 106253669.0, + "step": 88370 + }, + { + "entropy": 1.7122723177075385, + "epoch": 0.2739702671892086, + "grad_norm": 4.0345540046691895, + "learning_rate": 4.833296299637038e-06, + "loss": 0.3687, + "mean_token_accuracy": 0.8615933701395988, + "num_tokens": 106268120.0, + "step": 88380 + }, + { + "entropy": 1.7913119062781333, + "epoch": 0.2740012663142583, + "grad_norm": 3.695681095123291, + "learning_rate": 4.833022881394236e-06, + "loss": 0.4178, + "mean_token_accuracy": 0.8548486337065697, + "num_tokens": 106281871.0, + "step": 88390 + }, + { + "entropy": 1.8856089636683464, + "epoch": 0.274032265439308, + "grad_norm": 7.232779026031494, + "learning_rate": 4.832749509547768e-06, + "loss": 0.4807, + "mean_token_accuracy": 0.8429864302277565, + "num_tokens": 106293484.0, + "step": 88400 + }, + { + "entropy": 1.9002991870045662, + "epoch": 0.27406326456435764, + "grad_norm": 4.058182239532471, + "learning_rate": 4.832476184084515e-06, + "loss": 0.5214, + "mean_token_accuracy": 0.8347552672028542, + "num_tokens": 106305347.0, + "step": 88410 + }, + { + "entropy": 1.8636735692620277, + "epoch": 0.27409426368940737, + "grad_norm": 7.725419998168945, + "learning_rate": 4.832202904991362e-06, + "loss": 0.4471, + "mean_token_accuracy": 0.8470685452222824, + "num_tokens": 106317379.0, + "step": 88420 + }, + { + "entropy": 1.8224295750260353, + "epoch": 0.27412526281445704, + "grad_norm": 7.4432148933410645, + "learning_rate": 4.8319296722552e-06, + "loss": 0.4332, + "mean_token_accuracy": 0.8520729199051857, + "num_tokens": 106329618.0, + "step": 88430 + }, + { + "entropy": 1.9166969522833823, + "epoch": 0.27415626193950676, + "grad_norm": 8.651660919189453, + "learning_rate": 4.8316564858629236e-06, + "loss": 0.4738, + "mean_token_accuracy": 0.843331104516983, + "num_tokens": 106341322.0, + "step": 88440 + }, + { + "entropy": 1.900239697098732, + "epoch": 0.27418726106455643, + "grad_norm": 9.139606475830078, + "learning_rate": 4.831383345801432e-06, + "loss": 0.4642, + "mean_token_accuracy": 0.8463894948363304, + "num_tokens": 106352704.0, + "step": 88450 + }, + { + "entropy": 1.8953612327575684, + "epoch": 0.27421826018960616, + "grad_norm": 7.59618616104126, + "learning_rate": 4.831110252057634e-06, + "loss": 0.4812, + "mean_token_accuracy": 0.8437128469347954, + "num_tokens": 106364147.0, + "step": 88460 + }, + { + "entropy": 1.8952149584889413, + "epoch": 0.2742492593146558, + "grad_norm": 7.970698356628418, + "learning_rate": 4.830837204618439e-06, + "loss": 0.4564, + "mean_token_accuracy": 0.856081509590149, + "num_tokens": 106375733.0, + "step": 88470 + }, + { + "entropy": 1.8306088283658029, + "epoch": 0.27428025843970555, + "grad_norm": 8.874154090881348, + "learning_rate": 4.830564203470762e-06, + "loss": 0.4212, + "mean_token_accuracy": 0.8551815196871757, + "num_tokens": 106388345.0, + "step": 88480 + }, + { + "entropy": 1.8102877527475356, + "epoch": 0.2743112575647552, + "grad_norm": 5.486161231994629, + "learning_rate": 4.830291248601526e-06, + "loss": 0.4662, + "mean_token_accuracy": 0.8472759440541268, + "num_tokens": 106401837.0, + "step": 88490 + }, + { + "entropy": 1.9017505258321763, + "epoch": 0.27434225668980494, + "grad_norm": 4.231834888458252, + "learning_rate": 4.830018339997658e-06, + "loss": 0.4669, + "mean_token_accuracy": 0.8477098569273949, + "num_tokens": 106414196.0, + "step": 88500 + }, + { + "entropy": 1.763375386595726, + "epoch": 0.2743732558148546, + "grad_norm": 3.8429412841796875, + "learning_rate": 4.829745477646087e-06, + "loss": 0.3896, + "mean_token_accuracy": 0.8600920364260674, + "num_tokens": 106427418.0, + "step": 88510 + }, + { + "entropy": 1.8455632477998734, + "epoch": 0.27440425493990434, + "grad_norm": 9.410290718078613, + "learning_rate": 4.829472661533753e-06, + "loss": 0.4572, + "mean_token_accuracy": 0.8498677432537078, + "num_tokens": 106439679.0, + "step": 88520 + }, + { + "entropy": 1.9347635477781295, + "epoch": 0.274435254064954, + "grad_norm": 8.736660957336426, + "learning_rate": 4.829199891647595e-06, + "loss": 0.5427, + "mean_token_accuracy": 0.834366361796856, + "num_tokens": 106450622.0, + "step": 88530 + }, + { + "entropy": 1.7944782301783562, + "epoch": 0.27446625319000373, + "grad_norm": 3.962170362472534, + "learning_rate": 4.828927167974562e-06, + "loss": 0.4312, + "mean_token_accuracy": 0.84948940128088, + "num_tokens": 106464014.0, + "step": 88540 + }, + { + "entropy": 1.911988915503025, + "epoch": 0.2744972523150534, + "grad_norm": 8.544610977172852, + "learning_rate": 4.828654490501605e-06, + "loss": 0.4954, + "mean_token_accuracy": 0.8315621390938759, + "num_tokens": 106475443.0, + "step": 88550 + }, + { + "entropy": 1.8577186211943626, + "epoch": 0.2745282514401031, + "grad_norm": 9.083486557006836, + "learning_rate": 4.828381859215683e-06, + "loss": 0.5021, + "mean_token_accuracy": 0.8416240692138672, + "num_tokens": 106487802.0, + "step": 88560 + }, + { + "entropy": 1.8315035477280617, + "epoch": 0.2745592505651528, + "grad_norm": 8.581526756286621, + "learning_rate": 4.828109274103759e-06, + "loss": 0.4935, + "mean_token_accuracy": 0.8431025877594948, + "num_tokens": 106501150.0, + "step": 88570 + }, + { + "entropy": 1.8865200936794282, + "epoch": 0.2745902496902025, + "grad_norm": 11.377765655517578, + "learning_rate": 4.8278367351527985e-06, + "loss": 0.4927, + "mean_token_accuracy": 0.8460037097334862, + "num_tokens": 106513566.0, + "step": 88580 + }, + { + "entropy": 1.7892251804471015, + "epoch": 0.2746212488152522, + "grad_norm": 6.965874671936035, + "learning_rate": 4.8275642423497745e-06, + "loss": 0.3847, + "mean_token_accuracy": 0.8588668003678321, + "num_tokens": 106526831.0, + "step": 88590 + }, + { + "entropy": 1.8974862158298493, + "epoch": 0.2746522479403019, + "grad_norm": 7.9539265632629395, + "learning_rate": 4.827291795681668e-06, + "loss": 0.4423, + "mean_token_accuracy": 0.8509656980633735, + "num_tokens": 106538001.0, + "step": 88600 + }, + { + "entropy": 1.8913881599903106, + "epoch": 0.2746832470653516, + "grad_norm": 9.2141752243042, + "learning_rate": 4.827019395135459e-06, + "loss": 0.5634, + "mean_token_accuracy": 0.8275657877326011, + "num_tokens": 106549809.0, + "step": 88610 + }, + { + "entropy": 1.7867820590734482, + "epoch": 0.2747142461904013, + "grad_norm": 7.718073844909668, + "learning_rate": 4.8267470406981375e-06, + "loss": 0.4228, + "mean_token_accuracy": 0.8527751803398133, + "num_tokens": 106563083.0, + "step": 88620 + }, + { + "entropy": 1.838895745575428, + "epoch": 0.27474524531545097, + "grad_norm": 8.962431907653809, + "learning_rate": 4.826474732356697e-06, + "loss": 0.4731, + "mean_token_accuracy": 0.8443746566772461, + "num_tokens": 106575815.0, + "step": 88630 + }, + { + "entropy": 1.9002109482884406, + "epoch": 0.2747762444405007, + "grad_norm": 7.583240509033203, + "learning_rate": 4.826202470098135e-06, + "loss": 0.4828, + "mean_token_accuracy": 0.8451993703842163, + "num_tokens": 106587141.0, + "step": 88640 + }, + { + "entropy": 1.7970835909247398, + "epoch": 0.27480724356555036, + "grad_norm": 3.407457113265991, + "learning_rate": 4.825930253909458e-06, + "loss": 0.4077, + "mean_token_accuracy": 0.8588675916194916, + "num_tokens": 106600173.0, + "step": 88650 + }, + { + "entropy": 1.8548505648970603, + "epoch": 0.27483824269060003, + "grad_norm": 7.602267742156982, + "learning_rate": 4.825658083777671e-06, + "loss": 0.5061, + "mean_token_accuracy": 0.8400174841284752, + "num_tokens": 106611441.0, + "step": 88660 + }, + { + "entropy": 1.8951510965824128, + "epoch": 0.27486924181564976, + "grad_norm": 9.224246978759766, + "learning_rate": 4.82538595968979e-06, + "loss": 0.5088, + "mean_token_accuracy": 0.8430482760071755, + "num_tokens": 106622973.0, + "step": 88670 + }, + { + "entropy": 1.8475442111492157, + "epoch": 0.2749002409406994, + "grad_norm": 7.481077671051025, + "learning_rate": 4.825113881632835e-06, + "loss": 0.5015, + "mean_token_accuracy": 0.8386109799146653, + "num_tokens": 106635332.0, + "step": 88680 + }, + { + "entropy": 1.8375971369445323, + "epoch": 0.27493124006574915, + "grad_norm": 6.685962200164795, + "learning_rate": 4.824841849593828e-06, + "loss": 0.4501, + "mean_token_accuracy": 0.85401571393013, + "num_tokens": 106647486.0, + "step": 88690 + }, + { + "entropy": 1.8508135929703713, + "epoch": 0.2749622391907988, + "grad_norm": 9.494312286376953, + "learning_rate": 4.824569863559801e-06, + "loss": 0.5245, + "mean_token_accuracy": 0.8352826073765754, + "num_tokens": 106659227.0, + "step": 88700 + }, + { + "entropy": 1.8618362814188003, + "epoch": 0.27499323831584854, + "grad_norm": 8.583227157592773, + "learning_rate": 4.824297923517787e-06, + "loss": 0.4741, + "mean_token_accuracy": 0.8454991012811661, + "num_tokens": 106671337.0, + "step": 88710 + }, + { + "entropy": 1.8632221952080728, + "epoch": 0.2750242374408982, + "grad_norm": 4.273068428039551, + "learning_rate": 4.824026029454825e-06, + "loss": 0.477, + "mean_token_accuracy": 0.8438972979784012, + "num_tokens": 106683266.0, + "step": 88720 + }, + { + "entropy": 1.8759027153253556, + "epoch": 0.27505523656594794, + "grad_norm": 6.9859538078308105, + "learning_rate": 4.823754181357961e-06, + "loss": 0.5663, + "mean_token_accuracy": 0.8387471958994865, + "num_tokens": 106695075.0, + "step": 88730 + }, + { + "entropy": 1.9251735389232636, + "epoch": 0.2750862356909976, + "grad_norm": 8.672323226928711, + "learning_rate": 4.823482379214244e-06, + "loss": 0.5499, + "mean_token_accuracy": 0.8304734826087952, + "num_tokens": 106706762.0, + "step": 88740 + }, + { + "entropy": 1.8503682538866997, + "epoch": 0.27511723481604733, + "grad_norm": 9.310576438903809, + "learning_rate": 4.8232106230107285e-06, + "loss": 0.4709, + "mean_token_accuracy": 0.8472747296094895, + "num_tokens": 106718287.0, + "step": 88750 + }, + { + "entropy": 1.9084057167172432, + "epoch": 0.275148233941097, + "grad_norm": 8.345101356506348, + "learning_rate": 4.822938912734476e-06, + "loss": 0.5446, + "mean_token_accuracy": 0.8372975274920463, + "num_tokens": 106729907.0, + "step": 88760 + }, + { + "entropy": 1.8601343676447868, + "epoch": 0.2751792330661467, + "grad_norm": 3.5940239429473877, + "learning_rate": 4.822667248372551e-06, + "loss": 0.4276, + "mean_token_accuracy": 0.8547569274902344, + "num_tokens": 106742034.0, + "step": 88770 + }, + { + "entropy": 1.9169348627328873, + "epoch": 0.2752102321911964, + "grad_norm": 8.337597846984863, + "learning_rate": 4.822395629912025e-06, + "loss": 0.5174, + "mean_token_accuracy": 0.8388430058956147, + "num_tokens": 106753068.0, + "step": 88780 + }, + { + "entropy": 1.872619953751564, + "epoch": 0.2752412313162461, + "grad_norm": 10.032551765441895, + "learning_rate": 4.8221240573399705e-06, + "loss": 0.4984, + "mean_token_accuracy": 0.8413626179099083, + "num_tokens": 106764665.0, + "step": 88790 + }, + { + "entropy": 1.8718515574932098, + "epoch": 0.2752722304412958, + "grad_norm": 4.433212757110596, + "learning_rate": 4.82185253064347e-06, + "loss": 0.4435, + "mean_token_accuracy": 0.8516853898763657, + "num_tokens": 106776221.0, + "step": 88800 + }, + { + "entropy": 1.8270509555935859, + "epoch": 0.2753032295663455, + "grad_norm": 8.920626640319824, + "learning_rate": 4.821581049809608e-06, + "loss": 0.436, + "mean_token_accuracy": 0.8488035008311272, + "num_tokens": 106789268.0, + "step": 88810 + }, + { + "entropy": 1.8341209158301353, + "epoch": 0.2753342286913952, + "grad_norm": 9.364352226257324, + "learning_rate": 4.821309614825477e-06, + "loss": 0.4889, + "mean_token_accuracy": 0.8486865177750588, + "num_tokens": 106800990.0, + "step": 88820 + }, + { + "entropy": 1.7747258245944977, + "epoch": 0.2753652278164449, + "grad_norm": 3.747847557067871, + "learning_rate": 4.82103822567817e-06, + "loss": 0.4707, + "mean_token_accuracy": 0.8474699661135674, + "num_tokens": 106814661.0, + "step": 88830 + }, + { + "entropy": 1.8344976738095284, + "epoch": 0.27539622694149457, + "grad_norm": 3.948411226272583, + "learning_rate": 4.8207668823547895e-06, + "loss": 0.4755, + "mean_token_accuracy": 0.8421635374426841, + "num_tokens": 106827568.0, + "step": 88840 + }, + { + "entropy": 1.873189702630043, + "epoch": 0.2754272260665443, + "grad_norm": 7.882256507873535, + "learning_rate": 4.82049558484244e-06, + "loss": 0.4877, + "mean_token_accuracy": 0.8454801499843597, + "num_tokens": 106839436.0, + "step": 88850 + }, + { + "entropy": 1.8929407209157945, + "epoch": 0.27545822519159396, + "grad_norm": 6.915173530578613, + "learning_rate": 4.820224333128236e-06, + "loss": 0.4984, + "mean_token_accuracy": 0.8434187933802605, + "num_tokens": 106850474.0, + "step": 88860 + }, + { + "entropy": 1.8293378427624702, + "epoch": 0.2754892243166437, + "grad_norm": 4.267940521240234, + "learning_rate": 4.819953127199289e-06, + "loss": 0.4759, + "mean_token_accuracy": 0.8467979103326797, + "num_tokens": 106862337.0, + "step": 88870 + }, + { + "entropy": 1.8841822102665902, + "epoch": 0.27552022344169336, + "grad_norm": 7.245059013366699, + "learning_rate": 4.819681967042724e-06, + "loss": 0.513, + "mean_token_accuracy": 0.8383611172437668, + "num_tokens": 106874237.0, + "step": 88880 + }, + { + "entropy": 1.8861842527985573, + "epoch": 0.275551222566743, + "grad_norm": 7.553530693054199, + "learning_rate": 4.819410852645663e-06, + "loss": 0.4855, + "mean_token_accuracy": 0.8369044825434685, + "num_tokens": 106886243.0, + "step": 88890 + }, + { + "entropy": 1.8612857922911643, + "epoch": 0.27558222169179275, + "grad_norm": 5.01759147644043, + "learning_rate": 4.81913978399524e-06, + "loss": 0.523, + "mean_token_accuracy": 0.8347546473145485, + "num_tokens": 106898922.0, + "step": 88900 + }, + { + "entropy": 1.789707398414612, + "epoch": 0.2756132208168424, + "grad_norm": 8.906004905700684, + "learning_rate": 4.818868761078591e-06, + "loss": 0.4355, + "mean_token_accuracy": 0.8516422912478447, + "num_tokens": 106911823.0, + "step": 88910 + }, + { + "entropy": 1.8937145173549652, + "epoch": 0.27564421994189214, + "grad_norm": 7.709479331970215, + "learning_rate": 4.818597783882858e-06, + "loss": 0.476, + "mean_token_accuracy": 0.8450556769967079, + "num_tokens": 106923627.0, + "step": 88920 + }, + { + "entropy": 1.8715748369693757, + "epoch": 0.2756752190669418, + "grad_norm": 8.210858345031738, + "learning_rate": 4.818326852395186e-06, + "loss": 0.5139, + "mean_token_accuracy": 0.843447645008564, + "num_tokens": 106935011.0, + "step": 88930 + }, + { + "entropy": 1.8829000025987626, + "epoch": 0.27570621819199154, + "grad_norm": 9.785741806030273, + "learning_rate": 4.818055966602728e-06, + "loss": 0.4917, + "mean_token_accuracy": 0.8352604553103447, + "num_tokens": 106947124.0, + "step": 88940 + }, + { + "entropy": 1.9164473339915276, + "epoch": 0.2757372173170412, + "grad_norm": 8.271210670471191, + "learning_rate": 4.817785126492638e-06, + "loss": 0.5146, + "mean_token_accuracy": 0.8394878327846527, + "num_tokens": 106957935.0, + "step": 88950 + }, + { + "entropy": 1.890079266577959, + "epoch": 0.27576821644209093, + "grad_norm": 8.78956127166748, + "learning_rate": 4.817514332052081e-06, + "loss": 0.5374, + "mean_token_accuracy": 0.8357309713959694, + "num_tokens": 106970004.0, + "step": 88960 + }, + { + "entropy": 1.9482278615236281, + "epoch": 0.2757992155671406, + "grad_norm": 8.85912799835205, + "learning_rate": 4.817243583268221e-06, + "loss": 0.5326, + "mean_token_accuracy": 0.8404009014368057, + "num_tokens": 106980930.0, + "step": 88970 + }, + { + "entropy": 1.8766920641064644, + "epoch": 0.2758302146921903, + "grad_norm": 7.513118743896484, + "learning_rate": 4.8169728801282294e-06, + "loss": 0.4737, + "mean_token_accuracy": 0.8502911329269409, + "num_tokens": 106992422.0, + "step": 88980 + }, + { + "entropy": 1.8944596245884895, + "epoch": 0.27586121381724, + "grad_norm": 7.3327460289001465, + "learning_rate": 4.816702222619286e-06, + "loss": 0.4609, + "mean_token_accuracy": 0.8509733945131301, + "num_tokens": 107004273.0, + "step": 88990 + }, + { + "entropy": 1.847128589451313, + "epoch": 0.2758922129422897, + "grad_norm": 4.272797584533691, + "learning_rate": 4.816431610728571e-06, + "loss": 0.462, + "mean_token_accuracy": 0.8441042378544807, + "num_tokens": 107016497.0, + "step": 89000 + }, + { + "entropy": 1.855496746301651, + "epoch": 0.2759232120673394, + "grad_norm": 4.222333908081055, + "learning_rate": 4.816161044443269e-06, + "loss": 0.4865, + "mean_token_accuracy": 0.8440208032727241, + "num_tokens": 107029107.0, + "step": 89010 + }, + { + "entropy": 1.9156458109617234, + "epoch": 0.2759542111923891, + "grad_norm": 8.503098487854004, + "learning_rate": 4.815890523750575e-06, + "loss": 0.525, + "mean_token_accuracy": 0.8486156970262527, + "num_tokens": 107039998.0, + "step": 89020 + }, + { + "entropy": 1.8888219490647316, + "epoch": 0.2759852103174388, + "grad_norm": 2.975604772567749, + "learning_rate": 4.8156200486376845e-06, + "loss": 0.5026, + "mean_token_accuracy": 0.8388027891516685, + "num_tokens": 107052621.0, + "step": 89030 + }, + { + "entropy": 1.9181819319725038, + "epoch": 0.2760162094424885, + "grad_norm": 3.860818386077881, + "learning_rate": 4.8153496190918e-06, + "loss": 0.5216, + "mean_token_accuracy": 0.8331426337361336, + "num_tokens": 107063730.0, + "step": 89040 + }, + { + "entropy": 1.832422287762165, + "epoch": 0.27604720856753817, + "grad_norm": 7.345708847045898, + "learning_rate": 4.815079235100127e-06, + "loss": 0.4354, + "mean_token_accuracy": 0.854414065182209, + "num_tokens": 107076862.0, + "step": 89050 + }, + { + "entropy": 1.9153432667255401, + "epoch": 0.2760782076925879, + "grad_norm": 9.088801383972168, + "learning_rate": 4.814808896649879e-06, + "loss": 0.4634, + "mean_token_accuracy": 0.853677150607109, + "num_tokens": 107088395.0, + "step": 89060 + }, + { + "entropy": 1.8690629690885543, + "epoch": 0.27610920681763756, + "grad_norm": 7.586027145385742, + "learning_rate": 4.814538603728274e-06, + "loss": 0.4119, + "mean_token_accuracy": 0.8534372985363007, + "num_tokens": 107100834.0, + "step": 89070 + }, + { + "entropy": 1.8856920048594474, + "epoch": 0.2761402059426873, + "grad_norm": 4.5250043869018555, + "learning_rate": 4.814268356322531e-06, + "loss": 0.4513, + "mean_token_accuracy": 0.8607599556446075, + "num_tokens": 107112389.0, + "step": 89080 + }, + { + "entropy": 1.9317937284708022, + "epoch": 0.27617120506773696, + "grad_norm": 8.948545455932617, + "learning_rate": 4.813998154419879e-06, + "loss": 0.5167, + "mean_token_accuracy": 0.8472296670079231, + "num_tokens": 107122894.0, + "step": 89090 + }, + { + "entropy": 1.8248413413763047, + "epoch": 0.2762022041927867, + "grad_norm": 11.74347972869873, + "learning_rate": 4.813727998007552e-06, + "loss": 0.4519, + "mean_token_accuracy": 0.8507599249482155, + "num_tokens": 107134891.0, + "step": 89100 + }, + { + "entropy": 1.8651644140481949, + "epoch": 0.27623320331783635, + "grad_norm": 8.039405822753906, + "learning_rate": 4.813457887072781e-06, + "loss": 0.478, + "mean_token_accuracy": 0.8547470927238464, + "num_tokens": 107146700.0, + "step": 89110 + }, + { + "entropy": 1.8344761043787003, + "epoch": 0.2762642024428861, + "grad_norm": 8.795267105102539, + "learning_rate": 4.813187821602815e-06, + "loss": 0.5152, + "mean_token_accuracy": 0.8472502484917641, + "num_tokens": 107159722.0, + "step": 89120 + }, + { + "entropy": 1.909415753185749, + "epoch": 0.27629520156793574, + "grad_norm": 9.062956809997559, + "learning_rate": 4.812917801584898e-06, + "loss": 0.48, + "mean_token_accuracy": 0.850268816947937, + "num_tokens": 107171135.0, + "step": 89130 + }, + { + "entropy": 1.886960855126381, + "epoch": 0.2763262006929854, + "grad_norm": 8.55114459991455, + "learning_rate": 4.812647827006282e-06, + "loss": 0.4605, + "mean_token_accuracy": 0.8514637067914009, + "num_tokens": 107182863.0, + "step": 89140 + }, + { + "entropy": 1.862102809548378, + "epoch": 0.27635719981803514, + "grad_norm": 9.735430717468262, + "learning_rate": 4.812377897854223e-06, + "loss": 0.5274, + "mean_token_accuracy": 0.8368513882160187, + "num_tokens": 107195661.0, + "step": 89150 + }, + { + "entropy": 1.832503044605255, + "epoch": 0.2763881989430848, + "grad_norm": 7.541407108306885, + "learning_rate": 4.812108014115985e-06, + "loss": 0.4209, + "mean_token_accuracy": 0.8562299177050591, + "num_tokens": 107208363.0, + "step": 89160 + }, + { + "entropy": 1.884467676281929, + "epoch": 0.27641919806813453, + "grad_norm": 9.188913345336914, + "learning_rate": 4.811838175778836e-06, + "loss": 0.4936, + "mean_token_accuracy": 0.8432751014828682, + "num_tokens": 107219939.0, + "step": 89170 + }, + { + "entropy": 1.8959971502423287, + "epoch": 0.2764501971931842, + "grad_norm": 4.789566993713379, + "learning_rate": 4.8115683828300445e-06, + "loss": 0.4505, + "mean_token_accuracy": 0.8498433887958526, + "num_tokens": 107231619.0, + "step": 89180 + }, + { + "entropy": 1.8323383823037147, + "epoch": 0.2764811963182339, + "grad_norm": 2.805758237838745, + "learning_rate": 4.811298635256891e-06, + "loss": 0.4482, + "mean_token_accuracy": 0.8528842240571975, + "num_tokens": 107244259.0, + "step": 89190 + }, + { + "entropy": 1.7991958245635034, + "epoch": 0.2765121954432836, + "grad_norm": 6.88820219039917, + "learning_rate": 4.811028933046656e-06, + "loss": 0.3596, + "mean_token_accuracy": 0.872551740705967, + "num_tokens": 107257190.0, + "step": 89200 + }, + { + "entropy": 1.8357098802924157, + "epoch": 0.2765431945683333, + "grad_norm": 4.887908935546875, + "learning_rate": 4.810759276186628e-06, + "loss": 0.4842, + "mean_token_accuracy": 0.8493363171815872, + "num_tokens": 107269231.0, + "step": 89210 + }, + { + "entropy": 1.8650715343654156, + "epoch": 0.276574193693383, + "grad_norm": 8.015203475952148, + "learning_rate": 4.810489664664098e-06, + "loss": 0.4808, + "mean_token_accuracy": 0.8378633111715317, + "num_tokens": 107282063.0, + "step": 89220 + }, + { + "entropy": 1.8511658303439618, + "epoch": 0.2766051928184327, + "grad_norm": 8.239190101623535, + "learning_rate": 4.810220098466364e-06, + "loss": 0.4276, + "mean_token_accuracy": 0.8476231142878532, + "num_tokens": 107294849.0, + "step": 89230 + }, + { + "entropy": 1.8353752836585044, + "epoch": 0.2766361919434824, + "grad_norm": 11.316750526428223, + "learning_rate": 4.809950577580724e-06, + "loss": 0.4393, + "mean_token_accuracy": 0.8560484200716019, + "num_tokens": 107306875.0, + "step": 89240 + }, + { + "entropy": 1.8877992361783982, + "epoch": 0.2766671910685321, + "grad_norm": 10.842957496643066, + "learning_rate": 4.809681101994492e-06, + "loss": 0.4919, + "mean_token_accuracy": 0.8417460203170777, + "num_tokens": 107318449.0, + "step": 89250 + }, + { + "entropy": 1.798448894917965, + "epoch": 0.2766981901935818, + "grad_norm": 7.134692192077637, + "learning_rate": 4.809411671694974e-06, + "loss": 0.4513, + "mean_token_accuracy": 0.8509971871972084, + "num_tokens": 107331888.0, + "step": 89260 + }, + { + "entropy": 1.8195422321558, + "epoch": 0.2767291893186315, + "grad_norm": 10.8891019821167, + "learning_rate": 4.809142286669492e-06, + "loss": 0.4896, + "mean_token_accuracy": 0.8366403177380561, + "num_tokens": 107344418.0, + "step": 89270 + }, + { + "entropy": 1.857719998061657, + "epoch": 0.27676018844368117, + "grad_norm": 7.889041423797607, + "learning_rate": 4.808872946905363e-06, + "loss": 0.4521, + "mean_token_accuracy": 0.8492889553308487, + "num_tokens": 107356113.0, + "step": 89280 + }, + { + "entropy": 1.7993003293871879, + "epoch": 0.2767911875687309, + "grad_norm": 7.292881011962891, + "learning_rate": 4.808603652389917e-06, + "loss": 0.4368, + "mean_token_accuracy": 0.8571058794856071, + "num_tokens": 107369163.0, + "step": 89290 + }, + { + "entropy": 1.8662144735455513, + "epoch": 0.27682218669378056, + "grad_norm": 10.856431007385254, + "learning_rate": 4.808334403110485e-06, + "loss": 0.5662, + "mean_token_accuracy": 0.8313690677285195, + "num_tokens": 107381585.0, + "step": 89300 + }, + { + "entropy": 1.8663518592715262, + "epoch": 0.2768531858188303, + "grad_norm": 7.93809700012207, + "learning_rate": 4.808065199054404e-06, + "loss": 0.4681, + "mean_token_accuracy": 0.8461432874202728, + "num_tokens": 107393815.0, + "step": 89310 + }, + { + "entropy": 1.9202445238828658, + "epoch": 0.27688418494387995, + "grad_norm": 8.869173049926758, + "learning_rate": 4.8077960402090155e-06, + "loss": 0.5183, + "mean_token_accuracy": 0.8392524033784866, + "num_tokens": 107404393.0, + "step": 89320 + }, + { + "entropy": 1.9343400806188584, + "epoch": 0.2769151840689297, + "grad_norm": 8.072264671325684, + "learning_rate": 4.807526926561667e-06, + "loss": 0.509, + "mean_token_accuracy": 0.8465714007616043, + "num_tokens": 107414917.0, + "step": 89330 + }, + { + "entropy": 1.9184037640690803, + "epoch": 0.27694618319397935, + "grad_norm": 7.905839443206787, + "learning_rate": 4.80725785809971e-06, + "loss": 0.5024, + "mean_token_accuracy": 0.8361783891916275, + "num_tokens": 107425889.0, + "step": 89340 + }, + { + "entropy": 1.9122958168387414, + "epoch": 0.27697718231902907, + "grad_norm": 3.9599266052246094, + "learning_rate": 4.806988834810501e-06, + "loss": 0.4686, + "mean_token_accuracy": 0.8507964372634887, + "num_tokens": 107437118.0, + "step": 89350 + }, + { + "entropy": 1.7936722189188004, + "epoch": 0.27700818144407874, + "grad_norm": 9.577056884765625, + "learning_rate": 4.806719856681402e-06, + "loss": 0.4249, + "mean_token_accuracy": 0.8579995214939118, + "num_tokens": 107450413.0, + "step": 89360 + }, + { + "entropy": 1.8381581105291844, + "epoch": 0.27703918056912846, + "grad_norm": 7.809806823730469, + "learning_rate": 4.806450923699778e-06, + "loss": 0.4659, + "mean_token_accuracy": 0.8484640643000603, + "num_tokens": 107462904.0, + "step": 89370 + }, + { + "entropy": 1.878132027387619, + "epoch": 0.27707017969417813, + "grad_norm": 8.047992706298828, + "learning_rate": 4.806182035853004e-06, + "loss": 0.4695, + "mean_token_accuracy": 0.8498531639575958, + "num_tokens": 107474624.0, + "step": 89380 + }, + { + "entropy": 1.9003929272294044, + "epoch": 0.2771011788192278, + "grad_norm": 7.774439334869385, + "learning_rate": 4.805913193128452e-06, + "loss": 0.4197, + "mean_token_accuracy": 0.864824341237545, + "num_tokens": 107486478.0, + "step": 89390 + }, + { + "entropy": 1.885640236735344, + "epoch": 0.2771321779442775, + "grad_norm": 8.628144264221191, + "learning_rate": 4.805644395513508e-06, + "loss": 0.5054, + "mean_token_accuracy": 0.8425728008151054, + "num_tokens": 107498133.0, + "step": 89400 + }, + { + "entropy": 1.9598473072052003, + "epoch": 0.2771631770693272, + "grad_norm": 9.253849983215332, + "learning_rate": 4.805375642995554e-06, + "loss": 0.5666, + "mean_token_accuracy": 0.8246390670537949, + "num_tokens": 107509154.0, + "step": 89410 + }, + { + "entropy": 1.8804704681038857, + "epoch": 0.2771941761943769, + "grad_norm": 3.7883799076080322, + "learning_rate": 4.8051069355619846e-06, + "loss": 0.4571, + "mean_token_accuracy": 0.8492748379707337, + "num_tokens": 107521568.0, + "step": 89420 + }, + { + "entropy": 1.9423058658838273, + "epoch": 0.2772251753194266, + "grad_norm": 7.831295490264893, + "learning_rate": 4.804838273200196e-06, + "loss": 0.5115, + "mean_token_accuracy": 0.8353771314024925, + "num_tokens": 107532573.0, + "step": 89430 + }, + { + "entropy": 1.9429254934191704, + "epoch": 0.2772561744444763, + "grad_norm": 7.812403202056885, + "learning_rate": 4.804569655897587e-06, + "loss": 0.514, + "mean_token_accuracy": 0.8479711428284645, + "num_tokens": 107544075.0, + "step": 89440 + }, + { + "entropy": 1.8462449744343759, + "epoch": 0.277287173569526, + "grad_norm": 7.212341785430908, + "learning_rate": 4.8043010836415645e-06, + "loss": 0.4447, + "mean_token_accuracy": 0.8527239561080933, + "num_tokens": 107556555.0, + "step": 89450 + }, + { + "entropy": 1.8698409616947174, + "epoch": 0.2773181726945757, + "grad_norm": 7.4296698570251465, + "learning_rate": 4.804032556419541e-06, + "loss": 0.4619, + "mean_token_accuracy": 0.8424155786633492, + "num_tokens": 107569186.0, + "step": 89460 + }, + { + "entropy": 1.8821595564484597, + "epoch": 0.2773491718196254, + "grad_norm": 7.037583351135254, + "learning_rate": 4.803764074218931e-06, + "loss": 0.4709, + "mean_token_accuracy": 0.8456917703151703, + "num_tokens": 107580496.0, + "step": 89470 + }, + { + "entropy": 1.8518904522061348, + "epoch": 0.2773801709446751, + "grad_norm": 4.142475128173828, + "learning_rate": 4.803495637027156e-06, + "loss": 0.4833, + "mean_token_accuracy": 0.8462300166487694, + "num_tokens": 107592528.0, + "step": 89480 + }, + { + "entropy": 1.9444121688604354, + "epoch": 0.27741117006972477, + "grad_norm": 8.443082809448242, + "learning_rate": 4.803227244831642e-06, + "loss": 0.5032, + "mean_token_accuracy": 0.8497670993208886, + "num_tokens": 107603347.0, + "step": 89490 + }, + { + "entropy": 1.9082912877202034, + "epoch": 0.2774421691947745, + "grad_norm": 8.889103889465332, + "learning_rate": 4.80295889761982e-06, + "loss": 0.4889, + "mean_token_accuracy": 0.8479152485728264, + "num_tokens": 107614464.0, + "step": 89500 + }, + { + "entropy": 1.9622692078351975, + "epoch": 0.27747316831982416, + "grad_norm": 9.245258331298828, + "learning_rate": 4.802690595379124e-06, + "loss": 0.5951, + "mean_token_accuracy": 0.8277513101696968, + "num_tokens": 107625283.0, + "step": 89510 + }, + { + "entropy": 1.8577041417360305, + "epoch": 0.2775041674448739, + "grad_norm": 8.274160385131836, + "learning_rate": 4.802422338096995e-06, + "loss": 0.4768, + "mean_token_accuracy": 0.8520587861537934, + "num_tokens": 107637211.0, + "step": 89520 + }, + { + "entropy": 1.8055953189730645, + "epoch": 0.27753516656992355, + "grad_norm": 9.126919746398926, + "learning_rate": 4.80215412576088e-06, + "loss": 0.5178, + "mean_token_accuracy": 0.8396184176206589, + "num_tokens": 107650513.0, + "step": 89530 + }, + { + "entropy": 1.9063657209277154, + "epoch": 0.2775661656949733, + "grad_norm": 7.300408840179443, + "learning_rate": 4.801885958358229e-06, + "loss": 0.4567, + "mean_token_accuracy": 0.853947177529335, + "num_tokens": 107662885.0, + "step": 89540 + }, + { + "entropy": 2.0144054174423216, + "epoch": 0.27759716482002295, + "grad_norm": 8.333145141601562, + "learning_rate": 4.801617835876496e-06, + "loss": 0.5785, + "mean_token_accuracy": 0.8276149451732635, + "num_tokens": 107673365.0, + "step": 89550 + }, + { + "entropy": 1.8120394110679627, + "epoch": 0.27762816394507267, + "grad_norm": 5.053866863250732, + "learning_rate": 4.801349758303142e-06, + "loss": 0.4466, + "mean_token_accuracy": 0.8445941239595414, + "num_tokens": 107686729.0, + "step": 89560 + }, + { + "entropy": 1.8656949549913406, + "epoch": 0.27765916307012234, + "grad_norm": 10.746817588806152, + "learning_rate": 4.801081725625631e-06, + "loss": 0.5, + "mean_token_accuracy": 0.8389295890927315, + "num_tokens": 107699283.0, + "step": 89570 + }, + { + "entropy": 1.8939693599939347, + "epoch": 0.27769016219517206, + "grad_norm": 7.9915971755981445, + "learning_rate": 4.800813737831435e-06, + "loss": 0.4607, + "mean_token_accuracy": 0.836301201581955, + "num_tokens": 107710971.0, + "step": 89580 + }, + { + "entropy": 1.8993390202522278, + "epoch": 0.27772116132022173, + "grad_norm": 3.7865583896636963, + "learning_rate": 4.800545794908028e-06, + "loss": 0.5149, + "mean_token_accuracy": 0.8455944269895553, + "num_tokens": 107721957.0, + "step": 89590 + }, + { + "entropy": 1.9103139862418175, + "epoch": 0.27775216044527146, + "grad_norm": 7.072427749633789, + "learning_rate": 4.800277896842888e-06, + "loss": 0.5299, + "mean_token_accuracy": 0.8366509795188903, + "num_tokens": 107733436.0, + "step": 89600 + }, + { + "entropy": 1.916232281923294, + "epoch": 0.2777831595703211, + "grad_norm": 7.880809307098389, + "learning_rate": 4.8000100436235025e-06, + "loss": 0.4882, + "mean_token_accuracy": 0.846810282766819, + "num_tokens": 107744724.0, + "step": 89610 + }, + { + "entropy": 1.8565625533461572, + "epoch": 0.27781415869537085, + "grad_norm": 9.63576889038086, + "learning_rate": 4.799742235237359e-06, + "loss": 0.4176, + "mean_token_accuracy": 0.8563435986638069, + "num_tokens": 107756737.0, + "step": 89620 + }, + { + "entropy": 1.8952525824308395, + "epoch": 0.2778451578204205, + "grad_norm": 7.304356575012207, + "learning_rate": 4.799474471671954e-06, + "loss": 0.4555, + "mean_token_accuracy": 0.855880931019783, + "num_tokens": 107767777.0, + "step": 89630 + }, + { + "entropy": 1.8737752437591553, + "epoch": 0.2778761569454702, + "grad_norm": 8.607854843139648, + "learning_rate": 4.799206752914784e-06, + "loss": 0.4669, + "mean_token_accuracy": 0.8448594495654106, + "num_tokens": 107780521.0, + "step": 89640 + }, + { + "entropy": 1.8830969855189323, + "epoch": 0.2779071560705199, + "grad_norm": 9.414170265197754, + "learning_rate": 4.798939078953355e-06, + "loss": 0.4986, + "mean_token_accuracy": 0.8453325614333153, + "num_tokens": 107792404.0, + "step": 89650 + }, + { + "entropy": 1.8424068495631218, + "epoch": 0.2779381551955696, + "grad_norm": 3.329559087753296, + "learning_rate": 4.798671449775176e-06, + "loss": 0.4264, + "mean_token_accuracy": 0.851707661151886, + "num_tokens": 107804701.0, + "step": 89660 + }, + { + "entropy": 1.8513332203030586, + "epoch": 0.2779691543206193, + "grad_norm": 9.445657730102539, + "learning_rate": 4.798403865367761e-06, + "loss": 0.4731, + "mean_token_accuracy": 0.8407017186284065, + "num_tokens": 107817031.0, + "step": 89670 + }, + { + "entropy": 1.861585983633995, + "epoch": 0.278000153445669, + "grad_norm": 8.74657154083252, + "learning_rate": 4.798136325718627e-06, + "loss": 0.4754, + "mean_token_accuracy": 0.8404981315135955, + "num_tokens": 107829450.0, + "step": 89680 + }, + { + "entropy": 1.8812636777758598, + "epoch": 0.2780311525707187, + "grad_norm": 8.068127632141113, + "learning_rate": 4.797868830815301e-06, + "loss": 0.5182, + "mean_token_accuracy": 0.8442062169313431, + "num_tokens": 107842211.0, + "step": 89690 + }, + { + "entropy": 1.8865378215909003, + "epoch": 0.27806215169576837, + "grad_norm": 8.662693977355957, + "learning_rate": 4.797601380645308e-06, + "loss": 0.5025, + "mean_token_accuracy": 0.8458723932504654, + "num_tokens": 107854429.0, + "step": 89700 + }, + { + "entropy": 1.9377282798290252, + "epoch": 0.2780931508208181, + "grad_norm": 7.293258190155029, + "learning_rate": 4.797333975196185e-06, + "loss": 0.5247, + "mean_token_accuracy": 0.845015498995781, + "num_tokens": 107866326.0, + "step": 89710 + }, + { + "entropy": 1.8403261929750443, + "epoch": 0.27812414994586776, + "grad_norm": 4.629557132720947, + "learning_rate": 4.797066614455466e-06, + "loss": 0.4569, + "mean_token_accuracy": 0.8479184404015541, + "num_tokens": 107879453.0, + "step": 89720 + }, + { + "entropy": 1.8934699580073358, + "epoch": 0.2781551490709175, + "grad_norm": 3.9676856994628906, + "learning_rate": 4.796799298410698e-06, + "loss": 0.4826, + "mean_token_accuracy": 0.8484136417508126, + "num_tokens": 107891153.0, + "step": 89730 + }, + { + "entropy": 1.954687887430191, + "epoch": 0.27818614819596715, + "grad_norm": 7.991013526916504, + "learning_rate": 4.796532027049428e-06, + "loss": 0.4937, + "mean_token_accuracy": 0.85145123898983, + "num_tokens": 107902323.0, + "step": 89740 + }, + { + "entropy": 1.9269502833485603, + "epoch": 0.2782171473210169, + "grad_norm": 10.174466133117676, + "learning_rate": 4.796264800359207e-06, + "loss": 0.4912, + "mean_token_accuracy": 0.8483559593558312, + "num_tokens": 107913752.0, + "step": 89750 + }, + { + "entropy": 1.9451028525829315, + "epoch": 0.27824814644606655, + "grad_norm": 8.776559829711914, + "learning_rate": 4.795997618327595e-06, + "loss": 0.5428, + "mean_token_accuracy": 0.8392796277999878, + "num_tokens": 107925468.0, + "step": 89760 + }, + { + "entropy": 1.9637413799762726, + "epoch": 0.27827914557111627, + "grad_norm": 8.385177612304688, + "learning_rate": 4.795730480942153e-06, + "loss": 0.5066, + "mean_token_accuracy": 0.8446138560771942, + "num_tokens": 107936686.0, + "step": 89770 + }, + { + "entropy": 1.9453602582216263, + "epoch": 0.27831014469616594, + "grad_norm": 9.379023551940918, + "learning_rate": 4.795463388190449e-06, + "loss": 0.4917, + "mean_token_accuracy": 0.8378261864185333, + "num_tokens": 107947735.0, + "step": 89780 + }, + { + "entropy": 1.8662060409784318, + "epoch": 0.27834114382121566, + "grad_norm": 7.810781478881836, + "learning_rate": 4.7951963400600565e-06, + "loss": 0.4533, + "mean_token_accuracy": 0.8564840778708458, + "num_tokens": 107958863.0, + "step": 89790 + }, + { + "entropy": 1.8019395358860493, + "epoch": 0.27837214294626533, + "grad_norm": 8.415993690490723, + "learning_rate": 4.7949293365385505e-06, + "loss": 0.4145, + "mean_token_accuracy": 0.8503817662596702, + "num_tokens": 107972692.0, + "step": 89800 + }, + { + "entropy": 1.7698960989713668, + "epoch": 0.27840314207131506, + "grad_norm": 8.861882209777832, + "learning_rate": 4.794662377613515e-06, + "loss": 0.3761, + "mean_token_accuracy": 0.8597157940268516, + "num_tokens": 107985556.0, + "step": 89810 + }, + { + "entropy": 1.7953774243593217, + "epoch": 0.2784341411963647, + "grad_norm": 7.687840461730957, + "learning_rate": 4.794395463272534e-06, + "loss": 0.4189, + "mean_token_accuracy": 0.854545010626316, + "num_tokens": 107998951.0, + "step": 89820 + }, + { + "entropy": 1.9417035043239594, + "epoch": 0.27846514032141445, + "grad_norm": 9.26181697845459, + "learning_rate": 4.794128593503201e-06, + "loss": 0.5209, + "mean_token_accuracy": 0.8420970395207406, + "num_tokens": 108010519.0, + "step": 89830 + }, + { + "entropy": 1.8701779007911683, + "epoch": 0.2784961394464641, + "grad_norm": 8.857370376586914, + "learning_rate": 4.793861768293114e-06, + "loss": 0.4894, + "mean_token_accuracy": 0.8434204265475274, + "num_tokens": 108022433.0, + "step": 89840 + }, + { + "entropy": 1.9472114413976669, + "epoch": 0.27852713857151384, + "grad_norm": 8.817475318908691, + "learning_rate": 4.793594987629871e-06, + "loss": 0.5206, + "mean_token_accuracy": 0.8399725124239922, + "num_tokens": 108033484.0, + "step": 89850 + }, + { + "entropy": 1.9352742165327073, + "epoch": 0.2785581376965635, + "grad_norm": 7.381696701049805, + "learning_rate": 4.7933282515010806e-06, + "loss": 0.5144, + "mean_token_accuracy": 0.8374552443623543, + "num_tokens": 108045222.0, + "step": 89860 + }, + { + "entropy": 1.8853474691510201, + "epoch": 0.27858913682161324, + "grad_norm": 8.79345417022705, + "learning_rate": 4.793061559894352e-06, + "loss": 0.4711, + "mean_token_accuracy": 0.8477419853210449, + "num_tokens": 108057330.0, + "step": 89870 + }, + { + "entropy": 1.8697724029421807, + "epoch": 0.2786201359466629, + "grad_norm": 8.972064018249512, + "learning_rate": 4.792794912797302e-06, + "loss": 0.4377, + "mean_token_accuracy": 0.8470319598913193, + "num_tokens": 108069573.0, + "step": 89880 + }, + { + "entropy": 1.8288535490632056, + "epoch": 0.2786511350717126, + "grad_norm": 3.9921863079071045, + "learning_rate": 4.792528310197551e-06, + "loss": 0.465, + "mean_token_accuracy": 0.8418003395199776, + "num_tokens": 108083028.0, + "step": 89890 + }, + { + "entropy": 1.9048464432358743, + "epoch": 0.2786821341967623, + "grad_norm": 8.971819877624512, + "learning_rate": 4.792261752082724e-06, + "loss": 0.4933, + "mean_token_accuracy": 0.8390678346157074, + "num_tokens": 108094239.0, + "step": 89900 + }, + { + "entropy": 1.893374653160572, + "epoch": 0.27871313332181197, + "grad_norm": 8.717069625854492, + "learning_rate": 4.791995238440452e-06, + "loss": 0.4751, + "mean_token_accuracy": 0.8402533918619156, + "num_tokens": 108106085.0, + "step": 89910 + }, + { + "entropy": 1.9398944050073623, + "epoch": 0.2787441324468617, + "grad_norm": 9.565119743347168, + "learning_rate": 4.79172876925837e-06, + "loss": 0.5399, + "mean_token_accuracy": 0.8393180221319199, + "num_tokens": 108116682.0, + "step": 89920 + }, + { + "entropy": 1.8258964017033577, + "epoch": 0.27877513157191136, + "grad_norm": 2.663538694381714, + "learning_rate": 4.791462344524116e-06, + "loss": 0.4645, + "mean_token_accuracy": 0.8499773174524308, + "num_tokens": 108128639.0, + "step": 89930 + }, + { + "entropy": 1.833346499502659, + "epoch": 0.2788061306969611, + "grad_norm": 8.475390434265137, + "learning_rate": 4.791195964225338e-06, + "loss": 0.4454, + "mean_token_accuracy": 0.8522562757134438, + "num_tokens": 108141077.0, + "step": 89940 + }, + { + "entropy": 1.9464040279388428, + "epoch": 0.27883712982201075, + "grad_norm": 7.769806861877441, + "learning_rate": 4.790929628349683e-06, + "loss": 0.5211, + "mean_token_accuracy": 0.838797104358673, + "num_tokens": 108152453.0, + "step": 89950 + }, + { + "entropy": 1.8917251348495483, + "epoch": 0.2788681289470605, + "grad_norm": 7.752605438232422, + "learning_rate": 4.790663336884804e-06, + "loss": 0.459, + "mean_token_accuracy": 0.8592374518513679, + "num_tokens": 108164272.0, + "step": 89960 + }, + { + "entropy": 1.8999314427375793, + "epoch": 0.27889912807211015, + "grad_norm": 7.312995910644531, + "learning_rate": 4.790397089818365e-06, + "loss": 0.4661, + "mean_token_accuracy": 0.8518079608678818, + "num_tokens": 108175602.0, + "step": 89970 + }, + { + "entropy": 1.8947380736470223, + "epoch": 0.27893012719715987, + "grad_norm": 8.715499877929688, + "learning_rate": 4.790130887138025e-06, + "loss": 0.4957, + "mean_token_accuracy": 0.8417610317468643, + "num_tokens": 108187295.0, + "step": 89980 + }, + { + "entropy": 1.8655477941036225, + "epoch": 0.27896112632220954, + "grad_norm": 8.097354888916016, + "learning_rate": 4.789864728831455e-06, + "loss": 0.4866, + "mean_token_accuracy": 0.8457545340061188, + "num_tokens": 108199656.0, + "step": 89990 + }, + { + "entropy": 1.7254057943820953, + "epoch": 0.27899212544725926, + "grad_norm": 4.202314853668213, + "learning_rate": 4.789598614886327e-06, + "loss": 0.3796, + "mean_token_accuracy": 0.8670785754919053, + "num_tokens": 108213424.0, + "step": 90000 + }, + { + "entropy": 1.935379645228386, + "epoch": 0.27902312457230893, + "grad_norm": 4.619598865509033, + "learning_rate": 4.789332545290321e-06, + "loss": 0.5594, + "mean_token_accuracy": 0.8315211609005928, + "num_tokens": 108224748.0, + "step": 90010 + }, + { + "entropy": 1.9281780689954757, + "epoch": 0.27905412369735866, + "grad_norm": 8.387505531311035, + "learning_rate": 4.789066520031119e-06, + "loss": 0.5284, + "mean_token_accuracy": 0.8352978438138962, + "num_tokens": 108236160.0, + "step": 90020 + }, + { + "entropy": 1.9002047255635262, + "epoch": 0.2790851228224083, + "grad_norm": 8.676763534545898, + "learning_rate": 4.7888005390964094e-06, + "loss": 0.5236, + "mean_token_accuracy": 0.834555535018444, + "num_tokens": 108248149.0, + "step": 90030 + }, + { + "entropy": 1.8989999890327454, + "epoch": 0.27911612194745805, + "grad_norm": 9.32200813293457, + "learning_rate": 4.788534602473885e-06, + "loss": 0.4808, + "mean_token_accuracy": 0.8412574380636215, + "num_tokens": 108259859.0, + "step": 90040 + }, + { + "entropy": 1.8991701990365981, + "epoch": 0.2791471210725077, + "grad_norm": 7.9609694480896, + "learning_rate": 4.788268710151243e-06, + "loss": 0.4405, + "mean_token_accuracy": 0.857192724943161, + "num_tokens": 108271236.0, + "step": 90050 + }, + { + "entropy": 1.7937731251120568, + "epoch": 0.27917812019755744, + "grad_norm": 7.316855430603027, + "learning_rate": 4.788002862116185e-06, + "loss": 0.4172, + "mean_token_accuracy": 0.8526657864451408, + "num_tokens": 108284781.0, + "step": 90060 + }, + { + "entropy": 1.9361607402563095, + "epoch": 0.2792091193226071, + "grad_norm": 7.928420066833496, + "learning_rate": 4.787737058356419e-06, + "loss": 0.4876, + "mean_token_accuracy": 0.8461529463529587, + "num_tokens": 108295737.0, + "step": 90070 + }, + { + "entropy": 1.9357156157493591, + "epoch": 0.27924011844765684, + "grad_norm": 9.841268539428711, + "learning_rate": 4.787471298859655e-06, + "loss": 0.4987, + "mean_token_accuracy": 0.8431805863976478, + "num_tokens": 108307828.0, + "step": 90080 + }, + { + "entropy": 1.8373299419879914, + "epoch": 0.2792711175727065, + "grad_norm": 8.356464385986328, + "learning_rate": 4.78720558361361e-06, + "loss": 0.4569, + "mean_token_accuracy": 0.8455582305788993, + "num_tokens": 108320724.0, + "step": 90090 + }, + { + "entropy": 1.9154756784439086, + "epoch": 0.27930211669775623, + "grad_norm": 3.6140196323394775, + "learning_rate": 4.786939912606008e-06, + "loss": 0.4679, + "mean_token_accuracy": 0.8417788296937943, + "num_tokens": 108332507.0, + "step": 90100 + }, + { + "entropy": 1.8971551463007927, + "epoch": 0.2793331158228059, + "grad_norm": 8.835823059082031, + "learning_rate": 4.786674285824571e-06, + "loss": 0.484, + "mean_token_accuracy": 0.8474727541208267, + "num_tokens": 108343885.0, + "step": 90110 + }, + { + "entropy": 1.7721475332975387, + "epoch": 0.2793641149478556, + "grad_norm": 8.436137199401855, + "learning_rate": 4.786408703257034e-06, + "loss": 0.4507, + "mean_token_accuracy": 0.8569958195090294, + "num_tokens": 108356767.0, + "step": 90120 + }, + { + "entropy": 1.8786687865853309, + "epoch": 0.2793951140729053, + "grad_norm": 7.322561740875244, + "learning_rate": 4.78614316489113e-06, + "loss": 0.474, + "mean_token_accuracy": 0.8418064162135124, + "num_tokens": 108369550.0, + "step": 90130 + }, + { + "entropy": 1.8602668032050134, + "epoch": 0.27942611319795496, + "grad_norm": 4.303462505340576, + "learning_rate": 4.785877670714598e-06, + "loss": 0.4326, + "mean_token_accuracy": 0.8566737055778504, + "num_tokens": 108381931.0, + "step": 90140 + }, + { + "entropy": 1.869055911898613, + "epoch": 0.2794571123230047, + "grad_norm": 8.936867713928223, + "learning_rate": 4.7856122207151874e-06, + "loss": 0.4597, + "mean_token_accuracy": 0.8521290734410286, + "num_tokens": 108393434.0, + "step": 90150 + }, + { + "entropy": 1.9146001234650611, + "epoch": 0.27948811144805435, + "grad_norm": 10.076147079467773, + "learning_rate": 4.7853468148806436e-06, + "loss": 0.4827, + "mean_token_accuracy": 0.8482694402337074, + "num_tokens": 108404678.0, + "step": 90160 + }, + { + "entropy": 1.8209770336747169, + "epoch": 0.2795191105731041, + "grad_norm": 9.587752342224121, + "learning_rate": 4.785081453198724e-06, + "loss": 0.4182, + "mean_token_accuracy": 0.8516710668802261, + "num_tokens": 108416663.0, + "step": 90170 + }, + { + "entropy": 1.844451193511486, + "epoch": 0.27955010969815375, + "grad_norm": 10.955329895019531, + "learning_rate": 4.784816135657187e-06, + "loss": 0.4488, + "mean_token_accuracy": 0.8469073712825775, + "num_tokens": 108429075.0, + "step": 90180 + }, + { + "entropy": 1.8783362239599228, + "epoch": 0.2795811088232035, + "grad_norm": 8.352615356445312, + "learning_rate": 4.784550862243798e-06, + "loss": 0.4653, + "mean_token_accuracy": 0.8481233865022659, + "num_tokens": 108441399.0, + "step": 90190 + }, + { + "entropy": 1.9300066709518433, + "epoch": 0.27961210794825314, + "grad_norm": 7.3091206550598145, + "learning_rate": 4.784285632946324e-06, + "loss": 0.4898, + "mean_token_accuracy": 0.8453634783625603, + "num_tokens": 108452967.0, + "step": 90200 + }, + { + "entropy": 1.8706892415881158, + "epoch": 0.27964310707330287, + "grad_norm": 7.95927095413208, + "learning_rate": 4.784020447752539e-06, + "loss": 0.4524, + "mean_token_accuracy": 0.8540063112974167, + "num_tokens": 108464604.0, + "step": 90210 + }, + { + "entropy": 1.8064016848802567, + "epoch": 0.27967410619835253, + "grad_norm": 12.208187103271484, + "learning_rate": 4.783755306650223e-06, + "loss": 0.4148, + "mean_token_accuracy": 0.8536765992641449, + "num_tokens": 108477370.0, + "step": 90220 + }, + { + "entropy": 1.891729509830475, + "epoch": 0.27970510532340226, + "grad_norm": 7.544349193572998, + "learning_rate": 4.783490209627159e-06, + "loss": 0.503, + "mean_token_accuracy": 0.8449078306555748, + "num_tokens": 108488578.0, + "step": 90230 + }, + { + "entropy": 1.839923305809498, + "epoch": 0.2797361044484519, + "grad_norm": 7.284799098968506, + "learning_rate": 4.783225156671132e-06, + "loss": 0.4207, + "mean_token_accuracy": 0.8685827806591988, + "num_tokens": 108500874.0, + "step": 90240 + }, + { + "entropy": 1.9023682996630669, + "epoch": 0.27976710357350165, + "grad_norm": 7.970053195953369, + "learning_rate": 4.782960147769936e-06, + "loss": 0.4684, + "mean_token_accuracy": 0.8523323282599449, + "num_tokens": 108512936.0, + "step": 90250 + }, + { + "entropy": 1.8668055430054664, + "epoch": 0.2797981026985513, + "grad_norm": 7.000694751739502, + "learning_rate": 4.78269518291137e-06, + "loss": 0.4485, + "mean_token_accuracy": 0.8535307124257088, + "num_tokens": 108524895.0, + "step": 90260 + }, + { + "entropy": 1.8698514148592948, + "epoch": 0.27982910182360105, + "grad_norm": 7.908920764923096, + "learning_rate": 4.782430262083234e-06, + "loss": 0.4849, + "mean_token_accuracy": 0.8330338954925537, + "num_tokens": 108537045.0, + "step": 90270 + }, + { + "entropy": 1.9352901756763459, + "epoch": 0.2798601009486507, + "grad_norm": 9.200328826904297, + "learning_rate": 4.7821653852733365e-06, + "loss": 0.4976, + "mean_token_accuracy": 0.8495378881692887, + "num_tokens": 108547879.0, + "step": 90280 + }, + { + "entropy": 1.8811449840664864, + "epoch": 0.27989110007370044, + "grad_norm": 8.855972290039062, + "learning_rate": 4.781900552469487e-06, + "loss": 0.4601, + "mean_token_accuracy": 0.8424475163221359, + "num_tokens": 108560224.0, + "step": 90290 + }, + { + "entropy": 1.7924820497632026, + "epoch": 0.2799220991987501, + "grad_norm": 3.9103078842163086, + "learning_rate": 4.7816357636595036e-06, + "loss": 0.4031, + "mean_token_accuracy": 0.856936690211296, + "num_tokens": 108573946.0, + "step": 90300 + }, + { + "entropy": 1.9183641135692597, + "epoch": 0.27995309832379983, + "grad_norm": 4.683595180511475, + "learning_rate": 4.781371018831206e-06, + "loss": 0.5028, + "mean_token_accuracy": 0.8507445871829986, + "num_tokens": 108585889.0, + "step": 90310 + }, + { + "entropy": 1.882660059630871, + "epoch": 0.2799840974488495, + "grad_norm": 7.647256374359131, + "learning_rate": 4.781106317972421e-06, + "loss": 0.4848, + "mean_token_accuracy": 0.8541607797145844, + "num_tokens": 108597763.0, + "step": 90320 + }, + { + "entropy": 1.899367219209671, + "epoch": 0.2800150965738992, + "grad_norm": 7.681816577911377, + "learning_rate": 4.780841661070978e-06, + "loss": 0.4933, + "mean_token_accuracy": 0.8502333298325538, + "num_tokens": 108609237.0, + "step": 90330 + }, + { + "entropy": 1.9607916057109833, + "epoch": 0.2800460956989489, + "grad_norm": 8.942265510559082, + "learning_rate": 4.780577048114713e-06, + "loss": 0.4883, + "mean_token_accuracy": 0.8450431019067765, + "num_tokens": 108619807.0, + "step": 90340 + }, + { + "entropy": 1.925535424053669, + "epoch": 0.2800770948239986, + "grad_norm": 9.137941360473633, + "learning_rate": 4.780312479091465e-06, + "loss": 0.5373, + "mean_token_accuracy": 0.8320911303162575, + "num_tokens": 108630896.0, + "step": 90350 + }, + { + "entropy": 1.9415040105581283, + "epoch": 0.2801080939490483, + "grad_norm": 7.471439361572266, + "learning_rate": 4.780047953989079e-06, + "loss": 0.476, + "mean_token_accuracy": 0.848981736600399, + "num_tokens": 108642366.0, + "step": 90360 + }, + { + "entropy": 1.9614702731370925, + "epoch": 0.280139093074098, + "grad_norm": 9.385625839233398, + "learning_rate": 4.779783472795404e-06, + "loss": 0.5708, + "mean_token_accuracy": 0.8339747205376625, + "num_tokens": 108653416.0, + "step": 90370 + }, + { + "entropy": 1.7262531116604805, + "epoch": 0.2801700921991477, + "grad_norm": 8.864653587341309, + "learning_rate": 4.779519035498294e-06, + "loss": 0.3797, + "mean_token_accuracy": 0.8626728132367134, + "num_tokens": 108667678.0, + "step": 90380 + }, + { + "entropy": 1.9948803067207337, + "epoch": 0.28020109132419735, + "grad_norm": 8.669200897216797, + "learning_rate": 4.779254642085608e-06, + "loss": 0.5147, + "mean_token_accuracy": 0.8431035861372947, + "num_tokens": 108678385.0, + "step": 90390 + }, + { + "entropy": 1.9449063792824746, + "epoch": 0.2802320904492471, + "grad_norm": 9.006505966186523, + "learning_rate": 4.778990292545207e-06, + "loss": 0.5286, + "mean_token_accuracy": 0.829691307246685, + "num_tokens": 108689327.0, + "step": 90400 + }, + { + "entropy": 1.916630421578884, + "epoch": 0.28026308957429674, + "grad_norm": 8.61928653717041, + "learning_rate": 4.7787259868649635e-06, + "loss": 0.535, + "mean_token_accuracy": 0.8399298146367074, + "num_tokens": 108700905.0, + "step": 90410 + }, + { + "entropy": 1.9122631967067718, + "epoch": 0.28029408869934647, + "grad_norm": 7.261351108551025, + "learning_rate": 4.778461725032747e-06, + "loss": 0.5228, + "mean_token_accuracy": 0.8412989899516106, + "num_tokens": 108713415.0, + "step": 90420 + }, + { + "entropy": 1.9562030732631683, + "epoch": 0.28032508782439614, + "grad_norm": 7.638638496398926, + "learning_rate": 4.7781975070364375e-06, + "loss": 0.5237, + "mean_token_accuracy": 0.8463377922773361, + "num_tokens": 108723813.0, + "step": 90430 + }, + { + "entropy": 1.7865042075514794, + "epoch": 0.28035608694944586, + "grad_norm": 2.5158307552337646, + "learning_rate": 4.7779333328639124e-06, + "loss": 0.3875, + "mean_token_accuracy": 0.8637947380542755, + "num_tokens": 108737608.0, + "step": 90440 + }, + { + "entropy": 1.6872775062918663, + "epoch": 0.28038708607449553, + "grad_norm": 2.7983152866363525, + "learning_rate": 4.777669202503063e-06, + "loss": 0.3223, + "mean_token_accuracy": 0.872826486825943, + "num_tokens": 108752944.0, + "step": 90450 + }, + { + "entropy": 1.9655809059739113, + "epoch": 0.28041808519954525, + "grad_norm": 9.844002723693848, + "learning_rate": 4.77740511594178e-06, + "loss": 0.5496, + "mean_token_accuracy": 0.8349737733602524, + "num_tokens": 108764826.0, + "step": 90460 + }, + { + "entropy": 1.8685831755399704, + "epoch": 0.2804490843245949, + "grad_norm": 4.095294952392578, + "learning_rate": 4.777141073167958e-06, + "loss": 0.4645, + "mean_token_accuracy": 0.8481494188308716, + "num_tokens": 108777223.0, + "step": 90470 + }, + { + "entropy": 1.9378087937831878, + "epoch": 0.28048008344964465, + "grad_norm": 3.9913816452026367, + "learning_rate": 4.7768770741694985e-06, + "loss": 0.521, + "mean_token_accuracy": 0.8388440117239953, + "num_tokens": 108788749.0, + "step": 90480 + }, + { + "entropy": 1.9008531831204891, + "epoch": 0.2805110825746943, + "grad_norm": 7.451270580291748, + "learning_rate": 4.7766131189343075e-06, + "loss": 0.4134, + "mean_token_accuracy": 0.8565253868699074, + "num_tokens": 108801607.0, + "step": 90490 + }, + { + "entropy": 1.960809737443924, + "epoch": 0.28054208169974404, + "grad_norm": 7.793551921844482, + "learning_rate": 4.776349207450297e-06, + "loss": 0.4888, + "mean_token_accuracy": 0.8530623838305473, + "num_tokens": 108812575.0, + "step": 90500 + }, + { + "entropy": 1.9836690306663514, + "epoch": 0.2805730808247937, + "grad_norm": 9.390997886657715, + "learning_rate": 4.776085339705378e-06, + "loss": 0.527, + "mean_token_accuracy": 0.8447514802217484, + "num_tokens": 108823707.0, + "step": 90510 + }, + { + "entropy": 1.8805051818490028, + "epoch": 0.28060407994984343, + "grad_norm": 6.977400302886963, + "learning_rate": 4.775821515687472e-06, + "loss": 0.4779, + "mean_token_accuracy": 0.8450083822011948, + "num_tokens": 108835768.0, + "step": 90520 + }, + { + "entropy": 1.8927241086959838, + "epoch": 0.2806350790748931, + "grad_norm": 8.844595909118652, + "learning_rate": 4.775557735384503e-06, + "loss": 0.483, + "mean_token_accuracy": 0.8455369025468826, + "num_tokens": 108847447.0, + "step": 90530 + }, + { + "entropy": 1.9194593280553818, + "epoch": 0.2806660781999428, + "grad_norm": 9.10633659362793, + "learning_rate": 4.775293998784402e-06, + "loss": 0.4761, + "mean_token_accuracy": 0.8458576589822769, + "num_tokens": 108859245.0, + "step": 90540 + }, + { + "entropy": 1.9746450453996658, + "epoch": 0.2806970773249925, + "grad_norm": 8.2658052444458, + "learning_rate": 4.775030305875099e-06, + "loss": 0.5594, + "mean_token_accuracy": 0.8276672974228859, + "num_tokens": 108869967.0, + "step": 90550 + }, + { + "entropy": 1.8919481202960013, + "epoch": 0.2807280764500422, + "grad_norm": 6.916478157043457, + "learning_rate": 4.774766656644536e-06, + "loss": 0.4377, + "mean_token_accuracy": 0.8547657161951066, + "num_tokens": 108881653.0, + "step": 90560 + }, + { + "entropy": 1.8619554951786994, + "epoch": 0.2807590755750919, + "grad_norm": 7.139795780181885, + "learning_rate": 4.774503051080653e-06, + "loss": 0.4589, + "mean_token_accuracy": 0.845686687529087, + "num_tokens": 108893797.0, + "step": 90570 + }, + { + "entropy": 1.8943834751844406, + "epoch": 0.2807900747001416, + "grad_norm": 7.6240339279174805, + "learning_rate": 4.7742394891713975e-06, + "loss": 0.515, + "mean_token_accuracy": 0.8450383573770524, + "num_tokens": 108905569.0, + "step": 90580 + }, + { + "entropy": 1.8711055159568786, + "epoch": 0.2808210738251913, + "grad_norm": 7.671294689178467, + "learning_rate": 4.773975970904725e-06, + "loss": 0.4446, + "mean_token_accuracy": 0.8534323275089264, + "num_tokens": 108918072.0, + "step": 90590 + }, + { + "entropy": 1.9521177858114243, + "epoch": 0.280852072950241, + "grad_norm": 9.559345245361328, + "learning_rate": 4.773712496268588e-06, + "loss": 0.5296, + "mean_token_accuracy": 0.8375743925571442, + "num_tokens": 108929589.0, + "step": 90600 + }, + { + "entropy": 1.8415945529937745, + "epoch": 0.2808830720752907, + "grad_norm": 4.66655969619751, + "learning_rate": 4.773449065250952e-06, + "loss": 0.46, + "mean_token_accuracy": 0.8464484736323357, + "num_tokens": 108942319.0, + "step": 90610 + }, + { + "entropy": 1.8111029013991355, + "epoch": 0.28091407120034034, + "grad_norm": 7.22979211807251, + "learning_rate": 4.77318567783978e-06, + "loss": 0.4028, + "mean_token_accuracy": 0.8576018720865249, + "num_tokens": 108955141.0, + "step": 90620 + }, + { + "entropy": 1.910294608771801, + "epoch": 0.28094507032539007, + "grad_norm": 8.751194953918457, + "learning_rate": 4.772922334023044e-06, + "loss": 0.4734, + "mean_token_accuracy": 0.8544377252459526, + "num_tokens": 108966874.0, + "step": 90630 + }, + { + "entropy": 1.8671040296554566, + "epoch": 0.28097606945043974, + "grad_norm": 3.474283218383789, + "learning_rate": 4.7726590337887215e-06, + "loss": 0.4495, + "mean_token_accuracy": 0.8581783235073089, + "num_tokens": 108978553.0, + "step": 90640 + }, + { + "entropy": 1.8678077682852745, + "epoch": 0.28100706857548946, + "grad_norm": 10.035735130310059, + "learning_rate": 4.772395777124789e-06, + "loss": 0.5148, + "mean_token_accuracy": 0.8406292900443078, + "num_tokens": 108990475.0, + "step": 90650 + }, + { + "entropy": 1.9179120391607285, + "epoch": 0.28103806770053913, + "grad_norm": 8.567628860473633, + "learning_rate": 4.772132564019233e-06, + "loss": 0.4866, + "mean_token_accuracy": 0.8443030267953873, + "num_tokens": 109002069.0, + "step": 90660 + }, + { + "entropy": 1.9361174017190934, + "epoch": 0.28106906682558885, + "grad_norm": 8.406739234924316, + "learning_rate": 4.7718693944600445e-06, + "loss": 0.4704, + "mean_token_accuracy": 0.8500792890787124, + "num_tokens": 109013697.0, + "step": 90670 + }, + { + "entropy": 1.8635503351688385, + "epoch": 0.2811000659506385, + "grad_norm": 9.684234619140625, + "learning_rate": 4.771606268435215e-06, + "loss": 0.4781, + "mean_token_accuracy": 0.8503406763076782, + "num_tokens": 109025070.0, + "step": 90680 + }, + { + "entropy": 1.773262333869934, + "epoch": 0.28113106507568825, + "grad_norm": 4.093050003051758, + "learning_rate": 4.771343185932744e-06, + "loss": 0.4471, + "mean_token_accuracy": 0.8514026969671249, + "num_tokens": 109038769.0, + "step": 90690 + }, + { + "entropy": 1.8920626237988472, + "epoch": 0.2811620642007379, + "grad_norm": 8.63742446899414, + "learning_rate": 4.771080146940636e-06, + "loss": 0.4507, + "mean_token_accuracy": 0.8509817168116569, + "num_tokens": 109050600.0, + "step": 90700 + }, + { + "entropy": 1.9327398404479026, + "epoch": 0.28119306332578764, + "grad_norm": 10.020679473876953, + "learning_rate": 4.7708171514468965e-06, + "loss": 0.528, + "mean_token_accuracy": 0.842743456363678, + "num_tokens": 109061795.0, + "step": 90710 + }, + { + "entropy": 1.8716965742409228, + "epoch": 0.2812240624508373, + "grad_norm": 9.19456958770752, + "learning_rate": 4.770554199439541e-06, + "loss": 0.4618, + "mean_token_accuracy": 0.8475604027509689, + "num_tokens": 109073841.0, + "step": 90720 + }, + { + "entropy": 1.9363275811076164, + "epoch": 0.28125506157588703, + "grad_norm": 9.567742347717285, + "learning_rate": 4.770291290906584e-06, + "loss": 0.5366, + "mean_token_accuracy": 0.8288316577672958, + "num_tokens": 109084988.0, + "step": 90730 + }, + { + "entropy": 1.8694113805890082, + "epoch": 0.2812860607009367, + "grad_norm": 10.170321464538574, + "learning_rate": 4.770028425836049e-06, + "loss": 0.4493, + "mean_token_accuracy": 0.854419095814228, + "num_tokens": 109097065.0, + "step": 90740 + }, + { + "entropy": 1.893789705634117, + "epoch": 0.2813170598259864, + "grad_norm": 8.507575035095215, + "learning_rate": 4.769765604215961e-06, + "loss": 0.4775, + "mean_token_accuracy": 0.846349011361599, + "num_tokens": 109108931.0, + "step": 90750 + }, + { + "entropy": 1.9919794470071792, + "epoch": 0.2813480589510361, + "grad_norm": 8.848701477050781, + "learning_rate": 4.769502826034352e-06, + "loss": 0.5132, + "mean_token_accuracy": 0.8403985217213631, + "num_tokens": 109119893.0, + "step": 90760 + }, + { + "entropy": 1.826089233160019, + "epoch": 0.2813790580760858, + "grad_norm": 8.253718376159668, + "learning_rate": 4.769240091279257e-06, + "loss": 0.436, + "mean_token_accuracy": 0.8538560375571251, + "num_tokens": 109132512.0, + "step": 90770 + }, + { + "entropy": 1.8694834753870964, + "epoch": 0.2814100572011355, + "grad_norm": 3.41013240814209, + "learning_rate": 4.768977399938718e-06, + "loss": 0.4813, + "mean_token_accuracy": 0.8431166216731072, + "num_tokens": 109145787.0, + "step": 90780 + }, + { + "entropy": 1.7938583612442016, + "epoch": 0.2814410563261852, + "grad_norm": 3.963634729385376, + "learning_rate": 4.768714752000778e-06, + "loss": 0.4336, + "mean_token_accuracy": 0.8582982420921326, + "num_tokens": 109158903.0, + "step": 90790 + }, + { + "entropy": 1.9309082627296448, + "epoch": 0.2814720554512349, + "grad_norm": 9.012663841247559, + "learning_rate": 4.768452147453487e-06, + "loss": 0.5303, + "mean_token_accuracy": 0.8420499324798584, + "num_tokens": 109169777.0, + "step": 90800 + }, + { + "entropy": 1.8647458493709563, + "epoch": 0.2815030545762846, + "grad_norm": 8.505784034729004, + "learning_rate": 4.7681895862849e-06, + "loss": 0.4703, + "mean_token_accuracy": 0.8408839210867882, + "num_tokens": 109182314.0, + "step": 90810 + }, + { + "entropy": 1.8462204396724702, + "epoch": 0.2815340537013343, + "grad_norm": 8.286667823791504, + "learning_rate": 4.767927068483076e-06, + "loss": 0.4418, + "mean_token_accuracy": 0.8553971752524376, + "num_tokens": 109195725.0, + "step": 90820 + }, + { + "entropy": 1.8974308609962462, + "epoch": 0.281565052826384, + "grad_norm": 9.34967041015625, + "learning_rate": 4.767664594036074e-06, + "loss": 0.4852, + "mean_token_accuracy": 0.8490856289863586, + "num_tokens": 109207842.0, + "step": 90830 + }, + { + "entropy": 1.8527883812785149, + "epoch": 0.28159605195143367, + "grad_norm": 3.996321439743042, + "learning_rate": 4.767402162931967e-06, + "loss": 0.4555, + "mean_token_accuracy": 0.8528962567448616, + "num_tokens": 109220691.0, + "step": 90840 + }, + { + "entropy": 1.9433230310678482, + "epoch": 0.2816270510764834, + "grad_norm": 7.619417667388916, + "learning_rate": 4.767139775158826e-06, + "loss": 0.5239, + "mean_token_accuracy": 0.843589824438095, + "num_tokens": 109231579.0, + "step": 90850 + }, + { + "entropy": 1.8327447712421416, + "epoch": 0.28165805020153306, + "grad_norm": 8.732206344604492, + "learning_rate": 4.766877430704727e-06, + "loss": 0.4369, + "mean_token_accuracy": 0.8554098963737488, + "num_tokens": 109244327.0, + "step": 90860 + }, + { + "entropy": 1.9131013810634614, + "epoch": 0.28168904932658273, + "grad_norm": 8.691461563110352, + "learning_rate": 4.766615129557752e-06, + "loss": 0.48, + "mean_token_accuracy": 0.8473145887255669, + "num_tokens": 109255888.0, + "step": 90870 + }, + { + "entropy": 1.9245838135480882, + "epoch": 0.28172004845163245, + "grad_norm": 7.54715633392334, + "learning_rate": 4.766352871705987e-06, + "loss": 0.5234, + "mean_token_accuracy": 0.8493126124143601, + "num_tokens": 109266869.0, + "step": 90880 + }, + { + "entropy": 1.917286041378975, + "epoch": 0.2817510475766821, + "grad_norm": 4.127012252807617, + "learning_rate": 4.7660906571375246e-06, + "loss": 0.5033, + "mean_token_accuracy": 0.8439315423369408, + "num_tokens": 109278743.0, + "step": 90890 + }, + { + "entropy": 1.897385112941265, + "epoch": 0.28178204670173185, + "grad_norm": 8.57779312133789, + "learning_rate": 4.76582848584046e-06, + "loss": 0.4876, + "mean_token_accuracy": 0.8471187829971314, + "num_tokens": 109290444.0, + "step": 90900 + }, + { + "entropy": 1.9781205475330352, + "epoch": 0.2818130458267815, + "grad_norm": 9.53032398223877, + "learning_rate": 4.765566357802891e-06, + "loss": 0.5853, + "mean_token_accuracy": 0.8293624818325043, + "num_tokens": 109301356.0, + "step": 90910 + }, + { + "entropy": 1.9003041684627533, + "epoch": 0.28184404495183124, + "grad_norm": 8.813244819641113, + "learning_rate": 4.765304273012924e-06, + "loss": 0.4625, + "mean_token_accuracy": 0.8380188256502151, + "num_tokens": 109313531.0, + "step": 90920 + }, + { + "entropy": 1.8687111303210258, + "epoch": 0.2818750440768809, + "grad_norm": 8.043350219726562, + "learning_rate": 4.765042231458668e-06, + "loss": 0.4412, + "mean_token_accuracy": 0.8497692689299583, + "num_tokens": 109326158.0, + "step": 90930 + }, + { + "entropy": 1.9804033279418944, + "epoch": 0.28190604320193063, + "grad_norm": 8.524834632873535, + "learning_rate": 4.764780233128236e-06, + "loss": 0.5462, + "mean_token_accuracy": 0.8371674284338951, + "num_tokens": 109337187.0, + "step": 90940 + }, + { + "entropy": 1.813350136578083, + "epoch": 0.2819370423269803, + "grad_norm": 7.912545204162598, + "learning_rate": 4.764518278009748e-06, + "loss": 0.402, + "mean_token_accuracy": 0.8572453141212464, + "num_tokens": 109350343.0, + "step": 90950 + }, + { + "entropy": 1.9478640288114548, + "epoch": 0.28196804145203, + "grad_norm": 7.623589515686035, + "learning_rate": 4.764256366091324e-06, + "loss": 0.5357, + "mean_token_accuracy": 0.8418220117688179, + "num_tokens": 109361694.0, + "step": 90960 + }, + { + "entropy": 1.8554902136325837, + "epoch": 0.2819990405770797, + "grad_norm": 7.590473651885986, + "learning_rate": 4.763994497361095e-06, + "loss": 0.435, + "mean_token_accuracy": 0.8496796250343323, + "num_tokens": 109374068.0, + "step": 90970 + }, + { + "entropy": 1.9092994675040245, + "epoch": 0.2820300397021294, + "grad_norm": 7.852284908294678, + "learning_rate": 4.7637326718071905e-06, + "loss": 0.5231, + "mean_token_accuracy": 0.8353525906801224, + "num_tokens": 109385364.0, + "step": 90980 + }, + { + "entropy": 1.8934466361999511, + "epoch": 0.2820610388271791, + "grad_norm": 9.18950080871582, + "learning_rate": 4.763470889417748e-06, + "loss": 0.4483, + "mean_token_accuracy": 0.8570297732949257, + "num_tokens": 109397364.0, + "step": 90990 + }, + { + "entropy": 1.771867723762989, + "epoch": 0.2820920379522288, + "grad_norm": 9.84312629699707, + "learning_rate": 4.763209150180908e-06, + "loss": 0.3863, + "mean_token_accuracy": 0.8599537044763566, + "num_tokens": 109410971.0, + "step": 91000 + }, + { + "entropy": 1.9751540750265122, + "epoch": 0.2821230370772785, + "grad_norm": 10.806888580322266, + "learning_rate": 4.762947454084818e-06, + "loss": 0.5376, + "mean_token_accuracy": 0.8323762461543083, + "num_tokens": 109421894.0, + "step": 91010 + }, + { + "entropy": 1.863322387635708, + "epoch": 0.2821540362023282, + "grad_norm": 9.505192756652832, + "learning_rate": 4.7626858011176256e-06, + "loss": 0.4577, + "mean_token_accuracy": 0.8461381018161773, + "num_tokens": 109433667.0, + "step": 91020 + }, + { + "entropy": 1.9372095853090285, + "epoch": 0.2821850353273779, + "grad_norm": 9.51073169708252, + "learning_rate": 4.7624241912674885e-06, + "loss": 0.4814, + "mean_token_accuracy": 0.8425448566675187, + "num_tokens": 109445506.0, + "step": 91030 + }, + { + "entropy": 1.8736591801047324, + "epoch": 0.2822160344524276, + "grad_norm": 9.874513626098633, + "learning_rate": 4.762162624522564e-06, + "loss": 0.4579, + "mean_token_accuracy": 0.8544930562376976, + "num_tokens": 109456672.0, + "step": 91040 + }, + { + "entropy": 1.9017417326569557, + "epoch": 0.28224703357747727, + "grad_norm": 8.883088111877441, + "learning_rate": 4.761901100871018e-06, + "loss": 0.4956, + "mean_token_accuracy": 0.8446088701486587, + "num_tokens": 109468440.0, + "step": 91050 + }, + { + "entropy": 1.8893210887908936, + "epoch": 0.282278032702527, + "grad_norm": 10.458946228027344, + "learning_rate": 4.7616396203010165e-06, + "loss": 0.4411, + "mean_token_accuracy": 0.8509502872824669, + "num_tokens": 109480575.0, + "step": 91060 + }, + { + "entropy": 1.9682177215814591, + "epoch": 0.28230903182757666, + "grad_norm": 7.0361151695251465, + "learning_rate": 4.761378182800733e-06, + "loss": 0.5089, + "mean_token_accuracy": 0.8456061512231827, + "num_tokens": 109491254.0, + "step": 91070 + }, + { + "entropy": 1.9273748084902764, + "epoch": 0.2823400309526264, + "grad_norm": 7.263545989990234, + "learning_rate": 4.761116788358349e-06, + "loss": 0.501, + "mean_token_accuracy": 0.8323819488286972, + "num_tokens": 109503011.0, + "step": 91080 + }, + { + "entropy": 1.9739017739892006, + "epoch": 0.28237103007767606, + "grad_norm": 6.576552391052246, + "learning_rate": 4.760855436962041e-06, + "loss": 0.5487, + "mean_token_accuracy": 0.8396146357059479, + "num_tokens": 109514103.0, + "step": 91090 + }, + { + "entropy": 1.9086976170539856, + "epoch": 0.2824020292027258, + "grad_norm": 8.7454833984375, + "learning_rate": 4.760594128599999e-06, + "loss": 0.4661, + "mean_token_accuracy": 0.848183062672615, + "num_tokens": 109526563.0, + "step": 91100 + }, + { + "entropy": 1.9339544415473937, + "epoch": 0.28243302832777545, + "grad_norm": 9.544751167297363, + "learning_rate": 4.760332863260414e-06, + "loss": 0.5061, + "mean_token_accuracy": 0.8470303192734718, + "num_tokens": 109538093.0, + "step": 91110 + }, + { + "entropy": 1.916434782743454, + "epoch": 0.2824640274528251, + "grad_norm": 4.14872407913208, + "learning_rate": 4.7600716409314804e-06, + "loss": 0.4853, + "mean_token_accuracy": 0.8473877355456352, + "num_tokens": 109549378.0, + "step": 91120 + }, + { + "entropy": 1.766272282600403, + "epoch": 0.28249502657787484, + "grad_norm": 8.772634506225586, + "learning_rate": 4.7598104616014005e-06, + "loss": 0.3786, + "mean_token_accuracy": 0.8664262443780899, + "num_tokens": 109563346.0, + "step": 91130 + }, + { + "entropy": 1.8620783016085625, + "epoch": 0.2825260257029245, + "grad_norm": 5.1654887199401855, + "learning_rate": 4.759549325258377e-06, + "loss": 0.4479, + "mean_token_accuracy": 0.8528469145298004, + "num_tokens": 109575000.0, + "step": 91140 + }, + { + "entropy": 1.827536989748478, + "epoch": 0.28255702482797423, + "grad_norm": 4.431619644165039, + "learning_rate": 4.759288231890621e-06, + "loss": 0.4768, + "mean_token_accuracy": 0.849473150074482, + "num_tokens": 109587804.0, + "step": 91150 + }, + { + "entropy": 1.865426352620125, + "epoch": 0.2825880239530239, + "grad_norm": 7.909317493438721, + "learning_rate": 4.759027181486346e-06, + "loss": 0.4475, + "mean_token_accuracy": 0.8499869540333748, + "num_tokens": 109600294.0, + "step": 91160 + }, + { + "entropy": 1.9297534614801406, + "epoch": 0.28261902307807363, + "grad_norm": 9.111772537231445, + "learning_rate": 4.758766174033769e-06, + "loss": 0.5103, + "mean_token_accuracy": 0.8378744632005691, + "num_tokens": 109611503.0, + "step": 91170 + }, + { + "entropy": 1.8704712957143783, + "epoch": 0.2826500222031233, + "grad_norm": 4.173186779022217, + "learning_rate": 4.758505209521114e-06, + "loss": 0.4431, + "mean_token_accuracy": 0.855632272362709, + "num_tokens": 109624455.0, + "step": 91180 + }, + { + "entropy": 1.9280048042535782, + "epoch": 0.282681021328173, + "grad_norm": 8.439362525939941, + "learning_rate": 4.758244287936609e-06, + "loss": 0.5147, + "mean_token_accuracy": 0.8422107562422753, + "num_tokens": 109635564.0, + "step": 91190 + }, + { + "entropy": 1.8654653757810593, + "epoch": 0.2827120204532227, + "grad_norm": 4.882840156555176, + "learning_rate": 4.757983409268485e-06, + "loss": 0.47, + "mean_token_accuracy": 0.8396533340215683, + "num_tokens": 109648550.0, + "step": 91200 + }, + { + "entropy": 1.8784069836139679, + "epoch": 0.2827430195782724, + "grad_norm": 7.505212783813477, + "learning_rate": 4.757722573504979e-06, + "loss": 0.4466, + "mean_token_accuracy": 0.8495945394039154, + "num_tokens": 109660589.0, + "step": 91210 + }, + { + "entropy": 1.9778222680091857, + "epoch": 0.2827740187033221, + "grad_norm": 10.172992706298828, + "learning_rate": 4.757461780634332e-06, + "loss": 0.5444, + "mean_token_accuracy": 0.8417175248265266, + "num_tokens": 109671973.0, + "step": 91220 + }, + { + "entropy": 1.8196046486496926, + "epoch": 0.2828050178283718, + "grad_norm": 8.086063385009766, + "learning_rate": 4.757201030644789e-06, + "loss": 0.3953, + "mean_token_accuracy": 0.8687186688184738, + "num_tokens": 109684750.0, + "step": 91230 + }, + { + "entropy": 1.8343065902590752, + "epoch": 0.2828360169534215, + "grad_norm": 9.45622730255127, + "learning_rate": 4.7569403235246005e-06, + "loss": 0.4139, + "mean_token_accuracy": 0.8519006326794625, + "num_tokens": 109697003.0, + "step": 91240 + }, + { + "entropy": 1.838625229895115, + "epoch": 0.2828670160784712, + "grad_norm": 3.578289270401001, + "learning_rate": 4.756679659262021e-06, + "loss": 0.3938, + "mean_token_accuracy": 0.8600620925426483, + "num_tokens": 109709959.0, + "step": 91250 + }, + { + "entropy": 1.9180777609348296, + "epoch": 0.28289801520352087, + "grad_norm": 9.219250679016113, + "learning_rate": 4.756419037845309e-06, + "loss": 0.4797, + "mean_token_accuracy": 0.8345475569367409, + "num_tokens": 109721437.0, + "step": 91260 + }, + { + "entropy": 1.914302496612072, + "epoch": 0.2829290143285706, + "grad_norm": 3.8321692943573, + "learning_rate": 4.756158459262729e-06, + "loss": 0.4678, + "mean_token_accuracy": 0.8463889390230179, + "num_tokens": 109733406.0, + "step": 91270 + }, + { + "entropy": 1.841091763973236, + "epoch": 0.28296001345362026, + "grad_norm": 8.805869102478027, + "learning_rate": 4.755897923502547e-06, + "loss": 0.4353, + "mean_token_accuracy": 0.8465237930417061, + "num_tokens": 109745414.0, + "step": 91280 + }, + { + "entropy": 1.9549955561757089, + "epoch": 0.28299101257867, + "grad_norm": 4.771459102630615, + "learning_rate": 4.755637430553038e-06, + "loss": 0.5449, + "mean_token_accuracy": 0.8281950682401658, + "num_tokens": 109756473.0, + "step": 91290 + }, + { + "entropy": 1.9814363867044449, + "epoch": 0.28302201170371966, + "grad_norm": 9.486200332641602, + "learning_rate": 4.755376980402479e-06, + "loss": 0.5201, + "mean_token_accuracy": 0.8435119092464447, + "num_tokens": 109767033.0, + "step": 91300 + }, + { + "entropy": 1.8801933750510216, + "epoch": 0.2830530108287694, + "grad_norm": 8.066265106201172, + "learning_rate": 4.755116573039149e-06, + "loss": 0.4795, + "mean_token_accuracy": 0.8474684238433838, + "num_tokens": 109779009.0, + "step": 91310 + }, + { + "entropy": 1.8336575701832771, + "epoch": 0.28308400995381905, + "grad_norm": 9.574748039245605, + "learning_rate": 4.754856208451337e-06, + "loss": 0.4312, + "mean_token_accuracy": 0.8556732803583145, + "num_tokens": 109791334.0, + "step": 91320 + }, + { + "entropy": 1.9349465638399124, + "epoch": 0.2831150090788688, + "grad_norm": 8.522025108337402, + "learning_rate": 4.75459588662733e-06, + "loss": 0.4795, + "mean_token_accuracy": 0.851339441537857, + "num_tokens": 109802925.0, + "step": 91330 + }, + { + "entropy": 1.8921233609318733, + "epoch": 0.28314600820391844, + "grad_norm": 8.379258155822754, + "learning_rate": 4.754335607555427e-06, + "loss": 0.4995, + "mean_token_accuracy": 0.8478038221597671, + "num_tokens": 109814130.0, + "step": 91340 + }, + { + "entropy": 1.8570956602692603, + "epoch": 0.28317700732896817, + "grad_norm": 9.117257118225098, + "learning_rate": 4.754075371223925e-06, + "loss": 0.478, + "mean_token_accuracy": 0.8518408805131912, + "num_tokens": 109826200.0, + "step": 91350 + }, + { + "entropy": 1.91699261367321, + "epoch": 0.28320800645401784, + "grad_norm": 8.286026000976562, + "learning_rate": 4.753815177621128e-06, + "loss": 0.539, + "mean_token_accuracy": 0.8423243030905724, + "num_tokens": 109837290.0, + "step": 91360 + }, + { + "entropy": 1.940002153813839, + "epoch": 0.2832390055790675, + "grad_norm": 9.196231842041016, + "learning_rate": 4.753555026735344e-06, + "loss": 0.5067, + "mean_token_accuracy": 0.8430562511086463, + "num_tokens": 109848408.0, + "step": 91370 + }, + { + "entropy": 1.9086173102259636, + "epoch": 0.28327000470411723, + "grad_norm": 7.4503560066223145, + "learning_rate": 4.753294918554887e-06, + "loss": 0.4555, + "mean_token_accuracy": 0.8556687757372856, + "num_tokens": 109859863.0, + "step": 91380 + }, + { + "entropy": 1.8716721430420875, + "epoch": 0.2833010038291669, + "grad_norm": 7.7145795822143555, + "learning_rate": 4.753034853068076e-06, + "loss": 0.5203, + "mean_token_accuracy": 0.8355464920401573, + "num_tokens": 109872862.0, + "step": 91390 + }, + { + "entropy": 1.959967464208603, + "epoch": 0.2833320029542166, + "grad_norm": 8.618791580200195, + "learning_rate": 4.752774830263229e-06, + "loss": 0.4941, + "mean_token_accuracy": 0.8505872398614883, + "num_tokens": 109884502.0, + "step": 91400 + }, + { + "entropy": 1.8670057207345963, + "epoch": 0.2833630020792663, + "grad_norm": 2.768303155899048, + "learning_rate": 4.7525148501286754e-06, + "loss": 0.514, + "mean_token_accuracy": 0.841192239522934, + "num_tokens": 109896997.0, + "step": 91410 + }, + { + "entropy": 1.942854182422161, + "epoch": 0.283394001204316, + "grad_norm": 9.517045974731445, + "learning_rate": 4.752254912652746e-06, + "loss": 0.5407, + "mean_token_accuracy": 0.8302263349294663, + "num_tokens": 109908513.0, + "step": 91420 + }, + { + "entropy": 1.8470983251929283, + "epoch": 0.2834250003293657, + "grad_norm": 3.9292147159576416, + "learning_rate": 4.751995017823772e-06, + "loss": 0.4478, + "mean_token_accuracy": 0.847145140171051, + "num_tokens": 109921038.0, + "step": 91430 + }, + { + "entropy": 1.889025342464447, + "epoch": 0.2834559994544154, + "grad_norm": 7.8180952072143555, + "learning_rate": 4.751735165630099e-06, + "loss": 0.4536, + "mean_token_accuracy": 0.8388582825660705, + "num_tokens": 109933509.0, + "step": 91440 + }, + { + "entropy": 1.926556906104088, + "epoch": 0.2834869985794651, + "grad_norm": 9.13294506072998, + "learning_rate": 4.751475356060067e-06, + "loss": 0.502, + "mean_token_accuracy": 0.8347544342279434, + "num_tokens": 109945589.0, + "step": 91450 + }, + { + "entropy": 1.8450331330299377, + "epoch": 0.2835179977045148, + "grad_norm": 4.0597100257873535, + "learning_rate": 4.751215589102026e-06, + "loss": 0.424, + "mean_token_accuracy": 0.8560441970825196, + "num_tokens": 109958238.0, + "step": 91460 + }, + { + "entropy": 1.8735743075609208, + "epoch": 0.28354899682956447, + "grad_norm": 9.06523323059082, + "learning_rate": 4.75095586474433e-06, + "loss": 0.4176, + "mean_token_accuracy": 0.8512053444981575, + "num_tokens": 109970897.0, + "step": 91470 + }, + { + "entropy": 1.9218522995710372, + "epoch": 0.2835799959546142, + "grad_norm": 8.556892395019531, + "learning_rate": 4.750696182975335e-06, + "loss": 0.5451, + "mean_token_accuracy": 0.8374419540166855, + "num_tokens": 109982852.0, + "step": 91480 + }, + { + "entropy": 1.8760737299919128, + "epoch": 0.28361099507966386, + "grad_norm": 7.687764644622803, + "learning_rate": 4.750436543783403e-06, + "loss": 0.4736, + "mean_token_accuracy": 0.8507323205471039, + "num_tokens": 109993887.0, + "step": 91490 + }, + { + "entropy": 1.8525379657745362, + "epoch": 0.2836419942047136, + "grad_norm": 8.57304573059082, + "learning_rate": 4.750176947156903e-06, + "loss": 0.4597, + "mean_token_accuracy": 0.8570681139826775, + "num_tokens": 110005619.0, + "step": 91500 + }, + { + "entropy": 1.9060514703392983, + "epoch": 0.28367299332976326, + "grad_norm": 6.858503818511963, + "learning_rate": 4.749917393084203e-06, + "loss": 0.4772, + "mean_token_accuracy": 0.8408376634120941, + "num_tokens": 110018405.0, + "step": 91510 + }, + { + "entropy": 1.8777115240693092, + "epoch": 0.283703992454813, + "grad_norm": 7.576192855834961, + "learning_rate": 4.74965788155368e-06, + "loss": 0.4717, + "mean_token_accuracy": 0.8432316944003105, + "num_tokens": 110030683.0, + "step": 91520 + }, + { + "entropy": 1.6769248962402343, + "epoch": 0.28373499157986265, + "grad_norm": 8.123210906982422, + "learning_rate": 4.749398412553713e-06, + "loss": 0.3271, + "mean_token_accuracy": 0.8673816755414009, + "num_tokens": 110045730.0, + "step": 91530 + }, + { + "entropy": 1.8561891838908195, + "epoch": 0.2837659907049124, + "grad_norm": 4.347743511199951, + "learning_rate": 4.749138986072685e-06, + "loss": 0.4555, + "mean_token_accuracy": 0.849002268910408, + "num_tokens": 110058293.0, + "step": 91540 + }, + { + "entropy": 1.87572433501482, + "epoch": 0.28379698982996204, + "grad_norm": 8.649298667907715, + "learning_rate": 4.748879602098988e-06, + "loss": 0.4649, + "mean_token_accuracy": 0.8493537470698357, + "num_tokens": 110070569.0, + "step": 91550 + }, + { + "entropy": 1.8268972262740135, + "epoch": 0.28382798895501177, + "grad_norm": 3.4365265369415283, + "learning_rate": 4.748620260621013e-06, + "loss": 0.4031, + "mean_token_accuracy": 0.8539086386561394, + "num_tokens": 110083590.0, + "step": 91560 + }, + { + "entropy": 1.8347914576530457, + "epoch": 0.28385898808006144, + "grad_norm": 7.83189582824707, + "learning_rate": 4.748360961627159e-06, + "loss": 0.4167, + "mean_token_accuracy": 0.8629049167037011, + "num_tokens": 110096116.0, + "step": 91570 + }, + { + "entropy": 1.9295135840773583, + "epoch": 0.28388998720511116, + "grad_norm": 10.846009254455566, + "learning_rate": 4.748101705105827e-06, + "loss": 0.537, + "mean_token_accuracy": 0.8269555777311325, + "num_tokens": 110107595.0, + "step": 91580 + }, + { + "entropy": 1.9408522367477417, + "epoch": 0.28392098633016083, + "grad_norm": 7.8314056396484375, + "learning_rate": 4.747842491045421e-06, + "loss": 0.5202, + "mean_token_accuracy": 0.8457581490278244, + "num_tokens": 110118966.0, + "step": 91590 + }, + { + "entropy": 1.8445065826177598, + "epoch": 0.28395198545521055, + "grad_norm": 9.18549633026123, + "learning_rate": 4.747583319434357e-06, + "loss": 0.4766, + "mean_token_accuracy": 0.8425332620739937, + "num_tokens": 110131790.0, + "step": 91600 + }, + { + "entropy": 1.8247267931699753, + "epoch": 0.2839829845802602, + "grad_norm": 9.114663124084473, + "learning_rate": 4.747324190261046e-06, + "loss": 0.491, + "mean_token_accuracy": 0.8539474830031395, + "num_tokens": 110144807.0, + "step": 91610 + }, + { + "entropy": 1.859029544889927, + "epoch": 0.2840139837053099, + "grad_norm": 9.042014122009277, + "learning_rate": 4.74706510351391e-06, + "loss": 0.5174, + "mean_token_accuracy": 0.8466860115528106, + "num_tokens": 110157391.0, + "step": 91620 + }, + { + "entropy": 1.8583563596010209, + "epoch": 0.2840449828303596, + "grad_norm": 8.334424018859863, + "learning_rate": 4.746806059181373e-06, + "loss": 0.5017, + "mean_token_accuracy": 0.8355572551488877, + "num_tokens": 110170333.0, + "step": 91630 + }, + { + "entropy": 1.8378588289022446, + "epoch": 0.2840759819554093, + "grad_norm": 8.690698623657227, + "learning_rate": 4.746547057251862e-06, + "loss": 0.5242, + "mean_token_accuracy": 0.8404997318983078, + "num_tokens": 110183049.0, + "step": 91640 + }, + { + "entropy": 1.8678978830575943, + "epoch": 0.284106981080459, + "grad_norm": 4.2940473556518555, + "learning_rate": 4.7462880977138126e-06, + "loss": 0.4802, + "mean_token_accuracy": 0.8339786469936371, + "num_tokens": 110196329.0, + "step": 91650 + }, + { + "entropy": 1.7718224942684173, + "epoch": 0.2841379802055087, + "grad_norm": 8.046103477478027, + "learning_rate": 4.74602918055566e-06, + "loss": 0.4232, + "mean_token_accuracy": 0.8618276312947273, + "num_tokens": 110209525.0, + "step": 91660 + }, + { + "entropy": 1.9796247810125351, + "epoch": 0.2841689793305584, + "grad_norm": 6.6610870361328125, + "learning_rate": 4.745770305765847e-06, + "loss": 0.5604, + "mean_token_accuracy": 0.8357976496219635, + "num_tokens": 110220480.0, + "step": 91670 + }, + { + "entropy": 1.9403215855360032, + "epoch": 0.28419997845560807, + "grad_norm": 8.975007057189941, + "learning_rate": 4.745511473332818e-06, + "loss": 0.5478, + "mean_token_accuracy": 0.8363938242197037, + "num_tokens": 110231525.0, + "step": 91680 + }, + { + "entropy": 1.891119834780693, + "epoch": 0.2842309775806578, + "grad_norm": 9.538240432739258, + "learning_rate": 4.745252683245027e-06, + "loss": 0.4886, + "mean_token_accuracy": 0.8425269886851311, + "num_tokens": 110243198.0, + "step": 91690 + }, + { + "entropy": 1.8871520176529883, + "epoch": 0.28426197670570746, + "grad_norm": 9.343374252319336, + "learning_rate": 4.744993935490928e-06, + "loss": 0.4459, + "mean_token_accuracy": 0.8517936706542969, + "num_tokens": 110255593.0, + "step": 91700 + }, + { + "entropy": 1.8104426577687263, + "epoch": 0.2842929758307572, + "grad_norm": 8.839599609375, + "learning_rate": 4.744735230058977e-06, + "loss": 0.4135, + "mean_token_accuracy": 0.8551136195659638, + "num_tokens": 110268721.0, + "step": 91710 + }, + { + "entropy": 1.9426888212561608, + "epoch": 0.28432397495580686, + "grad_norm": 8.519753456115723, + "learning_rate": 4.744476566937642e-06, + "loss": 0.4967, + "mean_token_accuracy": 0.8359502226114273, + "num_tokens": 110280087.0, + "step": 91720 + }, + { + "entropy": 1.9278660222887993, + "epoch": 0.2843549740808566, + "grad_norm": 8.69306468963623, + "learning_rate": 4.74421794611539e-06, + "loss": 0.497, + "mean_token_accuracy": 0.8463233426213265, + "num_tokens": 110291541.0, + "step": 91730 + }, + { + "entropy": 1.9058746635913848, + "epoch": 0.28438597320590625, + "grad_norm": 4.05926513671875, + "learning_rate": 4.743959367580693e-06, + "loss": 0.4611, + "mean_token_accuracy": 0.8437731295824051, + "num_tokens": 110303659.0, + "step": 91740 + }, + { + "entropy": 1.9244377925992011, + "epoch": 0.284416972330956, + "grad_norm": 8.101446151733398, + "learning_rate": 4.743700831322029e-06, + "loss": 0.444, + "mean_token_accuracy": 0.8459056586027145, + "num_tokens": 110315031.0, + "step": 91750 + }, + { + "entropy": 1.9023540601134301, + "epoch": 0.28444797145600564, + "grad_norm": 6.504104137420654, + "learning_rate": 4.74344233732788e-06, + "loss": 0.4976, + "mean_token_accuracy": 0.8424684196710587, + "num_tokens": 110327386.0, + "step": 91760 + }, + { + "entropy": 1.9569046169519424, + "epoch": 0.28447897058105537, + "grad_norm": 9.210738182067871, + "learning_rate": 4.743183885586729e-06, + "loss": 0.537, + "mean_token_accuracy": 0.8266186848282814, + "num_tokens": 110338648.0, + "step": 91770 + }, + { + "entropy": 1.8510024085640908, + "epoch": 0.28450996970610504, + "grad_norm": 8.181108474731445, + "learning_rate": 4.74292547608707e-06, + "loss": 0.4109, + "mean_token_accuracy": 0.8562486320734024, + "num_tokens": 110350736.0, + "step": 91780 + }, + { + "entropy": 1.9186692774295806, + "epoch": 0.28454096883115476, + "grad_norm": 9.964049339294434, + "learning_rate": 4.7426671088173945e-06, + "loss": 0.4583, + "mean_token_accuracy": 0.8494500458240509, + "num_tokens": 110362275.0, + "step": 91790 + }, + { + "entropy": 1.7676223665475845, + "epoch": 0.28457196795620443, + "grad_norm": 2.426920175552368, + "learning_rate": 4.742408783766203e-06, + "loss": 0.3842, + "mean_token_accuracy": 0.863847254216671, + "num_tokens": 110375617.0, + "step": 91800 + }, + { + "entropy": 1.8544678494334221, + "epoch": 0.28460296708125415, + "grad_norm": 9.724370956420898, + "learning_rate": 4.742150500922e-06, + "loss": 0.484, + "mean_token_accuracy": 0.8445950224995613, + "num_tokens": 110387605.0, + "step": 91810 + }, + { + "entropy": 1.8677785605192185, + "epoch": 0.2846339662063038, + "grad_norm": 8.32010269165039, + "learning_rate": 4.741892260273291e-06, + "loss": 0.4568, + "mean_token_accuracy": 0.845884545147419, + "num_tokens": 110399589.0, + "step": 91820 + }, + { + "entropy": 1.8281821206212043, + "epoch": 0.28466496533135355, + "grad_norm": 8.002693176269531, + "learning_rate": 4.741634061808588e-06, + "loss": 0.4257, + "mean_token_accuracy": 0.8603383213281631, + "num_tokens": 110412247.0, + "step": 91830 + }, + { + "entropy": 1.8158651649951936, + "epoch": 0.2846959644564032, + "grad_norm": 8.582942008972168, + "learning_rate": 4.741375905516411e-06, + "loss": 0.4644, + "mean_token_accuracy": 0.846869707107544, + "num_tokens": 110424355.0, + "step": 91840 + }, + { + "entropy": 1.7574502289295197, + "epoch": 0.28472696358145294, + "grad_norm": 4.905912399291992, + "learning_rate": 4.741117791385276e-06, + "loss": 0.3859, + "mean_token_accuracy": 0.8498795077204704, + "num_tokens": 110437595.0, + "step": 91850 + }, + { + "entropy": 1.898454374074936, + "epoch": 0.2847579627065026, + "grad_norm": 7.610432147979736, + "learning_rate": 4.740859719403713e-06, + "loss": 0.492, + "mean_token_accuracy": 0.8487632632255554, + "num_tokens": 110448902.0, + "step": 91860 + }, + { + "entropy": 1.7625425659120082, + "epoch": 0.2847889618315523, + "grad_norm": 4.429518699645996, + "learning_rate": 4.740601689560249e-06, + "loss": 0.3973, + "mean_token_accuracy": 0.8575651466846466, + "num_tokens": 110461975.0, + "step": 91870 + }, + { + "entropy": 1.8556936159729958, + "epoch": 0.284819960956602, + "grad_norm": 9.066593170166016, + "learning_rate": 4.74034370184342e-06, + "loss": 0.4796, + "mean_token_accuracy": 0.847280016541481, + "num_tokens": 110473152.0, + "step": 91880 + }, + { + "entropy": 1.880302868783474, + "epoch": 0.28485096008165167, + "grad_norm": 9.606684684753418, + "learning_rate": 4.740085756241761e-06, + "loss": 0.5205, + "mean_token_accuracy": 0.831180626153946, + "num_tokens": 110484515.0, + "step": 91890 + }, + { + "entropy": 1.768574671447277, + "epoch": 0.2848819592067014, + "grad_norm": 3.897165298461914, + "learning_rate": 4.7398278527438175e-06, + "loss": 0.4161, + "mean_token_accuracy": 0.8571337282657623, + "num_tokens": 110496957.0, + "step": 91900 + }, + { + "entropy": 1.791210974752903, + "epoch": 0.28491295833175106, + "grad_norm": 9.069717407226562, + "learning_rate": 4.739569991338137e-06, + "loss": 0.4322, + "mean_token_accuracy": 0.851265873014927, + "num_tokens": 110509726.0, + "step": 91910 + }, + { + "entropy": 1.807637719810009, + "epoch": 0.2849439574568008, + "grad_norm": 7.900302886962891, + "learning_rate": 4.739312172013269e-06, + "loss": 0.4362, + "mean_token_accuracy": 0.8464860022068024, + "num_tokens": 110522419.0, + "step": 91920 + }, + { + "entropy": 1.8903899610042572, + "epoch": 0.28497495658185046, + "grad_norm": 6.663649559020996, + "learning_rate": 4.7390543947577705e-06, + "loss": 0.4893, + "mean_token_accuracy": 0.8373034983873368, + "num_tokens": 110533897.0, + "step": 91930 + }, + { + "entropy": 1.8422768160700798, + "epoch": 0.2850059557069002, + "grad_norm": 7.8179826736450195, + "learning_rate": 4.7387966595602014e-06, + "loss": 0.4492, + "mean_token_accuracy": 0.8479554176330566, + "num_tokens": 110546623.0, + "step": 91940 + }, + { + "entropy": 1.7804616317152977, + "epoch": 0.28503695483194985, + "grad_norm": 8.71053409576416, + "learning_rate": 4.738538966409126e-06, + "loss": 0.4473, + "mean_token_accuracy": 0.8510273039340973, + "num_tokens": 110559478.0, + "step": 91950 + }, + { + "entropy": 1.859451201558113, + "epoch": 0.2850679539569996, + "grad_norm": 4.298933029174805, + "learning_rate": 4.738281315293114e-06, + "loss": 0.4617, + "mean_token_accuracy": 0.8425625443458558, + "num_tokens": 110571704.0, + "step": 91960 + }, + { + "entropy": 1.850043423473835, + "epoch": 0.28509895308204924, + "grad_norm": 7.2485480308532715, + "learning_rate": 4.738023706200738e-06, + "loss": 0.4499, + "mean_token_accuracy": 0.851256474852562, + "num_tokens": 110583945.0, + "step": 91970 + }, + { + "entropy": 1.9050755083560944, + "epoch": 0.28512995220709897, + "grad_norm": 9.636146545410156, + "learning_rate": 4.737766139120575e-06, + "loss": 0.5271, + "mean_token_accuracy": 0.832217988371849, + "num_tokens": 110595560.0, + "step": 91980 + }, + { + "entropy": 1.9206005334854126, + "epoch": 0.28516095133214864, + "grad_norm": 8.515999794006348, + "learning_rate": 4.73750861404121e-06, + "loss": 0.5102, + "mean_token_accuracy": 0.8435544535517693, + "num_tokens": 110606577.0, + "step": 91990 + }, + { + "entropy": 1.869492068886757, + "epoch": 0.28519195045719836, + "grad_norm": 8.025008201599121, + "learning_rate": 4.737251130951226e-06, + "loss": 0.4956, + "mean_token_accuracy": 0.8449269160628319, + "num_tokens": 110618391.0, + "step": 92000 + }, + { + "entropy": 1.845357683300972, + "epoch": 0.28522294958224803, + "grad_norm": 8.134385108947754, + "learning_rate": 4.736993689839216e-06, + "loss": 0.4576, + "mean_token_accuracy": 0.8439294084906578, + "num_tokens": 110631021.0, + "step": 92010 + }, + { + "entropy": 1.8097378730773925, + "epoch": 0.28525394870729776, + "grad_norm": 10.888206481933594, + "learning_rate": 4.736736290693772e-06, + "loss": 0.4697, + "mean_token_accuracy": 0.8568091303110122, + "num_tokens": 110643427.0, + "step": 92020 + }, + { + "entropy": 1.9278713300824166, + "epoch": 0.2852849478323474, + "grad_norm": 8.461037635803223, + "learning_rate": 4.736478933503496e-06, + "loss": 0.524, + "mean_token_accuracy": 0.8417813435196877, + "num_tokens": 110654675.0, + "step": 92030 + }, + { + "entropy": 1.8513173662126063, + "epoch": 0.28531594695739715, + "grad_norm": 8.097729682922363, + "learning_rate": 4.7362216182569906e-06, + "loss": 0.4856, + "mean_token_accuracy": 0.8500329554080963, + "num_tokens": 110667292.0, + "step": 92040 + }, + { + "entropy": 1.9195297732949257, + "epoch": 0.2853469460824468, + "grad_norm": 8.111653327941895, + "learning_rate": 4.735964344942864e-06, + "loss": 0.4854, + "mean_token_accuracy": 0.8443229928612709, + "num_tokens": 110678569.0, + "step": 92050 + }, + { + "entropy": 1.8122631691396236, + "epoch": 0.28537794520749654, + "grad_norm": 8.886815071105957, + "learning_rate": 4.735707113549729e-06, + "loss": 0.4137, + "mean_token_accuracy": 0.8556604027748108, + "num_tokens": 110691490.0, + "step": 92060 + }, + { + "entropy": 1.884895347058773, + "epoch": 0.2854089443325462, + "grad_norm": 10.424093246459961, + "learning_rate": 4.735449924066201e-06, + "loss": 0.4717, + "mean_token_accuracy": 0.8439397796988487, + "num_tokens": 110703884.0, + "step": 92070 + }, + { + "entropy": 1.9233110576868058, + "epoch": 0.28543994345759593, + "grad_norm": 9.017784118652344, + "learning_rate": 4.735192776480902e-06, + "loss": 0.4782, + "mean_token_accuracy": 0.8422451555728913, + "num_tokens": 110716070.0, + "step": 92080 + }, + { + "entropy": 1.8380617439746856, + "epoch": 0.2854709425826456, + "grad_norm": 8.3659086227417, + "learning_rate": 4.734935670782457e-06, + "loss": 0.4673, + "mean_token_accuracy": 0.8482602387666702, + "num_tokens": 110728016.0, + "step": 92090 + }, + { + "entropy": 1.9306314051151277, + "epoch": 0.2855019417076953, + "grad_norm": 9.452094078063965, + "learning_rate": 4.7346786069594955e-06, + "loss": 0.5179, + "mean_token_accuracy": 0.8452840596437454, + "num_tokens": 110739218.0, + "step": 92100 + }, + { + "entropy": 1.7319670930504798, + "epoch": 0.285532940832745, + "grad_norm": 7.807188034057617, + "learning_rate": 4.734421585000652e-06, + "loss": 0.4277, + "mean_token_accuracy": 0.8550496265292168, + "num_tokens": 110752633.0, + "step": 92110 + }, + { + "entropy": 1.9415203884243966, + "epoch": 0.28556393995779467, + "grad_norm": 8.066431999206543, + "learning_rate": 4.7341646048945645e-06, + "loss": 0.5107, + "mean_token_accuracy": 0.8382659062743187, + "num_tokens": 110764376.0, + "step": 92120 + }, + { + "entropy": 1.801632682979107, + "epoch": 0.2855949390828444, + "grad_norm": 6.095812797546387, + "learning_rate": 4.733907666629874e-06, + "loss": 0.438, + "mean_token_accuracy": 0.8537136375904083, + "num_tokens": 110776672.0, + "step": 92130 + }, + { + "entropy": 1.8623081862926483, + "epoch": 0.28562593820789406, + "grad_norm": 8.639974594116211, + "learning_rate": 4.733650770195231e-06, + "loss": 0.4526, + "mean_token_accuracy": 0.848575672507286, + "num_tokens": 110788872.0, + "step": 92140 + }, + { + "entropy": 1.7804166600108147, + "epoch": 0.2856569373329438, + "grad_norm": 3.511837959289551, + "learning_rate": 4.733393915579283e-06, + "loss": 0.4113, + "mean_token_accuracy": 0.8584547877311707, + "num_tokens": 110801677.0, + "step": 92150 + }, + { + "entropy": 1.8777981102466583, + "epoch": 0.28568793645799345, + "grad_norm": 8.046910285949707, + "learning_rate": 4.733137102770687e-06, + "loss": 0.4629, + "mean_token_accuracy": 0.8518513917922974, + "num_tokens": 110813470.0, + "step": 92160 + }, + { + "entropy": 1.9628359898924828, + "epoch": 0.2857189355830432, + "grad_norm": 9.675116539001465, + "learning_rate": 4.732880331758104e-06, + "loss": 0.5357, + "mean_token_accuracy": 0.8325561985373497, + "num_tokens": 110824903.0, + "step": 92170 + }, + { + "entropy": 1.9065320461988449, + "epoch": 0.28574993470809285, + "grad_norm": 9.257568359375, + "learning_rate": 4.732623602530196e-06, + "loss": 0.4646, + "mean_token_accuracy": 0.8543298453092575, + "num_tokens": 110836341.0, + "step": 92180 + }, + { + "entropy": 1.8834924966096878, + "epoch": 0.28578093383314257, + "grad_norm": 3.791853189468384, + "learning_rate": 4.732366915075634e-06, + "loss": 0.5072, + "mean_token_accuracy": 0.8327832892537117, + "num_tokens": 110848011.0, + "step": 92190 + }, + { + "entropy": 1.9523876518011094, + "epoch": 0.28581193295819224, + "grad_norm": 8.7767972946167, + "learning_rate": 4.732110269383088e-06, + "loss": 0.5044, + "mean_token_accuracy": 0.8447451606392861, + "num_tokens": 110858754.0, + "step": 92200 + }, + { + "entropy": 1.9425658136606216, + "epoch": 0.28584293208324196, + "grad_norm": 6.863429069519043, + "learning_rate": 4.731853665441238e-06, + "loss": 0.5456, + "mean_token_accuracy": 0.835406644642353, + "num_tokens": 110869747.0, + "step": 92210 + }, + { + "entropy": 1.8428900748491288, + "epoch": 0.28587393120829163, + "grad_norm": 8.834907531738281, + "learning_rate": 4.731597103238762e-06, + "loss": 0.4663, + "mean_token_accuracy": 0.844575323164463, + "num_tokens": 110881672.0, + "step": 92220 + }, + { + "entropy": 1.9306576699018478, + "epoch": 0.28590493033334136, + "grad_norm": 9.883462905883789, + "learning_rate": 4.731340582764347e-06, + "loss": 0.498, + "mean_token_accuracy": 0.8402498573064804, + "num_tokens": 110893015.0, + "step": 92230 + }, + { + "entropy": 1.9201725766062736, + "epoch": 0.285935929458391, + "grad_norm": 6.858789443969727, + "learning_rate": 4.731084104006684e-06, + "loss": 0.4643, + "mean_token_accuracy": 0.841082276403904, + "num_tokens": 110905179.0, + "step": 92240 + }, + { + "entropy": 1.8505833253264428, + "epoch": 0.28596692858344075, + "grad_norm": 7.575535297393799, + "learning_rate": 4.730827666954467e-06, + "loss": 0.4116, + "mean_token_accuracy": 0.8532150328159332, + "num_tokens": 110918892.0, + "step": 92250 + }, + { + "entropy": 1.9996703028678895, + "epoch": 0.2859979277084904, + "grad_norm": 9.380273818969727, + "learning_rate": 4.730571271596393e-06, + "loss": 0.547, + "mean_token_accuracy": 0.8315587803721428, + "num_tokens": 110929571.0, + "step": 92260 + }, + { + "entropy": 1.724966013431549, + "epoch": 0.28602892683354014, + "grad_norm": 8.106704711914062, + "learning_rate": 4.730314917921165e-06, + "loss": 0.3552, + "mean_token_accuracy": 0.8658282339572907, + "num_tokens": 110942972.0, + "step": 92270 + }, + { + "entropy": 1.912028570473194, + "epoch": 0.2860599259585898, + "grad_norm": 3.8293659687042236, + "learning_rate": 4.730058605917492e-06, + "loss": 0.489, + "mean_token_accuracy": 0.847729179263115, + "num_tokens": 110954086.0, + "step": 92280 + }, + { + "entropy": 1.9026774257421493, + "epoch": 0.28609092508363954, + "grad_norm": 3.5246219635009766, + "learning_rate": 4.729802335574084e-06, + "loss": 0.4629, + "mean_token_accuracy": 0.8511073097586632, + "num_tokens": 110965849.0, + "step": 92290 + }, + { + "entropy": 1.839532507956028, + "epoch": 0.2861219242086892, + "grad_norm": 7.714004039764404, + "learning_rate": 4.729546106879656e-06, + "loss": 0.4422, + "mean_token_accuracy": 0.8525758549571038, + "num_tokens": 110978775.0, + "step": 92300 + }, + { + "entropy": 1.8683712184429169, + "epoch": 0.28615292333373893, + "grad_norm": 4.254873752593994, + "learning_rate": 4.729289919822929e-06, + "loss": 0.4697, + "mean_token_accuracy": 0.8314826056361199, + "num_tokens": 110990837.0, + "step": 92310 + }, + { + "entropy": 1.9552190572023391, + "epoch": 0.2861839224587886, + "grad_norm": 8.179940223693848, + "learning_rate": 4.729033774392628e-06, + "loss": 0.4934, + "mean_token_accuracy": 0.8429278552532196, + "num_tokens": 111002090.0, + "step": 92320 + }, + { + "entropy": 1.8049072623252869, + "epoch": 0.2862149215838383, + "grad_norm": 3.7070064544677734, + "learning_rate": 4.728777670577479e-06, + "loss": 0.3892, + "mean_token_accuracy": 0.8596838563680649, + "num_tokens": 111014690.0, + "step": 92330 + }, + { + "entropy": 1.865850919485092, + "epoch": 0.286245920708888, + "grad_norm": 9.017657279968262, + "learning_rate": 4.7285216083662165e-06, + "loss": 0.4644, + "mean_token_accuracy": 0.84141965508461, + "num_tokens": 111026626.0, + "step": 92340 + }, + { + "entropy": 1.802014322578907, + "epoch": 0.28627691983393766, + "grad_norm": 4.565001010894775, + "learning_rate": 4.728265587747578e-06, + "loss": 0.408, + "mean_token_accuracy": 0.8524199083447457, + "num_tokens": 111039897.0, + "step": 92350 + }, + { + "entropy": 1.8838436871767044, + "epoch": 0.2863079189589874, + "grad_norm": 10.405354499816895, + "learning_rate": 4.728009608710304e-06, + "loss": 0.5053, + "mean_token_accuracy": 0.8382696464657784, + "num_tokens": 111052245.0, + "step": 92360 + }, + { + "entropy": 1.8829987928271295, + "epoch": 0.28633891808403705, + "grad_norm": 8.90793514251709, + "learning_rate": 4.727753671243139e-06, + "loss": 0.4678, + "mean_token_accuracy": 0.8499641686677932, + "num_tokens": 111063937.0, + "step": 92370 + }, + { + "entropy": 1.8485404312610627, + "epoch": 0.2863699172090868, + "grad_norm": 7.828176975250244, + "learning_rate": 4.727497775334834e-06, + "loss": 0.4965, + "mean_token_accuracy": 0.8384749412536621, + "num_tokens": 111076807.0, + "step": 92380 + }, + { + "entropy": 1.9129365399479865, + "epoch": 0.28640091633413645, + "grad_norm": 10.41478443145752, + "learning_rate": 4.727241920974142e-06, + "loss": 0.5466, + "mean_token_accuracy": 0.8334106773138046, + "num_tokens": 111089253.0, + "step": 92390 + }, + { + "entropy": 1.9112588971853257, + "epoch": 0.28643191545918617, + "grad_norm": 9.177438735961914, + "learning_rate": 4.726986108149824e-06, + "loss": 0.5223, + "mean_token_accuracy": 0.8404966652393341, + "num_tokens": 111100456.0, + "step": 92400 + }, + { + "entropy": 1.9098269432783126, + "epoch": 0.28646291458423584, + "grad_norm": 7.330765724182129, + "learning_rate": 4.7267303368506395e-06, + "loss": 0.4828, + "mean_token_accuracy": 0.8523991569876671, + "num_tokens": 111111456.0, + "step": 92410 + }, + { + "entropy": 1.9250774174928664, + "epoch": 0.28649391370928556, + "grad_norm": 8.76259708404541, + "learning_rate": 4.726474607065357e-06, + "loss": 0.4822, + "mean_token_accuracy": 0.8517454132437706, + "num_tokens": 111122408.0, + "step": 92420 + }, + { + "entropy": 1.8456337764859199, + "epoch": 0.28652491283433523, + "grad_norm": 8.982091903686523, + "learning_rate": 4.726218918782747e-06, + "loss": 0.4748, + "mean_token_accuracy": 0.8434979751706123, + "num_tokens": 111135014.0, + "step": 92430 + }, + { + "entropy": 1.8489980950951577, + "epoch": 0.28655591195938496, + "grad_norm": 8.749741554260254, + "learning_rate": 4.725963271991586e-06, + "loss": 0.4099, + "mean_token_accuracy": 0.8554965123534203, + "num_tokens": 111147294.0, + "step": 92440 + }, + { + "entropy": 1.9318351566791534, + "epoch": 0.2865869110844346, + "grad_norm": 8.458632469177246, + "learning_rate": 4.725707666680653e-06, + "loss": 0.5415, + "mean_token_accuracy": 0.8387140676379203, + "num_tokens": 111158516.0, + "step": 92450 + }, + { + "entropy": 1.8340228885412215, + "epoch": 0.28661791020948435, + "grad_norm": 3.4830410480499268, + "learning_rate": 4.72545210283873e-06, + "loss": 0.3714, + "mean_token_accuracy": 0.8590243220329284, + "num_tokens": 111171043.0, + "step": 92460 + }, + { + "entropy": 1.8585921421647071, + "epoch": 0.286648909334534, + "grad_norm": 10.120370864868164, + "learning_rate": 4.725196580454608e-06, + "loss": 0.4472, + "mean_token_accuracy": 0.8484177976846695, + "num_tokens": 111183642.0, + "step": 92470 + }, + { + "entropy": 1.924575427174568, + "epoch": 0.28667990845958374, + "grad_norm": 9.668998718261719, + "learning_rate": 4.724941099517078e-06, + "loss": 0.4975, + "mean_token_accuracy": 0.8406669244170188, + "num_tokens": 111195086.0, + "step": 92480 + }, + { + "entropy": 1.8797391682863236, + "epoch": 0.2867109075846334, + "grad_norm": 7.982089996337891, + "learning_rate": 4.724685660014936e-06, + "loss": 0.4961, + "mean_token_accuracy": 0.8439215019345283, + "num_tokens": 111206895.0, + "step": 92490 + }, + { + "entropy": 1.906739890575409, + "epoch": 0.28674190670968314, + "grad_norm": 8.40061092376709, + "learning_rate": 4.724430261936984e-06, + "loss": 0.4778, + "mean_token_accuracy": 0.8415932491421699, + "num_tokens": 111218419.0, + "step": 92500 + }, + { + "entropy": 1.7755264952778815, + "epoch": 0.2867729058347328, + "grad_norm": 8.411188125610352, + "learning_rate": 4.724174905272025e-06, + "loss": 0.3824, + "mean_token_accuracy": 0.8556288599967956, + "num_tokens": 111232066.0, + "step": 92510 + }, + { + "entropy": 1.8620268389582635, + "epoch": 0.28680390495978253, + "grad_norm": 3.8714070320129395, + "learning_rate": 4.72391959000887e-06, + "loss": 0.4188, + "mean_token_accuracy": 0.8555803820490837, + "num_tokens": 111244082.0, + "step": 92520 + }, + { + "entropy": 1.925992988049984, + "epoch": 0.2868349040848322, + "grad_norm": 8.029500007629395, + "learning_rate": 4.723664316136334e-06, + "loss": 0.5205, + "mean_token_accuracy": 0.8394767045974731, + "num_tokens": 111255877.0, + "step": 92530 + }, + { + "entropy": 1.972803682088852, + "epoch": 0.2868659032098819, + "grad_norm": 9.798595428466797, + "learning_rate": 4.723409083643231e-06, + "loss": 0.5374, + "mean_token_accuracy": 0.831108058989048, + "num_tokens": 111266990.0, + "step": 92540 + }, + { + "entropy": 1.9300013601779937, + "epoch": 0.2868969023349316, + "grad_norm": 8.934462547302246, + "learning_rate": 4.7231538925183875e-06, + "loss": 0.5097, + "mean_token_accuracy": 0.8401208460330963, + "num_tokens": 111277932.0, + "step": 92550 + }, + { + "entropy": 1.8719871819019318, + "epoch": 0.2869279014599813, + "grad_norm": 8.938404083251953, + "learning_rate": 4.722898742750625e-06, + "loss": 0.4707, + "mean_token_accuracy": 0.8495993599295616, + "num_tokens": 111290308.0, + "step": 92560 + }, + { + "entropy": 1.9398197084665298, + "epoch": 0.286958900585031, + "grad_norm": 9.505714416503906, + "learning_rate": 4.7226436343287775e-06, + "loss": 0.5289, + "mean_token_accuracy": 0.8359267324209213, + "num_tokens": 111301797.0, + "step": 92570 + }, + { + "entropy": 1.8432279601693153, + "epoch": 0.2869898997100807, + "grad_norm": 3.82196307182312, + "learning_rate": 4.7223885672416784e-06, + "loss": 0.469, + "mean_token_accuracy": 0.850963968038559, + "num_tokens": 111314214.0, + "step": 92580 + }, + { + "entropy": 1.844231453537941, + "epoch": 0.2870208988351304, + "grad_norm": 8.75102710723877, + "learning_rate": 4.722133541478166e-06, + "loss": 0.4347, + "mean_token_accuracy": 0.8547771289944649, + "num_tokens": 111326679.0, + "step": 92590 + }, + { + "entropy": 1.9384073466062546, + "epoch": 0.28705189796018005, + "grad_norm": 7.945824146270752, + "learning_rate": 4.721878557027084e-06, + "loss": 0.524, + "mean_token_accuracy": 0.835817402601242, + "num_tokens": 111338404.0, + "step": 92600 + }, + { + "entropy": 1.9293118342757225, + "epoch": 0.28708289708522977, + "grad_norm": 8.206449508666992, + "learning_rate": 4.7216236138772795e-06, + "loss": 0.4973, + "mean_token_accuracy": 0.8453877314925193, + "num_tokens": 111350168.0, + "step": 92610 + }, + { + "entropy": 1.8235325343906879, + "epoch": 0.28711389621027944, + "grad_norm": 3.6279773712158203, + "learning_rate": 4.721368712017605e-06, + "loss": 0.4432, + "mean_token_accuracy": 0.8457371458411217, + "num_tokens": 111363248.0, + "step": 92620 + }, + { + "entropy": 1.9850926041603087, + "epoch": 0.28714489533532916, + "grad_norm": 9.338789939880371, + "learning_rate": 4.721113851436916e-06, + "loss": 0.5428, + "mean_token_accuracy": 0.8441228061914444, + "num_tokens": 111374112.0, + "step": 92630 + }, + { + "entropy": 1.8502737820148467, + "epoch": 0.28717589446037883, + "grad_norm": 8.026445388793945, + "learning_rate": 4.72085903212407e-06, + "loss": 0.423, + "mean_token_accuracy": 0.8572929382324219, + "num_tokens": 111386542.0, + "step": 92640 + }, + { + "entropy": 1.8364259883761407, + "epoch": 0.28720689358542856, + "grad_norm": 7.70156717300415, + "learning_rate": 4.7206042540679335e-06, + "loss": 0.4655, + "mean_token_accuracy": 0.8466950342059135, + "num_tokens": 111399115.0, + "step": 92650 + }, + { + "entropy": 1.8671451389789582, + "epoch": 0.2872378927104782, + "grad_norm": 9.978074073791504, + "learning_rate": 4.720349517257375e-06, + "loss": 0.4699, + "mean_token_accuracy": 0.8522609934210778, + "num_tokens": 111411335.0, + "step": 92660 + }, + { + "entropy": 1.8687150657176972, + "epoch": 0.28726889183552795, + "grad_norm": 4.219553470611572, + "learning_rate": 4.720094821681266e-06, + "loss": 0.4339, + "mean_token_accuracy": 0.8571690008044243, + "num_tokens": 111423197.0, + "step": 92670 + }, + { + "entropy": 1.839407466351986, + "epoch": 0.2872998909605776, + "grad_norm": 7.242602348327637, + "learning_rate": 4.719840167328485e-06, + "loss": 0.4644, + "mean_token_accuracy": 0.8519873261451721, + "num_tokens": 111436358.0, + "step": 92680 + }, + { + "entropy": 1.8508578151464463, + "epoch": 0.28733089008562734, + "grad_norm": 4.504565238952637, + "learning_rate": 4.719585554187911e-06, + "loss": 0.455, + "mean_token_accuracy": 0.8483609542250633, + "num_tokens": 111448749.0, + "step": 92690 + }, + { + "entropy": 1.8885752364993096, + "epoch": 0.287361889210677, + "grad_norm": 8.104568481445312, + "learning_rate": 4.7193309822484295e-06, + "loss": 0.4459, + "mean_token_accuracy": 0.8618633911013603, + "num_tokens": 111460055.0, + "step": 92700 + }, + { + "entropy": 1.8163581773638726, + "epoch": 0.28739288833572674, + "grad_norm": 7.430144786834717, + "learning_rate": 4.719076451498931e-06, + "loss": 0.4375, + "mean_token_accuracy": 0.8476424887776375, + "num_tokens": 111473204.0, + "step": 92710 + }, + { + "entropy": 1.9194232299923897, + "epoch": 0.2874238874607764, + "grad_norm": 8.560006141662598, + "learning_rate": 4.718821961928308e-06, + "loss": 0.551, + "mean_token_accuracy": 0.825740373134613, + "num_tokens": 111484969.0, + "step": 92720 + }, + { + "entropy": 1.8660444170236588, + "epoch": 0.28745488658582613, + "grad_norm": 8.867291450500488, + "learning_rate": 4.71856751352546e-06, + "loss": 0.4686, + "mean_token_accuracy": 0.8471433222293854, + "num_tokens": 111497653.0, + "step": 92730 + }, + { + "entropy": 1.8844849050045014, + "epoch": 0.2874858857108758, + "grad_norm": 8.284875869750977, + "learning_rate": 4.7183131062792855e-06, + "loss": 0.4806, + "mean_token_accuracy": 0.8477038741111755, + "num_tokens": 111510030.0, + "step": 92740 + }, + { + "entropy": 1.945139628648758, + "epoch": 0.2875168848359255, + "grad_norm": 10.998967170715332, + "learning_rate": 4.718058740178694e-06, + "loss": 0.5958, + "mean_token_accuracy": 0.8271935939788818, + "num_tokens": 111521480.0, + "step": 92750 + }, + { + "entropy": 1.9299291223287582, + "epoch": 0.2875478839609752, + "grad_norm": 8.494784355163574, + "learning_rate": 4.717804415212594e-06, + "loss": 0.533, + "mean_token_accuracy": 0.8403868332505227, + "num_tokens": 111532281.0, + "step": 92760 + }, + { + "entropy": 1.8992675706744193, + "epoch": 0.2875788830860249, + "grad_norm": 3.8736727237701416, + "learning_rate": 4.717550131369901e-06, + "loss": 0.4972, + "mean_token_accuracy": 0.843301497399807, + "num_tokens": 111544377.0, + "step": 92770 + }, + { + "entropy": 1.9680778548121451, + "epoch": 0.2876098822110746, + "grad_norm": 7.67625093460083, + "learning_rate": 4.717295888639533e-06, + "loss": 0.5604, + "mean_token_accuracy": 0.8245472773909569, + "num_tokens": 111555728.0, + "step": 92780 + }, + { + "entropy": 1.8295420020818711, + "epoch": 0.2876408813361243, + "grad_norm": 6.913825988769531, + "learning_rate": 4.717041687010413e-06, + "loss": 0.454, + "mean_token_accuracy": 0.8417760416865349, + "num_tokens": 111569369.0, + "step": 92790 + }, + { + "entropy": 1.8156803011894227, + "epoch": 0.287671880461174, + "grad_norm": 8.341958045959473, + "learning_rate": 4.716787526471468e-06, + "loss": 0.4361, + "mean_token_accuracy": 0.846878944337368, + "num_tokens": 111582611.0, + "step": 92800 + }, + { + "entropy": 1.9245857551693917, + "epoch": 0.2877028795862237, + "grad_norm": 8.205559730529785, + "learning_rate": 4.716533407011631e-06, + "loss": 0.5094, + "mean_token_accuracy": 0.8327091008424758, + "num_tokens": 111594361.0, + "step": 92810 + }, + { + "entropy": 1.818833366036415, + "epoch": 0.28773387871127337, + "grad_norm": 3.7494218349456787, + "learning_rate": 4.716279328619835e-06, + "loss": 0.414, + "mean_token_accuracy": 0.8558287933468819, + "num_tokens": 111607793.0, + "step": 92820 + }, + { + "entropy": 1.8438904002308845, + "epoch": 0.2877648778363231, + "grad_norm": 8.092823028564453, + "learning_rate": 4.716025291285019e-06, + "loss": 0.496, + "mean_token_accuracy": 0.8389999002218247, + "num_tokens": 111620382.0, + "step": 92830 + }, + { + "entropy": 1.8870263323187828, + "epoch": 0.28779587696137277, + "grad_norm": 2.7017006874084473, + "learning_rate": 4.715771294996129e-06, + "loss": 0.4943, + "mean_token_accuracy": 0.8476079568266869, + "num_tokens": 111632680.0, + "step": 92840 + }, + { + "entropy": 1.841055366396904, + "epoch": 0.28782687608642243, + "grad_norm": 7.183150768280029, + "learning_rate": 4.715517339742112e-06, + "loss": 0.4525, + "mean_token_accuracy": 0.8533366426825524, + "num_tokens": 111644869.0, + "step": 92850 + }, + { + "entropy": 1.763474926352501, + "epoch": 0.28785787521147216, + "grad_norm": 3.5942344665527344, + "learning_rate": 4.7152634255119215e-06, + "loss": 0.3912, + "mean_token_accuracy": 0.8623644202947617, + "num_tokens": 111658019.0, + "step": 92860 + }, + { + "entropy": 1.9290682673454285, + "epoch": 0.2878888743365218, + "grad_norm": 11.06643295288086, + "learning_rate": 4.71500955229451e-06, + "loss": 0.5342, + "mean_token_accuracy": 0.8362339481711387, + "num_tokens": 111669612.0, + "step": 92870 + }, + { + "entropy": 1.9505492269992828, + "epoch": 0.28791987346157155, + "grad_norm": 7.899135112762451, + "learning_rate": 4.7147557200788414e-06, + "loss": 0.5255, + "mean_token_accuracy": 0.8352016389369965, + "num_tokens": 111680635.0, + "step": 92880 + }, + { + "entropy": 1.8193707883358001, + "epoch": 0.2879508725866212, + "grad_norm": 5.0817975997924805, + "learning_rate": 4.714501928853879e-06, + "loss": 0.4464, + "mean_token_accuracy": 0.849019393324852, + "num_tokens": 111693588.0, + "step": 92890 + }, + { + "entropy": 1.8536582559347152, + "epoch": 0.28798187171167094, + "grad_norm": 3.8260529041290283, + "learning_rate": 4.714248178608591e-06, + "loss": 0.438, + "mean_token_accuracy": 0.8521668612957001, + "num_tokens": 111705649.0, + "step": 92900 + }, + { + "entropy": 1.9044562801718712, + "epoch": 0.2880128708367206, + "grad_norm": 4.923748016357422, + "learning_rate": 4.713994469331952e-06, + "loss": 0.471, + "mean_token_accuracy": 0.8511299833655357, + "num_tokens": 111717116.0, + "step": 92910 + }, + { + "entropy": 1.8554534062743187, + "epoch": 0.28804386996177034, + "grad_norm": 3.183706521987915, + "learning_rate": 4.713740801012937e-06, + "loss": 0.3813, + "mean_token_accuracy": 0.8666115805506707, + "num_tokens": 111729411.0, + "step": 92920 + }, + { + "entropy": 1.8407445654273034, + "epoch": 0.28807486908682, + "grad_norm": 9.04386043548584, + "learning_rate": 4.713487173640529e-06, + "loss": 0.4396, + "mean_token_accuracy": 0.848953865468502, + "num_tokens": 111742147.0, + "step": 92930 + }, + { + "entropy": 1.8394930571317674, + "epoch": 0.28810586821186973, + "grad_norm": 3.4830808639526367, + "learning_rate": 4.7132335872037114e-06, + "loss": 0.4433, + "mean_token_accuracy": 0.8509487017989159, + "num_tokens": 111754776.0, + "step": 92940 + }, + { + "entropy": 1.8994437158107758, + "epoch": 0.2881368673369194, + "grad_norm": 9.240650177001953, + "learning_rate": 4.712980041691476e-06, + "loss": 0.5243, + "mean_token_accuracy": 0.843148159980774, + "num_tokens": 111765930.0, + "step": 92950 + }, + { + "entropy": 1.8866792187094688, + "epoch": 0.2881678664619691, + "grad_norm": 7.509947299957275, + "learning_rate": 4.7127265370928134e-06, + "loss": 0.4576, + "mean_token_accuracy": 0.8502578973770142, + "num_tokens": 111777482.0, + "step": 92960 + }, + { + "entropy": 1.8861197009682655, + "epoch": 0.2881988655870188, + "grad_norm": 7.608978271484375, + "learning_rate": 4.712473073396724e-06, + "loss": 0.4883, + "mean_token_accuracy": 0.8533033058047295, + "num_tokens": 111789308.0, + "step": 92970 + }, + { + "entropy": 1.8014571815729141, + "epoch": 0.2882298647120685, + "grad_norm": 5.268444061279297, + "learning_rate": 4.7122196505922085e-06, + "loss": 0.444, + "mean_token_accuracy": 0.8505001902580261, + "num_tokens": 111803076.0, + "step": 92980 + }, + { + "entropy": 1.9110738933086395, + "epoch": 0.2882608638371182, + "grad_norm": 7.389989376068115, + "learning_rate": 4.711966268668274e-06, + "loss": 0.5182, + "mean_token_accuracy": 0.8408216923475266, + "num_tokens": 111814444.0, + "step": 92990 + }, + { + "entropy": 1.8892789036035538, + "epoch": 0.2882918629621679, + "grad_norm": 8.411653518676758, + "learning_rate": 4.711712927613929e-06, + "loss": 0.4976, + "mean_token_accuracy": 0.8434664875268936, + "num_tokens": 111826241.0, + "step": 93000 + }, + { + "entropy": 1.929236751794815, + "epoch": 0.2883228620872176, + "grad_norm": 9.464624404907227, + "learning_rate": 4.711459627418189e-06, + "loss": 0.5093, + "mean_token_accuracy": 0.8482060879468918, + "num_tokens": 111838666.0, + "step": 93010 + }, + { + "entropy": 1.8663489505648614, + "epoch": 0.2883538612122673, + "grad_norm": 7.735058784484863, + "learning_rate": 4.711206368070072e-06, + "loss": 0.4294, + "mean_token_accuracy": 0.8607504948973655, + "num_tokens": 111851236.0, + "step": 93020 + }, + { + "entropy": 1.8760851591825485, + "epoch": 0.288384860337317, + "grad_norm": 7.6005754470825195, + "learning_rate": 4.710953149558602e-06, + "loss": 0.477, + "mean_token_accuracy": 0.8493797600269317, + "num_tokens": 111863060.0, + "step": 93030 + }, + { + "entropy": 1.8153146281838417, + "epoch": 0.2884158594623667, + "grad_norm": 8.263833999633789, + "learning_rate": 4.710699971872803e-06, + "loss": 0.4379, + "mean_token_accuracy": 0.851483790576458, + "num_tokens": 111876458.0, + "step": 93040 + }, + { + "entropy": 1.8655553236603737, + "epoch": 0.28844685858741637, + "grad_norm": 5.136788368225098, + "learning_rate": 4.710446835001707e-06, + "loss": 0.4657, + "mean_token_accuracy": 0.8393679440021515, + "num_tokens": 111889068.0, + "step": 93050 + }, + { + "entropy": 1.8693054020404816, + "epoch": 0.2884778577124661, + "grad_norm": 6.886144638061523, + "learning_rate": 4.71019373893435e-06, + "loss": 0.4437, + "mean_token_accuracy": 0.8552120968699455, + "num_tokens": 111901836.0, + "step": 93060 + }, + { + "entropy": 1.8734416976571082, + "epoch": 0.28850885683751576, + "grad_norm": 10.7568941116333, + "learning_rate": 4.709940683659771e-06, + "loss": 0.4433, + "mean_token_accuracy": 0.8552261263132095, + "num_tokens": 111913647.0, + "step": 93070 + }, + { + "entropy": 1.868347629904747, + "epoch": 0.2885398559625655, + "grad_norm": 4.4855852127075195, + "learning_rate": 4.709687669167011e-06, + "loss": 0.4386, + "mean_token_accuracy": 0.8452133953571319, + "num_tokens": 111926603.0, + "step": 93080 + }, + { + "entropy": 1.9437709406018258, + "epoch": 0.28857085508761515, + "grad_norm": 9.099634170532227, + "learning_rate": 4.7094346954451196e-06, + "loss": 0.5133, + "mean_token_accuracy": 0.8310608744621277, + "num_tokens": 111937928.0, + "step": 93090 + }, + { + "entropy": 1.8436076626181603, + "epoch": 0.2886018542126648, + "grad_norm": 4.898632526397705, + "learning_rate": 4.709181762483149e-06, + "loss": 0.4675, + "mean_token_accuracy": 0.8579155907034874, + "num_tokens": 111950393.0, + "step": 93100 + }, + { + "entropy": 1.8545200631022454, + "epoch": 0.28863285333771455, + "grad_norm": 6.226985931396484, + "learning_rate": 4.708928870270152e-06, + "loss": 0.4087, + "mean_token_accuracy": 0.8614425033330917, + "num_tokens": 111962194.0, + "step": 93110 + }, + { + "entropy": 1.9305305495858192, + "epoch": 0.2886638524627642, + "grad_norm": 8.529313087463379, + "learning_rate": 4.70867601879519e-06, + "loss": 0.5062, + "mean_token_accuracy": 0.8401212841272354, + "num_tokens": 111973375.0, + "step": 93120 + }, + { + "entropy": 1.8925403907895089, + "epoch": 0.28869485158781394, + "grad_norm": 5.547452449798584, + "learning_rate": 4.7084232080473254e-06, + "loss": 0.5184, + "mean_token_accuracy": 0.8335786327719689, + "num_tokens": 111985430.0, + "step": 93130 + }, + { + "entropy": 1.9314084231853486, + "epoch": 0.2887258507128636, + "grad_norm": 8.339520454406738, + "learning_rate": 4.7081704380156275e-06, + "loss": 0.5239, + "mean_token_accuracy": 0.8429270505905151, + "num_tokens": 111996483.0, + "step": 93140 + }, + { + "entropy": 1.8602096036076545, + "epoch": 0.28875684983791333, + "grad_norm": 7.6653523445129395, + "learning_rate": 4.7079177086891694e-06, + "loss": 0.4984, + "mean_token_accuracy": 0.8460811406373978, + "num_tokens": 112008286.0, + "step": 93150 + }, + { + "entropy": 1.8911155819892884, + "epoch": 0.288787848962963, + "grad_norm": 8.817441940307617, + "learning_rate": 4.7076650200570235e-06, + "loss": 0.4631, + "mean_token_accuracy": 0.8515992224216461, + "num_tokens": 112019993.0, + "step": 93160 + }, + { + "entropy": 1.9293202444911004, + "epoch": 0.2888188480880127, + "grad_norm": 7.830452919006348, + "learning_rate": 4.707412372108274e-06, + "loss": 0.5404, + "mean_token_accuracy": 0.8322738707065582, + "num_tokens": 112030697.0, + "step": 93170 + }, + { + "entropy": 1.9798065185546876, + "epoch": 0.2888498472130624, + "grad_norm": 8.459895133972168, + "learning_rate": 4.707159764832003e-06, + "loss": 0.5198, + "mean_token_accuracy": 0.8400536015629768, + "num_tokens": 112041497.0, + "step": 93180 + }, + { + "entropy": 1.879485437273979, + "epoch": 0.2888808463381121, + "grad_norm": 6.744016647338867, + "learning_rate": 4.7069071982172985e-06, + "loss": 0.4375, + "mean_token_accuracy": 0.8491860061883927, + "num_tokens": 112054220.0, + "step": 93190 + }, + { + "entropy": 1.9080306202173234, + "epoch": 0.2889118454631618, + "grad_norm": 7.247501373291016, + "learning_rate": 4.706654672253255e-06, + "loss": 0.4684, + "mean_token_accuracy": 0.8472570210695267, + "num_tokens": 112066105.0, + "step": 93200 + }, + { + "entropy": 1.8720626473426818, + "epoch": 0.2889428445882115, + "grad_norm": 7.93571138381958, + "learning_rate": 4.706402186928967e-06, + "loss": 0.461, + "mean_token_accuracy": 0.8489278420805931, + "num_tokens": 112078575.0, + "step": 93210 + }, + { + "entropy": 1.8863236621022224, + "epoch": 0.2889738437132612, + "grad_norm": 8.153715133666992, + "learning_rate": 4.706149742233537e-06, + "loss": 0.5219, + "mean_token_accuracy": 0.8377825424075127, + "num_tokens": 112090038.0, + "step": 93220 + }, + { + "entropy": 1.8599166497588158, + "epoch": 0.2890048428383109, + "grad_norm": 8.849479675292969, + "learning_rate": 4.705897338156069e-06, + "loss": 0.479, + "mean_token_accuracy": 0.8481295198202133, + "num_tokens": 112101547.0, + "step": 93230 + }, + { + "entropy": 1.7532717436552048, + "epoch": 0.2890358419633606, + "grad_norm": 8.861629486083984, + "learning_rate": 4.705644974685672e-06, + "loss": 0.3496, + "mean_token_accuracy": 0.8671473681926727, + "num_tokens": 112114998.0, + "step": 93240 + }, + { + "entropy": 1.8831738129258155, + "epoch": 0.2890668410884103, + "grad_norm": 7.922811031341553, + "learning_rate": 4.705392651811459e-06, + "loss": 0.5045, + "mean_token_accuracy": 0.8377567693591118, + "num_tokens": 112126554.0, + "step": 93250 + }, + { + "entropy": 1.8818488538265228, + "epoch": 0.28909784021345997, + "grad_norm": 8.488920211791992, + "learning_rate": 4.705140369522546e-06, + "loss": 0.4977, + "mean_token_accuracy": 0.8422826811671257, + "num_tokens": 112138514.0, + "step": 93260 + }, + { + "entropy": 1.9045387908816338, + "epoch": 0.2891288393385097, + "grad_norm": 7.4267425537109375, + "learning_rate": 4.704888127808055e-06, + "loss": 0.4912, + "mean_token_accuracy": 0.8422675371170044, + "num_tokens": 112149688.0, + "step": 93270 + }, + { + "entropy": 1.9745795860886575, + "epoch": 0.28915983846355936, + "grad_norm": 7.358890056610107, + "learning_rate": 4.704635926657112e-06, + "loss": 0.5901, + "mean_token_accuracy": 0.8308152720332146, + "num_tokens": 112161678.0, + "step": 93280 + }, + { + "entropy": 1.8890698000788688, + "epoch": 0.2891908375886091, + "grad_norm": 7.141902446746826, + "learning_rate": 4.704383766058845e-06, + "loss": 0.4607, + "mean_token_accuracy": 0.8586556881666183, + "num_tokens": 112173914.0, + "step": 93290 + }, + { + "entropy": 1.9566715627908706, + "epoch": 0.28922183671365875, + "grad_norm": 8.638976097106934, + "learning_rate": 4.70413164600239e-06, + "loss": 0.5518, + "mean_token_accuracy": 0.8295210391283036, + "num_tokens": 112185080.0, + "step": 93300 + }, + { + "entropy": 1.8873969689011574, + "epoch": 0.2892528358387085, + "grad_norm": 3.9027388095855713, + "learning_rate": 4.70387956647688e-06, + "loss": 0.4504, + "mean_token_accuracy": 0.8503093495965004, + "num_tokens": 112197377.0, + "step": 93310 + }, + { + "entropy": 1.8474943891167641, + "epoch": 0.28928383496375815, + "grad_norm": 4.076306343078613, + "learning_rate": 4.703627527471461e-06, + "loss": 0.4205, + "mean_token_accuracy": 0.8605047658085823, + "num_tokens": 112209297.0, + "step": 93320 + }, + { + "entropy": 1.9313654646277427, + "epoch": 0.28931483408880787, + "grad_norm": 11.894732475280762, + "learning_rate": 4.703375528975276e-06, + "loss": 0.4935, + "mean_token_accuracy": 0.8404849201440812, + "num_tokens": 112220394.0, + "step": 93330 + }, + { + "entropy": 1.9135547280311584, + "epoch": 0.28934583321385754, + "grad_norm": 7.225008487701416, + "learning_rate": 4.703123570977474e-06, + "loss": 0.4937, + "mean_token_accuracy": 0.8464812904596328, + "num_tokens": 112231383.0, + "step": 93340 + }, + { + "entropy": 1.9131829939782619, + "epoch": 0.2893768323389072, + "grad_norm": 10.179394721984863, + "learning_rate": 4.702871653467211e-06, + "loss": 0.4887, + "mean_token_accuracy": 0.8461228340864182, + "num_tokens": 112243480.0, + "step": 93350 + }, + { + "entropy": 1.9151297047734261, + "epoch": 0.28940783146395693, + "grad_norm": 7.998013019561768, + "learning_rate": 4.702619776433645e-06, + "loss": 0.4861, + "mean_token_accuracy": 0.8424739927053452, + "num_tokens": 112254846.0, + "step": 93360 + }, + { + "entropy": 1.8562763080000877, + "epoch": 0.2894388305890066, + "grad_norm": 3.337523937225342, + "learning_rate": 4.702367939865935e-06, + "loss": 0.4565, + "mean_token_accuracy": 0.8496193781495094, + "num_tokens": 112266394.0, + "step": 93370 + }, + { + "entropy": 1.9171319857239724, + "epoch": 0.2894698297140563, + "grad_norm": 8.044168472290039, + "learning_rate": 4.70211614375325e-06, + "loss": 0.4958, + "mean_token_accuracy": 0.8436752587556839, + "num_tokens": 112277740.0, + "step": 93380 + }, + { + "entropy": 1.8200967207551002, + "epoch": 0.289500828839106, + "grad_norm": 7.529294013977051, + "learning_rate": 4.701864388084757e-06, + "loss": 0.4502, + "mean_token_accuracy": 0.8523975953459739, + "num_tokens": 112290792.0, + "step": 93390 + }, + { + "entropy": 1.8970361724495888, + "epoch": 0.2895318279641557, + "grad_norm": 10.518641471862793, + "learning_rate": 4.701612672849634e-06, + "loss": 0.5585, + "mean_token_accuracy": 0.8354102879762649, + "num_tokens": 112302522.0, + "step": 93400 + }, + { + "entropy": 1.827714842557907, + "epoch": 0.2895628270892054, + "grad_norm": 8.352113723754883, + "learning_rate": 4.701360998037056e-06, + "loss": 0.4356, + "mean_token_accuracy": 0.849641677737236, + "num_tokens": 112315336.0, + "step": 93410 + }, + { + "entropy": 1.9102312728762627, + "epoch": 0.2895938262142551, + "grad_norm": 7.807504653930664, + "learning_rate": 4.701109363636205e-06, + "loss": 0.5247, + "mean_token_accuracy": 0.8295806169509887, + "num_tokens": 112326917.0, + "step": 93420 + }, + { + "entropy": 1.850624245405197, + "epoch": 0.2896248253393048, + "grad_norm": 4.632590293884277, + "learning_rate": 4.70085776963627e-06, + "loss": 0.4633, + "mean_token_accuracy": 0.8438147455453873, + "num_tokens": 112339561.0, + "step": 93430 + }, + { + "entropy": 1.909179501235485, + "epoch": 0.2896558244643545, + "grad_norm": 8.325363159179688, + "learning_rate": 4.700606216026438e-06, + "loss": 0.4844, + "mean_token_accuracy": 0.8498807772994041, + "num_tokens": 112350456.0, + "step": 93440 + }, + { + "entropy": 1.8641536325216292, + "epoch": 0.2896868235894042, + "grad_norm": 10.412588119506836, + "learning_rate": 4.700354702795905e-06, + "loss": 0.4899, + "mean_token_accuracy": 0.8456828027963639, + "num_tokens": 112363019.0, + "step": 93450 + }, + { + "entropy": 1.92139028608799, + "epoch": 0.2897178227144539, + "grad_norm": 10.26781940460205, + "learning_rate": 4.700103229933871e-06, + "loss": 0.5355, + "mean_token_accuracy": 0.8238134995102883, + "num_tokens": 112374578.0, + "step": 93460 + }, + { + "entropy": 1.9097017407417298, + "epoch": 0.28974882183950357, + "grad_norm": 6.589956283569336, + "learning_rate": 4.699851797429535e-06, + "loss": 0.4802, + "mean_token_accuracy": 0.8451741203665734, + "num_tokens": 112385717.0, + "step": 93470 + }, + { + "entropy": 1.9064988225698472, + "epoch": 0.2897798209645533, + "grad_norm": 7.078739643096924, + "learning_rate": 4.6996004052721055e-06, + "loss": 0.5205, + "mean_token_accuracy": 0.832171306014061, + "num_tokens": 112397934.0, + "step": 93480 + }, + { + "entropy": 1.8057002767920494, + "epoch": 0.28981082008960296, + "grad_norm": 8.572505950927734, + "learning_rate": 4.699349053450793e-06, + "loss": 0.3998, + "mean_token_accuracy": 0.858871404826641, + "num_tokens": 112411952.0, + "step": 93490 + }, + { + "entropy": 1.9524729505181313, + "epoch": 0.2898418192146527, + "grad_norm": 8.818181037902832, + "learning_rate": 4.699097741954811e-06, + "loss": 0.5257, + "mean_token_accuracy": 0.8438670292496682, + "num_tokens": 112423702.0, + "step": 93500 + }, + { + "entropy": 1.9189194455742835, + "epoch": 0.28987281833970235, + "grad_norm": 3.635896682739258, + "learning_rate": 4.698846470773379e-06, + "loss": 0.4516, + "mean_token_accuracy": 0.8519726365804672, + "num_tokens": 112435673.0, + "step": 93510 + }, + { + "entropy": 1.9499738737940788, + "epoch": 0.2899038174647521, + "grad_norm": 8.249286651611328, + "learning_rate": 4.698595239895718e-06, + "loss": 0.4861, + "mean_token_accuracy": 0.844916258752346, + "num_tokens": 112446773.0, + "step": 93520 + }, + { + "entropy": 1.948981523513794, + "epoch": 0.28993481658980175, + "grad_norm": 8.473828315734863, + "learning_rate": 4.698344049311058e-06, + "loss": 0.5087, + "mean_token_accuracy": 0.8404761984944343, + "num_tokens": 112457606.0, + "step": 93530 + }, + { + "entropy": 1.8598048985004425, + "epoch": 0.28996581571485147, + "grad_norm": 7.43870735168457, + "learning_rate": 4.698092899008628e-06, + "loss": 0.475, + "mean_token_accuracy": 0.8440364077687263, + "num_tokens": 112470335.0, + "step": 93540 + }, + { + "entropy": 1.871220625936985, + "epoch": 0.28999681483990114, + "grad_norm": 7.426569938659668, + "learning_rate": 4.69784178897766e-06, + "loss": 0.4636, + "mean_token_accuracy": 0.8520672276616097, + "num_tokens": 112482248.0, + "step": 93550 + }, + { + "entropy": 1.86535192579031, + "epoch": 0.29002781396495086, + "grad_norm": 4.510183811187744, + "learning_rate": 4.697590719207397e-06, + "loss": 0.4548, + "mean_token_accuracy": 0.8448464289307595, + "num_tokens": 112494530.0, + "step": 93560 + }, + { + "entropy": 1.8161170035600662, + "epoch": 0.29005881309000053, + "grad_norm": 8.694151878356934, + "learning_rate": 4.69733968968708e-06, + "loss": 0.4765, + "mean_token_accuracy": 0.8455665096640587, + "num_tokens": 112508367.0, + "step": 93570 + }, + { + "entropy": 1.9283171832561492, + "epoch": 0.29008981221505026, + "grad_norm": 8.558956146240234, + "learning_rate": 4.697088700405954e-06, + "loss": 0.567, + "mean_token_accuracy": 0.8320746123790741, + "num_tokens": 112519148.0, + "step": 93580 + }, + { + "entropy": 1.9475929975509643, + "epoch": 0.2901208113400999, + "grad_norm": 7.119383811950684, + "learning_rate": 4.696837751353273e-06, + "loss": 0.5031, + "mean_token_accuracy": 0.8410884723067283, + "num_tokens": 112529839.0, + "step": 93590 + }, + { + "entropy": 1.8843889951705932, + "epoch": 0.2901518104651496, + "grad_norm": 6.53165340423584, + "learning_rate": 4.69658684251829e-06, + "loss": 0.4745, + "mean_token_accuracy": 0.8552757307887078, + "num_tokens": 112541392.0, + "step": 93600 + }, + { + "entropy": 1.9115852415561676, + "epoch": 0.2901828095901993, + "grad_norm": 8.958046913146973, + "learning_rate": 4.696335973890263e-06, + "loss": 0.4723, + "mean_token_accuracy": 0.8533832281827927, + "num_tokens": 112551686.0, + "step": 93610 + }, + { + "entropy": 1.9201652556657791, + "epoch": 0.290213808715249, + "grad_norm": 9.570830345153809, + "learning_rate": 4.696085145458457e-06, + "loss": 0.5081, + "mean_token_accuracy": 0.8437472805380821, + "num_tokens": 112562743.0, + "step": 93620 + }, + { + "entropy": 1.909856851398945, + "epoch": 0.2902448078402987, + "grad_norm": 3.7993521690368652, + "learning_rate": 4.695834357212138e-06, + "loss": 0.5146, + "mean_token_accuracy": 0.8377780944108963, + "num_tokens": 112574467.0, + "step": 93630 + }, + { + "entropy": 1.8987789198756218, + "epoch": 0.2902758069653484, + "grad_norm": 10.340612411499023, + "learning_rate": 4.695583609140576e-06, + "loss": 0.5368, + "mean_token_accuracy": 0.8381225094199181, + "num_tokens": 112587453.0, + "step": 93640 + }, + { + "entropy": 1.767358809709549, + "epoch": 0.2903068060903981, + "grad_norm": 4.561580181121826, + "learning_rate": 4.695332901233046e-06, + "loss": 0.4904, + "mean_token_accuracy": 0.845879316329956, + "num_tokens": 112601795.0, + "step": 93650 + }, + { + "entropy": 1.8162215173244476, + "epoch": 0.2903378052154478, + "grad_norm": 6.31105899810791, + "learning_rate": 4.695082233478828e-06, + "loss": 0.4, + "mean_token_accuracy": 0.8538815006613731, + "num_tokens": 112614333.0, + "step": 93660 + }, + { + "entropy": 1.865521389245987, + "epoch": 0.2903688043404975, + "grad_norm": 8.672292709350586, + "learning_rate": 4.694831605867206e-06, + "loss": 0.4737, + "mean_token_accuracy": 0.8480850234627724, + "num_tokens": 112627252.0, + "step": 93670 + }, + { + "entropy": 1.7771051108837128, + "epoch": 0.29039980346554717, + "grad_norm": 9.864136695861816, + "learning_rate": 4.694581018387463e-06, + "loss": 0.3924, + "mean_token_accuracy": 0.8620004311203957, + "num_tokens": 112640242.0, + "step": 93680 + }, + { + "entropy": 1.8782151356339454, + "epoch": 0.2904308025905969, + "grad_norm": 6.314033031463623, + "learning_rate": 4.694330471028893e-06, + "loss": 0.4644, + "mean_token_accuracy": 0.8508936673402786, + "num_tokens": 112652406.0, + "step": 93690 + }, + { + "entropy": 1.9099466919898986, + "epoch": 0.29046180171564656, + "grad_norm": 4.040074348449707, + "learning_rate": 4.694079963780791e-06, + "loss": 0.5005, + "mean_token_accuracy": 0.8430052191019058, + "num_tokens": 112664170.0, + "step": 93700 + }, + { + "entropy": 1.863590781390667, + "epoch": 0.2904928008406963, + "grad_norm": 8.13239860534668, + "learning_rate": 4.693829496632454e-06, + "loss": 0.503, + "mean_token_accuracy": 0.8332739755511284, + "num_tokens": 112676541.0, + "step": 93710 + }, + { + "entropy": 1.8954721093177795, + "epoch": 0.29052379996574595, + "grad_norm": 9.620070457458496, + "learning_rate": 4.693579069573186e-06, + "loss": 0.5071, + "mean_token_accuracy": 0.8386843577027321, + "num_tokens": 112689006.0, + "step": 93720 + }, + { + "entropy": 1.8437098309397697, + "epoch": 0.2905547990907957, + "grad_norm": 8.268927574157715, + "learning_rate": 4.693328682592294e-06, + "loss": 0.4478, + "mean_token_accuracy": 0.860004435479641, + "num_tokens": 112701042.0, + "step": 93730 + }, + { + "entropy": 1.8573900401592254, + "epoch": 0.29058579821584535, + "grad_norm": 3.780688524246216, + "learning_rate": 4.693078335679089e-06, + "loss": 0.4399, + "mean_token_accuracy": 0.8458506971597671, + "num_tokens": 112713236.0, + "step": 93740 + }, + { + "entropy": 1.81138436794281, + "epoch": 0.29061679734089507, + "grad_norm": 9.589875221252441, + "learning_rate": 4.692828028822885e-06, + "loss": 0.4301, + "mean_token_accuracy": 0.8497409135103225, + "num_tokens": 112725893.0, + "step": 93750 + }, + { + "entropy": 1.835260456800461, + "epoch": 0.29064779646594474, + "grad_norm": 9.189959526062012, + "learning_rate": 4.692577762013002e-06, + "loss": 0.5272, + "mean_token_accuracy": 0.8347838670015335, + "num_tokens": 112738315.0, + "step": 93760 + }, + { + "entropy": 1.8794139876961709, + "epoch": 0.29067879559099447, + "grad_norm": 3.8947536945343018, + "learning_rate": 4.692327535238763e-06, + "loss": 0.4627, + "mean_token_accuracy": 0.851905120909214, + "num_tokens": 112750430.0, + "step": 93770 + }, + { + "entropy": 1.7996224954724311, + "epoch": 0.29070979471604413, + "grad_norm": 9.7765474319458, + "learning_rate": 4.6920773484894935e-06, + "loss": 0.4734, + "mean_token_accuracy": 0.8449171632528305, + "num_tokens": 112763272.0, + "step": 93780 + }, + { + "entropy": 1.8897378101944924, + "epoch": 0.29074079384109386, + "grad_norm": 9.47298526763916, + "learning_rate": 4.6918272017545255e-06, + "loss": 0.4834, + "mean_token_accuracy": 0.8373202964663505, + "num_tokens": 112775724.0, + "step": 93790 + }, + { + "entropy": 1.9278543174266816, + "epoch": 0.2907717929661435, + "grad_norm": 7.359293460845947, + "learning_rate": 4.691577095023192e-06, + "loss": 0.5133, + "mean_token_accuracy": 0.8393516659736633, + "num_tokens": 112787066.0, + "step": 93800 + }, + { + "entropy": 1.9417521178722381, + "epoch": 0.29080279209119325, + "grad_norm": 8.523365020751953, + "learning_rate": 4.691327028284835e-06, + "loss": 0.5202, + "mean_token_accuracy": 0.835085253417492, + "num_tokens": 112798938.0, + "step": 93810 + }, + { + "entropy": 1.830255390703678, + "epoch": 0.2908337912162429, + "grad_norm": 8.126928329467773, + "learning_rate": 4.691077001528794e-06, + "loss": 0.4632, + "mean_token_accuracy": 0.8412661746144294, + "num_tokens": 112812321.0, + "step": 93820 + }, + { + "entropy": 1.8606264024972916, + "epoch": 0.2908647903412926, + "grad_norm": 10.298184394836426, + "learning_rate": 4.690827014744417e-06, + "loss": 0.5009, + "mean_token_accuracy": 0.840786500275135, + "num_tokens": 112824184.0, + "step": 93830 + }, + { + "entropy": 1.9569880202412606, + "epoch": 0.2908957894663423, + "grad_norm": 10.95293140411377, + "learning_rate": 4.690577067921055e-06, + "loss": 0.5302, + "mean_token_accuracy": 0.8427381262183189, + "num_tokens": 112834958.0, + "step": 93840 + }, + { + "entropy": 1.879898366332054, + "epoch": 0.290926788591392, + "grad_norm": 8.868535041809082, + "learning_rate": 4.690327161048064e-06, + "loss": 0.4585, + "mean_token_accuracy": 0.8543467596173286, + "num_tokens": 112847253.0, + "step": 93850 + }, + { + "entropy": 1.884432803094387, + "epoch": 0.2909577877164417, + "grad_norm": 3.7411601543426514, + "learning_rate": 4.6900772941147994e-06, + "loss": 0.4685, + "mean_token_accuracy": 0.8459440425038338, + "num_tokens": 112859034.0, + "step": 93860 + }, + { + "entropy": 1.8857886865735054, + "epoch": 0.2909887868414914, + "grad_norm": 4.058228492736816, + "learning_rate": 4.689827467110626e-06, + "loss": 0.4782, + "mean_token_accuracy": 0.8447174549102783, + "num_tokens": 112870746.0, + "step": 93870 + }, + { + "entropy": 1.8584589600563048, + "epoch": 0.2910197859665411, + "grad_norm": 3.87026309967041, + "learning_rate": 4.689577680024911e-06, + "loss": 0.4473, + "mean_token_accuracy": 0.846294179558754, + "num_tokens": 112883210.0, + "step": 93880 + }, + { + "entropy": 1.867951761186123, + "epoch": 0.29105078509159077, + "grad_norm": 8.24725341796875, + "learning_rate": 4.689327932847024e-06, + "loss": 0.4867, + "mean_token_accuracy": 0.8442766860127449, + "num_tokens": 112895841.0, + "step": 93890 + }, + { + "entropy": 1.8113203912973403, + "epoch": 0.2910817842166405, + "grad_norm": 3.6465940475463867, + "learning_rate": 4.689078225566338e-06, + "loss": 0.442, + "mean_token_accuracy": 0.8473143368959427, + "num_tokens": 112909141.0, + "step": 93900 + }, + { + "entropy": 1.929970356822014, + "epoch": 0.29111278334169016, + "grad_norm": 8.883091926574707, + "learning_rate": 4.688828558172234e-06, + "loss": 0.4968, + "mean_token_accuracy": 0.8561759814620018, + "num_tokens": 112919787.0, + "step": 93910 + }, + { + "entropy": 1.9370476379990578, + "epoch": 0.2911437824667399, + "grad_norm": 7.68314790725708, + "learning_rate": 4.688578930654094e-06, + "loss": 0.5672, + "mean_token_accuracy": 0.8174238324165344, + "num_tokens": 112931094.0, + "step": 93920 + }, + { + "entropy": 1.8187593132257462, + "epoch": 0.29117478159178956, + "grad_norm": 9.00848388671875, + "learning_rate": 4.688329343001302e-06, + "loss": 0.4596, + "mean_token_accuracy": 0.8468037515878677, + "num_tokens": 112944183.0, + "step": 93930 + }, + { + "entropy": 1.7754915788769723, + "epoch": 0.2912057807168393, + "grad_norm": 8.143677711486816, + "learning_rate": 4.688079795203251e-06, + "loss": 0.3993, + "mean_token_accuracy": 0.8577578738331795, + "num_tokens": 112957991.0, + "step": 93940 + }, + { + "entropy": 1.8825334414839745, + "epoch": 0.29123677984188895, + "grad_norm": 4.188181400299072, + "learning_rate": 4.687830287249335e-06, + "loss": 0.4675, + "mean_token_accuracy": 0.8422155871987342, + "num_tokens": 112970170.0, + "step": 93950 + }, + { + "entropy": 1.8940382510423661, + "epoch": 0.2912677789669387, + "grad_norm": 9.72235107421875, + "learning_rate": 4.68758081912895e-06, + "loss": 0.5222, + "mean_token_accuracy": 0.8427134841680527, + "num_tokens": 112981192.0, + "step": 93960 + }, + { + "entropy": 1.8888707652688026, + "epoch": 0.29129877809198834, + "grad_norm": 8.664020538330078, + "learning_rate": 4.6873313908315015e-06, + "loss": 0.5094, + "mean_token_accuracy": 0.8318000584840775, + "num_tokens": 112992870.0, + "step": 93970 + }, + { + "entropy": 1.850681571662426, + "epoch": 0.29132977721703807, + "grad_norm": 4.301815986633301, + "learning_rate": 4.687082002346394e-06, + "loss": 0.4519, + "mean_token_accuracy": 0.8496690198779107, + "num_tokens": 113005123.0, + "step": 93980 + }, + { + "entropy": 1.7844605684280395, + "epoch": 0.29136077634208774, + "grad_norm": 8.043193817138672, + "learning_rate": 4.686832653663037e-06, + "loss": 0.4085, + "mean_token_accuracy": 0.8557795941829681, + "num_tokens": 113017593.0, + "step": 93990 + }, + { + "entropy": 1.7445304661989212, + "epoch": 0.29139177546713746, + "grad_norm": 3.7523953914642334, + "learning_rate": 4.686583344770846e-06, + "loss": 0.4032, + "mean_token_accuracy": 0.8624411836266518, + "num_tokens": 113031950.0, + "step": 94000 + }, + { + "entropy": 1.7966215670108796, + "epoch": 0.29142277459218713, + "grad_norm": 7.984848499298096, + "learning_rate": 4.686334075659238e-06, + "loss": 0.3961, + "mean_token_accuracy": 0.8521535471081734, + "num_tokens": 113045309.0, + "step": 94010 + }, + { + "entropy": 1.8039371758699416, + "epoch": 0.29145377371723685, + "grad_norm": 4.514188766479492, + "learning_rate": 4.686084846317634e-06, + "loss": 0.4388, + "mean_token_accuracy": 0.8542758151888847, + "num_tokens": 113058068.0, + "step": 94020 + }, + { + "entropy": 1.8654557079076768, + "epoch": 0.2914847728422865, + "grad_norm": 8.273473739624023, + "learning_rate": 4.685835656735462e-06, + "loss": 0.508, + "mean_token_accuracy": 0.8452908799052239, + "num_tokens": 113069283.0, + "step": 94030 + }, + { + "entropy": 1.8588725805282593, + "epoch": 0.29151577196733625, + "grad_norm": 4.237758636474609, + "learning_rate": 4.685586506902148e-06, + "loss": 0.5063, + "mean_token_accuracy": 0.8419048935174942, + "num_tokens": 113081319.0, + "step": 94040 + }, + { + "entropy": 1.8301612615585328, + "epoch": 0.2915467710923859, + "grad_norm": 8.443975448608398, + "learning_rate": 4.685337396807132e-06, + "loss": 0.4933, + "mean_token_accuracy": 0.8360993027687073, + "num_tokens": 113093388.0, + "step": 94050 + }, + { + "entropy": 1.8223912194371223, + "epoch": 0.29157777021743564, + "grad_norm": 8.578900337219238, + "learning_rate": 4.685088326439846e-06, + "loss": 0.4068, + "mean_token_accuracy": 0.856349365413189, + "num_tokens": 113106077.0, + "step": 94060 + }, + { + "entropy": 1.8360855415463448, + "epoch": 0.2916087693424853, + "grad_norm": 8.660734176635742, + "learning_rate": 4.684839295789734e-06, + "loss": 0.4765, + "mean_token_accuracy": 0.848674775660038, + "num_tokens": 113118493.0, + "step": 94070 + }, + { + "entropy": 1.8793357729911804, + "epoch": 0.291639768467535, + "grad_norm": 6.9419403076171875, + "learning_rate": 4.684590304846241e-06, + "loss": 0.4598, + "mean_token_accuracy": 0.8568961501121521, + "num_tokens": 113129937.0, + "step": 94080 + }, + { + "entropy": 1.909253677725792, + "epoch": 0.2916707675925847, + "grad_norm": 8.138631820678711, + "learning_rate": 4.684341353598818e-06, + "loss": 0.5175, + "mean_token_accuracy": 0.8358252078294754, + "num_tokens": 113140848.0, + "step": 94090 + }, + { + "entropy": 1.8075340047478676, + "epoch": 0.29170176671763437, + "grad_norm": 8.938015937805176, + "learning_rate": 4.684092442036915e-06, + "loss": 0.4432, + "mean_token_accuracy": 0.8573233857750893, + "num_tokens": 113153007.0, + "step": 94100 + }, + { + "entropy": 1.8430316671729088, + "epoch": 0.2917327658426841, + "grad_norm": 7.955845355987549, + "learning_rate": 4.683843570149992e-06, + "loss": 0.4929, + "mean_token_accuracy": 0.8404422610998153, + "num_tokens": 113165004.0, + "step": 94110 + }, + { + "entropy": 1.8731530100107192, + "epoch": 0.29176376496773376, + "grad_norm": 9.946492195129395, + "learning_rate": 4.683594737927509e-06, + "loss": 0.4666, + "mean_token_accuracy": 0.8477850437164307, + "num_tokens": 113176838.0, + "step": 94120 + }, + { + "entropy": 1.8446904599666596, + "epoch": 0.2917947640927835, + "grad_norm": 4.911070823669434, + "learning_rate": 4.683345945358933e-06, + "loss": 0.4723, + "mean_token_accuracy": 0.8407913163304329, + "num_tokens": 113189520.0, + "step": 94130 + }, + { + "entropy": 1.8978979125618936, + "epoch": 0.29182576321783316, + "grad_norm": 8.811211585998535, + "learning_rate": 4.683097192433731e-06, + "loss": 0.4674, + "mean_token_accuracy": 0.8498449578881264, + "num_tokens": 113201050.0, + "step": 94140 + }, + { + "entropy": 1.868384449183941, + "epoch": 0.2918567623428829, + "grad_norm": 8.619186401367188, + "learning_rate": 4.682848479141376e-06, + "loss": 0.48, + "mean_token_accuracy": 0.8416146129369736, + "num_tokens": 113213345.0, + "step": 94150 + }, + { + "entropy": 1.8393750131130218, + "epoch": 0.29188776146793255, + "grad_norm": 2.398193836212158, + "learning_rate": 4.682599805471346e-06, + "loss": 0.4774, + "mean_token_accuracy": 0.8392178237438201, + "num_tokens": 113226489.0, + "step": 94160 + }, + { + "entropy": 1.9002157673239708, + "epoch": 0.2919187605929823, + "grad_norm": 9.483999252319336, + "learning_rate": 4.68235117141312e-06, + "loss": 0.4923, + "mean_token_accuracy": 0.8522425264120101, + "num_tokens": 113238608.0, + "step": 94170 + }, + { + "entropy": 1.8459337919950485, + "epoch": 0.29194975971803194, + "grad_norm": 7.897007942199707, + "learning_rate": 4.6821025769561855e-06, + "loss": 0.4421, + "mean_token_accuracy": 0.8493504419922828, + "num_tokens": 113250584.0, + "step": 94180 + }, + { + "entropy": 1.9223339349031447, + "epoch": 0.29198075884308167, + "grad_norm": 8.27923583984375, + "learning_rate": 4.681854022090028e-06, + "loss": 0.4856, + "mean_token_accuracy": 0.8487838566303253, + "num_tokens": 113261767.0, + "step": 94190 + }, + { + "entropy": 1.9111609309911728, + "epoch": 0.29201175796813134, + "grad_norm": 8.224406242370605, + "learning_rate": 4.681605506804143e-06, + "loss": 0.498, + "mean_token_accuracy": 0.839901152253151, + "num_tokens": 113273424.0, + "step": 94200 + }, + { + "entropy": 1.8766125679016112, + "epoch": 0.29204275709318106, + "grad_norm": 8.163731575012207, + "learning_rate": 4.681357031088025e-06, + "loss": 0.4346, + "mean_token_accuracy": 0.8469099998474121, + "num_tokens": 113285663.0, + "step": 94210 + }, + { + "entropy": 1.815425206720829, + "epoch": 0.29207375621823073, + "grad_norm": 8.935981750488281, + "learning_rate": 4.681108594931173e-06, + "loss": 0.4639, + "mean_token_accuracy": 0.8499750107526779, + "num_tokens": 113298461.0, + "step": 94220 + }, + { + "entropy": 1.6602164059877396, + "epoch": 0.29210475534328045, + "grad_norm": 8.308954238891602, + "learning_rate": 4.680860198323094e-06, + "loss": 0.383, + "mean_token_accuracy": 0.8591342657804489, + "num_tokens": 113314031.0, + "step": 94230 + }, + { + "entropy": 1.838196623325348, + "epoch": 0.2921357544683301, + "grad_norm": 3.9183874130249023, + "learning_rate": 4.6806118412532965e-06, + "loss": 0.4273, + "mean_token_accuracy": 0.8539181277155876, + "num_tokens": 113326110.0, + "step": 94240 + }, + { + "entropy": 1.8387087047100068, + "epoch": 0.29216675359337985, + "grad_norm": 7.87742280960083, + "learning_rate": 4.680363523711289e-06, + "loss": 0.451, + "mean_token_accuracy": 0.8537355214357376, + "num_tokens": 113338311.0, + "step": 94250 + }, + { + "entropy": 1.8915387392044067, + "epoch": 0.2921977527184295, + "grad_norm": 8.182762145996094, + "learning_rate": 4.680115245686591e-06, + "loss": 0.4672, + "mean_token_accuracy": 0.8396169424057007, + "num_tokens": 113350148.0, + "step": 94260 + }, + { + "entropy": 1.8612817063927651, + "epoch": 0.29222875184347924, + "grad_norm": 6.411753177642822, + "learning_rate": 4.679867007168719e-06, + "loss": 0.4455, + "mean_token_accuracy": 0.8560265928506852, + "num_tokens": 113362183.0, + "step": 94270 + }, + { + "entropy": 1.8689087167382241, + "epoch": 0.2922597509685289, + "grad_norm": 8.272517204284668, + "learning_rate": 4.6796188081472e-06, + "loss": 0.4405, + "mean_token_accuracy": 0.8516158699989319, + "num_tokens": 113374575.0, + "step": 94280 + }, + { + "entropy": 1.9062586069107055, + "epoch": 0.29229075009357863, + "grad_norm": 10.613982200622559, + "learning_rate": 4.679370648611559e-06, + "loss": 0.5072, + "mean_token_accuracy": 0.8422037899494171, + "num_tokens": 113386126.0, + "step": 94290 + }, + { + "entropy": 1.9051846981048584, + "epoch": 0.2923217492186283, + "grad_norm": 8.695990562438965, + "learning_rate": 4.679122528551329e-06, + "loss": 0.5411, + "mean_token_accuracy": 0.841190955042839, + "num_tokens": 113397175.0, + "step": 94300 + }, + { + "entropy": 1.7903152495622634, + "epoch": 0.292352748343678, + "grad_norm": 4.608640670776367, + "learning_rate": 4.678874447956044e-06, + "loss": 0.391, + "mean_token_accuracy": 0.8582356154918671, + "num_tokens": 113410086.0, + "step": 94310 + }, + { + "entropy": 1.8944632783532143, + "epoch": 0.2923837474687277, + "grad_norm": 8.895879745483398, + "learning_rate": 4.6786264068152445e-06, + "loss": 0.47, + "mean_token_accuracy": 0.8449664607644081, + "num_tokens": 113422204.0, + "step": 94320 + }, + { + "entropy": 1.9663603961467744, + "epoch": 0.29241474659377736, + "grad_norm": 10.93663215637207, + "learning_rate": 4.678378405118473e-06, + "loss": 0.5461, + "mean_token_accuracy": 0.8348477125167847, + "num_tokens": 113432962.0, + "step": 94330 + }, + { + "entropy": 1.8723286166787148, + "epoch": 0.2924457457188271, + "grad_norm": 8.900105476379395, + "learning_rate": 4.6781304428552765e-06, + "loss": 0.4489, + "mean_token_accuracy": 0.8530125021934509, + "num_tokens": 113444695.0, + "step": 94340 + }, + { + "entropy": 1.7696161583065986, + "epoch": 0.29247674484387676, + "grad_norm": 7.942965030670166, + "learning_rate": 4.677882520015207e-06, + "loss": 0.4524, + "mean_token_accuracy": 0.857183714210987, + "num_tokens": 113458974.0, + "step": 94350 + }, + { + "entropy": 1.8979527205228806, + "epoch": 0.2925077439689265, + "grad_norm": 10.76806354522705, + "learning_rate": 4.677634636587817e-06, + "loss": 0.4853, + "mean_token_accuracy": 0.8429912239313125, + "num_tokens": 113470746.0, + "step": 94360 + }, + { + "entropy": 1.8394412383437158, + "epoch": 0.29253874309397615, + "grad_norm": 7.812624931335449, + "learning_rate": 4.677386792562667e-06, + "loss": 0.4536, + "mean_token_accuracy": 0.8541227161884308, + "num_tokens": 113482575.0, + "step": 94370 + }, + { + "entropy": 1.8561811774969101, + "epoch": 0.2925697422190259, + "grad_norm": 7.494784832000732, + "learning_rate": 4.6771389879293185e-06, + "loss": 0.4912, + "mean_token_accuracy": 0.8553539365530014, + "num_tokens": 113493656.0, + "step": 94380 + }, + { + "entropy": 1.877823382616043, + "epoch": 0.29260074134407554, + "grad_norm": 7.919898986816406, + "learning_rate": 4.676891222677338e-06, + "loss": 0.4551, + "mean_token_accuracy": 0.8565265223383903, + "num_tokens": 113504994.0, + "step": 94390 + }, + { + "entropy": 1.8452934324741364, + "epoch": 0.29263174046912527, + "grad_norm": 9.235838890075684, + "learning_rate": 4.6766434967962945e-06, + "loss": 0.4938, + "mean_token_accuracy": 0.8403481170535088, + "num_tokens": 113517235.0, + "step": 94400 + }, + { + "entropy": 1.8997774094343185, + "epoch": 0.29266273959417494, + "grad_norm": 8.089444160461426, + "learning_rate": 4.6763958102757665e-06, + "loss": 0.4971, + "mean_token_accuracy": 0.8421216562390328, + "num_tokens": 113528646.0, + "step": 94410 + }, + { + "entropy": 1.6933333344757557, + "epoch": 0.29269373871922466, + "grad_norm": 9.213237762451172, + "learning_rate": 4.676148163105327e-06, + "loss": 0.499, + "mean_token_accuracy": 0.8442223116755485, + "num_tokens": 113543849.0, + "step": 94420 + }, + { + "entropy": 1.74602971971035, + "epoch": 0.29272473784427433, + "grad_norm": 8.171969413757324, + "learning_rate": 4.67590055527456e-06, + "loss": 0.388, + "mean_token_accuracy": 0.8524969726800918, + "num_tokens": 113556535.0, + "step": 94430 + }, + { + "entropy": 1.899616888165474, + "epoch": 0.29275573696932405, + "grad_norm": 9.324505805969238, + "learning_rate": 4.675652986773051e-06, + "loss": 0.5368, + "mean_token_accuracy": 0.8389844998717308, + "num_tokens": 113567827.0, + "step": 94440 + }, + { + "entropy": 1.85470499843359, + "epoch": 0.2927867360943737, + "grad_norm": 7.886104106903076, + "learning_rate": 4.675405457590389e-06, + "loss": 0.4989, + "mean_token_accuracy": 0.8475810378789902, + "num_tokens": 113578947.0, + "step": 94450 + }, + { + "entropy": 1.9117970570921898, + "epoch": 0.29281773521942345, + "grad_norm": 8.605646133422852, + "learning_rate": 4.675157967716168e-06, + "loss": 0.4978, + "mean_token_accuracy": 0.8469821259379386, + "num_tokens": 113590112.0, + "step": 94460 + }, + { + "entropy": 1.850995273888111, + "epoch": 0.2928487343444731, + "grad_norm": 9.236472129821777, + "learning_rate": 4.6749105171399864e-06, + "loss": 0.4171, + "mean_token_accuracy": 0.8592243269085884, + "num_tokens": 113602127.0, + "step": 94470 + }, + { + "entropy": 1.894811224937439, + "epoch": 0.29287973346952284, + "grad_norm": 7.549960136413574, + "learning_rate": 4.674663105851442e-06, + "loss": 0.5144, + "mean_token_accuracy": 0.8472552940249443, + "num_tokens": 113613991.0, + "step": 94480 + }, + { + "entropy": 1.8211253330111503, + "epoch": 0.2929107325945725, + "grad_norm": 10.19444751739502, + "learning_rate": 4.674415733840143e-06, + "loss": 0.5209, + "mean_token_accuracy": 0.8452879965305329, + "num_tokens": 113626732.0, + "step": 94490 + }, + { + "entropy": 1.870133863389492, + "epoch": 0.29294173171962223, + "grad_norm": 5.497763633728027, + "learning_rate": 4.674168401095697e-06, + "loss": 0.5169, + "mean_token_accuracy": 0.8333605661988258, + "num_tokens": 113638260.0, + "step": 94500 + }, + { + "entropy": 1.881983858346939, + "epoch": 0.2929727308446719, + "grad_norm": 7.503384590148926, + "learning_rate": 4.673921107607716e-06, + "loss": 0.4997, + "mean_token_accuracy": 0.8480863809585572, + "num_tokens": 113649729.0, + "step": 94510 + }, + { + "entropy": 1.7449950769543647, + "epoch": 0.2930037299697216, + "grad_norm": 6.960958003997803, + "learning_rate": 4.673673853365818e-06, + "loss": 0.4333, + "mean_token_accuracy": 0.8576125666499138, + "num_tokens": 113662822.0, + "step": 94520 + }, + { + "entropy": 1.6805919706821442, + "epoch": 0.2930347290947713, + "grad_norm": 7.576868534088135, + "learning_rate": 4.673426638359622e-06, + "loss": 0.3747, + "mean_token_accuracy": 0.8681085333228111, + "num_tokens": 113677064.0, + "step": 94530 + }, + { + "entropy": 1.8898225530982018, + "epoch": 0.293065728219821, + "grad_norm": 8.339954376220703, + "learning_rate": 4.673179462578754e-06, + "loss": 0.5254, + "mean_token_accuracy": 0.834310744702816, + "num_tokens": 113688785.0, + "step": 94540 + }, + { + "entropy": 1.822121039032936, + "epoch": 0.2930967273448707, + "grad_norm": 2.9981343746185303, + "learning_rate": 4.672932326012839e-06, + "loss": 0.4196, + "mean_token_accuracy": 0.8623607367277145, + "num_tokens": 113700761.0, + "step": 94550 + }, + { + "entropy": 1.8925957545638084, + "epoch": 0.2931277264699204, + "grad_norm": 8.26031494140625, + "learning_rate": 4.672685228651511e-06, + "loss": 0.4731, + "mean_token_accuracy": 0.8499511271715164, + "num_tokens": 113712498.0, + "step": 94560 + }, + { + "entropy": 1.92208162099123, + "epoch": 0.2931587255949701, + "grad_norm": 9.060463905334473, + "learning_rate": 4.672438170484405e-06, + "loss": 0.5613, + "mean_token_accuracy": 0.8314491465687752, + "num_tokens": 113724082.0, + "step": 94570 + }, + { + "entropy": 1.8770214468240738, + "epoch": 0.29318972472001975, + "grad_norm": 8.743376731872559, + "learning_rate": 4.672191151501161e-06, + "loss": 0.4541, + "mean_token_accuracy": 0.8547257348895073, + "num_tokens": 113736315.0, + "step": 94580 + }, + { + "entropy": 1.837056641280651, + "epoch": 0.2932207238450695, + "grad_norm": 4.969115257263184, + "learning_rate": 4.671944171691422e-06, + "loss": 0.464, + "mean_token_accuracy": 0.8457030534744263, + "num_tokens": 113748972.0, + "step": 94590 + }, + { + "entropy": 1.9361342743039132, + "epoch": 0.29325172297011914, + "grad_norm": 8.590668678283691, + "learning_rate": 4.671697231044837e-06, + "loss": 0.4903, + "mean_token_accuracy": 0.8455305263400078, + "num_tokens": 113760795.0, + "step": 94600 + }, + { + "entropy": 1.775714547932148, + "epoch": 0.29328272209516887, + "grad_norm": 3.653806209564209, + "learning_rate": 4.671450329551054e-06, + "loss": 0.4358, + "mean_token_accuracy": 0.8520709335803985, + "num_tokens": 113774218.0, + "step": 94610 + }, + { + "entropy": 1.9447847425937652, + "epoch": 0.29331372122021854, + "grad_norm": 8.897464752197266, + "learning_rate": 4.671203467199731e-06, + "loss": 0.5127, + "mean_token_accuracy": 0.8491786658763886, + "num_tokens": 113785173.0, + "step": 94620 + }, + { + "entropy": 1.8407832726836204, + "epoch": 0.29334472034526826, + "grad_norm": 6.721490383148193, + "learning_rate": 4.670956643980524e-06, + "loss": 0.4677, + "mean_token_accuracy": 0.8475279733538628, + "num_tokens": 113797753.0, + "step": 94630 + }, + { + "entropy": 1.7925242587924004, + "epoch": 0.29337571947031793, + "grad_norm": 8.641847610473633, + "learning_rate": 4.670709859883096e-06, + "loss": 0.4577, + "mean_token_accuracy": 0.8569319665431976, + "num_tokens": 113809733.0, + "step": 94640 + }, + { + "entropy": 1.9358911573886872, + "epoch": 0.29340671859536765, + "grad_norm": 8.337167739868164, + "learning_rate": 4.670463114897114e-06, + "loss": 0.5519, + "mean_token_accuracy": 0.8383975982666015, + "num_tokens": 113820542.0, + "step": 94650 + }, + { + "entropy": 1.8865822792053222, + "epoch": 0.2934377177204173, + "grad_norm": 8.994181632995605, + "learning_rate": 4.670216409012248e-06, + "loss": 0.49, + "mean_token_accuracy": 0.8425557687878609, + "num_tokens": 113832371.0, + "step": 94660 + }, + { + "entropy": 1.7770501509308816, + "epoch": 0.29346871684546705, + "grad_norm": 6.771418571472168, + "learning_rate": 4.669969742218173e-06, + "loss": 0.431, + "mean_token_accuracy": 0.8530333042144775, + "num_tokens": 113845284.0, + "step": 94670 + }, + { + "entropy": 1.7958509385585786, + "epoch": 0.2934997159705167, + "grad_norm": 7.999570369720459, + "learning_rate": 4.669723114504565e-06, + "loss": 0.4007, + "mean_token_accuracy": 0.8648405924439431, + "num_tokens": 113858418.0, + "step": 94680 + }, + { + "entropy": 1.9169925883412362, + "epoch": 0.29353071509556644, + "grad_norm": 8.945980072021484, + "learning_rate": 4.669476525861107e-06, + "loss": 0.5081, + "mean_token_accuracy": 0.8366316080093383, + "num_tokens": 113869941.0, + "step": 94690 + }, + { + "entropy": 1.804078498482704, + "epoch": 0.2935617142206161, + "grad_norm": 8.323823928833008, + "learning_rate": 4.669229976277483e-06, + "loss": 0.3992, + "mean_token_accuracy": 0.8675397202372551, + "num_tokens": 113882565.0, + "step": 94700 + }, + { + "entropy": 1.9035957649350166, + "epoch": 0.29359271334566583, + "grad_norm": 9.34669017791748, + "learning_rate": 4.668983465743385e-06, + "loss": 0.5124, + "mean_token_accuracy": 0.8365237683057785, + "num_tokens": 113893898.0, + "step": 94710 + }, + { + "entropy": 1.9729483425617218, + "epoch": 0.2936237124707155, + "grad_norm": 6.514955043792725, + "learning_rate": 4.668736994248504e-06, + "loss": 0.5382, + "mean_token_accuracy": 0.8372229024767875, + "num_tokens": 113904833.0, + "step": 94720 + }, + { + "entropy": 1.9153319388628005, + "epoch": 0.2936547115957652, + "grad_norm": 11.098676681518555, + "learning_rate": 4.668490561782535e-06, + "loss": 0.5469, + "mean_token_accuracy": 0.8335372865200043, + "num_tokens": 113915799.0, + "step": 94730 + }, + { + "entropy": 1.9642977565526962, + "epoch": 0.2936857107208149, + "grad_norm": 8.632896423339844, + "learning_rate": 4.668244168335182e-06, + "loss": 0.5027, + "mean_token_accuracy": 0.8496646553277969, + "num_tokens": 113926831.0, + "step": 94740 + }, + { + "entropy": 1.8451596662402152, + "epoch": 0.2937167098458646, + "grad_norm": 8.402206420898438, + "learning_rate": 4.667997813896149e-06, + "loss": 0.4127, + "mean_token_accuracy": 0.8657459333539009, + "num_tokens": 113938570.0, + "step": 94750 + }, + { + "entropy": 1.8689325533807277, + "epoch": 0.2937477089709143, + "grad_norm": 9.82726001739502, + "learning_rate": 4.667751498455142e-06, + "loss": 0.5013, + "mean_token_accuracy": 0.8491661697626114, + "num_tokens": 113950581.0, + "step": 94760 + }, + { + "entropy": 1.81126269698143, + "epoch": 0.293778708095964, + "grad_norm": 2.5253586769104004, + "learning_rate": 4.667505222001875e-06, + "loss": 0.4055, + "mean_token_accuracy": 0.8603689581155777, + "num_tokens": 113963539.0, + "step": 94770 + }, + { + "entropy": 1.8925784215331078, + "epoch": 0.2938097072210137, + "grad_norm": 7.797617435455322, + "learning_rate": 4.667258984526063e-06, + "loss": 0.4787, + "mean_token_accuracy": 0.8523267209529877, + "num_tokens": 113975014.0, + "step": 94780 + }, + { + "entropy": 1.8433985903859138, + "epoch": 0.2938407063460634, + "grad_norm": 10.589468002319336, + "learning_rate": 4.667012786017426e-06, + "loss": 0.4875, + "mean_token_accuracy": 0.8405422657728195, + "num_tokens": 113987158.0, + "step": 94790 + }, + { + "entropy": 1.8784984156489373, + "epoch": 0.2938717054711131, + "grad_norm": 3.9485971927642822, + "learning_rate": 4.666766626465689e-06, + "loss": 0.4409, + "mean_token_accuracy": 0.8542310744524002, + "num_tokens": 113998286.0, + "step": 94800 + }, + { + "entropy": 1.8718909628689289, + "epoch": 0.2939027045961628, + "grad_norm": 8.77500057220459, + "learning_rate": 4.666520505860577e-06, + "loss": 0.461, + "mean_token_accuracy": 0.8534493952989578, + "num_tokens": 114010433.0, + "step": 94810 + }, + { + "entropy": 1.9200672134757042, + "epoch": 0.29393370372121247, + "grad_norm": 8.568684577941895, + "learning_rate": 4.666274424191821e-06, + "loss": 0.4846, + "mean_token_accuracy": 0.8516236767172813, + "num_tokens": 114021482.0, + "step": 94820 + }, + { + "entropy": 1.8536953687667848, + "epoch": 0.29396470284626214, + "grad_norm": 7.200621128082275, + "learning_rate": 4.666028381449159e-06, + "loss": 0.4337, + "mean_token_accuracy": 0.8599602654576302, + "num_tokens": 114033985.0, + "step": 94830 + }, + { + "entropy": 1.894244983792305, + "epoch": 0.29399570197131186, + "grad_norm": 10.332306861877441, + "learning_rate": 4.6657823776223255e-06, + "loss": 0.4844, + "mean_token_accuracy": 0.8414657115936279, + "num_tokens": 114045285.0, + "step": 94840 + }, + { + "entropy": 1.863986374437809, + "epoch": 0.29402670109636153, + "grad_norm": 4.247035980224609, + "learning_rate": 4.6655364127010655e-06, + "loss": 0.4657, + "mean_token_accuracy": 0.8416002795100213, + "num_tokens": 114057674.0, + "step": 94850 + }, + { + "entropy": 1.9077302902936935, + "epoch": 0.29405770022141126, + "grad_norm": 7.60083532333374, + "learning_rate": 4.665290486675124e-06, + "loss": 0.4954, + "mean_token_accuracy": 0.8370777696371079, + "num_tokens": 114069600.0, + "step": 94860 + }, + { + "entropy": 1.8785944551229476, + "epoch": 0.2940886993464609, + "grad_norm": 7.80269718170166, + "learning_rate": 4.665044599534251e-06, + "loss": 0.4277, + "mean_token_accuracy": 0.852102880179882, + "num_tokens": 114081686.0, + "step": 94870 + }, + { + "entropy": 1.8737225562334061, + "epoch": 0.29411969847151065, + "grad_norm": 8.264700889587402, + "learning_rate": 4.664798751268201e-06, + "loss": 0.4726, + "mean_token_accuracy": 0.8497600093483925, + "num_tokens": 114093863.0, + "step": 94880 + }, + { + "entropy": 1.9434197053313256, + "epoch": 0.2941506975965603, + "grad_norm": 8.022650718688965, + "learning_rate": 4.664552941866732e-06, + "loss": 0.482, + "mean_token_accuracy": 0.8454866543412208, + "num_tokens": 114105246.0, + "step": 94890 + }, + { + "entropy": 1.7929591804742813, + "epoch": 0.29418169672161004, + "grad_norm": 4.1385908126831055, + "learning_rate": 4.664307171319604e-06, + "loss": 0.4195, + "mean_token_accuracy": 0.84492729306221, + "num_tokens": 114118876.0, + "step": 94900 + }, + { + "entropy": 1.9504995405673982, + "epoch": 0.2942126958466597, + "grad_norm": 7.590723037719727, + "learning_rate": 4.664061439616583e-06, + "loss": 0.4788, + "mean_token_accuracy": 0.8555523350834846, + "num_tokens": 114129331.0, + "step": 94910 + }, + { + "entropy": 1.8454117804765702, + "epoch": 0.29424369497170944, + "grad_norm": 10.14136791229248, + "learning_rate": 4.663815746747437e-06, + "loss": 0.4786, + "mean_token_accuracy": 0.8382917091250419, + "num_tokens": 114141760.0, + "step": 94920 + }, + { + "entropy": 1.8560292690992355, + "epoch": 0.2942746940967591, + "grad_norm": 7.97593879699707, + "learning_rate": 4.66357009270194e-06, + "loss": 0.5148, + "mean_token_accuracy": 0.8398404493927956, + "num_tokens": 114154144.0, + "step": 94930 + }, + { + "entropy": 1.9188194185495377, + "epoch": 0.29430569322180883, + "grad_norm": 8.06023120880127, + "learning_rate": 4.663324477469867e-06, + "loss": 0.4748, + "mean_token_accuracy": 0.8514153569936752, + "num_tokens": 114165232.0, + "step": 94940 + }, + { + "entropy": 1.9558087676763534, + "epoch": 0.2943366923468585, + "grad_norm": 9.450501441955566, + "learning_rate": 4.663078901040999e-06, + "loss": 0.5441, + "mean_token_accuracy": 0.8372850298881531, + "num_tokens": 114176045.0, + "step": 94950 + }, + { + "entropy": 1.8351658523082732, + "epoch": 0.2943676914719082, + "grad_norm": 4.117894649505615, + "learning_rate": 4.662833363405119e-06, + "loss": 0.4534, + "mean_token_accuracy": 0.8479064837098121, + "num_tokens": 114189019.0, + "step": 94960 + }, + { + "entropy": 1.8557307586073875, + "epoch": 0.2943986905969579, + "grad_norm": 8.198458671569824, + "learning_rate": 4.662587864552017e-06, + "loss": 0.464, + "mean_token_accuracy": 0.8527988627552986, + "num_tokens": 114201808.0, + "step": 94970 + }, + { + "entropy": 1.927764955163002, + "epoch": 0.2944296897220076, + "grad_norm": 9.65707015991211, + "learning_rate": 4.662342404471482e-06, + "loss": 0.5003, + "mean_token_accuracy": 0.8513452783226967, + "num_tokens": 114213290.0, + "step": 94980 + }, + { + "entropy": 1.76514892578125, + "epoch": 0.2944606888470573, + "grad_norm": 8.054937362670898, + "learning_rate": 4.662096983153311e-06, + "loss": 0.3548, + "mean_token_accuracy": 0.8645235538482666, + "num_tokens": 114226877.0, + "step": 94990 + }, + { + "entropy": 1.8766354367136955, + "epoch": 0.294491687972107, + "grad_norm": 4.424029350280762, + "learning_rate": 4.661851600587301e-06, + "loss": 0.4644, + "mean_token_accuracy": 0.8427970930933952, + "num_tokens": 114239241.0, + "step": 95000 + }, + { + "entropy": 1.9168139308691026, + "epoch": 0.2945226870971567, + "grad_norm": 7.28840970993042, + "learning_rate": 4.661606256763257e-06, + "loss": 0.528, + "mean_token_accuracy": 0.8504084393382072, + "num_tokens": 114250255.0, + "step": 95010 + }, + { + "entropy": 1.7996359452605248, + "epoch": 0.2945536862222064, + "grad_norm": 8.724395751953125, + "learning_rate": 4.661360951670983e-06, + "loss": 0.4217, + "mean_token_accuracy": 0.8572775185108185, + "num_tokens": 114262731.0, + "step": 95020 + }, + { + "entropy": 1.847113935649395, + "epoch": 0.29458468534725607, + "grad_norm": 8.312970161437988, + "learning_rate": 4.661115685300293e-06, + "loss": 0.4897, + "mean_token_accuracy": 0.8483634769916535, + "num_tokens": 114274768.0, + "step": 95030 + }, + { + "entropy": 1.8922452926635742, + "epoch": 0.2946156844723058, + "grad_norm": 6.005066871643066, + "learning_rate": 4.660870457640998e-06, + "loss": 0.4613, + "mean_token_accuracy": 0.846984113752842, + "num_tokens": 114286772.0, + "step": 95040 + }, + { + "entropy": 1.8506748765707015, + "epoch": 0.29464668359735546, + "grad_norm": 3.484794855117798, + "learning_rate": 4.660625268682915e-06, + "loss": 0.4566, + "mean_token_accuracy": 0.8471810176968575, + "num_tokens": 114298457.0, + "step": 95050 + }, + { + "entropy": 1.952257016301155, + "epoch": 0.2946776827224052, + "grad_norm": 9.706525802612305, + "learning_rate": 4.660380118415869e-06, + "loss": 0.5516, + "mean_token_accuracy": 0.8365617975592613, + "num_tokens": 114309656.0, + "step": 95060 + }, + { + "entropy": 1.854885457456112, + "epoch": 0.29470868184745486, + "grad_norm": 7.192900657653809, + "learning_rate": 4.660135006829682e-06, + "loss": 0.4771, + "mean_token_accuracy": 0.8500230267643929, + "num_tokens": 114321375.0, + "step": 95070 + }, + { + "entropy": 1.8961684226989746, + "epoch": 0.2947396809725045, + "grad_norm": 8.305758476257324, + "learning_rate": 4.659889933914185e-06, + "loss": 0.4983, + "mean_token_accuracy": 0.8406580999493599, + "num_tokens": 114333189.0, + "step": 95080 + }, + { + "entropy": 1.914514508843422, + "epoch": 0.29477068009755425, + "grad_norm": 7.5317702293396, + "learning_rate": 4.65964489965921e-06, + "loss": 0.5174, + "mean_token_accuracy": 0.8434663847088814, + "num_tokens": 114344937.0, + "step": 95090 + }, + { + "entropy": 1.8632069528102875, + "epoch": 0.2948016792226039, + "grad_norm": 8.437536239624023, + "learning_rate": 4.659399904054594e-06, + "loss": 0.512, + "mean_token_accuracy": 0.8462415754795074, + "num_tokens": 114357269.0, + "step": 95100 + }, + { + "entropy": 1.9126679688692092, + "epoch": 0.29483267834765364, + "grad_norm": 4.100780010223389, + "learning_rate": 4.6591549470901755e-06, + "loss": 0.5264, + "mean_token_accuracy": 0.8231175735592842, + "num_tokens": 114369778.0, + "step": 95110 + }, + { + "entropy": 1.84993801638484, + "epoch": 0.2948636774727033, + "grad_norm": 8.060639381408691, + "learning_rate": 4.6589100287558015e-06, + "loss": 0.4221, + "mean_token_accuracy": 0.8632707759737969, + "num_tokens": 114383231.0, + "step": 95120 + }, + { + "entropy": 1.9371715486049652, + "epoch": 0.29489467659775304, + "grad_norm": 7.608775615692139, + "learning_rate": 4.658665149041318e-06, + "loss": 0.5282, + "mean_token_accuracy": 0.8381532579660416, + "num_tokens": 114393685.0, + "step": 95130 + }, + { + "entropy": 1.8963730677962303, + "epoch": 0.2949256757228027, + "grad_norm": 7.8309550285339355, + "learning_rate": 4.6584203079365756e-06, + "loss": 0.4819, + "mean_token_accuracy": 0.8506500810384751, + "num_tokens": 114404986.0, + "step": 95140 + }, + { + "entropy": 1.8873082131147385, + "epoch": 0.29495667484785243, + "grad_norm": 8.878643989562988, + "learning_rate": 4.658175505431431e-06, + "loss": 0.4859, + "mean_token_accuracy": 0.8476304590702057, + "num_tokens": 114416424.0, + "step": 95150 + }, + { + "entropy": 1.7684010401368142, + "epoch": 0.2949876739729021, + "grad_norm": 4.694974422454834, + "learning_rate": 4.657930741515742e-06, + "loss": 0.4157, + "mean_token_accuracy": 0.853473761677742, + "num_tokens": 114430299.0, + "step": 95160 + }, + { + "entropy": 1.8848821252584458, + "epoch": 0.2950186730979518, + "grad_norm": 9.460819244384766, + "learning_rate": 4.657686016179372e-06, + "loss": 0.5366, + "mean_token_accuracy": 0.8388624176383018, + "num_tokens": 114441497.0, + "step": 95170 + }, + { + "entropy": 1.8895638823509215, + "epoch": 0.2950496722230015, + "grad_norm": 7.845742702484131, + "learning_rate": 4.6574413294121865e-06, + "loss": 0.5091, + "mean_token_accuracy": 0.840872798860073, + "num_tokens": 114453312.0, + "step": 95180 + }, + { + "entropy": 1.8671682730317116, + "epoch": 0.2950806713480512, + "grad_norm": 8.617532730102539, + "learning_rate": 4.657196681204057e-06, + "loss": 0.4543, + "mean_token_accuracy": 0.858017711341381, + "num_tokens": 114464871.0, + "step": 95190 + }, + { + "entropy": 1.8859140858054162, + "epoch": 0.2951116704731009, + "grad_norm": 8.359928131103516, + "learning_rate": 4.656952071544857e-06, + "loss": 0.503, + "mean_token_accuracy": 0.8446494430303574, + "num_tokens": 114476109.0, + "step": 95200 + }, + { + "entropy": 1.9119530409574508, + "epoch": 0.2951426695981506, + "grad_norm": 9.479840278625488, + "learning_rate": 4.656707500424463e-06, + "loss": 0.5339, + "mean_token_accuracy": 0.8339703768491745, + "num_tokens": 114487221.0, + "step": 95210 + }, + { + "entropy": 1.8878633320331573, + "epoch": 0.2951736687232003, + "grad_norm": 3.638047218322754, + "learning_rate": 4.656462967832758e-06, + "loss": 0.4996, + "mean_token_accuracy": 0.8457856431603432, + "num_tokens": 114498809.0, + "step": 95220 + }, + { + "entropy": 1.893609069287777, + "epoch": 0.29520466784825, + "grad_norm": 7.487365245819092, + "learning_rate": 4.656218473759623e-06, + "loss": 0.4817, + "mean_token_accuracy": 0.849824258685112, + "num_tokens": 114510866.0, + "step": 95230 + }, + { + "entropy": 1.9306818440556526, + "epoch": 0.29523566697329967, + "grad_norm": 7.329023838043213, + "learning_rate": 4.655974018194953e-06, + "loss": 0.4857, + "mean_token_accuracy": 0.8432904377579689, + "num_tokens": 114521966.0, + "step": 95240 + }, + { + "entropy": 1.9246254205703734, + "epoch": 0.2952666660983494, + "grad_norm": 8.638476371765137, + "learning_rate": 4.655729601128635e-06, + "loss": 0.493, + "mean_token_accuracy": 0.8454683497548103, + "num_tokens": 114533785.0, + "step": 95250 + }, + { + "entropy": 1.8969884738326073, + "epoch": 0.29529766522339906, + "grad_norm": 7.14600944519043, + "learning_rate": 4.655485222550568e-06, + "loss": 0.4495, + "mean_token_accuracy": 0.8438623622059822, + "num_tokens": 114545791.0, + "step": 95260 + }, + { + "entropy": 1.8377904728055001, + "epoch": 0.2953286643484488, + "grad_norm": 6.43510103225708, + "learning_rate": 4.65524088245065e-06, + "loss": 0.433, + "mean_token_accuracy": 0.8514227852225303, + "num_tokens": 114558462.0, + "step": 95270 + }, + { + "entropy": 1.9160344362258912, + "epoch": 0.29535966347349846, + "grad_norm": 10.534104347229004, + "learning_rate": 4.654996580818786e-06, + "loss": 0.4926, + "mean_token_accuracy": 0.845592126250267, + "num_tokens": 114569485.0, + "step": 95280 + }, + { + "entropy": 1.8697632998228073, + "epoch": 0.2953906625985482, + "grad_norm": 8.796942710876465, + "learning_rate": 4.654752317644883e-06, + "loss": 0.5373, + "mean_token_accuracy": 0.8387501269578934, + "num_tokens": 114581277.0, + "step": 95290 + }, + { + "entropy": 1.831010890007019, + "epoch": 0.29542166172359785, + "grad_norm": 9.258670806884766, + "learning_rate": 4.654508092918852e-06, + "loss": 0.4995, + "mean_token_accuracy": 0.8350206702947617, + "num_tokens": 114593427.0, + "step": 95300 + }, + { + "entropy": 1.8626672565937041, + "epoch": 0.2954526608486475, + "grad_norm": 4.265320777893066, + "learning_rate": 4.6542639066306065e-06, + "loss": 0.4421, + "mean_token_accuracy": 0.8505603194236755, + "num_tokens": 114605252.0, + "step": 95310 + }, + { + "entropy": 1.7814742475748062, + "epoch": 0.29548365997369724, + "grad_norm": 9.163718223571777, + "learning_rate": 4.654019758770067e-06, + "loss": 0.4164, + "mean_token_accuracy": 0.8585087850689888, + "num_tokens": 114618618.0, + "step": 95320 + }, + { + "entropy": 1.8076193630695343, + "epoch": 0.2955146590987469, + "grad_norm": 7.5134477615356445, + "learning_rate": 4.653775649327154e-06, + "loss": 0.4022, + "mean_token_accuracy": 0.8535345688462257, + "num_tokens": 114632466.0, + "step": 95330 + }, + { + "entropy": 1.878467933833599, + "epoch": 0.29554565822379664, + "grad_norm": 3.287616729736328, + "learning_rate": 4.653531578291793e-06, + "loss": 0.4508, + "mean_token_accuracy": 0.8520422443747521, + "num_tokens": 114644129.0, + "step": 95340 + }, + { + "entropy": 1.7941744670271873, + "epoch": 0.2955766573488463, + "grad_norm": 8.888310432434082, + "learning_rate": 4.653287545653915e-06, + "loss": 0.4527, + "mean_token_accuracy": 0.8592835500836372, + "num_tokens": 114657506.0, + "step": 95350 + }, + { + "entropy": 1.842213323712349, + "epoch": 0.29560765647389603, + "grad_norm": 8.71075439453125, + "learning_rate": 4.653043551403452e-06, + "loss": 0.4674, + "mean_token_accuracy": 0.8478049322962761, + "num_tokens": 114669587.0, + "step": 95360 + }, + { + "entropy": 1.8689180329442023, + "epoch": 0.2956386555989457, + "grad_norm": 7.912755489349365, + "learning_rate": 4.652799595530342e-06, + "loss": 0.4909, + "mean_token_accuracy": 0.8550763368606568, + "num_tokens": 114682059.0, + "step": 95370 + }, + { + "entropy": 1.9113721013069154, + "epoch": 0.2956696547239954, + "grad_norm": 9.693363189697266, + "learning_rate": 4.652555678024524e-06, + "loss": 0.4759, + "mean_token_accuracy": 0.853804387152195, + "num_tokens": 114693470.0, + "step": 95380 + }, + { + "entropy": 1.8694558992981911, + "epoch": 0.2957006538490451, + "grad_norm": 8.779112815856934, + "learning_rate": 4.652311798875943e-06, + "loss": 0.4829, + "mean_token_accuracy": 0.8518740639090538, + "num_tokens": 114705358.0, + "step": 95390 + }, + { + "entropy": 1.8623104050755501, + "epoch": 0.2957316529740948, + "grad_norm": 9.614526748657227, + "learning_rate": 4.652067958074547e-06, + "loss": 0.4875, + "mean_token_accuracy": 0.8510076805949212, + "num_tokens": 114718125.0, + "step": 95400 + }, + { + "entropy": 1.8787400797009468, + "epoch": 0.2957626520991445, + "grad_norm": 8.458456039428711, + "learning_rate": 4.651824155610288e-06, + "loss": 0.4505, + "mean_token_accuracy": 0.8569131478667259, + "num_tokens": 114729728.0, + "step": 95410 + }, + { + "entropy": 1.9131328269839287, + "epoch": 0.2957936512241942, + "grad_norm": 9.49421215057373, + "learning_rate": 4.6515803914731215e-06, + "loss": 0.472, + "mean_token_accuracy": 0.845634426176548, + "num_tokens": 114742431.0, + "step": 95420 + }, + { + "entropy": 1.7844593867659568, + "epoch": 0.2958246503492439, + "grad_norm": 8.483735084533691, + "learning_rate": 4.651336665653005e-06, + "loss": 0.3808, + "mean_token_accuracy": 0.8685527250170708, + "num_tokens": 114755668.0, + "step": 95430 + }, + { + "entropy": 1.922539620101452, + "epoch": 0.2958556494742936, + "grad_norm": 7.755213260650635, + "learning_rate": 4.651092978139902e-06, + "loss": 0.5134, + "mean_token_accuracy": 0.8362011343240738, + "num_tokens": 114766944.0, + "step": 95440 + }, + { + "entropy": 1.8400570914149283, + "epoch": 0.29588664859934327, + "grad_norm": 7.850890636444092, + "learning_rate": 4.650849328923779e-06, + "loss": 0.4173, + "mean_token_accuracy": 0.8529717952013016, + "num_tokens": 114779470.0, + "step": 95450 + }, + { + "entropy": 1.8972987502813339, + "epoch": 0.295917647724393, + "grad_norm": 10.900968551635742, + "learning_rate": 4.650605717994607e-06, + "loss": 0.4505, + "mean_token_accuracy": 0.8552507728338241, + "num_tokens": 114791012.0, + "step": 95460 + }, + { + "entropy": 1.904093398153782, + "epoch": 0.29594864684944266, + "grad_norm": 7.401911735534668, + "learning_rate": 4.650362145342358e-06, + "loss": 0.4572, + "mean_token_accuracy": 0.8534610956907273, + "num_tokens": 114802550.0, + "step": 95470 + }, + { + "entropy": 1.8667383641004562, + "epoch": 0.2959796459744924, + "grad_norm": 7.704689025878906, + "learning_rate": 4.650118610957009e-06, + "loss": 0.468, + "mean_token_accuracy": 0.8388277173042298, + "num_tokens": 114814877.0, + "step": 95480 + }, + { + "entropy": 1.8931495070457458, + "epoch": 0.29601064509954206, + "grad_norm": 9.703409194946289, + "learning_rate": 4.649875114828544e-06, + "loss": 0.5122, + "mean_token_accuracy": 0.8397621661424637, + "num_tokens": 114826671.0, + "step": 95490 + }, + { + "entropy": 1.949988493323326, + "epoch": 0.2960416442245918, + "grad_norm": 7.655276298522949, + "learning_rate": 4.6496316569469455e-06, + "loss": 0.515, + "mean_token_accuracy": 0.8395925149321556, + "num_tokens": 114837908.0, + "step": 95500 + }, + { + "entropy": 1.9334761276841164, + "epoch": 0.29607264334964145, + "grad_norm": 8.131454467773438, + "learning_rate": 4.649388237302203e-06, + "loss": 0.5127, + "mean_token_accuracy": 0.8403454497456551, + "num_tokens": 114849471.0, + "step": 95510 + }, + { + "entropy": 1.902390295267105, + "epoch": 0.2961036424746912, + "grad_norm": 8.340156555175781, + "learning_rate": 4.6491448558843065e-06, + "loss": 0.5007, + "mean_token_accuracy": 0.8446046605706214, + "num_tokens": 114861012.0, + "step": 95520 + }, + { + "entropy": 1.9234414175152779, + "epoch": 0.29613464159974084, + "grad_norm": 9.107486724853516, + "learning_rate": 4.648901512683255e-06, + "loss": 0.4532, + "mean_token_accuracy": 0.84765195697546, + "num_tokens": 114872674.0, + "step": 95530 + }, + { + "entropy": 1.8331530943512917, + "epoch": 0.29616564072479057, + "grad_norm": 8.991997718811035, + "learning_rate": 4.648658207689045e-06, + "loss": 0.4409, + "mean_token_accuracy": 0.8575505539774895, + "num_tokens": 114885367.0, + "step": 95540 + }, + { + "entropy": 1.8234670519828797, + "epoch": 0.29619663984984024, + "grad_norm": 5.19529914855957, + "learning_rate": 4.648414940891681e-06, + "loss": 0.4943, + "mean_token_accuracy": 0.8523637726902962, + "num_tokens": 114899055.0, + "step": 95550 + }, + { + "entropy": 1.8568585798144341, + "epoch": 0.2962276389748899, + "grad_norm": 4.264057636260986, + "learning_rate": 4.648171712281169e-06, + "loss": 0.4132, + "mean_token_accuracy": 0.8576111733913422, + "num_tokens": 114911683.0, + "step": 95560 + }, + { + "entropy": 1.9596162497997285, + "epoch": 0.29625863809993963, + "grad_norm": 9.092710494995117, + "learning_rate": 4.64792852184752e-06, + "loss": 0.5458, + "mean_token_accuracy": 0.8393025726079941, + "num_tokens": 114922365.0, + "step": 95570 + }, + { + "entropy": 1.8301913827657699, + "epoch": 0.2962896372249893, + "grad_norm": 7.440187454223633, + "learning_rate": 4.647685369580747e-06, + "loss": 0.4163, + "mean_token_accuracy": 0.857902692258358, + "num_tokens": 114935376.0, + "step": 95580 + }, + { + "entropy": 1.9472972556948662, + "epoch": 0.296320636350039, + "grad_norm": 8.745131492614746, + "learning_rate": 4.64744225547087e-06, + "loss": 0.5434, + "mean_token_accuracy": 0.828769737482071, + "num_tokens": 114946969.0, + "step": 95590 + }, + { + "entropy": 1.935869987308979, + "epoch": 0.2963516354750887, + "grad_norm": 7.479891777038574, + "learning_rate": 4.647199179507909e-06, + "loss": 0.4902, + "mean_token_accuracy": 0.8483911573886871, + "num_tokens": 114958473.0, + "step": 95600 + }, + { + "entropy": 1.899821263551712, + "epoch": 0.2963826346001384, + "grad_norm": 7.5076680183410645, + "learning_rate": 4.646956141681888e-06, + "loss": 0.5001, + "mean_token_accuracy": 0.845979979634285, + "num_tokens": 114969919.0, + "step": 95610 + }, + { + "entropy": 1.9035681039094925, + "epoch": 0.2964136337251881, + "grad_norm": 8.332342147827148, + "learning_rate": 4.646713141982837e-06, + "loss": 0.4396, + "mean_token_accuracy": 0.8571434035897255, + "num_tokens": 114982144.0, + "step": 95620 + }, + { + "entropy": 1.924658827483654, + "epoch": 0.2964446328502378, + "grad_norm": 9.30444622039795, + "learning_rate": 4.646470180400788e-06, + "loss": 0.4844, + "mean_token_accuracy": 0.8478532791137695, + "num_tokens": 114993947.0, + "step": 95630 + }, + { + "entropy": 1.8619504556059838, + "epoch": 0.2964756319752875, + "grad_norm": 5.051435470581055, + "learning_rate": 4.646227256925777e-06, + "loss": 0.4141, + "mean_token_accuracy": 0.8470785349607468, + "num_tokens": 115006275.0, + "step": 95640 + }, + { + "entropy": 1.8626134410500526, + "epoch": 0.2965066311003372, + "grad_norm": 4.338906288146973, + "learning_rate": 4.645984371547844e-06, + "loss": 0.4394, + "mean_token_accuracy": 0.8533647701144218, + "num_tokens": 115018973.0, + "step": 95650 + }, + { + "entropy": 1.8923157513141633, + "epoch": 0.2965376302253869, + "grad_norm": 8.258339881896973, + "learning_rate": 4.645741524257032e-06, + "loss": 0.5561, + "mean_token_accuracy": 0.8394008025527, + "num_tokens": 115030711.0, + "step": 95660 + }, + { + "entropy": 1.9015984013676643, + "epoch": 0.2965686293504366, + "grad_norm": 10.353814125061035, + "learning_rate": 4.645498715043387e-06, + "loss": 0.5335, + "mean_token_accuracy": 0.8356149435043335, + "num_tokens": 115042545.0, + "step": 95670 + }, + { + "entropy": 1.9278481543064117, + "epoch": 0.29659962847548627, + "grad_norm": 8.37845516204834, + "learning_rate": 4.645255943896961e-06, + "loss": 0.4757, + "mean_token_accuracy": 0.8446251779794693, + "num_tokens": 115054273.0, + "step": 95680 + }, + { + "entropy": 1.9885007172822953, + "epoch": 0.296630627600536, + "grad_norm": 8.631330490112305, + "learning_rate": 4.645013210807806e-06, + "loss": 0.4925, + "mean_token_accuracy": 0.8454102978110314, + "num_tokens": 115064894.0, + "step": 95690 + }, + { + "entropy": 1.8312623113393784, + "epoch": 0.29666162672558566, + "grad_norm": 8.99056339263916, + "learning_rate": 4.644770515765983e-06, + "loss": 0.3975, + "mean_token_accuracy": 0.8604427292943001, + "num_tokens": 115077844.0, + "step": 95700 + }, + { + "entropy": 1.8540172457695008, + "epoch": 0.2966926258506354, + "grad_norm": 2.9684078693389893, + "learning_rate": 4.64452785876155e-06, + "loss": 0.4706, + "mean_token_accuracy": 0.844263382256031, + "num_tokens": 115090181.0, + "step": 95710 + }, + { + "entropy": 1.9741147756576538, + "epoch": 0.29672362497568505, + "grad_norm": 8.870850563049316, + "learning_rate": 4.644285239784575e-06, + "loss": 0.5372, + "mean_token_accuracy": 0.8312768504023552, + "num_tokens": 115100406.0, + "step": 95720 + }, + { + "entropy": 1.9829291343688964, + "epoch": 0.2967546241007348, + "grad_norm": 3.856050729751587, + "learning_rate": 4.644042658825126e-06, + "loss": 0.538, + "mean_token_accuracy": 0.8350884988903999, + "num_tokens": 115111439.0, + "step": 95730 + }, + { + "entropy": 1.9604087606072427, + "epoch": 0.29678562322578445, + "grad_norm": 8.375349998474121, + "learning_rate": 4.643800115873274e-06, + "loss": 0.5206, + "mean_token_accuracy": 0.8406010925769806, + "num_tokens": 115122511.0, + "step": 95740 + }, + { + "entropy": 1.8680056795477866, + "epoch": 0.29681662235083417, + "grad_norm": 3.6315958499908447, + "learning_rate": 4.643557610919095e-06, + "loss": 0.4559, + "mean_token_accuracy": 0.851527801156044, + "num_tokens": 115134569.0, + "step": 95750 + }, + { + "entropy": 1.860308752954006, + "epoch": 0.29684762147588384, + "grad_norm": 7.783955097198486, + "learning_rate": 4.643315143952671e-06, + "loss": 0.4196, + "mean_token_accuracy": 0.8580260217189789, + "num_tokens": 115146701.0, + "step": 95760 + }, + { + "entropy": 1.9309222459793092, + "epoch": 0.29687862060093356, + "grad_norm": 8.806280136108398, + "learning_rate": 4.643072714964084e-06, + "loss": 0.5613, + "mean_token_accuracy": 0.836700190603733, + "num_tokens": 115157808.0, + "step": 95770 + }, + { + "entropy": 1.9781962364912034, + "epoch": 0.29690961972598323, + "grad_norm": 7.529308319091797, + "learning_rate": 4.64283032394342e-06, + "loss": 0.524, + "mean_token_accuracy": 0.8430225938558579, + "num_tokens": 115168935.0, + "step": 95780 + }, + { + "entropy": 1.8590948715806008, + "epoch": 0.29694061885103296, + "grad_norm": 8.396674156188965, + "learning_rate": 4.642587970880769e-06, + "loss": 0.4035, + "mean_token_accuracy": 0.8598534435033798, + "num_tokens": 115180678.0, + "step": 95790 + }, + { + "entropy": 1.8222449347376823, + "epoch": 0.2969716179760826, + "grad_norm": 5.250387191772461, + "learning_rate": 4.642345655766227e-06, + "loss": 0.4177, + "mean_token_accuracy": 0.8539089500904083, + "num_tokens": 115193480.0, + "step": 95800 + }, + { + "entropy": 1.8699547871947289, + "epoch": 0.2970026171011323, + "grad_norm": 10.275331497192383, + "learning_rate": 4.642103378589891e-06, + "loss": 0.4747, + "mean_token_accuracy": 0.8331056758761406, + "num_tokens": 115205319.0, + "step": 95810 + }, + { + "entropy": 1.8728644341230392, + "epoch": 0.297033616226182, + "grad_norm": 3.5322608947753906, + "learning_rate": 4.641861139341863e-06, + "loss": 0.4816, + "mean_token_accuracy": 0.8516603350639343, + "num_tokens": 115217604.0, + "step": 95820 + }, + { + "entropy": 1.9369212195277214, + "epoch": 0.2970646153512317, + "grad_norm": 7.612384796142578, + "learning_rate": 4.641618938012246e-06, + "loss": 0.4872, + "mean_token_accuracy": 0.8410941004753113, + "num_tokens": 115229163.0, + "step": 95830 + }, + { + "entropy": 1.930888931453228, + "epoch": 0.2970956144762814, + "grad_norm": 8.152316093444824, + "learning_rate": 4.64137677459115e-06, + "loss": 0.5037, + "mean_token_accuracy": 0.8360361099243164, + "num_tokens": 115241832.0, + "step": 95840 + }, + { + "entropy": 1.860244083404541, + "epoch": 0.2971266136013311, + "grad_norm": 2.479391574859619, + "learning_rate": 4.641134649068688e-06, + "loss": 0.4802, + "mean_token_accuracy": 0.8496681988239289, + "num_tokens": 115254340.0, + "step": 95850 + }, + { + "entropy": 1.8632259294390678, + "epoch": 0.2971576127263808, + "grad_norm": 4.353872299194336, + "learning_rate": 4.640892561434973e-06, + "loss": 0.4722, + "mean_token_accuracy": 0.8551715791225434, + "num_tokens": 115266838.0, + "step": 95860 + }, + { + "entropy": 1.999795663356781, + "epoch": 0.2971886118514305, + "grad_norm": 9.249789237976074, + "learning_rate": 4.640650511680128e-06, + "loss": 0.5064, + "mean_token_accuracy": 0.8440039098262787, + "num_tokens": 115277215.0, + "step": 95870 + }, + { + "entropy": 1.954987433552742, + "epoch": 0.2972196109764802, + "grad_norm": 13.373074531555176, + "learning_rate": 4.640408499794271e-06, + "loss": 0.5134, + "mean_token_accuracy": 0.8423504129052162, + "num_tokens": 115287841.0, + "step": 95880 + }, + { + "entropy": 1.821416835486889, + "epoch": 0.29725061010152987, + "grad_norm": 3.3793182373046875, + "learning_rate": 4.640166525767535e-06, + "loss": 0.3991, + "mean_token_accuracy": 0.8553699105978012, + "num_tokens": 115300828.0, + "step": 95890 + }, + { + "entropy": 1.9414212331175804, + "epoch": 0.2972816092265796, + "grad_norm": 3.7656142711639404, + "learning_rate": 4.639924589590045e-06, + "loss": 0.4728, + "mean_token_accuracy": 0.8337204396724701, + "num_tokens": 115312598.0, + "step": 95900 + }, + { + "entropy": 1.9279131293296814, + "epoch": 0.29731260835162926, + "grad_norm": 10.359955787658691, + "learning_rate": 4.639682691251938e-06, + "loss": 0.4972, + "mean_token_accuracy": 0.8390220627188683, + "num_tokens": 115324584.0, + "step": 95910 + }, + { + "entropy": 1.859154610335827, + "epoch": 0.297343607476679, + "grad_norm": 7.873774528503418, + "learning_rate": 4.6394408307433494e-06, + "loss": 0.3915, + "mean_token_accuracy": 0.8545582070946693, + "num_tokens": 115337114.0, + "step": 95920 + }, + { + "entropy": 1.9387451350688933, + "epoch": 0.29737460660172865, + "grad_norm": 10.652080535888672, + "learning_rate": 4.639199008054421e-06, + "loss": 0.4937, + "mean_token_accuracy": 0.8364695340394974, + "num_tokens": 115348545.0, + "step": 95930 + }, + { + "entropy": 2.010096028447151, + "epoch": 0.2974056057267784, + "grad_norm": 9.396890640258789, + "learning_rate": 4.638957223175298e-06, + "loss": 0.5485, + "mean_token_accuracy": 0.8352597668766976, + "num_tokens": 115359072.0, + "step": 95940 + }, + { + "entropy": 1.845473076403141, + "epoch": 0.29743660485182805, + "grad_norm": 7.396862030029297, + "learning_rate": 4.638715476096127e-06, + "loss": 0.4258, + "mean_token_accuracy": 0.8523558273911476, + "num_tokens": 115372520.0, + "step": 95950 + }, + { + "entropy": 1.8642360031604768, + "epoch": 0.29746760397687777, + "grad_norm": 3.6565442085266113, + "learning_rate": 4.638473766807061e-06, + "loss": 0.4411, + "mean_token_accuracy": 0.8511311173439026, + "num_tokens": 115385427.0, + "step": 95960 + }, + { + "entropy": 1.9817214131355285, + "epoch": 0.29749860310192744, + "grad_norm": 9.105077743530273, + "learning_rate": 4.638232095298256e-06, + "loss": 0.5501, + "mean_token_accuracy": 0.8311897858977317, + "num_tokens": 115396089.0, + "step": 95970 + }, + { + "entropy": 1.8148702546954154, + "epoch": 0.29752960222697716, + "grad_norm": 4.5015106201171875, + "learning_rate": 4.6379904615598705e-06, + "loss": 0.4107, + "mean_token_accuracy": 0.8546622708439827, + "num_tokens": 115408984.0, + "step": 95980 + }, + { + "entropy": 1.8370864361524581, + "epoch": 0.29756060135202683, + "grad_norm": 8.784217834472656, + "learning_rate": 4.637748865582065e-06, + "loss": 0.4133, + "mean_token_accuracy": 0.8529459208250045, + "num_tokens": 115421841.0, + "step": 95990 + }, + { + "entropy": 1.9331490993499756, + "epoch": 0.29759160047707656, + "grad_norm": 8.97538948059082, + "learning_rate": 4.637507307355009e-06, + "loss": 0.4849, + "mean_token_accuracy": 0.8470474123954773, + "num_tokens": 115432927.0, + "step": 96000 + }, + { + "entropy": 1.8707862794399261, + "epoch": 0.2976225996021262, + "grad_norm": 8.228986740112305, + "learning_rate": 4.63726578686887e-06, + "loss": 0.4487, + "mean_token_accuracy": 0.857261911034584, + "num_tokens": 115444664.0, + "step": 96010 + }, + { + "entropy": 1.9656662926077844, + "epoch": 0.29765359872717595, + "grad_norm": 8.407549858093262, + "learning_rate": 4.637024304113822e-06, + "loss": 0.4903, + "mean_token_accuracy": 0.847040732204914, + "num_tokens": 115456093.0, + "step": 96020 + }, + { + "entropy": 2.015861451625824, + "epoch": 0.2976845978522256, + "grad_norm": 8.171123504638672, + "learning_rate": 4.636782859080041e-06, + "loss": 0.5187, + "mean_token_accuracy": 0.8443436101078987, + "num_tokens": 115467201.0, + "step": 96030 + }, + { + "entropy": 2.0067085534334184, + "epoch": 0.29771559697727534, + "grad_norm": 9.067146301269531, + "learning_rate": 4.636541451757711e-06, + "loss": 0.5731, + "mean_token_accuracy": 0.8257392793893814, + "num_tokens": 115478314.0, + "step": 96040 + }, + { + "entropy": 1.8171662881970405, + "epoch": 0.297746596102325, + "grad_norm": 9.887383460998535, + "learning_rate": 4.636300082137011e-06, + "loss": 0.4195, + "mean_token_accuracy": 0.860226346552372, + "num_tokens": 115490891.0, + "step": 96050 + }, + { + "entropy": 1.8969256058335304, + "epoch": 0.2977775952273747, + "grad_norm": 3.9007980823516846, + "learning_rate": 4.636058750208131e-06, + "loss": 0.5259, + "mean_token_accuracy": 0.8481943354010582, + "num_tokens": 115502955.0, + "step": 96060 + }, + { + "entropy": 1.855407053232193, + "epoch": 0.2978085943524244, + "grad_norm": 7.683308124542236, + "learning_rate": 4.635817455961264e-06, + "loss": 0.4505, + "mean_token_accuracy": 0.8527497768402099, + "num_tokens": 115516432.0, + "step": 96070 + }, + { + "entropy": 1.9460287556052207, + "epoch": 0.2978395934774741, + "grad_norm": 10.421119689941406, + "learning_rate": 4.635576199386602e-06, + "loss": 0.556, + "mean_token_accuracy": 0.8432202488183975, + "num_tokens": 115528309.0, + "step": 96080 + }, + { + "entropy": 1.9160909160971642, + "epoch": 0.2978705926025238, + "grad_norm": 7.117789268493652, + "learning_rate": 4.635334980474345e-06, + "loss": 0.5228, + "mean_token_accuracy": 0.8440079033374787, + "num_tokens": 115539915.0, + "step": 96090 + }, + { + "entropy": 1.9123343035578728, + "epoch": 0.29790159172757347, + "grad_norm": 7.615013122558594, + "learning_rate": 4.635093799214693e-06, + "loss": 0.483, + "mean_token_accuracy": 0.8447704032063484, + "num_tokens": 115551725.0, + "step": 96100 + }, + { + "entropy": 1.8921069249510765, + "epoch": 0.2979325908526232, + "grad_norm": 8.272263526916504, + "learning_rate": 4.634852655597854e-06, + "loss": 0.4898, + "mean_token_accuracy": 0.8503027930855751, + "num_tokens": 115563104.0, + "step": 96110 + }, + { + "entropy": 1.9330130770802498, + "epoch": 0.29796358997767286, + "grad_norm": 7.9516143798828125, + "learning_rate": 4.634611549614036e-06, + "loss": 0.4694, + "mean_token_accuracy": 0.846668167412281, + "num_tokens": 115574420.0, + "step": 96120 + }, + { + "entropy": 1.930530808866024, + "epoch": 0.2979945891027226, + "grad_norm": 10.271747589111328, + "learning_rate": 4.634370481253451e-06, + "loss": 0.5066, + "mean_token_accuracy": 0.8329700931906701, + "num_tokens": 115586106.0, + "step": 96130 + }, + { + "entropy": 1.9371146902441978, + "epoch": 0.29802558822777225, + "grad_norm": 8.164349555969238, + "learning_rate": 4.634129450506316e-06, + "loss": 0.5208, + "mean_token_accuracy": 0.8380087822675705, + "num_tokens": 115597047.0, + "step": 96140 + }, + { + "entropy": 1.93800760358572, + "epoch": 0.298056587352822, + "grad_norm": 8.42188549041748, + "learning_rate": 4.633888457362851e-06, + "loss": 0.4878, + "mean_token_accuracy": 0.8453196853399276, + "num_tokens": 115608165.0, + "step": 96150 + }, + { + "entropy": 1.8701598271727562, + "epoch": 0.29808758647787165, + "grad_norm": 8.132607460021973, + "learning_rate": 4.633647501813278e-06, + "loss": 0.4596, + "mean_token_accuracy": 0.8482308521866798, + "num_tokens": 115620759.0, + "step": 96160 + }, + { + "entropy": 1.8687379583716393, + "epoch": 0.29811858560292137, + "grad_norm": 3.990684747695923, + "learning_rate": 4.633406583847825e-06, + "loss": 0.4503, + "mean_token_accuracy": 0.8474869653582573, + "num_tokens": 115633358.0, + "step": 96170 + }, + { + "entropy": 1.8821706905961038, + "epoch": 0.29814958472797104, + "grad_norm": 7.216353416442871, + "learning_rate": 4.633165703456723e-06, + "loss": 0.433, + "mean_token_accuracy": 0.8504739284515381, + "num_tokens": 115646131.0, + "step": 96180 + }, + { + "entropy": 1.98152334690094, + "epoch": 0.29818058385302076, + "grad_norm": 9.739982604980469, + "learning_rate": 4.6329248606302045e-06, + "loss": 0.5019, + "mean_token_accuracy": 0.8421538904309273, + "num_tokens": 115656444.0, + "step": 96190 + }, + { + "entropy": 1.7937684386968613, + "epoch": 0.29821158297807043, + "grad_norm": 4.097762107849121, + "learning_rate": 4.6326840553585075e-06, + "loss": 0.3803, + "mean_token_accuracy": 0.8663532704114913, + "num_tokens": 115669849.0, + "step": 96200 + }, + { + "entropy": 1.960816130042076, + "epoch": 0.29824258210312016, + "grad_norm": 7.547084331512451, + "learning_rate": 4.632443287631873e-06, + "loss": 0.538, + "mean_token_accuracy": 0.8430081829428673, + "num_tokens": 115680669.0, + "step": 96210 + }, + { + "entropy": 1.8645331501960754, + "epoch": 0.2982735812281698, + "grad_norm": 4.153885364532471, + "learning_rate": 4.632202557440546e-06, + "loss": 0.4595, + "mean_token_accuracy": 0.8473611772060394, + "num_tokens": 115693855.0, + "step": 96220 + }, + { + "entropy": 1.932821586728096, + "epoch": 0.29830458035321955, + "grad_norm": 6.955663681030273, + "learning_rate": 4.631961864774775e-06, + "loss": 0.4661, + "mean_token_accuracy": 0.853759004175663, + "num_tokens": 115705441.0, + "step": 96230 + }, + { + "entropy": 1.8800847560167313, + "epoch": 0.2983355794782692, + "grad_norm": 10.34147834777832, + "learning_rate": 4.631721209624811e-06, + "loss": 0.5122, + "mean_token_accuracy": 0.8399926438927651, + "num_tokens": 115718673.0, + "step": 96240 + }, + { + "entropy": 1.868877911567688, + "epoch": 0.29836657860331894, + "grad_norm": 9.02034854888916, + "learning_rate": 4.63148059198091e-06, + "loss": 0.4677, + "mean_token_accuracy": 0.8474683433771133, + "num_tokens": 115731147.0, + "step": 96250 + }, + { + "entropy": 1.7052193224430083, + "epoch": 0.2983975777283686, + "grad_norm": 4.165556907653809, + "learning_rate": 4.63124001183333e-06, + "loss": 0.392, + "mean_token_accuracy": 0.8648575574159623, + "num_tokens": 115745846.0, + "step": 96260 + }, + { + "entropy": 1.8481420859694482, + "epoch": 0.29842857685341834, + "grad_norm": 8.495997428894043, + "learning_rate": 4.630999469172333e-06, + "loss": 0.4131, + "mean_token_accuracy": 0.8595449611544609, + "num_tokens": 115758179.0, + "step": 96270 + }, + { + "entropy": 1.8788498505949973, + "epoch": 0.298459575978468, + "grad_norm": 7.667300701141357, + "learning_rate": 4.630758963988187e-06, + "loss": 0.456, + "mean_token_accuracy": 0.8503319293260574, + "num_tokens": 115769809.0, + "step": 96280 + }, + { + "entropy": 1.9287839099764823, + "epoch": 0.29849057510351773, + "grad_norm": 8.444547653198242, + "learning_rate": 4.63051849627116e-06, + "loss": 0.4939, + "mean_token_accuracy": 0.8440607368946076, + "num_tokens": 115781425.0, + "step": 96290 + }, + { + "entropy": 1.9038120612502099, + "epoch": 0.2985215742285674, + "grad_norm": 7.381701946258545, + "learning_rate": 4.630278066011525e-06, + "loss": 0.4942, + "mean_token_accuracy": 0.8477517932653427, + "num_tokens": 115793181.0, + "step": 96300 + }, + { + "entropy": 1.8960516452789307, + "epoch": 0.29855257335361707, + "grad_norm": 7.266051292419434, + "learning_rate": 4.630037673199559e-06, + "loss": 0.4783, + "mean_token_accuracy": 0.8412499457597733, + "num_tokens": 115805564.0, + "step": 96310 + }, + { + "entropy": 1.9981954544782639, + "epoch": 0.2985835724786668, + "grad_norm": 8.080859184265137, + "learning_rate": 4.62979731782554e-06, + "loss": 0.4944, + "mean_token_accuracy": 0.8427246376872063, + "num_tokens": 115816757.0, + "step": 96320 + }, + { + "entropy": 1.824733631312847, + "epoch": 0.29861457160371646, + "grad_norm": 3.8523154258728027, + "learning_rate": 4.629556999879755e-06, + "loss": 0.4328, + "mean_token_accuracy": 0.8527489557862282, + "num_tokens": 115829066.0, + "step": 96330 + }, + { + "entropy": 1.9180657356977462, + "epoch": 0.2986455707287662, + "grad_norm": 9.480389595031738, + "learning_rate": 4.629316719352488e-06, + "loss": 0.4889, + "mean_token_accuracy": 0.8383275628089905, + "num_tokens": 115840829.0, + "step": 96340 + }, + { + "entropy": 1.8489946901798249, + "epoch": 0.29867656985381585, + "grad_norm": 4.126688003540039, + "learning_rate": 4.629076476234032e-06, + "loss": 0.4585, + "mean_token_accuracy": 0.8490898162126541, + "num_tokens": 115853917.0, + "step": 96350 + }, + { + "entropy": 1.8371913895010947, + "epoch": 0.2987075689788656, + "grad_norm": 3.543614387512207, + "learning_rate": 4.628836270514679e-06, + "loss": 0.4319, + "mean_token_accuracy": 0.8514250427484512, + "num_tokens": 115867183.0, + "step": 96360 + }, + { + "entropy": 1.853197917342186, + "epoch": 0.29873856810391525, + "grad_norm": 3.921837329864502, + "learning_rate": 4.628596102184729e-06, + "loss": 0.4699, + "mean_token_accuracy": 0.8422363549470901, + "num_tokens": 115879237.0, + "step": 96370 + }, + { + "entropy": 1.818500466644764, + "epoch": 0.29876956722896497, + "grad_norm": 8.878031730651855, + "learning_rate": 4.6283559712344825e-06, + "loss": 0.4574, + "mean_token_accuracy": 0.8616446629166603, + "num_tokens": 115892081.0, + "step": 96380 + }, + { + "entropy": 1.9318446889519691, + "epoch": 0.29880056635401464, + "grad_norm": 8.830316543579102, + "learning_rate": 4.628115877654243e-06, + "loss": 0.5193, + "mean_token_accuracy": 0.8325408533215523, + "num_tokens": 115903514.0, + "step": 96390 + }, + { + "entropy": 1.8660483628511428, + "epoch": 0.29883156547906436, + "grad_norm": 8.622795104980469, + "learning_rate": 4.627875821434319e-06, + "loss": 0.4325, + "mean_token_accuracy": 0.8535114452242851, + "num_tokens": 115915306.0, + "step": 96400 + }, + { + "entropy": 1.8813782200217246, + "epoch": 0.29886256460411403, + "grad_norm": 9.0267333984375, + "learning_rate": 4.627635802565024e-06, + "loss": 0.499, + "mean_token_accuracy": 0.846407724916935, + "num_tokens": 115927498.0, + "step": 96410 + }, + { + "entropy": 1.9499717622995376, + "epoch": 0.29889356372916376, + "grad_norm": 9.516846656799316, + "learning_rate": 4.627395821036672e-06, + "loss": 0.4815, + "mean_token_accuracy": 0.853526496887207, + "num_tokens": 115938787.0, + "step": 96420 + }, + { + "entropy": 1.9189068511128426, + "epoch": 0.2989245628542134, + "grad_norm": 7.334571361541748, + "learning_rate": 4.6271558768395816e-06, + "loss": 0.4921, + "mean_token_accuracy": 0.8488484725356102, + "num_tokens": 115950064.0, + "step": 96430 + }, + { + "entropy": 1.8531944811344148, + "epoch": 0.29895556197926315, + "grad_norm": 8.624135971069336, + "learning_rate": 4.6269159699640755e-06, + "loss": 0.4706, + "mean_token_accuracy": 0.8516410380601883, + "num_tokens": 115962309.0, + "step": 96440 + }, + { + "entropy": 1.903770998120308, + "epoch": 0.2989865611043128, + "grad_norm": 8.187854766845703, + "learning_rate": 4.62667610040048e-06, + "loss": 0.4689, + "mean_token_accuracy": 0.8519768640398979, + "num_tokens": 115973845.0, + "step": 96450 + }, + { + "entropy": 1.8115949898958206, + "epoch": 0.29901756022936254, + "grad_norm": 8.5139799118042, + "learning_rate": 4.626436268139122e-06, + "loss": 0.4351, + "mean_token_accuracy": 0.8568135395646095, + "num_tokens": 115986832.0, + "step": 96460 + }, + { + "entropy": 1.979418794810772, + "epoch": 0.2990485593544122, + "grad_norm": 7.29787015914917, + "learning_rate": 4.626196473170338e-06, + "loss": 0.5356, + "mean_token_accuracy": 0.8345787361264229, + "num_tokens": 115997910.0, + "step": 96470 + }, + { + "entropy": 1.9397563070058823, + "epoch": 0.29907955847946194, + "grad_norm": 8.139474868774414, + "learning_rate": 4.625956715484463e-06, + "loss": 0.4978, + "mean_token_accuracy": 0.8472580030560494, + "num_tokens": 116009904.0, + "step": 96480 + }, + { + "entropy": 1.8948927074670792, + "epoch": 0.2991105576045116, + "grad_norm": 9.053715705871582, + "learning_rate": 4.625716995071836e-06, + "loss": 0.4591, + "mean_token_accuracy": 0.8533863142132759, + "num_tokens": 116021106.0, + "step": 96490 + }, + { + "entropy": 1.8922299653291703, + "epoch": 0.29914155672956133, + "grad_norm": 9.0900239944458, + "learning_rate": 4.6254773119228004e-06, + "loss": 0.4598, + "mean_token_accuracy": 0.8474109992384911, + "num_tokens": 116033322.0, + "step": 96500 + }, + { + "entropy": 1.847077339887619, + "epoch": 0.299172555854611, + "grad_norm": 3.035865068435669, + "learning_rate": 4.625237666027704e-06, + "loss": 0.4026, + "mean_token_accuracy": 0.8617709457874299, + "num_tokens": 116045141.0, + "step": 96510 + }, + { + "entropy": 1.8434469774365425, + "epoch": 0.2992035549796607, + "grad_norm": 10.367220878601074, + "learning_rate": 4.624998057376896e-06, + "loss": 0.43, + "mean_token_accuracy": 0.859175056219101, + "num_tokens": 116057311.0, + "step": 96520 + }, + { + "entropy": 1.922757549583912, + "epoch": 0.2992345541047104, + "grad_norm": 7.6684417724609375, + "learning_rate": 4.624758485960731e-06, + "loss": 0.5211, + "mean_token_accuracy": 0.8412060752511025, + "num_tokens": 116068781.0, + "step": 96530 + }, + { + "entropy": 1.874141050875187, + "epoch": 0.2992655532297601, + "grad_norm": 5.786795139312744, + "learning_rate": 4.624518951769568e-06, + "loss": 0.5015, + "mean_token_accuracy": 0.8453152433037758, + "num_tokens": 116080384.0, + "step": 96540 + }, + { + "entropy": 1.9315826326608658, + "epoch": 0.2992965523548098, + "grad_norm": 7.588706016540527, + "learning_rate": 4.624279454793765e-06, + "loss": 0.5078, + "mean_token_accuracy": 0.84673622995615, + "num_tokens": 116090980.0, + "step": 96550 + }, + { + "entropy": 1.910808216035366, + "epoch": 0.29932755147985945, + "grad_norm": 4.097476482391357, + "learning_rate": 4.624039995023688e-06, + "loss": 0.4614, + "mean_token_accuracy": 0.8506806045770645, + "num_tokens": 116102714.0, + "step": 96560 + }, + { + "entropy": 1.8671618595719337, + "epoch": 0.2993585506049092, + "grad_norm": 4.14247465133667, + "learning_rate": 4.623800572449704e-06, + "loss": 0.4883, + "mean_token_accuracy": 0.8386751100420952, + "num_tokens": 116114873.0, + "step": 96570 + }, + { + "entropy": 1.8730577558279038, + "epoch": 0.29938954972995885, + "grad_norm": 11.251909255981445, + "learning_rate": 4.623561187062184e-06, + "loss": 0.4503, + "mean_token_accuracy": 0.8561120167374611, + "num_tokens": 116126620.0, + "step": 96580 + }, + { + "entropy": 1.8773257568478585, + "epoch": 0.2994205488550086, + "grad_norm": 9.164704322814941, + "learning_rate": 4.623321838851505e-06, + "loss": 0.4981, + "mean_token_accuracy": 0.8373346477746964, + "num_tokens": 116137916.0, + "step": 96590 + }, + { + "entropy": 1.9126343131065369, + "epoch": 0.29945154798005824, + "grad_norm": 8.017783164978027, + "learning_rate": 4.623082527808043e-06, + "loss": 0.4813, + "mean_token_accuracy": 0.841637770831585, + "num_tokens": 116149896.0, + "step": 96600 + }, + { + "entropy": 1.8838177442550659, + "epoch": 0.29948254710510797, + "grad_norm": 4.913340091705322, + "learning_rate": 4.622843253922182e-06, + "loss": 0.4407, + "mean_token_accuracy": 0.8604057088494301, + "num_tokens": 116162580.0, + "step": 96610 + }, + { + "entropy": 1.8277122244238853, + "epoch": 0.29951354623015763, + "grad_norm": 8.11374282836914, + "learning_rate": 4.622604017184304e-06, + "loss": 0.4217, + "mean_token_accuracy": 0.8429702982306481, + "num_tokens": 116175332.0, + "step": 96620 + }, + { + "entropy": 1.9389643400907517, + "epoch": 0.29954454535520736, + "grad_norm": 8.148730278015137, + "learning_rate": 4.622364817584801e-06, + "loss": 0.541, + "mean_token_accuracy": 0.8371641963720322, + "num_tokens": 116186424.0, + "step": 96630 + }, + { + "entropy": 1.9510870546102523, + "epoch": 0.29957554448025703, + "grad_norm": 6.960958957672119, + "learning_rate": 4.622125655114065e-06, + "loss": 0.5032, + "mean_token_accuracy": 0.839006906747818, + "num_tokens": 116197799.0, + "step": 96640 + }, + { + "entropy": 1.9510822862386703, + "epoch": 0.29960654360530675, + "grad_norm": 8.637950897216797, + "learning_rate": 4.621886529762488e-06, + "loss": 0.5395, + "mean_token_accuracy": 0.8410506933927536, + "num_tokens": 116208985.0, + "step": 96650 + }, + { + "entropy": 1.868352036178112, + "epoch": 0.2996375427303564, + "grad_norm": 10.132604598999023, + "learning_rate": 4.621647441520475e-06, + "loss": 0.4289, + "mean_token_accuracy": 0.8492979109287262, + "num_tokens": 116221911.0, + "step": 96660 + }, + { + "entropy": 1.9646795377135278, + "epoch": 0.29966854185540615, + "grad_norm": 9.418262481689453, + "learning_rate": 4.621408390378424e-06, + "loss": 0.4937, + "mean_token_accuracy": 0.8449583351612091, + "num_tokens": 116232932.0, + "step": 96670 + }, + { + "entropy": 1.9498365700244904, + "epoch": 0.2996995409804558, + "grad_norm": 7.607828617095947, + "learning_rate": 4.621169376326742e-06, + "loss": 0.5144, + "mean_token_accuracy": 0.8415804550051689, + "num_tokens": 116243576.0, + "step": 96680 + }, + { + "entropy": 1.862950399518013, + "epoch": 0.29973054010550554, + "grad_norm": 8.767817497253418, + "learning_rate": 4.620930399355841e-06, + "loss": 0.4621, + "mean_token_accuracy": 0.8488745912909508, + "num_tokens": 116256004.0, + "step": 96690 + }, + { + "entropy": 1.795606505870819, + "epoch": 0.2997615392305552, + "grad_norm": 4.170324325561523, + "learning_rate": 4.620691459456132e-06, + "loss": 0.4269, + "mean_token_accuracy": 0.8523398399353027, + "num_tokens": 116269297.0, + "step": 96700 + }, + { + "entropy": 1.8912088066339492, + "epoch": 0.29979253835560493, + "grad_norm": 7.616124153137207, + "learning_rate": 4.620452556618031e-06, + "loss": 0.4653, + "mean_token_accuracy": 0.8536741435527802, + "num_tokens": 116281401.0, + "step": 96710 + }, + { + "entropy": 1.9044504195451737, + "epoch": 0.2998235374806546, + "grad_norm": 3.9353320598602295, + "learning_rate": 4.6202136908319606e-06, + "loss": 0.4888, + "mean_token_accuracy": 0.8428740501403809, + "num_tokens": 116293236.0, + "step": 96720 + }, + { + "entropy": 1.831810677051544, + "epoch": 0.2998545366057043, + "grad_norm": 9.96127986907959, + "learning_rate": 4.6199748620883425e-06, + "loss": 0.4187, + "mean_token_accuracy": 0.856151320040226, + "num_tokens": 116305330.0, + "step": 96730 + }, + { + "entropy": 1.915688943862915, + "epoch": 0.299885535730754, + "grad_norm": 8.897388458251953, + "learning_rate": 4.619736070377604e-06, + "loss": 0.4791, + "mean_token_accuracy": 0.8483431145548821, + "num_tokens": 116316018.0, + "step": 96740 + }, + { + "entropy": 1.8551687330007554, + "epoch": 0.2999165348558037, + "grad_norm": 3.1668596267700195, + "learning_rate": 4.619497315690176e-06, + "loss": 0.5315, + "mean_token_accuracy": 0.8362834542989731, + "num_tokens": 116328555.0, + "step": 96750 + }, + { + "entropy": 1.8770240753889085, + "epoch": 0.2999475339808534, + "grad_norm": 2.5352485179901123, + "learning_rate": 4.61925859801649e-06, + "loss": 0.4974, + "mean_token_accuracy": 0.8435857251286507, + "num_tokens": 116340178.0, + "step": 96760 + }, + { + "entropy": 1.9596908926963805, + "epoch": 0.2999785331059031, + "grad_norm": 8.359101295471191, + "learning_rate": 4.619019917346987e-06, + "loss": 0.5595, + "mean_token_accuracy": 0.8392258763313294, + "num_tokens": 116351512.0, + "step": 96770 + }, + { + "entropy": 1.7745133236050605, + "epoch": 0.3000095322309528, + "grad_norm": 6.980304718017578, + "learning_rate": 4.618781273672105e-06, + "loss": 0.3915, + "mean_token_accuracy": 0.8550699412822723, + "num_tokens": 116365493.0, + "step": 96780 + }, + { + "entropy": 1.8009425699710846, + "epoch": 0.3000405313560025, + "grad_norm": 9.106100082397461, + "learning_rate": 4.618542666982291e-06, + "loss": 0.4403, + "mean_token_accuracy": 0.8583745554089546, + "num_tokens": 116378953.0, + "step": 96790 + }, + { + "entropy": 1.8405057787895203, + "epoch": 0.3000715304810522, + "grad_norm": 6.269810199737549, + "learning_rate": 4.6183040972679905e-06, + "loss": 0.455, + "mean_token_accuracy": 0.853019006550312, + "num_tokens": 116391720.0, + "step": 96800 + }, + { + "entropy": 1.8243362039327622, + "epoch": 0.30010252960610184, + "grad_norm": 8.785292625427246, + "learning_rate": 4.618065564519655e-06, + "loss": 0.4258, + "mean_token_accuracy": 0.8519068777561187, + "num_tokens": 116404782.0, + "step": 96810 + }, + { + "entropy": 1.9162081688642503, + "epoch": 0.30013352873115157, + "grad_norm": 9.158159255981445, + "learning_rate": 4.617827068727739e-06, + "loss": 0.5059, + "mean_token_accuracy": 0.8442169070243836, + "num_tokens": 116415863.0, + "step": 96820 + }, + { + "entropy": 1.8956199631094932, + "epoch": 0.30016452785620124, + "grad_norm": 5.014525890350342, + "learning_rate": 4.617588609882702e-06, + "loss": 0.5128, + "mean_token_accuracy": 0.8389386609196663, + "num_tokens": 116427960.0, + "step": 96830 + }, + { + "entropy": 1.890786738693714, + "epoch": 0.30019552698125096, + "grad_norm": 10.428237915039062, + "learning_rate": 4.617350187975004e-06, + "loss": 0.4924, + "mean_token_accuracy": 0.8487927287817001, + "num_tokens": 116439841.0, + "step": 96840 + }, + { + "entropy": 1.8615357890725135, + "epoch": 0.30022652610630063, + "grad_norm": 9.423101425170898, + "learning_rate": 4.617111802995109e-06, + "loss": 0.4574, + "mean_token_accuracy": 0.8472002789378166, + "num_tokens": 116451475.0, + "step": 96850 + }, + { + "entropy": 1.9217775925993918, + "epoch": 0.30025752523135035, + "grad_norm": 7.137108325958252, + "learning_rate": 4.616873454933489e-06, + "loss": 0.5065, + "mean_token_accuracy": 0.8377560988068581, + "num_tokens": 116462859.0, + "step": 96860 + }, + { + "entropy": 1.8776028633117676, + "epoch": 0.3002885243564, + "grad_norm": 4.520118236541748, + "learning_rate": 4.616635143780614e-06, + "loss": 0.5238, + "mean_token_accuracy": 0.8304355323314667, + "num_tokens": 116475060.0, + "step": 96870 + }, + { + "entropy": 1.8731580957770348, + "epoch": 0.30031952348144975, + "grad_norm": 7.3886942863464355, + "learning_rate": 4.616396869526958e-06, + "loss": 0.4684, + "mean_token_accuracy": 0.8490657389163971, + "num_tokens": 116487261.0, + "step": 96880 + }, + { + "entropy": 1.8840817973017692, + "epoch": 0.3003505226064994, + "grad_norm": 8.366750717163086, + "learning_rate": 4.616158632163e-06, + "loss": 0.4508, + "mean_token_accuracy": 0.857377803325653, + "num_tokens": 116498999.0, + "step": 96890 + }, + { + "entropy": 1.8003995344042778, + "epoch": 0.30038152173154914, + "grad_norm": 2.8149616718292236, + "learning_rate": 4.615920431679226e-06, + "loss": 0.4015, + "mean_token_accuracy": 0.8655429676175117, + "num_tokens": 116512337.0, + "step": 96900 + }, + { + "entropy": 1.8423293381929398, + "epoch": 0.3004125208565988, + "grad_norm": 3.726378917694092, + "learning_rate": 4.615682268066116e-06, + "loss": 0.4743, + "mean_token_accuracy": 0.8466761097311973, + "num_tokens": 116525215.0, + "step": 96910 + }, + { + "entropy": 1.8393646717071532, + "epoch": 0.30044351998164853, + "grad_norm": 9.186988830566406, + "learning_rate": 4.615444141314163e-06, + "loss": 0.4738, + "mean_token_accuracy": 0.8458735197782516, + "num_tokens": 116537163.0, + "step": 96920 + }, + { + "entropy": 1.9296279013156892, + "epoch": 0.3004745191066982, + "grad_norm": 9.516865730285645, + "learning_rate": 4.615206051413857e-06, + "loss": 0.4996, + "mean_token_accuracy": 0.8410545736551285, + "num_tokens": 116548242.0, + "step": 96930 + }, + { + "entropy": 1.867045633494854, + "epoch": 0.3005055182317479, + "grad_norm": 7.079854965209961, + "learning_rate": 4.614967998355696e-06, + "loss": 0.4553, + "mean_token_accuracy": 0.8517505764961243, + "num_tokens": 116560243.0, + "step": 96940 + }, + { + "entropy": 1.9103495597839355, + "epoch": 0.3005365173567976, + "grad_norm": 5.358558177947998, + "learning_rate": 4.614729982130179e-06, + "loss": 0.5171, + "mean_token_accuracy": 0.8406784102320671, + "num_tokens": 116571891.0, + "step": 96950 + }, + { + "entropy": 1.9171627387404442, + "epoch": 0.3005675164818473, + "grad_norm": 8.190808296203613, + "learning_rate": 4.614492002727808e-06, + "loss": 0.4863, + "mean_token_accuracy": 0.8321561366319656, + "num_tokens": 116583993.0, + "step": 96960 + }, + { + "entropy": 1.84826198220253, + "epoch": 0.300598515606897, + "grad_norm": 9.064087867736816, + "learning_rate": 4.61425406013909e-06, + "loss": 0.4424, + "mean_token_accuracy": 0.8504088371992111, + "num_tokens": 116596878.0, + "step": 96970 + }, + { + "entropy": 1.941984808444977, + "epoch": 0.3006295147319467, + "grad_norm": 4.682338237762451, + "learning_rate": 4.614016154354533e-06, + "loss": 0.5337, + "mean_token_accuracy": 0.8427264273166657, + "num_tokens": 116608036.0, + "step": 96980 + }, + { + "entropy": 1.9229829460382462, + "epoch": 0.3006605138569964, + "grad_norm": 7.0781097412109375, + "learning_rate": 4.6137782853646524e-06, + "loss": 0.5264, + "mean_token_accuracy": 0.839740289747715, + "num_tokens": 116619904.0, + "step": 96990 + }, + { + "entropy": 1.8459256619215012, + "epoch": 0.3006915129820461, + "grad_norm": 7.523281097412109, + "learning_rate": 4.613540453159963e-06, + "loss": 0.4691, + "mean_token_accuracy": 0.8505584686994553, + "num_tokens": 116632261.0, + "step": 97000 + }, + { + "entropy": 1.8035550713539124, + "epoch": 0.3007225121070958, + "grad_norm": 5.279726982116699, + "learning_rate": 4.613302657730985e-06, + "loss": 0.4401, + "mean_token_accuracy": 0.854595598578453, + "num_tokens": 116645315.0, + "step": 97010 + }, + { + "entropy": 1.907500149309635, + "epoch": 0.3007535112321455, + "grad_norm": 3.997288227081299, + "learning_rate": 4.613064899068243e-06, + "loss": 0.5113, + "mean_token_accuracy": 0.8380198463797569, + "num_tokens": 116657205.0, + "step": 97020 + }, + { + "entropy": 1.815874347090721, + "epoch": 0.30078451035719517, + "grad_norm": 3.8312273025512695, + "learning_rate": 4.612827177162262e-06, + "loss": 0.4213, + "mean_token_accuracy": 0.8552998825907707, + "num_tokens": 116669443.0, + "step": 97030 + }, + { + "entropy": 1.93814327865839, + "epoch": 0.30081550948224484, + "grad_norm": 9.422027587890625, + "learning_rate": 4.612589492003573e-06, + "loss": 0.5309, + "mean_token_accuracy": 0.840138903260231, + "num_tokens": 116681521.0, + "step": 97040 + }, + { + "entropy": 1.9181523829698564, + "epoch": 0.30084650860729456, + "grad_norm": 7.659515380859375, + "learning_rate": 4.6123518435827095e-06, + "loss": 0.4698, + "mean_token_accuracy": 0.8537562400102615, + "num_tokens": 116692024.0, + "step": 97050 + }, + { + "entropy": 1.8777270019054413, + "epoch": 0.30087750773234423, + "grad_norm": 4.077333450317383, + "learning_rate": 4.612114231890209e-06, + "loss": 0.4429, + "mean_token_accuracy": 0.8504292860627174, + "num_tokens": 116703672.0, + "step": 97060 + }, + { + "entropy": 1.8734031990170479, + "epoch": 0.30090850685739395, + "grad_norm": 8.679883003234863, + "learning_rate": 4.61187665691661e-06, + "loss": 0.5151, + "mean_token_accuracy": 0.8424584448337555, + "num_tokens": 116715061.0, + "step": 97070 + }, + { + "entropy": 1.919617336988449, + "epoch": 0.3009395059824436, + "grad_norm": 9.382501602172852, + "learning_rate": 4.611639118652459e-06, + "loss": 0.5369, + "mean_token_accuracy": 0.8328076407313347, + "num_tokens": 116726486.0, + "step": 97080 + }, + { + "entropy": 1.890953540802002, + "epoch": 0.30097050510749335, + "grad_norm": 10.713085174560547, + "learning_rate": 4.611401617088301e-06, + "loss": 0.563, + "mean_token_accuracy": 0.8449400596320629, + "num_tokens": 116738906.0, + "step": 97090 + }, + { + "entropy": 1.8357614412903787, + "epoch": 0.301001504232543, + "grad_norm": 8.783710479736328, + "learning_rate": 4.611164152214689e-06, + "loss": 0.4748, + "mean_token_accuracy": 0.8468549698591232, + "num_tokens": 116751196.0, + "step": 97100 + }, + { + "entropy": 1.9422936275601388, + "epoch": 0.30103250335759274, + "grad_norm": 8.501259803771973, + "learning_rate": 4.6109267240221755e-06, + "loss": 0.527, + "mean_token_accuracy": 0.8422497108578682, + "num_tokens": 116763810.0, + "step": 97110 + }, + { + "entropy": 1.9155998945236206, + "epoch": 0.3010635024826424, + "grad_norm": 7.944989204406738, + "learning_rate": 4.610689332501317e-06, + "loss": 0.5321, + "mean_token_accuracy": 0.823591648042202, + "num_tokens": 116776024.0, + "step": 97120 + }, + { + "entropy": 1.8754037857055663, + "epoch": 0.30109450160769213, + "grad_norm": 7.605261325836182, + "learning_rate": 4.610451977642677e-06, + "loss": 0.4565, + "mean_token_accuracy": 0.8444375693798065, + "num_tokens": 116787890.0, + "step": 97130 + }, + { + "entropy": 1.8820706829428673, + "epoch": 0.3011255007327418, + "grad_norm": 4.352447509765625, + "learning_rate": 4.610214659436818e-06, + "loss": 0.4916, + "mean_token_accuracy": 0.8387032672762871, + "num_tokens": 116799997.0, + "step": 97140 + }, + { + "entropy": 1.8734632670879363, + "epoch": 0.3011564998577915, + "grad_norm": 7.570825576782227, + "learning_rate": 4.609977377874307e-06, + "loss": 0.4729, + "mean_token_accuracy": 0.8414079815149307, + "num_tokens": 116812400.0, + "step": 97150 + }, + { + "entropy": 1.9303068399429322, + "epoch": 0.3011874989828412, + "grad_norm": 3.602426528930664, + "learning_rate": 4.609740132945716e-06, + "loss": 0.4627, + "mean_token_accuracy": 0.8503539338707924, + "num_tokens": 116825076.0, + "step": 97160 + }, + { + "entropy": 1.8318970784544946, + "epoch": 0.3012184981078909, + "grad_norm": 9.406216621398926, + "learning_rate": 4.609502924641619e-06, + "loss": 0.4565, + "mean_token_accuracy": 0.8480072125792504, + "num_tokens": 116837724.0, + "step": 97170 + }, + { + "entropy": 1.9633089289069177, + "epoch": 0.3012494972329406, + "grad_norm": 8.81785774230957, + "learning_rate": 4.609265752952596e-06, + "loss": 0.543, + "mean_token_accuracy": 0.8268476709723472, + "num_tokens": 116849107.0, + "step": 97180 + }, + { + "entropy": 1.8834416687488555, + "epoch": 0.3012804963579903, + "grad_norm": 4.704411506652832, + "learning_rate": 4.609028617869224e-06, + "loss": 0.4607, + "mean_token_accuracy": 0.8506078109145164, + "num_tokens": 116861833.0, + "step": 97190 + }, + { + "entropy": 1.7407511696219444, + "epoch": 0.30131149548304, + "grad_norm": 2.5168025493621826, + "learning_rate": 4.6087915193820916e-06, + "loss": 0.3506, + "mean_token_accuracy": 0.8639174044132233, + "num_tokens": 116876452.0, + "step": 97200 + }, + { + "entropy": 1.8649092674255372, + "epoch": 0.3013424946080897, + "grad_norm": 3.976242780685425, + "learning_rate": 4.608554457481785e-06, + "loss": 0.4238, + "mean_token_accuracy": 0.8543577790260315, + "num_tokens": 116888349.0, + "step": 97210 + }, + { + "entropy": 1.850515154004097, + "epoch": 0.3013734937331394, + "grad_norm": 4.014977931976318, + "learning_rate": 4.608317432158896e-06, + "loss": 0.4406, + "mean_token_accuracy": 0.8551569744944573, + "num_tokens": 116900337.0, + "step": 97220 + }, + { + "entropy": 1.904152835905552, + "epoch": 0.3014044928581891, + "grad_norm": 7.234251022338867, + "learning_rate": 4.60808044340402e-06, + "loss": 0.4614, + "mean_token_accuracy": 0.8568873882293702, + "num_tokens": 116912573.0, + "step": 97230 + }, + { + "entropy": 1.8178890123963356, + "epoch": 0.30143549198323877, + "grad_norm": 2.6580164432525635, + "learning_rate": 4.607843491207752e-06, + "loss": 0.4276, + "mean_token_accuracy": 0.8546398520469666, + "num_tokens": 116924977.0, + "step": 97240 + }, + { + "entropy": 1.9117593422532082, + "epoch": 0.3014664911082885, + "grad_norm": 13.091536521911621, + "learning_rate": 4.607606575560697e-06, + "loss": 0.4832, + "mean_token_accuracy": 0.8411953374743462, + "num_tokens": 116936606.0, + "step": 97250 + }, + { + "entropy": 1.9153368800878525, + "epoch": 0.30149749023333816, + "grad_norm": 10.279766082763672, + "learning_rate": 4.607369696453461e-06, + "loss": 0.5155, + "mean_token_accuracy": 0.8421198353171349, + "num_tokens": 116948020.0, + "step": 97260 + }, + { + "entropy": 1.892761492729187, + "epoch": 0.3015284893583879, + "grad_norm": 8.145535469055176, + "learning_rate": 4.6071328538766486e-06, + "loss": 0.4989, + "mean_token_accuracy": 0.8441890180110931, + "num_tokens": 116959806.0, + "step": 97270 + }, + { + "entropy": 1.9218065708875656, + "epoch": 0.30155948848343755, + "grad_norm": 7.536378860473633, + "learning_rate": 4.606896047820874e-06, + "loss": 0.5161, + "mean_token_accuracy": 0.8465237602591514, + "num_tokens": 116970254.0, + "step": 97280 + }, + { + "entropy": 1.8478126615285873, + "epoch": 0.3015904876084872, + "grad_norm": 8.085580825805664, + "learning_rate": 4.60665927827675e-06, + "loss": 0.4712, + "mean_token_accuracy": 0.8454589918255806, + "num_tokens": 116981953.0, + "step": 97290 + }, + { + "entropy": 1.7839539676904679, + "epoch": 0.30162148673353695, + "grad_norm": 4.7135491371154785, + "learning_rate": 4.606422545234899e-06, + "loss": 0.4022, + "mean_token_accuracy": 0.8522520795464515, + "num_tokens": 116994789.0, + "step": 97300 + }, + { + "entropy": 1.7900249511003494, + "epoch": 0.3016524858585866, + "grad_norm": 12.013949394226074, + "learning_rate": 4.606185848685939e-06, + "loss": 0.4124, + "mean_token_accuracy": 0.8632920622825623, + "num_tokens": 117007458.0, + "step": 97310 + }, + { + "entropy": 1.8314153790473937, + "epoch": 0.30168348498363634, + "grad_norm": 8.588581085205078, + "learning_rate": 4.605949188620496e-06, + "loss": 0.454, + "mean_token_accuracy": 0.8470736399292946, + "num_tokens": 117019927.0, + "step": 97320 + }, + { + "entropy": 1.8574935659766196, + "epoch": 0.301714484108686, + "grad_norm": 8.47958755493164, + "learning_rate": 4.6057125650292e-06, + "loss": 0.4717, + "mean_token_accuracy": 0.8512878254055977, + "num_tokens": 117031729.0, + "step": 97330 + }, + { + "entropy": 1.9198235914111137, + "epoch": 0.30174548323373573, + "grad_norm": 10.255192756652832, + "learning_rate": 4.605475977902682e-06, + "loss": 0.4887, + "mean_token_accuracy": 0.8471052810549736, + "num_tokens": 117042862.0, + "step": 97340 + }, + { + "entropy": 1.8835196584463119, + "epoch": 0.3017764823587854, + "grad_norm": 9.062085151672363, + "learning_rate": 4.605239427231577e-06, + "loss": 0.5183, + "mean_token_accuracy": 0.8494998052716255, + "num_tokens": 117054126.0, + "step": 97350 + }, + { + "entropy": 1.9018598824739457, + "epoch": 0.3018074814838351, + "grad_norm": 8.304078102111816, + "learning_rate": 4.6050029130065245e-06, + "loss": 0.5274, + "mean_token_accuracy": 0.8413213551044464, + "num_tokens": 117065364.0, + "step": 97360 + }, + { + "entropy": 1.8747293829917908, + "epoch": 0.3018384806088848, + "grad_norm": 8.438359260559082, + "learning_rate": 4.604766435218166e-06, + "loss": 0.498, + "mean_token_accuracy": 0.8443131268024444, + "num_tokens": 117076938.0, + "step": 97370 + }, + { + "entropy": 1.8413864582777024, + "epoch": 0.3018694797339345, + "grad_norm": 8.747175216674805, + "learning_rate": 4.604529993857147e-06, + "loss": 0.4393, + "mean_token_accuracy": 0.8414786517620086, + "num_tokens": 117089574.0, + "step": 97380 + }, + { + "entropy": 1.9370706051588058, + "epoch": 0.3019004788589842, + "grad_norm": 7.200348854064941, + "learning_rate": 4.604293588914116e-06, + "loss": 0.5174, + "mean_token_accuracy": 0.8398382663726807, + "num_tokens": 117100625.0, + "step": 97390 + }, + { + "entropy": 1.850702230632305, + "epoch": 0.3019314779840339, + "grad_norm": 8.624340057373047, + "learning_rate": 4.604057220379726e-06, + "loss": 0.4435, + "mean_token_accuracy": 0.8501965671777725, + "num_tokens": 117113376.0, + "step": 97400 + }, + { + "entropy": 1.8446269080042839, + "epoch": 0.3019624771090836, + "grad_norm": 3.699392318725586, + "learning_rate": 4.603820888244632e-06, + "loss": 0.4437, + "mean_token_accuracy": 0.8584934890270233, + "num_tokens": 117126682.0, + "step": 97410 + }, + { + "entropy": 1.943909691274166, + "epoch": 0.3019934762341333, + "grad_norm": 3.7331109046936035, + "learning_rate": 4.603584592499492e-06, + "loss": 0.569, + "mean_token_accuracy": 0.8290564298629761, + "num_tokens": 117138961.0, + "step": 97420 + }, + { + "entropy": 1.8698807314038277, + "epoch": 0.302024475359183, + "grad_norm": 8.145543098449707, + "learning_rate": 4.603348333134969e-06, + "loss": 0.466, + "mean_token_accuracy": 0.8512676984071732, + "num_tokens": 117150754.0, + "step": 97430 + }, + { + "entropy": 1.9144811987876893, + "epoch": 0.3020554744842327, + "grad_norm": 9.019363403320312, + "learning_rate": 4.60311211014173e-06, + "loss": 0.4746, + "mean_token_accuracy": 0.8448151677846909, + "num_tokens": 117162759.0, + "step": 97440 + }, + { + "entropy": 1.8737313896417618, + "epoch": 0.30208647360928237, + "grad_norm": 7.63023567199707, + "learning_rate": 4.602875923510441e-06, + "loss": 0.4619, + "mean_token_accuracy": 0.8467231750488281, + "num_tokens": 117174665.0, + "step": 97450 + }, + { + "entropy": 1.8207977265119553, + "epoch": 0.3021174727343321, + "grad_norm": 8.674139976501465, + "learning_rate": 4.602639773231776e-06, + "loss": 0.471, + "mean_token_accuracy": 0.8548212826251984, + "num_tokens": 117187809.0, + "step": 97460 + }, + { + "entropy": 1.8152425453066825, + "epoch": 0.30214847185938176, + "grad_norm": 5.675132751464844, + "learning_rate": 4.602403659296411e-06, + "loss": 0.4768, + "mean_token_accuracy": 0.8367105752229691, + "num_tokens": 117201085.0, + "step": 97470 + }, + { + "entropy": 1.857864636182785, + "epoch": 0.3021794709844315, + "grad_norm": 8.102805137634277, + "learning_rate": 4.602167581695023e-06, + "loss": 0.4151, + "mean_token_accuracy": 0.8479077383875847, + "num_tokens": 117213732.0, + "step": 97480 + }, + { + "entropy": 1.925594538450241, + "epoch": 0.30221047010948116, + "grad_norm": 7.970809459686279, + "learning_rate": 4.601931540418297e-06, + "loss": 0.523, + "mean_token_accuracy": 0.8393017098307609, + "num_tokens": 117224935.0, + "step": 97490 + }, + { + "entropy": 1.86695496737957, + "epoch": 0.3022414692345309, + "grad_norm": 9.16865348815918, + "learning_rate": 4.601695535456917e-06, + "loss": 0.4778, + "mean_token_accuracy": 0.8479771926999092, + "num_tokens": 117236716.0, + "step": 97500 + }, + { + "entropy": 1.9163902431726456, + "epoch": 0.30227246835958055, + "grad_norm": 8.983051300048828, + "learning_rate": 4.601459566801571e-06, + "loss": 0.4887, + "mean_token_accuracy": 0.8492978289723396, + "num_tokens": 117247883.0, + "step": 97510 + }, + { + "entropy": 1.8202176943421364, + "epoch": 0.3023034674846303, + "grad_norm": 7.500359535217285, + "learning_rate": 4.601223634442954e-06, + "loss": 0.3772, + "mean_token_accuracy": 0.8624223947525025, + "num_tokens": 117261178.0, + "step": 97520 + }, + { + "entropy": 1.9173918321728707, + "epoch": 0.30233446660967994, + "grad_norm": 8.454766273498535, + "learning_rate": 4.600987738371759e-06, + "loss": 0.4866, + "mean_token_accuracy": 0.8437489971518517, + "num_tokens": 117273426.0, + "step": 97530 + }, + { + "entropy": 1.77590219527483, + "epoch": 0.3023654657347296, + "grad_norm": 8.413315773010254, + "learning_rate": 4.600751878578687e-06, + "loss": 0.3971, + "mean_token_accuracy": 0.8557587161660194, + "num_tokens": 117286603.0, + "step": 97540 + }, + { + "entropy": 1.9176592335104943, + "epoch": 0.30239646485977933, + "grad_norm": 10.7260103225708, + "learning_rate": 4.600516055054439e-06, + "loss": 0.5096, + "mean_token_accuracy": 0.8402794227004051, + "num_tokens": 117297972.0, + "step": 97550 + }, + { + "entropy": 1.8973240569233893, + "epoch": 0.302427463984829, + "grad_norm": 4.1420087814331055, + "learning_rate": 4.600280267789722e-06, + "loss": 0.4362, + "mean_token_accuracy": 0.8584436431527138, + "num_tokens": 117309456.0, + "step": 97560 + }, + { + "entropy": 1.9296040430665016, + "epoch": 0.30245846310987873, + "grad_norm": 8.410199165344238, + "learning_rate": 4.600044516775245e-06, + "loss": 0.5078, + "mean_token_accuracy": 0.8443345412611961, + "num_tokens": 117321272.0, + "step": 97570 + }, + { + "entropy": 1.9159424543380736, + "epoch": 0.3024894622349284, + "grad_norm": 8.414966583251953, + "learning_rate": 4.5998088020017186e-06, + "loss": 0.4903, + "mean_token_accuracy": 0.8412731990218163, + "num_tokens": 117333250.0, + "step": 97580 + }, + { + "entropy": 1.7769809067249298, + "epoch": 0.3025204613599781, + "grad_norm": 3.658212661743164, + "learning_rate": 4.599573123459859e-06, + "loss": 0.38, + "mean_token_accuracy": 0.8634654805064201, + "num_tokens": 117346443.0, + "step": 97590 + }, + { + "entropy": 1.8901727512478828, + "epoch": 0.3025514604850278, + "grad_norm": 9.50108814239502, + "learning_rate": 4.599337481140387e-06, + "loss": 0.4743, + "mean_token_accuracy": 0.8505553930997849, + "num_tokens": 117358066.0, + "step": 97600 + }, + { + "entropy": 1.8625748187303544, + "epoch": 0.3025824596100775, + "grad_norm": 8.508552551269531, + "learning_rate": 4.5991018750340235e-06, + "loss": 0.4556, + "mean_token_accuracy": 0.849023912847042, + "num_tokens": 117369805.0, + "step": 97610 + }, + { + "entropy": 1.9069266065955162, + "epoch": 0.3026134587351272, + "grad_norm": 8.270326614379883, + "learning_rate": 4.5988663051314944e-06, + "loss": 0.4747, + "mean_token_accuracy": 0.8539838179945946, + "num_tokens": 117381262.0, + "step": 97620 + }, + { + "entropy": 1.9128611549735068, + "epoch": 0.3026444578601769, + "grad_norm": 8.971906661987305, + "learning_rate": 4.5986307714235286e-06, + "loss": 0.5167, + "mean_token_accuracy": 0.832034258544445, + "num_tokens": 117393179.0, + "step": 97630 + }, + { + "entropy": 1.8143539875745773, + "epoch": 0.3026754569852266, + "grad_norm": 7.190959453582764, + "learning_rate": 4.5983952739008585e-06, + "loss": 0.4082, + "mean_token_accuracy": 0.8518998950719834, + "num_tokens": 117406161.0, + "step": 97640 + }, + { + "entropy": 1.8843399420380593, + "epoch": 0.3027064561102763, + "grad_norm": 9.908082008361816, + "learning_rate": 4.598159812554219e-06, + "loss": 0.4768, + "mean_token_accuracy": 0.8463353976607323, + "num_tokens": 117418576.0, + "step": 97650 + }, + { + "entropy": 1.8831077113747596, + "epoch": 0.30273745523532597, + "grad_norm": 9.612486839294434, + "learning_rate": 4.597924387374351e-06, + "loss": 0.4527, + "mean_token_accuracy": 0.8505209341645241, + "num_tokens": 117430122.0, + "step": 97660 + }, + { + "entropy": 1.8462072387337685, + "epoch": 0.3027684543603757, + "grad_norm": 4.665594100952148, + "learning_rate": 4.597688998351995e-06, + "loss": 0.3917, + "mean_token_accuracy": 0.8621034786105156, + "num_tokens": 117442896.0, + "step": 97670 + }, + { + "entropy": 1.9408204436302186, + "epoch": 0.30279945348542536, + "grad_norm": 7.827560901641846, + "learning_rate": 4.597453645477898e-06, + "loss": 0.5212, + "mean_token_accuracy": 0.8394302636384964, + "num_tokens": 117453977.0, + "step": 97680 + }, + { + "entropy": 1.8475888326764107, + "epoch": 0.3028304526104751, + "grad_norm": 8.166518211364746, + "learning_rate": 4.597218328742807e-06, + "loss": 0.4278, + "mean_token_accuracy": 0.8580135375261306, + "num_tokens": 117466325.0, + "step": 97690 + }, + { + "entropy": 1.8173963025212287, + "epoch": 0.30286145173552476, + "grad_norm": 4.001589775085449, + "learning_rate": 4.596983048137475e-06, + "loss": 0.4122, + "mean_token_accuracy": 0.8486342668533325, + "num_tokens": 117478782.0, + "step": 97700 + }, + { + "entropy": 1.8551449865102767, + "epoch": 0.3028924508605745, + "grad_norm": 8.81025218963623, + "learning_rate": 4.596747803652658e-06, + "loss": 0.4866, + "mean_token_accuracy": 0.8483422353863717, + "num_tokens": 117491302.0, + "step": 97710 + }, + { + "entropy": 1.8700275629758836, + "epoch": 0.30292344998562415, + "grad_norm": 3.273538112640381, + "learning_rate": 4.596512595279115e-06, + "loss": 0.4879, + "mean_token_accuracy": 0.8363815456628799, + "num_tokens": 117504377.0, + "step": 97720 + }, + { + "entropy": 1.9091096714138984, + "epoch": 0.3029544491106739, + "grad_norm": 4.058509349822998, + "learning_rate": 4.5962774230076075e-06, + "loss": 0.4511, + "mean_token_accuracy": 0.8540859907865525, + "num_tokens": 117515754.0, + "step": 97730 + }, + { + "entropy": 1.979654061794281, + "epoch": 0.30298544823572354, + "grad_norm": 8.739022254943848, + "learning_rate": 4.596042286828902e-06, + "loss": 0.5324, + "mean_token_accuracy": 0.8428146079182625, + "num_tokens": 117526416.0, + "step": 97740 + }, + { + "entropy": 1.9291452512145042, + "epoch": 0.30301644736077327, + "grad_norm": 8.007489204406738, + "learning_rate": 4.595807186733767e-06, + "loss": 0.5136, + "mean_token_accuracy": 0.8436457946896553, + "num_tokens": 117538034.0, + "step": 97750 + }, + { + "entropy": 1.8014676854014398, + "epoch": 0.30304744648582294, + "grad_norm": 9.48009967803955, + "learning_rate": 4.595572122712974e-06, + "loss": 0.4153, + "mean_token_accuracy": 0.8604965686798096, + "num_tokens": 117551372.0, + "step": 97760 + }, + { + "entropy": 1.8575839295983314, + "epoch": 0.30307844561087266, + "grad_norm": 2.4057457447052, + "learning_rate": 4.595337094757297e-06, + "loss": 0.4555, + "mean_token_accuracy": 0.8554554954171181, + "num_tokens": 117563424.0, + "step": 97770 + }, + { + "entropy": 1.8977721393108369, + "epoch": 0.30310944473592233, + "grad_norm": 9.092135429382324, + "learning_rate": 4.595102102857518e-06, + "loss": 0.4895, + "mean_token_accuracy": 0.8410274058580398, + "num_tokens": 117575055.0, + "step": 97780 + }, + { + "entropy": 1.89531751871109, + "epoch": 0.303140443860972, + "grad_norm": 8.363005638122559, + "learning_rate": 4.594867147004416e-06, + "loss": 0.484, + "mean_token_accuracy": 0.8437793105840683, + "num_tokens": 117586328.0, + "step": 97790 + }, + { + "entropy": 1.8966928854584695, + "epoch": 0.3031714429860217, + "grad_norm": 8.183049201965332, + "learning_rate": 4.594632227188778e-06, + "loss": 0.4881, + "mean_token_accuracy": 0.8496061801910401, + "num_tokens": 117598053.0, + "step": 97800 + }, + { + "entropy": 1.7953755557537079, + "epoch": 0.3032024421110714, + "grad_norm": 7.209178924560547, + "learning_rate": 4.594397343401393e-06, + "loss": 0.3953, + "mean_token_accuracy": 0.8673543766140938, + "num_tokens": 117610778.0, + "step": 97810 + }, + { + "entropy": 1.8411307245492936, + "epoch": 0.3032334412361211, + "grad_norm": 4.28125, + "learning_rate": 4.59416249563305e-06, + "loss": 0.4508, + "mean_token_accuracy": 0.8453304886817932, + "num_tokens": 117623608.0, + "step": 97820 + }, + { + "entropy": 1.849568995833397, + "epoch": 0.3032644403611708, + "grad_norm": 7.9680023193359375, + "learning_rate": 4.593927683874549e-06, + "loss": 0.4654, + "mean_token_accuracy": 0.8446632474660873, + "num_tokens": 117635895.0, + "step": 97830 + }, + { + "entropy": 1.9411183446645737, + "epoch": 0.3032954394862205, + "grad_norm": 8.592232704162598, + "learning_rate": 4.593692908116683e-06, + "loss": 0.49, + "mean_token_accuracy": 0.8454527780413628, + "num_tokens": 117647042.0, + "step": 97840 + }, + { + "entropy": 1.9133759438991547, + "epoch": 0.3033264386112702, + "grad_norm": 8.33125114440918, + "learning_rate": 4.593458168350257e-06, + "loss": 0.4768, + "mean_token_accuracy": 0.8406713515520096, + "num_tokens": 117658493.0, + "step": 97850 + }, + { + "entropy": 1.9860854953527451, + "epoch": 0.3033574377363199, + "grad_norm": 8.772767066955566, + "learning_rate": 4.593223464566075e-06, + "loss": 0.5346, + "mean_token_accuracy": 0.836230854690075, + "num_tokens": 117669208.0, + "step": 97860 + }, + { + "entropy": 1.8566246941685676, + "epoch": 0.30338843686136957, + "grad_norm": 2.640052556991577, + "learning_rate": 4.592988796754947e-06, + "loss": 0.4453, + "mean_token_accuracy": 0.8560033723711967, + "num_tokens": 117681693.0, + "step": 97870 + }, + { + "entropy": 1.90928722769022, + "epoch": 0.3034194359864193, + "grad_norm": 7.907132148742676, + "learning_rate": 4.592754164907683e-06, + "loss": 0.5258, + "mean_token_accuracy": 0.8417934641242028, + "num_tokens": 117693406.0, + "step": 97880 + }, + { + "entropy": 1.9087349817156791, + "epoch": 0.30345043511146896, + "grad_norm": 11.36373519897461, + "learning_rate": 4.592519569015098e-06, + "loss": 0.4901, + "mean_token_accuracy": 0.8501476049423218, + "num_tokens": 117704501.0, + "step": 97890 + }, + { + "entropy": 1.8858125910162926, + "epoch": 0.3034814342365187, + "grad_norm": 7.878525733947754, + "learning_rate": 4.592285009068011e-06, + "loss": 0.4703, + "mean_token_accuracy": 0.8410215243697167, + "num_tokens": 117716498.0, + "step": 97900 + }, + { + "entropy": 1.9222358748316766, + "epoch": 0.30351243336156836, + "grad_norm": 9.170816421508789, + "learning_rate": 4.592050485057241e-06, + "loss": 0.4626, + "mean_token_accuracy": 0.8559870198369026, + "num_tokens": 117727660.0, + "step": 97910 + }, + { + "entropy": 1.9125974863767623, + "epoch": 0.3035434324866181, + "grad_norm": 8.71693229675293, + "learning_rate": 4.591815996973617e-06, + "loss": 0.4696, + "mean_token_accuracy": 0.8463147193193435, + "num_tokens": 117739528.0, + "step": 97920 + }, + { + "entropy": 1.8418530434370042, + "epoch": 0.30357443161166775, + "grad_norm": 4.724992752075195, + "learning_rate": 4.591581544807964e-06, + "loss": 0.4521, + "mean_token_accuracy": 0.849432897567749, + "num_tokens": 117752050.0, + "step": 97930 + }, + { + "entropy": 1.8719761818647385, + "epoch": 0.3036054307367175, + "grad_norm": 8.673355102539062, + "learning_rate": 4.591347128551114e-06, + "loss": 0.4561, + "mean_token_accuracy": 0.8506433501839638, + "num_tokens": 117763777.0, + "step": 97940 + }, + { + "entropy": 1.834936611354351, + "epoch": 0.30363642986176714, + "grad_norm": 4.6238603591918945, + "learning_rate": 4.5911127481939e-06, + "loss": 0.4469, + "mean_token_accuracy": 0.8557968467473984, + "num_tokens": 117776099.0, + "step": 97950 + }, + { + "entropy": 1.7863100260496139, + "epoch": 0.30366742898681687, + "grad_norm": 4.946314811706543, + "learning_rate": 4.590878403727164e-06, + "loss": 0.44, + "mean_token_accuracy": 0.8499247878789902, + "num_tokens": 117789509.0, + "step": 97960 + }, + { + "entropy": 1.915014560520649, + "epoch": 0.30369842811186654, + "grad_norm": 8.316082000732422, + "learning_rate": 4.5906440951417435e-06, + "loss": 0.4681, + "mean_token_accuracy": 0.8483388528227807, + "num_tokens": 117800926.0, + "step": 97970 + }, + { + "entropy": 1.9649482190608978, + "epoch": 0.30372942723691626, + "grad_norm": 10.604179382324219, + "learning_rate": 4.590409822428485e-06, + "loss": 0.524, + "mean_token_accuracy": 0.8398748651146889, + "num_tokens": 117811622.0, + "step": 97980 + }, + { + "entropy": 1.7894397795200347, + "epoch": 0.30376042636196593, + "grad_norm": 6.535918712615967, + "learning_rate": 4.590175585578233e-06, + "loss": 0.3897, + "mean_token_accuracy": 0.8634741649031639, + "num_tokens": 117824809.0, + "step": 97990 + }, + { + "entropy": 1.8452475354075433, + "epoch": 0.30379142548701565, + "grad_norm": 9.069293975830078, + "learning_rate": 4.589941384581842e-06, + "loss": 0.4343, + "mean_token_accuracy": 0.8545988813042641, + "num_tokens": 117837581.0, + "step": 98000 + }, + { + "entropy": 1.9193524435162543, + "epoch": 0.3038224246120653, + "grad_norm": 9.2352933883667, + "learning_rate": 4.589707219430166e-06, + "loss": 0.483, + "mean_token_accuracy": 0.8450753495097161, + "num_tokens": 117849268.0, + "step": 98010 + }, + { + "entropy": 1.8406575858592986, + "epoch": 0.30385342373711505, + "grad_norm": 8.772045135498047, + "learning_rate": 4.589473090114059e-06, + "loss": 0.4357, + "mean_token_accuracy": 0.8576031014323234, + "num_tokens": 117861975.0, + "step": 98020 + }, + { + "entropy": 1.8483123630285263, + "epoch": 0.3038844228621647, + "grad_norm": 9.5717191696167, + "learning_rate": 4.5892389966243866e-06, + "loss": 0.4226, + "mean_token_accuracy": 0.8474526911973953, + "num_tokens": 117874143.0, + "step": 98030 + }, + { + "entropy": 1.9467709794640542, + "epoch": 0.3039154219872144, + "grad_norm": 8.297847747802734, + "learning_rate": 4.589004938952009e-06, + "loss": 0.488, + "mean_token_accuracy": 0.8526682198047638, + "num_tokens": 117885322.0, + "step": 98040 + }, + { + "entropy": 1.859838457405567, + "epoch": 0.3039464211122641, + "grad_norm": 8.134725570678711, + "learning_rate": 4.588770917087794e-06, + "loss": 0.4836, + "mean_token_accuracy": 0.8431189700961113, + "num_tokens": 117897060.0, + "step": 98050 + }, + { + "entropy": 1.8738309249281884, + "epoch": 0.3039774202373138, + "grad_norm": 4.704370021820068, + "learning_rate": 4.5885369310226145e-06, + "loss": 0.4939, + "mean_token_accuracy": 0.8435174241662026, + "num_tokens": 117909430.0, + "step": 98060 + }, + { + "entropy": 1.8503778785467149, + "epoch": 0.3040084193623635, + "grad_norm": 3.1746442317962646, + "learning_rate": 4.588302980747341e-06, + "loss": 0.4732, + "mean_token_accuracy": 0.8513809859752655, + "num_tokens": 117920805.0, + "step": 98070 + }, + { + "entropy": 1.8909773424267768, + "epoch": 0.30403941848741317, + "grad_norm": 8.956877708435059, + "learning_rate": 4.588069066252854e-06, + "loss": 0.5191, + "mean_token_accuracy": 0.8422663122415542, + "num_tokens": 117932448.0, + "step": 98080 + }, + { + "entropy": 1.8859625205397605, + "epoch": 0.3040704176124629, + "grad_norm": 6.800996780395508, + "learning_rate": 4.587835187530031e-06, + "loss": 0.4477, + "mean_token_accuracy": 0.8531765311956405, + "num_tokens": 117944171.0, + "step": 98090 + }, + { + "entropy": 1.9186548352241517, + "epoch": 0.30410141673751256, + "grad_norm": 8.095877647399902, + "learning_rate": 4.587601344569756e-06, + "loss": 0.5043, + "mean_token_accuracy": 0.8412599250674248, + "num_tokens": 117956478.0, + "step": 98100 + }, + { + "entropy": 1.8034547924995423, + "epoch": 0.3041324158625623, + "grad_norm": 4.433010578155518, + "learning_rate": 4.587367537362918e-06, + "loss": 0.4144, + "mean_token_accuracy": 0.8495987445116043, + "num_tokens": 117969524.0, + "step": 98110 + }, + { + "entropy": 1.8625740513205529, + "epoch": 0.30416341498761196, + "grad_norm": 8.75419807434082, + "learning_rate": 4.587133765900404e-06, + "loss": 0.4489, + "mean_token_accuracy": 0.8563158795237541, + "num_tokens": 117982012.0, + "step": 98120 + }, + { + "entropy": 1.8077227592468261, + "epoch": 0.3041944141126617, + "grad_norm": 8.769314765930176, + "learning_rate": 4.586900030173109e-06, + "loss": 0.4041, + "mean_token_accuracy": 0.8591718584299087, + "num_tokens": 117994852.0, + "step": 98130 + }, + { + "entropy": 1.8287582576274872, + "epoch": 0.30422541323771135, + "grad_norm": 5.158862590789795, + "learning_rate": 4.58666633017193e-06, + "loss": 0.4848, + "mean_token_accuracy": 0.8468107968568802, + "num_tokens": 118007827.0, + "step": 98140 + }, + { + "entropy": 1.917576177418232, + "epoch": 0.3042564123627611, + "grad_norm": 10.31728458404541, + "learning_rate": 4.586432665887766e-06, + "loss": 0.5309, + "mean_token_accuracy": 0.8305649489164353, + "num_tokens": 118018948.0, + "step": 98150 + }, + { + "entropy": 1.9343233779072762, + "epoch": 0.30428741148781074, + "grad_norm": 7.640726089477539, + "learning_rate": 4.586199037311519e-06, + "loss": 0.5013, + "mean_token_accuracy": 0.849469755589962, + "num_tokens": 118030240.0, + "step": 98160 + }, + { + "entropy": 1.874649804830551, + "epoch": 0.30431841061286047, + "grad_norm": 7.788882255554199, + "learning_rate": 4.585965444434098e-06, + "loss": 0.4418, + "mean_token_accuracy": 0.8480836063623428, + "num_tokens": 118042287.0, + "step": 98170 + }, + { + "entropy": 1.894361424446106, + "epoch": 0.30434940973791014, + "grad_norm": 5.997152328491211, + "learning_rate": 4.58573188724641e-06, + "loss": 0.533, + "mean_token_accuracy": 0.8430122479796409, + "num_tokens": 118054246.0, + "step": 98180 + }, + { + "entropy": 1.8837274074554444, + "epoch": 0.30438040886295986, + "grad_norm": 8.705531120300293, + "learning_rate": 4.585498365739368e-06, + "loss": 0.4851, + "mean_token_accuracy": 0.8436176598072052, + "num_tokens": 118066667.0, + "step": 98190 + }, + { + "entropy": 1.905727145075798, + "epoch": 0.30441140798800953, + "grad_norm": 8.105955123901367, + "learning_rate": 4.585264879903889e-06, + "loss": 0.4952, + "mean_token_accuracy": 0.8468491420149803, + "num_tokens": 118078925.0, + "step": 98200 + }, + { + "entropy": 1.9071518421173095, + "epoch": 0.30444240711305925, + "grad_norm": 8.408157348632812, + "learning_rate": 4.585031429730893e-06, + "loss": 0.4984, + "mean_token_accuracy": 0.8430832877755166, + "num_tokens": 118090390.0, + "step": 98210 + }, + { + "entropy": 1.8832137271761895, + "epoch": 0.3044734062381089, + "grad_norm": 4.172749042510986, + "learning_rate": 4.5847980152113015e-06, + "loss": 0.4766, + "mean_token_accuracy": 0.8408018171787262, + "num_tokens": 118103271.0, + "step": 98220 + }, + { + "entropy": 1.8779758632183075, + "epoch": 0.30450440536315865, + "grad_norm": 6.640566825866699, + "learning_rate": 4.584564636336039e-06, + "loss": 0.4322, + "mean_token_accuracy": 0.8598590075969696, + "num_tokens": 118115121.0, + "step": 98230 + }, + { + "entropy": 1.8136912897229194, + "epoch": 0.3045354044882083, + "grad_norm": 9.391630172729492, + "learning_rate": 4.584331293096037e-06, + "loss": 0.4272, + "mean_token_accuracy": 0.8552101209759713, + "num_tokens": 118127803.0, + "step": 98240 + }, + { + "entropy": 1.7988118350505828, + "epoch": 0.30456640361325804, + "grad_norm": 3.3279430866241455, + "learning_rate": 4.584097985482225e-06, + "loss": 0.3666, + "mean_token_accuracy": 0.8751557558774948, + "num_tokens": 118140663.0, + "step": 98250 + }, + { + "entropy": 1.8820227935910225, + "epoch": 0.3045974027383077, + "grad_norm": 12.045969009399414, + "learning_rate": 4.583864713485541e-06, + "loss": 0.4841, + "mean_token_accuracy": 0.8497835651040078, + "num_tokens": 118152695.0, + "step": 98260 + }, + { + "entropy": 1.8159508243203164, + "epoch": 0.30462840186335743, + "grad_norm": 5.319043159484863, + "learning_rate": 4.583631477096921e-06, + "loss": 0.5005, + "mean_token_accuracy": 0.8409684166312218, + "num_tokens": 118165387.0, + "step": 98270 + }, + { + "entropy": 1.8920594125986099, + "epoch": 0.3046594009884071, + "grad_norm": 8.387589454650879, + "learning_rate": 4.583398276307309e-06, + "loss": 0.463, + "mean_token_accuracy": 0.8461384385824203, + "num_tokens": 118177881.0, + "step": 98280 + }, + { + "entropy": 1.8834513157606125, + "epoch": 0.30469040011345677, + "grad_norm": 7.704468250274658, + "learning_rate": 4.58316511110765e-06, + "loss": 0.5188, + "mean_token_accuracy": 0.84387187063694, + "num_tokens": 118189579.0, + "step": 98290 + }, + { + "entropy": 1.9479789227247237, + "epoch": 0.3047213992385065, + "grad_norm": 9.058480262756348, + "learning_rate": 4.582931981488891e-06, + "loss": 0.5274, + "mean_token_accuracy": 0.822111751139164, + "num_tokens": 118201583.0, + "step": 98300 + }, + { + "entropy": 1.914680229127407, + "epoch": 0.30475239836355617, + "grad_norm": 9.425095558166504, + "learning_rate": 4.582698887441983e-06, + "loss": 0.4838, + "mean_token_accuracy": 0.8463716924190521, + "num_tokens": 118212830.0, + "step": 98310 + }, + { + "entropy": 1.9096634030342101, + "epoch": 0.3047833974886059, + "grad_norm": 4.8716912269592285, + "learning_rate": 4.582465828957883e-06, + "loss": 0.5376, + "mean_token_accuracy": 0.8446076348423958, + "num_tokens": 118223765.0, + "step": 98320 + }, + { + "entropy": 1.8158196270465852, + "epoch": 0.30481439661365556, + "grad_norm": 7.788163185119629, + "learning_rate": 4.582232806027548e-06, + "loss": 0.4027, + "mean_token_accuracy": 0.8658209949731827, + "num_tokens": 118236493.0, + "step": 98330 + }, + { + "entropy": 1.8198370561003685, + "epoch": 0.3048453957387053, + "grad_norm": 3.936845541000366, + "learning_rate": 4.581999818641939e-06, + "loss": 0.4219, + "mean_token_accuracy": 0.8498961672186851, + "num_tokens": 118248905.0, + "step": 98340 + }, + { + "entropy": 1.8224836379289626, + "epoch": 0.30487639486375495, + "grad_norm": 2.4840080738067627, + "learning_rate": 4.5817668667920205e-06, + "loss": 0.4328, + "mean_token_accuracy": 0.8523753926157951, + "num_tokens": 118262429.0, + "step": 98350 + }, + { + "entropy": 1.776771107316017, + "epoch": 0.3049073939888047, + "grad_norm": 7.824039936065674, + "learning_rate": 4.58153395046876e-06, + "loss": 0.3806, + "mean_token_accuracy": 0.8675816237926484, + "num_tokens": 118275691.0, + "step": 98360 + }, + { + "entropy": 1.8841424211859703, + "epoch": 0.30493839311385434, + "grad_norm": 7.849323749542236, + "learning_rate": 4.581301069663129e-06, + "loss": 0.5043, + "mean_token_accuracy": 0.8445948898792267, + "num_tokens": 118288240.0, + "step": 98370 + }, + { + "entropy": 1.8760451026260854, + "epoch": 0.30496939223890407, + "grad_norm": 8.8077974319458, + "learning_rate": 4.5810682243661e-06, + "loss": 0.4661, + "mean_token_accuracy": 0.8469317957758904, + "num_tokens": 118301110.0, + "step": 98380 + }, + { + "entropy": 1.8852962955832482, + "epoch": 0.30500039136395374, + "grad_norm": 8.56628704071045, + "learning_rate": 4.580835414568652e-06, + "loss": 0.476, + "mean_token_accuracy": 0.849009807407856, + "num_tokens": 118312570.0, + "step": 98390 + }, + { + "entropy": 1.8616637766361237, + "epoch": 0.30503139048900346, + "grad_norm": 4.301907539367676, + "learning_rate": 4.580602640261765e-06, + "loss": 0.44, + "mean_token_accuracy": 0.852383928000927, + "num_tokens": 118324622.0, + "step": 98400 + }, + { + "entropy": 1.9427771091461181, + "epoch": 0.30506238961405313, + "grad_norm": 6.355135440826416, + "learning_rate": 4.580369901436422e-06, + "loss": 0.5128, + "mean_token_accuracy": 0.8413548216223716, + "num_tokens": 118336164.0, + "step": 98410 + }, + { + "entropy": 1.9013810217380525, + "epoch": 0.30509338873910286, + "grad_norm": 8.685293197631836, + "learning_rate": 4.580137198083611e-06, + "loss": 0.5058, + "mean_token_accuracy": 0.8395018517971039, + "num_tokens": 118348429.0, + "step": 98420 + }, + { + "entropy": 1.8083545833826065, + "epoch": 0.3051243878641525, + "grad_norm": 9.026556968688965, + "learning_rate": 4.5799045301943205e-06, + "loss": 0.4327, + "mean_token_accuracy": 0.8529153957962989, + "num_tokens": 118361098.0, + "step": 98430 + }, + { + "entropy": 1.8322697907686234, + "epoch": 0.30515538698920225, + "grad_norm": 8.164215087890625, + "learning_rate": 4.579671897759546e-06, + "loss": 0.49, + "mean_token_accuracy": 0.8390917211771012, + "num_tokens": 118374073.0, + "step": 98440 + }, + { + "entropy": 1.8684598043560983, + "epoch": 0.3051863861142519, + "grad_norm": 7.570199489593506, + "learning_rate": 4.579439300770282e-06, + "loss": 0.4692, + "mean_token_accuracy": 0.8506176099181175, + "num_tokens": 118385894.0, + "step": 98450 + }, + { + "entropy": 1.959225982427597, + "epoch": 0.30521738523930164, + "grad_norm": 8.114801406860352, + "learning_rate": 4.579206739217529e-06, + "loss": 0.5804, + "mean_token_accuracy": 0.8300017550587654, + "num_tokens": 118397286.0, + "step": 98460 + }, + { + "entropy": 1.865318314731121, + "epoch": 0.3052483843643513, + "grad_norm": 9.157855033874512, + "learning_rate": 4.57897421309229e-06, + "loss": 0.4553, + "mean_token_accuracy": 0.8516108497977257, + "num_tokens": 118408785.0, + "step": 98470 + }, + { + "entropy": 1.8846929490566253, + "epoch": 0.30527938348940103, + "grad_norm": 7.941446304321289, + "learning_rate": 4.5787417223855705e-06, + "loss": 0.4774, + "mean_token_accuracy": 0.8428527131676674, + "num_tokens": 118420758.0, + "step": 98480 + }, + { + "entropy": 1.8963923439383508, + "epoch": 0.3053103826144507, + "grad_norm": 9.210304260253906, + "learning_rate": 4.57850926708838e-06, + "loss": 0.4957, + "mean_token_accuracy": 0.8412079870700836, + "num_tokens": 118432505.0, + "step": 98490 + }, + { + "entropy": 1.9127984009683132, + "epoch": 0.30534138173950043, + "grad_norm": 8.439544677734375, + "learning_rate": 4.578276847191734e-06, + "loss": 0.4771, + "mean_token_accuracy": 0.8416270837187767, + "num_tokens": 118444368.0, + "step": 98500 + }, + { + "entropy": 1.8232401758432388, + "epoch": 0.3053723808645501, + "grad_norm": 8.125101089477539, + "learning_rate": 4.578044462686643e-06, + "loss": 0.4172, + "mean_token_accuracy": 0.8568010330200195, + "num_tokens": 118457274.0, + "step": 98510 + }, + { + "entropy": 1.988058003783226, + "epoch": 0.30540337998959977, + "grad_norm": 8.513818740844727, + "learning_rate": 4.577812113564129e-06, + "loss": 0.5254, + "mean_token_accuracy": 0.8399447157979012, + "num_tokens": 118468141.0, + "step": 98520 + }, + { + "entropy": 1.8777014121413231, + "epoch": 0.3054343791146495, + "grad_norm": 7.6989850997924805, + "learning_rate": 4.577579799815213e-06, + "loss": 0.4963, + "mean_token_accuracy": 0.8421044409275055, + "num_tokens": 118479910.0, + "step": 98530 + }, + { + "entropy": 1.8567018955945969, + "epoch": 0.30546537823969916, + "grad_norm": 8.173233032226562, + "learning_rate": 4.57734752143092e-06, + "loss": 0.4427, + "mean_token_accuracy": 0.8535317838191986, + "num_tokens": 118491450.0, + "step": 98540 + }, + { + "entropy": 1.8805242463946343, + "epoch": 0.3054963773647489, + "grad_norm": 8.804410934448242, + "learning_rate": 4.577115278402281e-06, + "loss": 0.4536, + "mean_token_accuracy": 0.8533170059323311, + "num_tokens": 118502975.0, + "step": 98550 + }, + { + "entropy": 1.7814088612794876, + "epoch": 0.30552737648979855, + "grad_norm": 8.247437477111816, + "learning_rate": 4.5768830707203236e-06, + "loss": 0.4217, + "mean_token_accuracy": 0.8531454712152481, + "num_tokens": 118515769.0, + "step": 98560 + }, + { + "entropy": 1.8369492530822753, + "epoch": 0.3055583756148483, + "grad_norm": 7.819140911102295, + "learning_rate": 4.576650898376085e-06, + "loss": 0.4961, + "mean_token_accuracy": 0.8478553339838981, + "num_tokens": 118528258.0, + "step": 98570 + }, + { + "entropy": 1.868079724907875, + "epoch": 0.30558937473989795, + "grad_norm": 4.0178446769714355, + "learning_rate": 4.5764187613606045e-06, + "loss": 0.4468, + "mean_token_accuracy": 0.8433499991893768, + "num_tokens": 118540637.0, + "step": 98580 + }, + { + "entropy": 1.825770527124405, + "epoch": 0.30562037386494767, + "grad_norm": 6.859602451324463, + "learning_rate": 4.57618665966492e-06, + "loss": 0.4441, + "mean_token_accuracy": 0.8496432945132255, + "num_tokens": 118553277.0, + "step": 98590 + }, + { + "entropy": 1.8329296857118607, + "epoch": 0.30565137298999734, + "grad_norm": 7.438360214233398, + "learning_rate": 4.575954593280079e-06, + "loss": 0.4231, + "mean_token_accuracy": 0.858691118657589, + "num_tokens": 118564885.0, + "step": 98600 + }, + { + "entropy": 1.8381449341773988, + "epoch": 0.30568237211504706, + "grad_norm": 4.074737071990967, + "learning_rate": 4.575722562197127e-06, + "loss": 0.4524, + "mean_token_accuracy": 0.8469588398933411, + "num_tokens": 118577848.0, + "step": 98610 + }, + { + "entropy": 1.85487762093544, + "epoch": 0.30571337124009673, + "grad_norm": 7.444735527038574, + "learning_rate": 4.575490566407115e-06, + "loss": 0.4466, + "mean_token_accuracy": 0.849855688214302, + "num_tokens": 118590154.0, + "step": 98620 + }, + { + "entropy": 1.8266935624182223, + "epoch": 0.30574437036514646, + "grad_norm": 4.6787896156311035, + "learning_rate": 4.575258605901098e-06, + "loss": 0.419, + "mean_token_accuracy": 0.8533053979277611, + "num_tokens": 118603425.0, + "step": 98630 + }, + { + "entropy": 1.886932836472988, + "epoch": 0.3057753694901961, + "grad_norm": 7.9331889152526855, + "learning_rate": 4.575026680670132e-06, + "loss": 0.5059, + "mean_token_accuracy": 0.8492220997810364, + "num_tokens": 118614798.0, + "step": 98640 + }, + { + "entropy": 1.803956750035286, + "epoch": 0.30580636861524585, + "grad_norm": 7.170061111450195, + "learning_rate": 4.5747947907052775e-06, + "loss": 0.4131, + "mean_token_accuracy": 0.8577531769871711, + "num_tokens": 118628191.0, + "step": 98650 + }, + { + "entropy": 1.8016648098826409, + "epoch": 0.3058373677402955, + "grad_norm": 3.502978801727295, + "learning_rate": 4.574562935997597e-06, + "loss": 0.4022, + "mean_token_accuracy": 0.8521159499883652, + "num_tokens": 118641385.0, + "step": 98660 + }, + { + "entropy": 1.8296085387468337, + "epoch": 0.30586836686534524, + "grad_norm": 8.538934707641602, + "learning_rate": 4.574331116538158e-06, + "loss": 0.4299, + "mean_token_accuracy": 0.8565147161483765, + "num_tokens": 118653485.0, + "step": 98670 + }, + { + "entropy": 1.8369739711284638, + "epoch": 0.3058993659903949, + "grad_norm": 7.255573749542236, + "learning_rate": 4.574099332318032e-06, + "loss": 0.4302, + "mean_token_accuracy": 0.8531767651438713, + "num_tokens": 118665318.0, + "step": 98680 + }, + { + "entropy": 1.8349349424242973, + "epoch": 0.30593036511544464, + "grad_norm": 8.855729103088379, + "learning_rate": 4.573867583328289e-06, + "loss": 0.4502, + "mean_token_accuracy": 0.8501504331827163, + "num_tokens": 118678218.0, + "step": 98690 + }, + { + "entropy": 1.8983908250927926, + "epoch": 0.3059613642404943, + "grad_norm": 8.671244621276855, + "learning_rate": 4.573635869560006e-06, + "loss": 0.472, + "mean_token_accuracy": 0.8487844780087471, + "num_tokens": 118690095.0, + "step": 98700 + }, + { + "entropy": 1.8810410752892495, + "epoch": 0.30599236336554403, + "grad_norm": 8.668618202209473, + "learning_rate": 4.573404191004263e-06, + "loss": 0.482, + "mean_token_accuracy": 0.8451575711369514, + "num_tokens": 118701551.0, + "step": 98710 + }, + { + "entropy": 1.9009374052286148, + "epoch": 0.3060233624905937, + "grad_norm": 8.427909851074219, + "learning_rate": 4.573172547652142e-06, + "loss": 0.4786, + "mean_token_accuracy": 0.839051516354084, + "num_tokens": 118713008.0, + "step": 98720 + }, + { + "entropy": 1.9209471434354781, + "epoch": 0.3060543616156434, + "grad_norm": 7.763575077056885, + "learning_rate": 4.572940939494728e-06, + "loss": 0.5099, + "mean_token_accuracy": 0.8371775403618813, + "num_tokens": 118724161.0, + "step": 98730 + }, + { + "entropy": 1.9421784669160842, + "epoch": 0.3060853607406931, + "grad_norm": 8.41716194152832, + "learning_rate": 4.5727093665231095e-06, + "loss": 0.498, + "mean_token_accuracy": 0.8471307173371315, + "num_tokens": 118734733.0, + "step": 98740 + }, + { + "entropy": 1.8512969225645066, + "epoch": 0.3061163598657428, + "grad_norm": 4.9068193435668945, + "learning_rate": 4.57247782872838e-06, + "loss": 0.4509, + "mean_token_accuracy": 0.8417642742395401, + "num_tokens": 118747239.0, + "step": 98750 + }, + { + "entropy": 1.8801366731524467, + "epoch": 0.3061473589907925, + "grad_norm": 8.114398956298828, + "learning_rate": 4.5722463261016335e-06, + "loss": 0.4223, + "mean_token_accuracy": 0.8596544772386551, + "num_tokens": 118759918.0, + "step": 98760 + }, + { + "entropy": 1.8776177152991296, + "epoch": 0.30617835811584215, + "grad_norm": 7.3587141036987305, + "learning_rate": 4.572014858633968e-06, + "loss": 0.4446, + "mean_token_accuracy": 0.8482073560357094, + "num_tokens": 118771855.0, + "step": 98770 + }, + { + "entropy": 1.8818192645907401, + "epoch": 0.3062093572408919, + "grad_norm": 10.548823356628418, + "learning_rate": 4.571783426316486e-06, + "loss": 0.5322, + "mean_token_accuracy": 0.835597887635231, + "num_tokens": 118783654.0, + "step": 98780 + }, + { + "entropy": 1.8536282077431678, + "epoch": 0.30624035636594155, + "grad_norm": 3.757561206817627, + "learning_rate": 4.571552029140291e-06, + "loss": 0.4171, + "mean_token_accuracy": 0.860873955488205, + "num_tokens": 118795662.0, + "step": 98790 + }, + { + "entropy": 1.8578494921326638, + "epoch": 0.30627135549099127, + "grad_norm": 6.849057197570801, + "learning_rate": 4.57132066709649e-06, + "loss": 0.5194, + "mean_token_accuracy": 0.8355998247861862, + "num_tokens": 118807279.0, + "step": 98800 + }, + { + "entropy": 1.9314350634813309, + "epoch": 0.30630235461604094, + "grad_norm": 9.63023853302002, + "learning_rate": 4.571089340176196e-06, + "loss": 0.5303, + "mean_token_accuracy": 0.8435678333044052, + "num_tokens": 118818128.0, + "step": 98810 + }, + { + "entropy": 1.816218902170658, + "epoch": 0.30633335374109066, + "grad_norm": 8.004508972167969, + "learning_rate": 4.570858048370521e-06, + "loss": 0.4384, + "mean_token_accuracy": 0.8539455533027649, + "num_tokens": 118830413.0, + "step": 98820 + }, + { + "entropy": 1.9110864594578743, + "epoch": 0.30636435286614033, + "grad_norm": 8.797080039978027, + "learning_rate": 4.570626791670582e-06, + "loss": 0.5289, + "mean_token_accuracy": 0.8348402082920074, + "num_tokens": 118841694.0, + "step": 98830 + }, + { + "entropy": 1.8853062570095063, + "epoch": 0.30639535199119006, + "grad_norm": 8.307490348815918, + "learning_rate": 4.570395570067499e-06, + "loss": 0.4921, + "mean_token_accuracy": 0.8420171350240707, + "num_tokens": 118853649.0, + "step": 98840 + }, + { + "entropy": 1.902986891567707, + "epoch": 0.3064263511162397, + "grad_norm": 6.329873561859131, + "learning_rate": 4.5701643835523984e-06, + "loss": 0.4958, + "mean_token_accuracy": 0.8435382351279259, + "num_tokens": 118864431.0, + "step": 98850 + }, + { + "entropy": 1.9434664219617843, + "epoch": 0.30645735024128945, + "grad_norm": 7.9458184242248535, + "learning_rate": 4.569933232116404e-06, + "loss": 0.5506, + "mean_token_accuracy": 0.8374830722808838, + "num_tokens": 118875327.0, + "step": 98860 + }, + { + "entropy": 1.824971318244934, + "epoch": 0.3064883493663391, + "grad_norm": 8.772302627563477, + "learning_rate": 4.569702115750646e-06, + "loss": 0.417, + "mean_token_accuracy": 0.8616791382431984, + "num_tokens": 118888102.0, + "step": 98870 + }, + { + "entropy": 1.924868457019329, + "epoch": 0.30651934849138884, + "grad_norm": 7.891080379486084, + "learning_rate": 4.569471034446258e-06, + "loss": 0.4636, + "mean_token_accuracy": 0.8509886354207993, + "num_tokens": 118898982.0, + "step": 98880 + }, + { + "entropy": 1.8691664278507232, + "epoch": 0.3065503476164385, + "grad_norm": 8.10325813293457, + "learning_rate": 4.5692399881943754e-06, + "loss": 0.4811, + "mean_token_accuracy": 0.8400322616100311, + "num_tokens": 118910630.0, + "step": 98890 + }, + { + "entropy": 1.8352098122239113, + "epoch": 0.30658134674148824, + "grad_norm": 3.6725313663482666, + "learning_rate": 4.569008976986136e-06, + "loss": 0.3725, + "mean_token_accuracy": 0.8578842118382454, + "num_tokens": 118923666.0, + "step": 98900 + }, + { + "entropy": 1.852015070617199, + "epoch": 0.3066123458665379, + "grad_norm": 7.963640213012695, + "learning_rate": 4.568778000812685e-06, + "loss": 0.458, + "mean_token_accuracy": 0.8513859167695046, + "num_tokens": 118935379.0, + "step": 98910 + }, + { + "entropy": 1.8301201090216637, + "epoch": 0.30664334499158763, + "grad_norm": 7.35858678817749, + "learning_rate": 4.568547059665164e-06, + "loss": 0.4167, + "mean_token_accuracy": 0.8498409286141395, + "num_tokens": 118948414.0, + "step": 98920 + }, + { + "entropy": 1.8332775115966797, + "epoch": 0.3066743441166373, + "grad_norm": 4.402047157287598, + "learning_rate": 4.568316153534725e-06, + "loss": 0.4252, + "mean_token_accuracy": 0.8492154240608215, + "num_tokens": 118961750.0, + "step": 98930 + }, + { + "entropy": 1.912602500617504, + "epoch": 0.306705343241687, + "grad_norm": 8.157501220703125, + "learning_rate": 4.568085282412518e-06, + "loss": 0.5092, + "mean_token_accuracy": 0.8328171208500862, + "num_tokens": 118973215.0, + "step": 98940 + }, + { + "entropy": 1.8730465933680533, + "epoch": 0.3067363423667367, + "grad_norm": 10.181466102600098, + "learning_rate": 4.567854446289697e-06, + "loss": 0.4826, + "mean_token_accuracy": 0.8437885701656341, + "num_tokens": 118985524.0, + "step": 98950 + }, + { + "entropy": 1.902740554511547, + "epoch": 0.3067673414917864, + "grad_norm": 9.36436939239502, + "learning_rate": 4.567623645157422e-06, + "loss": 0.4682, + "mean_token_accuracy": 0.8489101201295852, + "num_tokens": 118997046.0, + "step": 98960 + }, + { + "entropy": 1.8747300267219544, + "epoch": 0.3067983406168361, + "grad_norm": 9.957647323608398, + "learning_rate": 4.567392879006852e-06, + "loss": 0.4768, + "mean_token_accuracy": 0.8495268151164055, + "num_tokens": 119009030.0, + "step": 98970 + }, + { + "entropy": 1.8686933800578118, + "epoch": 0.3068293397418858, + "grad_norm": 7.555231094360352, + "learning_rate": 4.567162147829152e-06, + "loss": 0.491, + "mean_token_accuracy": 0.8467110440135002, + "num_tokens": 119020843.0, + "step": 98980 + }, + { + "entropy": 1.884840413928032, + "epoch": 0.3068603388669355, + "grad_norm": 7.894239902496338, + "learning_rate": 4.56693145161549e-06, + "loss": 0.4907, + "mean_token_accuracy": 0.8425130993127823, + "num_tokens": 119032739.0, + "step": 98990 + }, + { + "entropy": 1.846720139682293, + "epoch": 0.3068913379919852, + "grad_norm": 7.1045403480529785, + "learning_rate": 4.566700790357034e-06, + "loss": 0.4442, + "mean_token_accuracy": 0.8530875831842423, + "num_tokens": 119044858.0, + "step": 99000 + }, + { + "entropy": 1.9004247322678567, + "epoch": 0.30692233711703487, + "grad_norm": 3.9099464416503906, + "learning_rate": 4.56647016404496e-06, + "loss": 0.4907, + "mean_token_accuracy": 0.8432266771793365, + "num_tokens": 119056720.0, + "step": 99010 + }, + { + "entropy": 1.845305010676384, + "epoch": 0.30695333624208454, + "grad_norm": 4.365594387054443, + "learning_rate": 4.566239572670445e-06, + "loss": 0.4271, + "mean_token_accuracy": 0.8529621243476868, + "num_tokens": 119069481.0, + "step": 99020 + }, + { + "entropy": 1.8452321007847785, + "epoch": 0.30698433536713426, + "grad_norm": 7.914214611053467, + "learning_rate": 4.566009016224666e-06, + "loss": 0.451, + "mean_token_accuracy": 0.8480882182717323, + "num_tokens": 119081620.0, + "step": 99030 + }, + { + "entropy": 1.8590731248259544, + "epoch": 0.30701533449218393, + "grad_norm": 8.533273696899414, + "learning_rate": 4.565778494698808e-06, + "loss": 0.4815, + "mean_token_accuracy": 0.8423286929726601, + "num_tokens": 119093680.0, + "step": 99040 + }, + { + "entropy": 1.8688816666603087, + "epoch": 0.30704633361723366, + "grad_norm": 8.64100456237793, + "learning_rate": 4.5655480080840556e-06, + "loss": 0.4382, + "mean_token_accuracy": 0.8599061399698258, + "num_tokens": 119105568.0, + "step": 99050 + }, + { + "entropy": 1.853632289171219, + "epoch": 0.3070773327422833, + "grad_norm": 2.459723949432373, + "learning_rate": 4.565317556371598e-06, + "loss": 0.4447, + "mean_token_accuracy": 0.8528544023633003, + "num_tokens": 119118403.0, + "step": 99060 + }, + { + "entropy": 1.867573080956936, + "epoch": 0.30710833186733305, + "grad_norm": 7.473991394042969, + "learning_rate": 4.56508713955263e-06, + "loss": 0.477, + "mean_token_accuracy": 0.844258151948452, + "num_tokens": 119130445.0, + "step": 99070 + }, + { + "entropy": 1.8869913056492806, + "epoch": 0.3071393309923827, + "grad_norm": 6.70831298828125, + "learning_rate": 4.564856757618344e-06, + "loss": 0.4929, + "mean_token_accuracy": 0.8497973084449768, + "num_tokens": 119142338.0, + "step": 99080 + }, + { + "entropy": 1.8372093975543975, + "epoch": 0.30717033011743244, + "grad_norm": 7.9739603996276855, + "learning_rate": 4.564626410559939e-06, + "loss": 0.4441, + "mean_token_accuracy": 0.8506271958351135, + "num_tokens": 119154394.0, + "step": 99090 + }, + { + "entropy": 1.8323589369654656, + "epoch": 0.3072013292424821, + "grad_norm": 8.829228401184082, + "learning_rate": 4.564396098368618e-06, + "loss": 0.4315, + "mean_token_accuracy": 0.8452668026089668, + "num_tokens": 119167066.0, + "step": 99100 + }, + { + "entropy": 1.9608383178710938, + "epoch": 0.30723232836753184, + "grad_norm": 8.926813125610352, + "learning_rate": 4.564165821035583e-06, + "loss": 0.5453, + "mean_token_accuracy": 0.8367260545492172, + "num_tokens": 119178017.0, + "step": 99110 + }, + { + "entropy": 1.861896750330925, + "epoch": 0.3072633274925815, + "grad_norm": 3.713798761367798, + "learning_rate": 4.563935578552043e-06, + "loss": 0.4896, + "mean_token_accuracy": 0.8421308517456054, + "num_tokens": 119190396.0, + "step": 99120 + }, + { + "entropy": 1.9013278767466546, + "epoch": 0.30729432661763123, + "grad_norm": 9.141477584838867, + "learning_rate": 4.563705370909211e-06, + "loss": 0.5019, + "mean_token_accuracy": 0.8393938109278679, + "num_tokens": 119201792.0, + "step": 99130 + }, + { + "entropy": 1.850807547569275, + "epoch": 0.3073253257426809, + "grad_norm": 4.37838888168335, + "learning_rate": 4.563475198098299e-06, + "loss": 0.4531, + "mean_token_accuracy": 0.8489650651812554, + "num_tokens": 119214868.0, + "step": 99140 + }, + { + "entropy": 1.7602989837527274, + "epoch": 0.3073563248677306, + "grad_norm": 3.604959726333618, + "learning_rate": 4.563245060110523e-06, + "loss": 0.412, + "mean_token_accuracy": 0.8620150581002235, + "num_tokens": 119228299.0, + "step": 99150 + }, + { + "entropy": 1.949777290225029, + "epoch": 0.3073873239927803, + "grad_norm": 7.350498199462891, + "learning_rate": 4.563014956937104e-06, + "loss": 0.5215, + "mean_token_accuracy": 0.8472014635801315, + "num_tokens": 119239986.0, + "step": 99160 + }, + { + "entropy": 1.9088080897927284, + "epoch": 0.30741832311783, + "grad_norm": 8.599087715148926, + "learning_rate": 4.562784888569266e-06, + "loss": 0.4898, + "mean_token_accuracy": 0.8541075602173805, + "num_tokens": 119251421.0, + "step": 99170 + }, + { + "entropy": 1.9487151458859444, + "epoch": 0.3074493222428797, + "grad_norm": 9.30274772644043, + "learning_rate": 4.562554854998235e-06, + "loss": 0.4915, + "mean_token_accuracy": 0.842944149672985, + "num_tokens": 119263018.0, + "step": 99180 + }, + { + "entropy": 1.8647134892642498, + "epoch": 0.3074803213679294, + "grad_norm": 7.472745418548584, + "learning_rate": 4.56232485621524e-06, + "loss": 0.4288, + "mean_token_accuracy": 0.8561579942703247, + "num_tokens": 119275984.0, + "step": 99190 + }, + { + "entropy": 1.8933315068483352, + "epoch": 0.3075113204929791, + "grad_norm": 8.288209915161133, + "learning_rate": 4.5620948922115156e-06, + "loss": 0.5475, + "mean_token_accuracy": 0.831151905655861, + "num_tokens": 119287771.0, + "step": 99200 + }, + { + "entropy": 1.8554987102746963, + "epoch": 0.3075423196180288, + "grad_norm": 7.526989459991455, + "learning_rate": 4.561864962978294e-06, + "loss": 0.4617, + "mean_token_accuracy": 0.8532639876008034, + "num_tokens": 119299903.0, + "step": 99210 + }, + { + "entropy": 1.9998096972703934, + "epoch": 0.30757331874307847, + "grad_norm": 9.368232727050781, + "learning_rate": 4.5616350685068165e-06, + "loss": 0.567, + "mean_token_accuracy": 0.8291105717420578, + "num_tokens": 119310444.0, + "step": 99220 + }, + { + "entropy": 1.898825192451477, + "epoch": 0.3076043178681282, + "grad_norm": 8.540651321411133, + "learning_rate": 4.561405208788324e-06, + "loss": 0.5238, + "mean_token_accuracy": 0.8367143541574478, + "num_tokens": 119321635.0, + "step": 99230 + }, + { + "entropy": 1.8685836613178253, + "epoch": 0.30763531699317787, + "grad_norm": 3.111175775527954, + "learning_rate": 4.561175383814061e-06, + "loss": 0.4296, + "mean_token_accuracy": 0.8581192702054977, + "num_tokens": 119333768.0, + "step": 99240 + }, + { + "entropy": 1.8917466133832932, + "epoch": 0.3076663161182276, + "grad_norm": 8.266393661499023, + "learning_rate": 4.560945593575276e-06, + "loss": 0.4361, + "mean_token_accuracy": 0.8560093253850937, + "num_tokens": 119345845.0, + "step": 99250 + }, + { + "entropy": 1.9194523006677628, + "epoch": 0.30769731524327726, + "grad_norm": 7.8154683113098145, + "learning_rate": 4.560715838063221e-06, + "loss": 0.4998, + "mean_token_accuracy": 0.8392749205231667, + "num_tokens": 119356959.0, + "step": 99260 + }, + { + "entropy": 1.9294108331203461, + "epoch": 0.3077283143683269, + "grad_norm": 7.9482831954956055, + "learning_rate": 4.560486117269149e-06, + "loss": 0.5774, + "mean_token_accuracy": 0.8344770103693009, + "num_tokens": 119368443.0, + "step": 99270 + }, + { + "entropy": 1.8910026296973228, + "epoch": 0.30775931349337665, + "grad_norm": 6.961696624755859, + "learning_rate": 4.560256431184316e-06, + "loss": 0.4661, + "mean_token_accuracy": 0.8482179164886474, + "num_tokens": 119380584.0, + "step": 99280 + }, + { + "entropy": 1.9641872704029084, + "epoch": 0.3077903126184263, + "grad_norm": 9.529417991638184, + "learning_rate": 4.5600267797999856e-06, + "loss": 0.5362, + "mean_token_accuracy": 0.8355140164494514, + "num_tokens": 119390739.0, + "step": 99290 + }, + { + "entropy": 1.8502513483166694, + "epoch": 0.30782131174347604, + "grad_norm": 7.629122734069824, + "learning_rate": 4.55979716310742e-06, + "loss": 0.4319, + "mean_token_accuracy": 0.8559545025229454, + "num_tokens": 119403174.0, + "step": 99300 + }, + { + "entropy": 1.9880924820899963, + "epoch": 0.3078523108685257, + "grad_norm": 11.568595886230469, + "learning_rate": 4.5595675810978835e-06, + "loss": 0.5379, + "mean_token_accuracy": 0.8477470189332962, + "num_tokens": 119414065.0, + "step": 99310 + }, + { + "entropy": 1.9389469519257545, + "epoch": 0.30788330999357544, + "grad_norm": 8.663850784301758, + "learning_rate": 4.559338033762647e-06, + "loss": 0.4673, + "mean_token_accuracy": 0.8486195042729378, + "num_tokens": 119425228.0, + "step": 99320 + }, + { + "entropy": 1.8266063407063484, + "epoch": 0.3079143091186251, + "grad_norm": 5.284822940826416, + "learning_rate": 4.559108521092985e-06, + "loss": 0.3787, + "mean_token_accuracy": 0.861154480278492, + "num_tokens": 119439104.0, + "step": 99330 + }, + { + "entropy": 1.8920496240258218, + "epoch": 0.30794530824367483, + "grad_norm": 3.9276669025421143, + "learning_rate": 4.558879043080171e-06, + "loss": 0.4505, + "mean_token_accuracy": 0.8588141456246376, + "num_tokens": 119451373.0, + "step": 99340 + }, + { + "entropy": 1.951807254552841, + "epoch": 0.3079763073687245, + "grad_norm": 7.063690662384033, + "learning_rate": 4.5586495997154835e-06, + "loss": 0.5193, + "mean_token_accuracy": 0.8351276248693467, + "num_tokens": 119462888.0, + "step": 99350 + }, + { + "entropy": 2.002240237593651, + "epoch": 0.3080073064937742, + "grad_norm": 9.491320610046387, + "learning_rate": 4.558420190990207e-06, + "loss": 0.5519, + "mean_token_accuracy": 0.8310097977519035, + "num_tokens": 119473869.0, + "step": 99360 + }, + { + "entropy": 1.847242882847786, + "epoch": 0.3080383056188239, + "grad_norm": 8.938675880432129, + "learning_rate": 4.558190816895623e-06, + "loss": 0.464, + "mean_token_accuracy": 0.8449984312057495, + "num_tokens": 119486256.0, + "step": 99370 + }, + { + "entropy": 1.865819439291954, + "epoch": 0.3080693047438736, + "grad_norm": 7.645913600921631, + "learning_rate": 4.557961477423024e-06, + "loss": 0.4424, + "mean_token_accuracy": 0.848369000852108, + "num_tokens": 119497901.0, + "step": 99380 + }, + { + "entropy": 1.8621415674686432, + "epoch": 0.3081003038689233, + "grad_norm": 9.088942527770996, + "learning_rate": 4.557732172563696e-06, + "loss": 0.4749, + "mean_token_accuracy": 0.8500281676650048, + "num_tokens": 119510003.0, + "step": 99390 + }, + { + "entropy": 1.8306550726294517, + "epoch": 0.308131302993973, + "grad_norm": 7.942266941070557, + "learning_rate": 4.557502902308936e-06, + "loss": 0.3983, + "mean_token_accuracy": 0.8491405546665192, + "num_tokens": 119522659.0, + "step": 99400 + }, + { + "entropy": 1.9008427813649178, + "epoch": 0.3081623021190227, + "grad_norm": 10.027981758117676, + "learning_rate": 4.557273666650041e-06, + "loss": 0.4978, + "mean_token_accuracy": 0.8394996523857117, + "num_tokens": 119534862.0, + "step": 99410 + }, + { + "entropy": 1.9339268684387207, + "epoch": 0.3081933012440724, + "grad_norm": 8.332462310791016, + "learning_rate": 4.5570444655783105e-06, + "loss": 0.5281, + "mean_token_accuracy": 0.848136767745018, + "num_tokens": 119546767.0, + "step": 99420 + }, + { + "entropy": 1.8939686760306358, + "epoch": 0.3082243003691221, + "grad_norm": 7.6620259284973145, + "learning_rate": 4.556815299085049e-06, + "loss": 0.4571, + "mean_token_accuracy": 0.8582808762788773, + "num_tokens": 119558146.0, + "step": 99430 + }, + { + "entropy": 1.8960684776306151, + "epoch": 0.3082552994941718, + "grad_norm": 8.92011547088623, + "learning_rate": 4.556586167161562e-06, + "loss": 0.5102, + "mean_token_accuracy": 0.8485264018177986, + "num_tokens": 119569433.0, + "step": 99440 + }, + { + "entropy": 1.8812731936573983, + "epoch": 0.30828629861922147, + "grad_norm": 7.918720722198486, + "learning_rate": 4.556357069799159e-06, + "loss": 0.5013, + "mean_token_accuracy": 0.8485799968242645, + "num_tokens": 119581623.0, + "step": 99450 + }, + { + "entropy": 1.9013389393687248, + "epoch": 0.3083172977442712, + "grad_norm": 8.778814315795898, + "learning_rate": 4.556128006989152e-06, + "loss": 0.4826, + "mean_token_accuracy": 0.8451300621032715, + "num_tokens": 119593213.0, + "step": 99460 + }, + { + "entropy": 1.8785056233406068, + "epoch": 0.30834829686932086, + "grad_norm": 7.801638126373291, + "learning_rate": 4.555898978722858e-06, + "loss": 0.4349, + "mean_token_accuracy": 0.8590483710169792, + "num_tokens": 119605344.0, + "step": 99470 + }, + { + "entropy": 1.9365007862448693, + "epoch": 0.3083792959943706, + "grad_norm": 10.32783317565918, + "learning_rate": 4.555669984991595e-06, + "loss": 0.5556, + "mean_token_accuracy": 0.830289502441883, + "num_tokens": 119617211.0, + "step": 99480 + }, + { + "entropy": 1.9110652074217795, + "epoch": 0.30841029511942025, + "grad_norm": 7.38109016418457, + "learning_rate": 4.555441025786685e-06, + "loss": 0.5, + "mean_token_accuracy": 0.8413866236805916, + "num_tokens": 119628112.0, + "step": 99490 + }, + { + "entropy": 1.9205258354544639, + "epoch": 0.30844129424447, + "grad_norm": 7.779130458831787, + "learning_rate": 4.555212101099452e-06, + "loss": 0.4783, + "mean_token_accuracy": 0.8441153079271316, + "num_tokens": 119639907.0, + "step": 99500 + }, + { + "entropy": 1.9123625665903092, + "epoch": 0.30847229336951965, + "grad_norm": 3.761373281478882, + "learning_rate": 4.554983210921223e-06, + "loss": 0.4505, + "mean_token_accuracy": 0.8475768864154816, + "num_tokens": 119651966.0, + "step": 99510 + }, + { + "entropy": 1.8980589851737022, + "epoch": 0.3085032924945693, + "grad_norm": 9.282678604125977, + "learning_rate": 4.55475435524333e-06, + "loss": 0.4699, + "mean_token_accuracy": 0.8474918335676194, + "num_tokens": 119663224.0, + "step": 99520 + }, + { + "entropy": 1.807744611799717, + "epoch": 0.30853429161961904, + "grad_norm": 8.637643814086914, + "learning_rate": 4.554525534057108e-06, + "loss": 0.4036, + "mean_token_accuracy": 0.8565024748444557, + "num_tokens": 119676021.0, + "step": 99530 + }, + { + "entropy": 1.936571803689003, + "epoch": 0.3085652907446687, + "grad_norm": 8.615496635437012, + "learning_rate": 4.554296747353892e-06, + "loss": 0.4591, + "mean_token_accuracy": 0.8555189058184623, + "num_tokens": 119686726.0, + "step": 99540 + }, + { + "entropy": 1.869454000890255, + "epoch": 0.30859628986971843, + "grad_norm": 8.078155517578125, + "learning_rate": 4.554067995125023e-06, + "loss": 0.4736, + "mean_token_accuracy": 0.8461382001638412, + "num_tokens": 119698220.0, + "step": 99550 + }, + { + "entropy": 1.8459780648350717, + "epoch": 0.3086272889947681, + "grad_norm": 4.338291168212891, + "learning_rate": 4.5538392773618436e-06, + "loss": 0.4567, + "mean_token_accuracy": 0.8427159383893013, + "num_tokens": 119710621.0, + "step": 99560 + }, + { + "entropy": 1.8610286951065063, + "epoch": 0.3086582881198178, + "grad_norm": 8.782474517822266, + "learning_rate": 4.553610594055699e-06, + "loss": 0.4504, + "mean_token_accuracy": 0.8488268256187439, + "num_tokens": 119722541.0, + "step": 99570 + }, + { + "entropy": 1.9070811703801156, + "epoch": 0.3086892872448675, + "grad_norm": 7.238033771514893, + "learning_rate": 4.553381945197941e-06, + "loss": 0.4898, + "mean_token_accuracy": 0.836867593228817, + "num_tokens": 119733908.0, + "step": 99580 + }, + { + "entropy": 1.9172224178910255, + "epoch": 0.3087202863699172, + "grad_norm": 8.506465911865234, + "learning_rate": 4.553153330779919e-06, + "loss": 0.5402, + "mean_token_accuracy": 0.8402721121907234, + "num_tokens": 119745237.0, + "step": 99590 + }, + { + "entropy": 1.8876484125852584, + "epoch": 0.3087512854949669, + "grad_norm": 4.413023471832275, + "learning_rate": 4.552924750792989e-06, + "loss": 0.4712, + "mean_token_accuracy": 0.846372652053833, + "num_tokens": 119757146.0, + "step": 99600 + }, + { + "entropy": 1.8622125744819642, + "epoch": 0.3087822846200166, + "grad_norm": 8.20417308807373, + "learning_rate": 4.552696205228509e-06, + "loss": 0.4326, + "mean_token_accuracy": 0.8524843230843544, + "num_tokens": 119769534.0, + "step": 99610 + }, + { + "entropy": 1.847460262477398, + "epoch": 0.3088132837450663, + "grad_norm": 7.062432289123535, + "learning_rate": 4.552467694077842e-06, + "loss": 0.4164, + "mean_token_accuracy": 0.8552860572934151, + "num_tokens": 119781883.0, + "step": 99620 + }, + { + "entropy": 1.8279512047767639, + "epoch": 0.308844282870116, + "grad_norm": 3.6404471397399902, + "learning_rate": 4.552239217332351e-06, + "loss": 0.4187, + "mean_token_accuracy": 0.8477238088846206, + "num_tokens": 119795146.0, + "step": 99630 + }, + { + "entropy": 1.8767915710806846, + "epoch": 0.3088752819951657, + "grad_norm": 10.762784004211426, + "learning_rate": 4.552010774983402e-06, + "loss": 0.466, + "mean_token_accuracy": 0.8552503928542137, + "num_tokens": 119806803.0, + "step": 99640 + }, + { + "entropy": 1.8293926134705543, + "epoch": 0.3089062811202154, + "grad_norm": 7.491893291473389, + "learning_rate": 4.551782367022367e-06, + "loss": 0.4573, + "mean_token_accuracy": 0.8511452227830887, + "num_tokens": 119819906.0, + "step": 99650 + }, + { + "entropy": 1.8923638671636582, + "epoch": 0.30893728024526507, + "grad_norm": 9.077742576599121, + "learning_rate": 4.55155399344062e-06, + "loss": 0.474, + "mean_token_accuracy": 0.8456351578235626, + "num_tokens": 119831777.0, + "step": 99660 + }, + { + "entropy": 1.9003022998571395, + "epoch": 0.3089682793703148, + "grad_norm": 8.246248245239258, + "learning_rate": 4.551325654229535e-06, + "loss": 0.4391, + "mean_token_accuracy": 0.8550438165664673, + "num_tokens": 119843799.0, + "step": 99670 + }, + { + "entropy": 1.8228402808308601, + "epoch": 0.30899927849536446, + "grad_norm": 5.1642746925354, + "learning_rate": 4.551097349380495e-06, + "loss": 0.4592, + "mean_token_accuracy": 0.8491297498345375, + "num_tokens": 119856606.0, + "step": 99680 + }, + { + "entropy": 1.96132450401783, + "epoch": 0.3090302776204142, + "grad_norm": 9.276740074157715, + "learning_rate": 4.550869078884878e-06, + "loss": 0.5335, + "mean_token_accuracy": 0.8399526447057724, + "num_tokens": 119867481.0, + "step": 99690 + }, + { + "entropy": 1.8656087532639503, + "epoch": 0.30906127674546385, + "grad_norm": 8.782885551452637, + "learning_rate": 4.550640842734073e-06, + "loss": 0.4616, + "mean_token_accuracy": 0.8477922558784485, + "num_tokens": 119879627.0, + "step": 99700 + }, + { + "entropy": 1.8200937718153, + "epoch": 0.3090922758705136, + "grad_norm": 9.573867797851562, + "learning_rate": 4.550412640919468e-06, + "loss": 0.5322, + "mean_token_accuracy": 0.8348228052258492, + "num_tokens": 119892028.0, + "step": 99710 + }, + { + "entropy": 1.8446323826909066, + "epoch": 0.30912327499556325, + "grad_norm": 8.252911567687988, + "learning_rate": 4.550184473432453e-06, + "loss": 0.4397, + "mean_token_accuracy": 0.8526568725705147, + "num_tokens": 119904371.0, + "step": 99720 + }, + { + "entropy": 1.9776091009378434, + "epoch": 0.30915427412061297, + "grad_norm": 9.516276359558105, + "learning_rate": 4.5499563402644234e-06, + "loss": 0.5689, + "mean_token_accuracy": 0.8284485876560211, + "num_tokens": 119915090.0, + "step": 99730 + }, + { + "entropy": 1.9320985347032547, + "epoch": 0.30918527324566264, + "grad_norm": 9.039697647094727, + "learning_rate": 4.5497282414067775e-06, + "loss": 0.4897, + "mean_token_accuracy": 0.850306898355484, + "num_tokens": 119926461.0, + "step": 99740 + }, + { + "entropy": 1.9415932521224022, + "epoch": 0.30921627237071236, + "grad_norm": 9.981196403503418, + "learning_rate": 4.549500176850916e-06, + "loss": 0.5159, + "mean_token_accuracy": 0.848623238503933, + "num_tokens": 119937402.0, + "step": 99750 + }, + { + "entropy": 1.8851657703518867, + "epoch": 0.30924727149576203, + "grad_norm": 6.050877571105957, + "learning_rate": 4.549272146588241e-06, + "loss": 0.525, + "mean_token_accuracy": 0.8291679427027703, + "num_tokens": 119948670.0, + "step": 99760 + }, + { + "entropy": 1.8636073276400567, + "epoch": 0.3092782706208117, + "grad_norm": 9.428133964538574, + "learning_rate": 4.54904415061016e-06, + "loss": 0.4713, + "mean_token_accuracy": 0.8505959510803223, + "num_tokens": 119961140.0, + "step": 99770 + }, + { + "entropy": 1.8380408152937888, + "epoch": 0.3093092697458614, + "grad_norm": 8.414907455444336, + "learning_rate": 4.548816188908081e-06, + "loss": 0.501, + "mean_token_accuracy": 0.839817276597023, + "num_tokens": 119973426.0, + "step": 99780 + }, + { + "entropy": 1.8116330251097679, + "epoch": 0.3093402688709111, + "grad_norm": 8.415764808654785, + "learning_rate": 4.548588261473421e-06, + "loss": 0.4328, + "mean_token_accuracy": 0.8535687699913979, + "num_tokens": 119986183.0, + "step": 99790 + }, + { + "entropy": 1.8176366582512855, + "epoch": 0.3093712679959608, + "grad_norm": 9.07093620300293, + "learning_rate": 4.548360368297591e-06, + "loss": 0.4462, + "mean_token_accuracy": 0.8550222635269165, + "num_tokens": 119998588.0, + "step": 99800 + }, + { + "entropy": 1.9179295137524606, + "epoch": 0.3094022671210105, + "grad_norm": 8.444205284118652, + "learning_rate": 4.548132509372013e-06, + "loss": 0.5187, + "mean_token_accuracy": 0.8391743138432503, + "num_tokens": 120010012.0, + "step": 99810 + }, + { + "entropy": 1.8938566118478775, + "epoch": 0.3094332662460602, + "grad_norm": 8.31008529663086, + "learning_rate": 4.5479046846881064e-06, + "loss": 0.4787, + "mean_token_accuracy": 0.8451041653752327, + "num_tokens": 120022944.0, + "step": 99820 + }, + { + "entropy": 1.906507858633995, + "epoch": 0.3094642653711099, + "grad_norm": 8.747406959533691, + "learning_rate": 4.547676894237297e-06, + "loss": 0.4485, + "mean_token_accuracy": 0.8478483647108078, + "num_tokens": 120034662.0, + "step": 99830 + }, + { + "entropy": 1.8798761084675788, + "epoch": 0.3094952644961596, + "grad_norm": 9.963387489318848, + "learning_rate": 4.547449138011013e-06, + "loss": 0.466, + "mean_token_accuracy": 0.8498962029814721, + "num_tokens": 120047172.0, + "step": 99840 + }, + { + "entropy": 1.8998943746089936, + "epoch": 0.3095262636212093, + "grad_norm": 8.54676342010498, + "learning_rate": 4.5472214160006844e-06, + "loss": 0.5129, + "mean_token_accuracy": 0.8516019433736801, + "num_tokens": 120058507.0, + "step": 99850 + }, + { + "entropy": 1.8910057291388511, + "epoch": 0.309557262746259, + "grad_norm": 9.665603637695312, + "learning_rate": 4.546993728197744e-06, + "loss": 0.5021, + "mean_token_accuracy": 0.8422492250800133, + "num_tokens": 120070406.0, + "step": 99860 + }, + { + "entropy": 1.9007139950990677, + "epoch": 0.30958826187130867, + "grad_norm": 8.319001197814941, + "learning_rate": 4.546766074593631e-06, + "loss": 0.4697, + "mean_token_accuracy": 0.8561020061373711, + "num_tokens": 120082456.0, + "step": 99870 + }, + { + "entropy": 1.9839264631271363, + "epoch": 0.3096192609963584, + "grad_norm": 7.754944324493408, + "learning_rate": 4.546538455179782e-06, + "loss": 0.5336, + "mean_token_accuracy": 0.836320398747921, + "num_tokens": 120093393.0, + "step": 99880 + }, + { + "entropy": 1.827572251856327, + "epoch": 0.30965026012140806, + "grad_norm": 6.521658897399902, + "learning_rate": 4.546310869947643e-06, + "loss": 0.4143, + "mean_token_accuracy": 0.8617355406284333, + "num_tokens": 120106497.0, + "step": 99890 + }, + { + "entropy": 1.952468115091324, + "epoch": 0.3096812592464578, + "grad_norm": 7.609374523162842, + "learning_rate": 4.546083318888656e-06, + "loss": 0.4451, + "mean_token_accuracy": 0.8570076480507851, + "num_tokens": 120117150.0, + "step": 99900 + }, + { + "entropy": 1.9119912654161453, + "epoch": 0.30971225837150745, + "grad_norm": 8.479022979736328, + "learning_rate": 4.5458558019942736e-06, + "loss": 0.4679, + "mean_token_accuracy": 0.8490529209375381, + "num_tokens": 120128766.0, + "step": 99910 + }, + { + "entropy": 1.8472444072365761, + "epoch": 0.3097432574965572, + "grad_norm": 8.135096549987793, + "learning_rate": 4.5456283192559455e-06, + "loss": 0.4387, + "mean_token_accuracy": 0.8514307633042335, + "num_tokens": 120140846.0, + "step": 99920 + }, + { + "entropy": 1.8320681855082512, + "epoch": 0.30977425662160685, + "grad_norm": 7.373263835906982, + "learning_rate": 4.5454008706651255e-06, + "loss": 0.4321, + "mean_token_accuracy": 0.8532561391592026, + "num_tokens": 120152671.0, + "step": 99930 + }, + { + "entropy": 1.8469135209918022, + "epoch": 0.30980525574665657, + "grad_norm": 6.601968288421631, + "learning_rate": 4.545173456213272e-06, + "loss": 0.4331, + "mean_token_accuracy": 0.8549439549446106, + "num_tokens": 120165189.0, + "step": 99940 + }, + { + "entropy": 1.8578027948737144, + "epoch": 0.30983625487170624, + "grad_norm": 9.094026565551758, + "learning_rate": 4.5449460758918485e-06, + "loss": 0.4063, + "mean_token_accuracy": 0.8560818761587143, + "num_tokens": 120177445.0, + "step": 99950 + }, + { + "entropy": 1.8183515168726445, + "epoch": 0.30986725399675596, + "grad_norm": 8.197356224060059, + "learning_rate": 4.544718729692315e-06, + "loss": 0.4337, + "mean_token_accuracy": 0.8482865273952485, + "num_tokens": 120190572.0, + "step": 99960 + }, + { + "entropy": 1.8944060549139976, + "epoch": 0.30989825312180563, + "grad_norm": 8.696087837219238, + "learning_rate": 4.544491417606139e-06, + "loss": 0.5191, + "mean_token_accuracy": 0.8414214491844177, + "num_tokens": 120202087.0, + "step": 99970 + }, + { + "entropy": 1.7967160180211068, + "epoch": 0.30992925224685536, + "grad_norm": 8.540863990783691, + "learning_rate": 4.544264139624791e-06, + "loss": 0.4583, + "mean_token_accuracy": 0.8545686945319175, + "num_tokens": 120214607.0, + "step": 99980 + }, + { + "entropy": 1.814838644862175, + "epoch": 0.309960251371905, + "grad_norm": 6.886814594268799, + "learning_rate": 4.544036895739743e-06, + "loss": 0.42, + "mean_token_accuracy": 0.8569683223962784, + "num_tokens": 120227295.0, + "step": 99990 + }, + { + "entropy": 1.8500340178608894, + "epoch": 0.30999125049695475, + "grad_norm": 4.226168632507324, + "learning_rate": 4.5438096859424714e-06, + "loss": 0.4386, + "mean_token_accuracy": 0.8484739258885383, + "num_tokens": 120240549.0, + "step": 100000 + }, + { + "entropy": 1.7947644501924516, + "epoch": 0.3100222496220044, + "grad_norm": 4.175483703613281, + "learning_rate": 4.543582510224454e-06, + "loss": 0.4234, + "mean_token_accuracy": 0.8510774120688438, + "num_tokens": 120253357.0, + "step": 100010 + }, + { + "entropy": 1.8611189171671867, + "epoch": 0.3100532487470541, + "grad_norm": 11.313339233398438, + "learning_rate": 4.543355368577173e-06, + "loss": 0.4428, + "mean_token_accuracy": 0.8596706509590148, + "num_tokens": 120265081.0, + "step": 100020 + }, + { + "entropy": 1.8015659287571908, + "epoch": 0.3100842478721038, + "grad_norm": 8.804397583007812, + "learning_rate": 4.543128260992112e-06, + "loss": 0.4095, + "mean_token_accuracy": 0.8543227300047874, + "num_tokens": 120278300.0, + "step": 100030 + }, + { + "entropy": 1.8594049900770186, + "epoch": 0.3101152469971535, + "grad_norm": 9.17402458190918, + "learning_rate": 4.54290118746076e-06, + "loss": 0.4722, + "mean_token_accuracy": 0.8479228600859642, + "num_tokens": 120289822.0, + "step": 100040 + }, + { + "entropy": 1.808043359220028, + "epoch": 0.3101462461222032, + "grad_norm": 7.376701831817627, + "learning_rate": 4.542674147974606e-06, + "loss": 0.4142, + "mean_token_accuracy": 0.8577449038624764, + "num_tokens": 120302594.0, + "step": 100050 + }, + { + "entropy": 1.9016132436692714, + "epoch": 0.3101772452472529, + "grad_norm": 8.157571792602539, + "learning_rate": 4.5424471425251435e-06, + "loss": 0.4858, + "mean_token_accuracy": 0.8419206649065017, + "num_tokens": 120314205.0, + "step": 100060 + }, + { + "entropy": 1.8275883719325066, + "epoch": 0.3102082443723026, + "grad_norm": 8.187117576599121, + "learning_rate": 4.542220171103871e-06, + "loss": 0.4082, + "mean_token_accuracy": 0.8554164975881576, + "num_tokens": 120327120.0, + "step": 100070 + }, + { + "entropy": 1.8765954077243805, + "epoch": 0.31023924349735227, + "grad_norm": 2.51336932182312, + "learning_rate": 4.541993233702286e-06, + "loss": 0.5119, + "mean_token_accuracy": 0.8441577076911926, + "num_tokens": 120338971.0, + "step": 100080 + }, + { + "entropy": 1.8348333820700646, + "epoch": 0.310270242622402, + "grad_norm": 9.080048561096191, + "learning_rate": 4.541766330311893e-06, + "loss": 0.4552, + "mean_token_accuracy": 0.8454143956303597, + "num_tokens": 120351609.0, + "step": 100090 + }, + { + "entropy": 1.8745562806725502, + "epoch": 0.31030124174745166, + "grad_norm": 7.322804927825928, + "learning_rate": 4.541539460924194e-06, + "loss": 0.4858, + "mean_token_accuracy": 0.8521630093455315, + "num_tokens": 120363483.0, + "step": 100100 + }, + { + "entropy": 1.899325506389141, + "epoch": 0.3103322408725014, + "grad_norm": 9.89995002746582, + "learning_rate": 4.541312625530701e-06, + "loss": 0.4891, + "mean_token_accuracy": 0.8446615591645241, + "num_tokens": 120374672.0, + "step": 100110 + }, + { + "entropy": 1.8418832674622536, + "epoch": 0.31036323999755105, + "grad_norm": 7.776636123657227, + "learning_rate": 4.541085824122922e-06, + "loss": 0.4485, + "mean_token_accuracy": 0.8611134603619576, + "num_tokens": 120386805.0, + "step": 100120 + }, + { + "entropy": 1.908632791042328, + "epoch": 0.3103942391226008, + "grad_norm": 9.52039909362793, + "learning_rate": 4.540859056692375e-06, + "loss": 0.5108, + "mean_token_accuracy": 0.8384132295846939, + "num_tokens": 120398539.0, + "step": 100130 + }, + { + "entropy": 1.9329130694270134, + "epoch": 0.31042523824765045, + "grad_norm": 8.478260040283203, + "learning_rate": 4.540632323230573e-06, + "loss": 0.5144, + "mean_token_accuracy": 0.8484991729259491, + "num_tokens": 120409458.0, + "step": 100140 + }, + { + "entropy": 1.9148776784539223, + "epoch": 0.3104562373727002, + "grad_norm": 8.679959297180176, + "learning_rate": 4.54040562372904e-06, + "loss": 0.505, + "mean_token_accuracy": 0.8426013261079788, + "num_tokens": 120420579.0, + "step": 100150 + }, + { + "entropy": 1.8648797929286958, + "epoch": 0.31048723649774984, + "grad_norm": 4.328078269958496, + "learning_rate": 4.5401789581792985e-06, + "loss": 0.4867, + "mean_token_accuracy": 0.846242044866085, + "num_tokens": 120432678.0, + "step": 100160 + }, + { + "entropy": 1.8050495654344558, + "epoch": 0.31051823562279957, + "grad_norm": 9.263623237609863, + "learning_rate": 4.539952326572873e-06, + "loss": 0.4235, + "mean_token_accuracy": 0.8572616815567017, + "num_tokens": 120446167.0, + "step": 100170 + }, + { + "entropy": 2.000722700357437, + "epoch": 0.31054923474784923, + "grad_norm": 7.7509355545043945, + "learning_rate": 4.539725728901292e-06, + "loss": 0.5552, + "mean_token_accuracy": 0.8357238873839379, + "num_tokens": 120457079.0, + "step": 100180 + }, + { + "entropy": 1.8446622014045715, + "epoch": 0.31058023387289896, + "grad_norm": 7.534979820251465, + "learning_rate": 4.539499165156091e-06, + "loss": 0.4584, + "mean_token_accuracy": 0.8397464141249656, + "num_tokens": 120469823.0, + "step": 100190 + }, + { + "entropy": 1.8747181564569473, + "epoch": 0.3106112329979486, + "grad_norm": 3.7008886337280273, + "learning_rate": 4.539272635328803e-06, + "loss": 0.472, + "mean_token_accuracy": 0.846717146039009, + "num_tokens": 120482088.0, + "step": 100200 + }, + { + "entropy": 1.8778936117887497, + "epoch": 0.31064223212299835, + "grad_norm": 8.01706314086914, + "learning_rate": 4.539046139410965e-06, + "loss": 0.5166, + "mean_token_accuracy": 0.8487760618329048, + "num_tokens": 120494848.0, + "step": 100210 + }, + { + "entropy": 1.8788290441036224, + "epoch": 0.310673231248048, + "grad_norm": 7.9294915199279785, + "learning_rate": 4.53881967739412e-06, + "loss": 0.4745, + "mean_token_accuracy": 0.8471907436847687, + "num_tokens": 120506578.0, + "step": 100220 + }, + { + "entropy": 1.8919280216097831, + "epoch": 0.31070423037309775, + "grad_norm": 4.634057521820068, + "learning_rate": 4.53859324926981e-06, + "loss": 0.4983, + "mean_token_accuracy": 0.8390332102775574, + "num_tokens": 120518962.0, + "step": 100230 + }, + { + "entropy": 1.73563092648983, + "epoch": 0.3107352294981474, + "grad_norm": 6.966121196746826, + "learning_rate": 4.538366855029584e-06, + "loss": 0.408, + "mean_token_accuracy": 0.8594012647867203, + "num_tokens": 120532862.0, + "step": 100240 + }, + { + "entropy": 1.8883020922541618, + "epoch": 0.3107662286231971, + "grad_norm": 2.8376030921936035, + "learning_rate": 4.53814049466499e-06, + "loss": 0.4781, + "mean_token_accuracy": 0.8497428327798844, + "num_tokens": 120544852.0, + "step": 100250 + }, + { + "entropy": 1.980030670762062, + "epoch": 0.3107972277482468, + "grad_norm": 9.871118545532227, + "learning_rate": 4.537914168167582e-06, + "loss": 0.517, + "mean_token_accuracy": 0.8421619832515717, + "num_tokens": 120555617.0, + "step": 100260 + }, + { + "entropy": 1.9399761855602264, + "epoch": 0.3108282268732965, + "grad_norm": 8.313036918640137, + "learning_rate": 4.5376878755289136e-06, + "loss": 0.4704, + "mean_token_accuracy": 0.8462784796953201, + "num_tokens": 120567039.0, + "step": 100270 + }, + { + "entropy": 1.8688796371221543, + "epoch": 0.3108592259983462, + "grad_norm": 8.893397331237793, + "learning_rate": 4.537461616740546e-06, + "loss": 0.4449, + "mean_token_accuracy": 0.8576091736555099, + "num_tokens": 120579341.0, + "step": 100280 + }, + { + "entropy": 1.902301235496998, + "epoch": 0.31089022512339587, + "grad_norm": 8.338861465454102, + "learning_rate": 4.53723539179404e-06, + "loss": 0.4307, + "mean_token_accuracy": 0.8621632814407348, + "num_tokens": 120590733.0, + "step": 100290 + }, + { + "entropy": 1.9393490612506867, + "epoch": 0.3109212242484456, + "grad_norm": 7.925196170806885, + "learning_rate": 4.53700920068096e-06, + "loss": 0.5162, + "mean_token_accuracy": 0.8450248718261719, + "num_tokens": 120601534.0, + "step": 100300 + }, + { + "entropy": 1.8962682634592056, + "epoch": 0.31095222337349526, + "grad_norm": 7.99591064453125, + "learning_rate": 4.536783043392873e-06, + "loss": 0.4653, + "mean_token_accuracy": 0.8482768073678016, + "num_tokens": 120612997.0, + "step": 100310 + }, + { + "entropy": 1.9093605667352676, + "epoch": 0.310983222498545, + "grad_norm": 8.260348320007324, + "learning_rate": 4.536556919921349e-06, + "loss": 0.4934, + "mean_token_accuracy": 0.8399380281567573, + "num_tokens": 120624613.0, + "step": 100320 + }, + { + "entropy": 1.8416539490222932, + "epoch": 0.31101422162359466, + "grad_norm": 4.14152193069458, + "learning_rate": 4.536330830257964e-06, + "loss": 0.4217, + "mean_token_accuracy": 0.8592947632074356, + "num_tokens": 120637123.0, + "step": 100330 + }, + { + "entropy": 1.9081314831972123, + "epoch": 0.3110452207486444, + "grad_norm": 10.202445030212402, + "learning_rate": 4.536104774394291e-06, + "loss": 0.5284, + "mean_token_accuracy": 0.8314228966832161, + "num_tokens": 120648950.0, + "step": 100340 + }, + { + "entropy": 1.8932959333062171, + "epoch": 0.31107621987369405, + "grad_norm": 9.009939193725586, + "learning_rate": 4.5358787523219115e-06, + "loss": 0.4466, + "mean_token_accuracy": 0.8498683959245682, + "num_tokens": 120660896.0, + "step": 100350 + }, + { + "entropy": 1.8493494719266892, + "epoch": 0.3111072189987438, + "grad_norm": 3.8888235092163086, + "learning_rate": 4.535652764032407e-06, + "loss": 0.4515, + "mean_token_accuracy": 0.8555730670690537, + "num_tokens": 120673291.0, + "step": 100360 + }, + { + "entropy": 1.8772372379899025, + "epoch": 0.31113821812379344, + "grad_norm": 8.41147232055664, + "learning_rate": 4.535426809517363e-06, + "loss": 0.5155, + "mean_token_accuracy": 0.844378887116909, + "num_tokens": 120685691.0, + "step": 100370 + }, + { + "entropy": 1.86507987678051, + "epoch": 0.31116921724884317, + "grad_norm": 8.958087921142578, + "learning_rate": 4.535200888768366e-06, + "loss": 0.4501, + "mean_token_accuracy": 0.8503421053290368, + "num_tokens": 120697935.0, + "step": 100380 + }, + { + "entropy": 1.849946916103363, + "epoch": 0.31120021637389284, + "grad_norm": 7.368825435638428, + "learning_rate": 4.534975001777008e-06, + "loss": 0.4453, + "mean_token_accuracy": 0.8594206288456917, + "num_tokens": 120709752.0, + "step": 100390 + }, + { + "entropy": 1.859765648841858, + "epoch": 0.31123121549894256, + "grad_norm": 7.649475574493408, + "learning_rate": 4.5347491485348835e-06, + "loss": 0.4323, + "mean_token_accuracy": 0.8555939674377442, + "num_tokens": 120721721.0, + "step": 100400 + }, + { + "entropy": 1.877078004181385, + "epoch": 0.31126221462399223, + "grad_norm": 8.297224998474121, + "learning_rate": 4.534523329033589e-06, + "loss": 0.4486, + "mean_token_accuracy": 0.8504548445343971, + "num_tokens": 120732890.0, + "step": 100410 + }, + { + "entropy": 1.8714374005794525, + "epoch": 0.31129321374904195, + "grad_norm": 4.207152366638184, + "learning_rate": 4.534297543264725e-06, + "loss": 0.5621, + "mean_token_accuracy": 0.8266729831695556, + "num_tokens": 120745682.0, + "step": 100420 + }, + { + "entropy": 1.8621924549341202, + "epoch": 0.3113242128740916, + "grad_norm": 8.956070899963379, + "learning_rate": 4.534071791219892e-06, + "loss": 0.4647, + "mean_token_accuracy": 0.8492157995700836, + "num_tokens": 120758011.0, + "step": 100430 + }, + { + "entropy": 1.9551179885864258, + "epoch": 0.31135521199914135, + "grad_norm": 8.54804515838623, + "learning_rate": 4.533846072890697e-06, + "loss": 0.5251, + "mean_token_accuracy": 0.83407664000988, + "num_tokens": 120768890.0, + "step": 100440 + }, + { + "entropy": 1.8986855819821358, + "epoch": 0.311386211124191, + "grad_norm": 8.727336883544922, + "learning_rate": 4.533620388268749e-06, + "loss": 0.4687, + "mean_token_accuracy": 0.8469956681132317, + "num_tokens": 120780472.0, + "step": 100450 + }, + { + "entropy": 1.8609646454453468, + "epoch": 0.31141721024924074, + "grad_norm": 7.309536457061768, + "learning_rate": 4.533394737345659e-06, + "loss": 0.4281, + "mean_token_accuracy": 0.8551263570785522, + "num_tokens": 120792295.0, + "step": 100460 + }, + { + "entropy": 1.8796255081892013, + "epoch": 0.3114482093742904, + "grad_norm": 10.224238395690918, + "learning_rate": 4.5331691201130415e-06, + "loss": 0.4551, + "mean_token_accuracy": 0.8438882201910018, + "num_tokens": 120804242.0, + "step": 100470 + }, + { + "entropy": 1.907512204349041, + "epoch": 0.31147920849934013, + "grad_norm": 10.103514671325684, + "learning_rate": 4.532943536562514e-06, + "loss": 0.4917, + "mean_token_accuracy": 0.8437295973300933, + "num_tokens": 120815319.0, + "step": 100480 + }, + { + "entropy": 1.8957229569554328, + "epoch": 0.3115102076243898, + "grad_norm": 5.280575752258301, + "learning_rate": 4.5327179866856965e-06, + "loss": 0.4682, + "mean_token_accuracy": 0.8484357133507728, + "num_tokens": 120827094.0, + "step": 100490 + }, + { + "entropy": 1.856425480544567, + "epoch": 0.31154120674943947, + "grad_norm": 8.965953826904297, + "learning_rate": 4.532492470474212e-06, + "loss": 0.4583, + "mean_token_accuracy": 0.8441311463713645, + "num_tokens": 120839247.0, + "step": 100500 + }, + { + "entropy": 1.8147829815745353, + "epoch": 0.3115722058744892, + "grad_norm": 9.263935089111328, + "learning_rate": 4.532266987919687e-06, + "loss": 0.4365, + "mean_token_accuracy": 0.8509012356400489, + "num_tokens": 120852066.0, + "step": 100510 + }, + { + "entropy": 1.9309598222374915, + "epoch": 0.31160320499953886, + "grad_norm": 7.9816975593566895, + "learning_rate": 4.53204153901375e-06, + "loss": 0.4874, + "mean_token_accuracy": 0.8446980267763138, + "num_tokens": 120863149.0, + "step": 100520 + }, + { + "entropy": 1.8563687920570373, + "epoch": 0.3116342041245886, + "grad_norm": 8.456686973571777, + "learning_rate": 4.531816123748033e-06, + "loss": 0.4465, + "mean_token_accuracy": 0.8485777124762535, + "num_tokens": 120875622.0, + "step": 100530 + }, + { + "entropy": 1.9105036780238152, + "epoch": 0.31166520324963826, + "grad_norm": 8.747064590454102, + "learning_rate": 4.531590742114171e-06, + "loss": 0.4662, + "mean_token_accuracy": 0.8524237051606178, + "num_tokens": 120886828.0, + "step": 100540 + }, + { + "entropy": 1.9014281533658504, + "epoch": 0.311696202374688, + "grad_norm": 10.515109062194824, + "learning_rate": 4.531365394103802e-06, + "loss": 0.491, + "mean_token_accuracy": 0.8461366832256317, + "num_tokens": 120898889.0, + "step": 100550 + }, + { + "entropy": 1.7405464336276055, + "epoch": 0.31172720149973765, + "grad_norm": 4.015621662139893, + "learning_rate": 4.531140079708566e-06, + "loss": 0.3677, + "mean_token_accuracy": 0.8626249134540558, + "num_tokens": 120913012.0, + "step": 100560 + }, + { + "entropy": 1.875110612809658, + "epoch": 0.3117582006247874, + "grad_norm": 9.846309661865234, + "learning_rate": 4.530914798920107e-06, + "loss": 0.4614, + "mean_token_accuracy": 0.8495632246136665, + "num_tokens": 120924428.0, + "step": 100570 + }, + { + "entropy": 1.7907123163342475, + "epoch": 0.31178919974983704, + "grad_norm": 8.938268661499023, + "learning_rate": 4.530689551730072e-06, + "loss": 0.4333, + "mean_token_accuracy": 0.8574684172868728, + "num_tokens": 120937451.0, + "step": 100580 + }, + { + "entropy": 1.9089242190122604, + "epoch": 0.31182019887488677, + "grad_norm": 10.029963493347168, + "learning_rate": 4.5304643381301094e-06, + "loss": 0.4889, + "mean_token_accuracy": 0.8454339489340782, + "num_tokens": 120948860.0, + "step": 100590 + }, + { + "entropy": 1.9048386812210083, + "epoch": 0.31185119799993644, + "grad_norm": 8.10409927368164, + "learning_rate": 4.530239158111872e-06, + "loss": 0.4551, + "mean_token_accuracy": 0.8487098172307015, + "num_tokens": 120960368.0, + "step": 100600 + }, + { + "entropy": 1.7758634328842162, + "epoch": 0.31188219712498616, + "grad_norm": 3.4512832164764404, + "learning_rate": 4.530014011667015e-06, + "loss": 0.379, + "mean_token_accuracy": 0.8484735369682312, + "num_tokens": 120973621.0, + "step": 100610 + }, + { + "entropy": 1.9328245520591736, + "epoch": 0.31191319625003583, + "grad_norm": 9.077108383178711, + "learning_rate": 4.5297888987871956e-06, + "loss": 0.5028, + "mean_token_accuracy": 0.8441737592220306, + "num_tokens": 120984223.0, + "step": 100620 + }, + { + "entropy": 1.8256781995296478, + "epoch": 0.31194419537508555, + "grad_norm": 7.908056735992432, + "learning_rate": 4.529563819464075e-06, + "loss": 0.4364, + "mean_token_accuracy": 0.8539151340723038, + "num_tokens": 120997133.0, + "step": 100630 + }, + { + "entropy": 1.8427986733615398, + "epoch": 0.3119751945001352, + "grad_norm": 9.019706726074219, + "learning_rate": 4.529338773689319e-06, + "loss": 0.4545, + "mean_token_accuracy": 0.8452917739748955, + "num_tokens": 121009965.0, + "step": 100640 + }, + { + "entropy": 1.912809681892395, + "epoch": 0.31200619362518495, + "grad_norm": 8.718803405761719, + "learning_rate": 4.529113761454591e-06, + "loss": 0.5344, + "mean_token_accuracy": 0.8374342992901802, + "num_tokens": 121021538.0, + "step": 100650 + }, + { + "entropy": 1.8587751030921935, + "epoch": 0.3120371927502346, + "grad_norm": 8.5054931640625, + "learning_rate": 4.528888782751565e-06, + "loss": 0.4487, + "mean_token_accuracy": 0.8497788742184639, + "num_tokens": 121033842.0, + "step": 100660 + }, + { + "entropy": 1.8983284369111062, + "epoch": 0.31206819187528434, + "grad_norm": 8.823607444763184, + "learning_rate": 4.52866383757191e-06, + "loss": 0.4662, + "mean_token_accuracy": 0.8428234368562698, + "num_tokens": 121045774.0, + "step": 100670 + }, + { + "entropy": 1.9349606305360794, + "epoch": 0.312099191000334, + "grad_norm": 8.035542488098145, + "learning_rate": 4.528438925907303e-06, + "loss": 0.5277, + "mean_token_accuracy": 0.8406255826354027, + "num_tokens": 121057292.0, + "step": 100680 + }, + { + "entropy": 1.8452520370483398, + "epoch": 0.31213019012538373, + "grad_norm": 9.03587818145752, + "learning_rate": 4.528214047749422e-06, + "loss": 0.4873, + "mean_token_accuracy": 0.8514409214258194, + "num_tokens": 121070931.0, + "step": 100690 + }, + { + "entropy": 1.7869605004787446, + "epoch": 0.3121611892504334, + "grad_norm": 10.495682716369629, + "learning_rate": 4.5279892030899485e-06, + "loss": 0.4015, + "mean_token_accuracy": 0.8594531282782555, + "num_tokens": 121083578.0, + "step": 100700 + }, + { + "entropy": 1.8690698593854904, + "epoch": 0.3121921883754831, + "grad_norm": 8.482170104980469, + "learning_rate": 4.527764391920566e-06, + "loss": 0.4727, + "mean_token_accuracy": 0.8500708416104317, + "num_tokens": 121095247.0, + "step": 100710 + }, + { + "entropy": 1.9164055377244948, + "epoch": 0.3122231875005328, + "grad_norm": 8.222436904907227, + "learning_rate": 4.527539614232962e-06, + "loss": 0.4953, + "mean_token_accuracy": 0.8398829951882363, + "num_tokens": 121107205.0, + "step": 100720 + }, + { + "entropy": 1.8966273233294486, + "epoch": 0.3122541866255825, + "grad_norm": 10.007433891296387, + "learning_rate": 4.527314870018826e-06, + "loss": 0.4603, + "mean_token_accuracy": 0.8499047040939331, + "num_tokens": 121118189.0, + "step": 100730 + }, + { + "entropy": 1.9177552998065948, + "epoch": 0.3122851857506322, + "grad_norm": 9.334546089172363, + "learning_rate": 4.527090159269853e-06, + "loss": 0.4732, + "mean_token_accuracy": 0.8446200251579284, + "num_tokens": 121129883.0, + "step": 100740 + }, + { + "entropy": 1.951483702659607, + "epoch": 0.31231618487568186, + "grad_norm": 9.152246475219727, + "learning_rate": 4.5268654819777355e-06, + "loss": 0.5432, + "mean_token_accuracy": 0.8399815455079078, + "num_tokens": 121140945.0, + "step": 100750 + }, + { + "entropy": 1.843865168094635, + "epoch": 0.3123471840007316, + "grad_norm": 6.65300178527832, + "learning_rate": 4.5266408381341735e-06, + "loss": 0.4655, + "mean_token_accuracy": 0.846330837905407, + "num_tokens": 121153921.0, + "step": 100760 + }, + { + "entropy": 1.9063566982746125, + "epoch": 0.31237818312578125, + "grad_norm": 7.834247589111328, + "learning_rate": 4.526416227730868e-06, + "loss": 0.549, + "mean_token_accuracy": 0.8455073088407516, + "num_tokens": 121165017.0, + "step": 100770 + }, + { + "entropy": 1.8952314227819442, + "epoch": 0.312409182250831, + "grad_norm": 8.009584426879883, + "learning_rate": 4.526191650759525e-06, + "loss": 0.4638, + "mean_token_accuracy": 0.846976974606514, + "num_tokens": 121176827.0, + "step": 100780 + }, + { + "entropy": 1.9277762562036513, + "epoch": 0.31244018137588064, + "grad_norm": 8.662957191467285, + "learning_rate": 4.52596710721185e-06, + "loss": 0.5201, + "mean_token_accuracy": 0.8444670990109444, + "num_tokens": 121187950.0, + "step": 100790 + }, + { + "entropy": 1.943731963634491, + "epoch": 0.31247118050093037, + "grad_norm": 7.965757846832275, + "learning_rate": 4.525742597079554e-06, + "loss": 0.529, + "mean_token_accuracy": 0.8364956364035606, + "num_tokens": 121199219.0, + "step": 100800 + }, + { + "entropy": 1.9287627547979356, + "epoch": 0.31250217962598004, + "grad_norm": 6.632516384124756, + "learning_rate": 4.52551812035435e-06, + "loss": 0.4824, + "mean_token_accuracy": 0.8509150311350823, + "num_tokens": 121210268.0, + "step": 100810 + }, + { + "entropy": 1.9134871885180473, + "epoch": 0.31253317875102976, + "grad_norm": 4.123570919036865, + "learning_rate": 4.525293677027954e-06, + "loss": 0.461, + "mean_token_accuracy": 0.8477164819836617, + "num_tokens": 121222196.0, + "step": 100820 + }, + { + "entropy": 1.916480553150177, + "epoch": 0.31256417787607943, + "grad_norm": 10.33231258392334, + "learning_rate": 4.525069267092083e-06, + "loss": 0.5124, + "mean_token_accuracy": 0.8356222257018089, + "num_tokens": 121233821.0, + "step": 100830 + }, + { + "entropy": 1.8253281712532043, + "epoch": 0.31259517700112915, + "grad_norm": 8.607940673828125, + "learning_rate": 4.52484489053846e-06, + "loss": 0.4572, + "mean_token_accuracy": 0.8448203921318054, + "num_tokens": 121246155.0, + "step": 100840 + }, + { + "entropy": 1.9012310445308684, + "epoch": 0.3126261761261788, + "grad_norm": 7.7749247550964355, + "learning_rate": 4.524620547358811e-06, + "loss": 0.4896, + "mean_token_accuracy": 0.8430942222476006, + "num_tokens": 121258155.0, + "step": 100850 + }, + { + "entropy": 1.9564151272177697, + "epoch": 0.31265717525122855, + "grad_norm": 8.548589706420898, + "learning_rate": 4.52439623754486e-06, + "loss": 0.4858, + "mean_token_accuracy": 0.8461882799863816, + "num_tokens": 121269354.0, + "step": 100860 + }, + { + "entropy": 1.8889728412032127, + "epoch": 0.3126881743762782, + "grad_norm": 5.758060455322266, + "learning_rate": 4.524171961088339e-06, + "loss": 0.4714, + "mean_token_accuracy": 0.857310351729393, + "num_tokens": 121281304.0, + "step": 100870 + }, + { + "entropy": 1.930538135766983, + "epoch": 0.31271917350132794, + "grad_norm": 8.554981231689453, + "learning_rate": 4.523947717980982e-06, + "loss": 0.4988, + "mean_token_accuracy": 0.8413084149360657, + "num_tokens": 121293664.0, + "step": 100880 + }, + { + "entropy": 1.950375673174858, + "epoch": 0.3127501726263776, + "grad_norm": 8.186065673828125, + "learning_rate": 4.5237235082145235e-06, + "loss": 0.4897, + "mean_token_accuracy": 0.849441209435463, + "num_tokens": 121304745.0, + "step": 100890 + }, + { + "entropy": 1.9061787739396094, + "epoch": 0.31278117175142733, + "grad_norm": 3.3713457584381104, + "learning_rate": 4.523499331780703e-06, + "loss": 0.5062, + "mean_token_accuracy": 0.8370168238878251, + "num_tokens": 121316310.0, + "step": 100900 + }, + { + "entropy": 1.830712193250656, + "epoch": 0.312812170876477, + "grad_norm": 4.071151256561279, + "learning_rate": 4.5232751886712615e-06, + "loss": 0.4785, + "mean_token_accuracy": 0.8477034792304039, + "num_tokens": 121328915.0, + "step": 100910 + }, + { + "entropy": 1.9675110548734664, + "epoch": 0.3128431700015267, + "grad_norm": 8.07413101196289, + "learning_rate": 4.523051078877946e-06, + "loss": 0.5416, + "mean_token_accuracy": 0.8389467343688011, + "num_tokens": 121339986.0, + "step": 100920 + }, + { + "entropy": 1.8929841727018357, + "epoch": 0.3128741691265764, + "grad_norm": 8.73824691772461, + "learning_rate": 4.5228270023925e-06, + "loss": 0.4723, + "mean_token_accuracy": 0.8490571796894073, + "num_tokens": 121352037.0, + "step": 100930 + }, + { + "entropy": 1.8892315909266473, + "epoch": 0.3129051682516261, + "grad_norm": 4.492147922515869, + "learning_rate": 4.522602959206678e-06, + "loss": 0.4653, + "mean_token_accuracy": 0.8552650704979896, + "num_tokens": 121364144.0, + "step": 100940 + }, + { + "entropy": 1.923583671450615, + "epoch": 0.3129361673766758, + "grad_norm": 10.050043106079102, + "learning_rate": 4.52237894931223e-06, + "loss": 0.4653, + "mean_token_accuracy": 0.844260835647583, + "num_tokens": 121375808.0, + "step": 100950 + }, + { + "entropy": 1.9433955758810044, + "epoch": 0.3129671665017255, + "grad_norm": 9.592854499816895, + "learning_rate": 4.522154972700912e-06, + "loss": 0.5002, + "mean_token_accuracy": 0.8481174424290657, + "num_tokens": 121387011.0, + "step": 100960 + }, + { + "entropy": 1.847121460735798, + "epoch": 0.3129981656267752, + "grad_norm": 8.493393898010254, + "learning_rate": 4.5219310293644856e-06, + "loss": 0.4003, + "mean_token_accuracy": 0.8667575031518936, + "num_tokens": 121399217.0, + "step": 100970 + }, + { + "entropy": 1.895681183040142, + "epoch": 0.3130291647518249, + "grad_norm": 8.458040237426758, + "learning_rate": 4.52170711929471e-06, + "loss": 0.4853, + "mean_token_accuracy": 0.8476063832640648, + "num_tokens": 121411380.0, + "step": 100980 + }, + { + "entropy": 1.8396486029028893, + "epoch": 0.3130601638768746, + "grad_norm": 8.87826156616211, + "learning_rate": 4.521483242483351e-06, + "loss": 0.4117, + "mean_token_accuracy": 0.8522458449006081, + "num_tokens": 121424542.0, + "step": 100990 + }, + { + "entropy": 1.8425541028380394, + "epoch": 0.31309116300192424, + "grad_norm": 8.997958183288574, + "learning_rate": 4.521259398922175e-06, + "loss": 0.4341, + "mean_token_accuracy": 0.8560503959655762, + "num_tokens": 121437235.0, + "step": 101000 + }, + { + "entropy": 1.857857908308506, + "epoch": 0.31312216212697397, + "grad_norm": 9.751863479614258, + "learning_rate": 4.521035588602953e-06, + "loss": 0.4642, + "mean_token_accuracy": 0.8419872790575027, + "num_tokens": 121449522.0, + "step": 101010 + }, + { + "entropy": 1.858136995136738, + "epoch": 0.31315316125202364, + "grad_norm": 3.528123617172241, + "learning_rate": 4.520811811517458e-06, + "loss": 0.4313, + "mean_token_accuracy": 0.8612496837973594, + "num_tokens": 121462199.0, + "step": 101020 + }, + { + "entropy": 1.907412128150463, + "epoch": 0.31318416037707336, + "grad_norm": 6.3340744972229, + "learning_rate": 4.520588067657467e-06, + "loss": 0.4864, + "mean_token_accuracy": 0.8482323855161666, + "num_tokens": 121474247.0, + "step": 101030 + }, + { + "entropy": 1.8920990169048308, + "epoch": 0.31321515950212303, + "grad_norm": 7.975953578948975, + "learning_rate": 4.520364357014758e-06, + "loss": 0.4311, + "mean_token_accuracy": 0.8457095667719841, + "num_tokens": 121486893.0, + "step": 101040 + }, + { + "entropy": 1.8772882983088492, + "epoch": 0.31324615862717275, + "grad_norm": 8.792813301086426, + "learning_rate": 4.520140679581111e-06, + "loss": 0.4382, + "mean_token_accuracy": 0.8604818403720855, + "num_tokens": 121499181.0, + "step": 101050 + }, + { + "entropy": 1.8207586660981179, + "epoch": 0.3132771577522224, + "grad_norm": 7.4785590171813965, + "learning_rate": 4.519917035348314e-06, + "loss": 0.3837, + "mean_token_accuracy": 0.8638281837105751, + "num_tokens": 121511510.0, + "step": 101060 + }, + { + "entropy": 1.847452338039875, + "epoch": 0.31330815687727215, + "grad_norm": 4.0800886154174805, + "learning_rate": 4.519693424308152e-06, + "loss": 0.4483, + "mean_token_accuracy": 0.8405447885394096, + "num_tokens": 121524110.0, + "step": 101070 + }, + { + "entropy": 1.9054955273866654, + "epoch": 0.3133391560023218, + "grad_norm": 5.289824485778809, + "learning_rate": 4.519469846452415e-06, + "loss": 0.48, + "mean_token_accuracy": 0.8457769706845284, + "num_tokens": 121535699.0, + "step": 101080 + }, + { + "entropy": 1.915067094564438, + "epoch": 0.31337015512737154, + "grad_norm": 8.302640914916992, + "learning_rate": 4.519246301772896e-06, + "loss": 0.5167, + "mean_token_accuracy": 0.8345979511737823, + "num_tokens": 121547506.0, + "step": 101090 + }, + { + "entropy": 1.8765003278851509, + "epoch": 0.3134011542524212, + "grad_norm": 9.370650291442871, + "learning_rate": 4.519022790261393e-06, + "loss": 0.4425, + "mean_token_accuracy": 0.8564626336097717, + "num_tokens": 121560190.0, + "step": 101100 + }, + { + "entropy": 1.6999532222747802, + "epoch": 0.31343215337747093, + "grad_norm": 3.211577892303467, + "learning_rate": 4.5187993119097045e-06, + "loss": 0.3213, + "mean_token_accuracy": 0.8762291237711907, + "num_tokens": 121574646.0, + "step": 101110 + }, + { + "entropy": 1.8197277091443538, + "epoch": 0.3134631525025206, + "grad_norm": 7.400459289550781, + "learning_rate": 4.5185758667096295e-06, + "loss": 0.3932, + "mean_token_accuracy": 0.8629156649112701, + "num_tokens": 121588419.0, + "step": 101120 + }, + { + "entropy": 1.9415106505155564, + "epoch": 0.3134941516275703, + "grad_norm": 9.660387992858887, + "learning_rate": 4.518352454652974e-06, + "loss": 0.5114, + "mean_token_accuracy": 0.8417216569185257, + "num_tokens": 121599422.0, + "step": 101130 + }, + { + "entropy": 1.9480364575982094, + "epoch": 0.31352515075262, + "grad_norm": 9.710609436035156, + "learning_rate": 4.518129075731546e-06, + "loss": 0.5141, + "mean_token_accuracy": 0.8369411841034889, + "num_tokens": 121610421.0, + "step": 101140 + }, + { + "entropy": 1.9201453104615211, + "epoch": 0.3135561498776697, + "grad_norm": 3.905876874923706, + "learning_rate": 4.517905729937153e-06, + "loss": 0.5488, + "mean_token_accuracy": 0.8450462609529495, + "num_tokens": 121622409.0, + "step": 101150 + }, + { + "entropy": 1.9243328019976615, + "epoch": 0.3135871490027194, + "grad_norm": 8.177422523498535, + "learning_rate": 4.517682417261611e-06, + "loss": 0.4732, + "mean_token_accuracy": 0.8475325971841812, + "num_tokens": 121634012.0, + "step": 101160 + }, + { + "entropy": 1.8541806608438491, + "epoch": 0.3136181481277691, + "grad_norm": 9.121359825134277, + "learning_rate": 4.517459137696734e-06, + "loss": 0.4359, + "mean_token_accuracy": 0.8584785044193268, + "num_tokens": 121646345.0, + "step": 101170 + }, + { + "entropy": 1.8980771958827973, + "epoch": 0.3136491472528188, + "grad_norm": 8.328401565551758, + "learning_rate": 4.517235891234341e-06, + "loss": 0.4921, + "mean_token_accuracy": 0.8456721261143685, + "num_tokens": 121657741.0, + "step": 101180 + }, + { + "entropy": 1.8372810363769532, + "epoch": 0.3136801463778685, + "grad_norm": 9.05524730682373, + "learning_rate": 4.517012677866254e-06, + "loss": 0.4301, + "mean_token_accuracy": 0.857391194999218, + "num_tokens": 121670981.0, + "step": 101190 + }, + { + "entropy": 1.8419764667749405, + "epoch": 0.3137111455029182, + "grad_norm": 3.9297235012054443, + "learning_rate": 4.516789497584297e-06, + "loss": 0.4726, + "mean_token_accuracy": 0.8536780118942261, + "num_tokens": 121682826.0, + "step": 101200 + }, + { + "entropy": 1.8321501910686493, + "epoch": 0.3137421446279679, + "grad_norm": 4.388725757598877, + "learning_rate": 4.516566350380297e-06, + "loss": 0.4487, + "mean_token_accuracy": 0.852819861471653, + "num_tokens": 121695953.0, + "step": 101210 + }, + { + "entropy": 1.8966490373015403, + "epoch": 0.31377314375301757, + "grad_norm": 4.4960737228393555, + "learning_rate": 4.5163432362460825e-06, + "loss": 0.4519, + "mean_token_accuracy": 0.8485043302178383, + "num_tokens": 121707846.0, + "step": 101220 + }, + { + "entropy": 1.8889821529388429, + "epoch": 0.3138041428780673, + "grad_norm": 9.122937202453613, + "learning_rate": 4.516120155173487e-06, + "loss": 0.4919, + "mean_token_accuracy": 0.8423930436372757, + "num_tokens": 121719869.0, + "step": 101230 + }, + { + "entropy": 1.832286487519741, + "epoch": 0.31383514200311696, + "grad_norm": 3.832523822784424, + "learning_rate": 4.515897107154348e-06, + "loss": 0.4331, + "mean_token_accuracy": 0.8503172323107719, + "num_tokens": 121732133.0, + "step": 101240 + }, + { + "entropy": 1.8415155798196792, + "epoch": 0.31386614112816663, + "grad_norm": 3.9735727310180664, + "learning_rate": 4.515674092180501e-06, + "loss": 0.4646, + "mean_token_accuracy": 0.848311547935009, + "num_tokens": 121744373.0, + "step": 101250 + }, + { + "entropy": 1.9100284710526467, + "epoch": 0.31389714025321636, + "grad_norm": 8.413751602172852, + "learning_rate": 4.51545111024379e-06, + "loss": 0.4904, + "mean_token_accuracy": 0.8440921515226364, + "num_tokens": 121755112.0, + "step": 101260 + }, + { + "entropy": 1.8759055025875568, + "epoch": 0.313928139378266, + "grad_norm": 9.331445693969727, + "learning_rate": 4.515228161336056e-06, + "loss": 0.4728, + "mean_token_accuracy": 0.8457464009523392, + "num_tokens": 121767583.0, + "step": 101270 + }, + { + "entropy": 1.8825054869055748, + "epoch": 0.31395913850331575, + "grad_norm": 3.7135350704193115, + "learning_rate": 4.515005245449148e-06, + "loss": 0.4498, + "mean_token_accuracy": 0.8491608336567879, + "num_tokens": 121779121.0, + "step": 101280 + }, + { + "entropy": 1.8358449339866638, + "epoch": 0.3139901376283654, + "grad_norm": 8.02281379699707, + "learning_rate": 4.514782362574916e-06, + "loss": 0.4137, + "mean_token_accuracy": 0.8574068158864975, + "num_tokens": 121791200.0, + "step": 101290 + }, + { + "entropy": 1.7923202894628047, + "epoch": 0.31402113675341514, + "grad_norm": 8.330782890319824, + "learning_rate": 4.514559512705209e-06, + "loss": 0.4104, + "mean_token_accuracy": 0.8662643000483513, + "num_tokens": 121804297.0, + "step": 101300 + }, + { + "entropy": 1.9169503509998322, + "epoch": 0.3140521358784648, + "grad_norm": 8.749537467956543, + "learning_rate": 4.514336695831886e-06, + "loss": 0.5041, + "mean_token_accuracy": 0.8455056488513947, + "num_tokens": 121815612.0, + "step": 101310 + }, + { + "entropy": 1.7872198984026908, + "epoch": 0.31408313500351454, + "grad_norm": 7.533987522125244, + "learning_rate": 4.514113911946806e-06, + "loss": 0.4268, + "mean_token_accuracy": 0.858782111108303, + "num_tokens": 121828475.0, + "step": 101320 + }, + { + "entropy": 1.9148533344268799, + "epoch": 0.3141141341285642, + "grad_norm": 8.236104011535645, + "learning_rate": 4.5138911610418245e-06, + "loss": 0.5253, + "mean_token_accuracy": 0.8438421174883842, + "num_tokens": 121838831.0, + "step": 101330 + }, + { + "entropy": 1.9065877795219421, + "epoch": 0.31414513325361393, + "grad_norm": 8.377083778381348, + "learning_rate": 4.51366844310881e-06, + "loss": 0.4838, + "mean_token_accuracy": 0.8459710478782654, + "num_tokens": 121850367.0, + "step": 101340 + }, + { + "entropy": 1.9168651878833771, + "epoch": 0.3141761323786636, + "grad_norm": 8.766077995300293, + "learning_rate": 4.513445758139627e-06, + "loss": 0.4972, + "mean_token_accuracy": 0.8526653304696084, + "num_tokens": 121860852.0, + "step": 101350 + }, + { + "entropy": 1.8872756123542787, + "epoch": 0.3142071315037133, + "grad_norm": 8.51346492767334, + "learning_rate": 4.513223106126145e-06, + "loss": 0.4947, + "mean_token_accuracy": 0.8485476955771446, + "num_tokens": 121871508.0, + "step": 101360 + }, + { + "entropy": 1.8321407943964005, + "epoch": 0.314238130628763, + "grad_norm": 7.802825450897217, + "learning_rate": 4.513000487060237e-06, + "loss": 0.4132, + "mean_token_accuracy": 0.8585144847631454, + "num_tokens": 121883191.0, + "step": 101370 + }, + { + "entropy": 1.8902756407856942, + "epoch": 0.3142691297538127, + "grad_norm": 8.383158683776855, + "learning_rate": 4.5127779009337785e-06, + "loss": 0.5165, + "mean_token_accuracy": 0.840720933675766, + "num_tokens": 121894503.0, + "step": 101380 + }, + { + "entropy": 1.9538305789232253, + "epoch": 0.3143001288788624, + "grad_norm": 10.174178123474121, + "learning_rate": 4.5125553477386455e-06, + "loss": 0.5445, + "mean_token_accuracy": 0.8378095537424087, + "num_tokens": 121905568.0, + "step": 101390 + }, + { + "entropy": 1.8209902182221414, + "epoch": 0.3143311280039121, + "grad_norm": 9.949512481689453, + "learning_rate": 4.512332827466718e-06, + "loss": 0.466, + "mean_token_accuracy": 0.8366820439696312, + "num_tokens": 121918020.0, + "step": 101400 + }, + { + "entropy": 1.8949487626552581, + "epoch": 0.3143621271289618, + "grad_norm": 7.5901360511779785, + "learning_rate": 4.5121103401098816e-06, + "loss": 0.472, + "mean_token_accuracy": 0.8482216715812683, + "num_tokens": 121929522.0, + "step": 101410 + }, + { + "entropy": 1.805781890451908, + "epoch": 0.3143931262540115, + "grad_norm": 7.198166370391846, + "learning_rate": 4.5118878856600216e-06, + "loss": 0.4295, + "mean_token_accuracy": 0.8508959487080574, + "num_tokens": 121941861.0, + "step": 101420 + }, + { + "entropy": 1.9467833280563354, + "epoch": 0.31442412537906117, + "grad_norm": 7.574606895446777, + "learning_rate": 4.511665464109026e-06, + "loss": 0.531, + "mean_token_accuracy": 0.8413808315992355, + "num_tokens": 121952629.0, + "step": 101430 + }, + { + "entropy": 1.8594104155898095, + "epoch": 0.3144551245041109, + "grad_norm": 8.646973609924316, + "learning_rate": 4.511443075448789e-06, + "loss": 0.4942, + "mean_token_accuracy": 0.8487472698092461, + "num_tokens": 121964076.0, + "step": 101440 + }, + { + "entropy": 1.8867372497916222, + "epoch": 0.31448612362916056, + "grad_norm": 9.636038780212402, + "learning_rate": 4.511220719671201e-06, + "loss": 0.4743, + "mean_token_accuracy": 0.8549345463514328, + "num_tokens": 121975596.0, + "step": 101450 + }, + { + "entropy": 1.8463930860161781, + "epoch": 0.3145171227542103, + "grad_norm": 4.379640579223633, + "learning_rate": 4.510998396768163e-06, + "loss": 0.4399, + "mean_token_accuracy": 0.8542065605521202, + "num_tokens": 121988727.0, + "step": 101460 + }, + { + "entropy": 1.7745927199721336, + "epoch": 0.31454812187925996, + "grad_norm": 8.495243072509766, + "learning_rate": 4.510776106731574e-06, + "loss": 0.4697, + "mean_token_accuracy": 0.8536753922700882, + "num_tokens": 122001880.0, + "step": 101470 + }, + { + "entropy": 1.857958073914051, + "epoch": 0.3145791210043097, + "grad_norm": 8.16500473022461, + "learning_rate": 4.510553849553338e-06, + "loss": 0.4352, + "mean_token_accuracy": 0.8449077561497689, + "num_tokens": 122013484.0, + "step": 101480 + }, + { + "entropy": 1.894733229279518, + "epoch": 0.31461012012935935, + "grad_norm": 8.241775512695312, + "learning_rate": 4.5103316252253596e-06, + "loss": 0.4974, + "mean_token_accuracy": 0.8418209850788116, + "num_tokens": 122024930.0, + "step": 101490 + }, + { + "entropy": 1.8933887377381324, + "epoch": 0.314641119254409, + "grad_norm": 6.502079963684082, + "learning_rate": 4.510109433739546e-06, + "loss": 0.5135, + "mean_token_accuracy": 0.8463633015751839, + "num_tokens": 122036290.0, + "step": 101500 + }, + { + "entropy": 1.7902062192559243, + "epoch": 0.31467211837945874, + "grad_norm": 7.463756084442139, + "learning_rate": 4.5098872750878105e-06, + "loss": 0.4618, + "mean_token_accuracy": 0.8511275082826615, + "num_tokens": 122048789.0, + "step": 101510 + }, + { + "entropy": 1.8736846506595612, + "epoch": 0.3147031175045084, + "grad_norm": 8.760028839111328, + "learning_rate": 4.509665149262067e-06, + "loss": 0.5182, + "mean_token_accuracy": 0.8344025030732155, + "num_tokens": 122060583.0, + "step": 101520 + }, + { + "entropy": 1.863266195356846, + "epoch": 0.31473411662955814, + "grad_norm": 8.545906066894531, + "learning_rate": 4.509443056254233e-06, + "loss": 0.427, + "mean_token_accuracy": 0.8548782065510749, + "num_tokens": 122072828.0, + "step": 101530 + }, + { + "entropy": 1.8797732189297676, + "epoch": 0.3147651157546078, + "grad_norm": 7.313852787017822, + "learning_rate": 4.509220996056225e-06, + "loss": 0.4835, + "mean_token_accuracy": 0.8434521064162255, + "num_tokens": 122084579.0, + "step": 101540 + }, + { + "entropy": 1.823886838555336, + "epoch": 0.31479611487965753, + "grad_norm": 5.079513072967529, + "learning_rate": 4.508998968659968e-06, + "loss": 0.4358, + "mean_token_accuracy": 0.8580957293510437, + "num_tokens": 122097466.0, + "step": 101550 + }, + { + "entropy": 1.8132152885198594, + "epoch": 0.3148271140047072, + "grad_norm": 10.782577514648438, + "learning_rate": 4.508776974057388e-06, + "loss": 0.4318, + "mean_token_accuracy": 0.838719479739666, + "num_tokens": 122109870.0, + "step": 101560 + }, + { + "entropy": 1.8953714028000832, + "epoch": 0.3148581131297569, + "grad_norm": 8.248565673828125, + "learning_rate": 4.508555012240411e-06, + "loss": 0.5109, + "mean_token_accuracy": 0.8457905784249306, + "num_tokens": 122121740.0, + "step": 101570 + }, + { + "entropy": 1.8286928310990334, + "epoch": 0.3148891122548066, + "grad_norm": 9.213640213012695, + "learning_rate": 4.50833308320097e-06, + "loss": 0.421, + "mean_token_accuracy": 0.8573044881224632, + "num_tokens": 122133522.0, + "step": 101580 + }, + { + "entropy": 1.8590153113007546, + "epoch": 0.3149201113798563, + "grad_norm": 5.629488945007324, + "learning_rate": 4.508111186930996e-06, + "loss": 0.5116, + "mean_token_accuracy": 0.8378587946295738, + "num_tokens": 122145368.0, + "step": 101590 + }, + { + "entropy": 1.8506905019283295, + "epoch": 0.314951110504906, + "grad_norm": 7.348735332489014, + "learning_rate": 4.507889323422427e-06, + "loss": 0.4933, + "mean_token_accuracy": 0.8469028800725937, + "num_tokens": 122157779.0, + "step": 101600 + }, + { + "entropy": 1.8291833832859994, + "epoch": 0.3149821096299557, + "grad_norm": 2.6711909770965576, + "learning_rate": 4.507667492667202e-06, + "loss": 0.418, + "mean_token_accuracy": 0.8578299075365067, + "num_tokens": 122170157.0, + "step": 101610 + }, + { + "entropy": 1.9278810530900956, + "epoch": 0.3150131087550054, + "grad_norm": 6.753495693206787, + "learning_rate": 4.507445694657263e-06, + "loss": 0.5123, + "mean_token_accuracy": 0.8417115181684494, + "num_tokens": 122181568.0, + "step": 101620 + }, + { + "entropy": 1.8500063866376877, + "epoch": 0.3150441078800551, + "grad_norm": 9.120920181274414, + "learning_rate": 4.507223929384555e-06, + "loss": 0.459, + "mean_token_accuracy": 0.8450711652636528, + "num_tokens": 122193790.0, + "step": 101630 + }, + { + "entropy": 1.8878964528441429, + "epoch": 0.31507510700510477, + "grad_norm": 7.948580265045166, + "learning_rate": 4.507002196841023e-06, + "loss": 0.4774, + "mean_token_accuracy": 0.8558190748095512, + "num_tokens": 122205835.0, + "step": 101640 + }, + { + "entropy": 1.870565366744995, + "epoch": 0.3151061061301545, + "grad_norm": 9.229273796081543, + "learning_rate": 4.506780497018622e-06, + "loss": 0.4816, + "mean_token_accuracy": 0.8505363553762436, + "num_tokens": 122217561.0, + "step": 101650 + }, + { + "entropy": 1.8509984895586968, + "epoch": 0.31513710525520416, + "grad_norm": 9.244194984436035, + "learning_rate": 4.506558829909301e-06, + "loss": 0.4356, + "mean_token_accuracy": 0.8476543664932251, + "num_tokens": 122230248.0, + "step": 101660 + }, + { + "entropy": 1.7851860001683235, + "epoch": 0.3151681043802539, + "grad_norm": 8.21959114074707, + "learning_rate": 4.506337195505015e-06, + "loss": 0.3845, + "mean_token_accuracy": 0.8607925057411194, + "num_tokens": 122243301.0, + "step": 101670 + }, + { + "entropy": 1.843540082871914, + "epoch": 0.31519910350530356, + "grad_norm": 8.222967147827148, + "learning_rate": 4.506115593797727e-06, + "loss": 0.4182, + "mean_token_accuracy": 0.8591127768158913, + "num_tokens": 122255771.0, + "step": 101680 + }, + { + "entropy": 1.8111511290073394, + "epoch": 0.3152301026303533, + "grad_norm": 4.0410075187683105, + "learning_rate": 4.5058940247793955e-06, + "loss": 0.4031, + "mean_token_accuracy": 0.8595302075147628, + "num_tokens": 122268473.0, + "step": 101690 + }, + { + "entropy": 1.849461354315281, + "epoch": 0.31526110175540295, + "grad_norm": 7.890182971954346, + "learning_rate": 4.505672488441985e-06, + "loss": 0.4679, + "mean_token_accuracy": 0.8466949865221978, + "num_tokens": 122280554.0, + "step": 101700 + }, + { + "entropy": 1.8682439640164374, + "epoch": 0.3152921008804527, + "grad_norm": 3.920172691345215, + "learning_rate": 4.505450984777461e-06, + "loss": 0.4249, + "mean_token_accuracy": 0.8509997203946114, + "num_tokens": 122293129.0, + "step": 101710 + }, + { + "entropy": 1.8806020855903625, + "epoch": 0.31532310000550234, + "grad_norm": 9.121551513671875, + "learning_rate": 4.505229513777795e-06, + "loss": 0.4952, + "mean_token_accuracy": 0.8400007635354996, + "num_tokens": 122304732.0, + "step": 101720 + }, + { + "entropy": 1.902934204041958, + "epoch": 0.315354099130552, + "grad_norm": 8.14372730255127, + "learning_rate": 4.50500807543496e-06, + "loss": 0.4798, + "mean_token_accuracy": 0.8440614491701126, + "num_tokens": 122315872.0, + "step": 101730 + }, + { + "entropy": 1.8729471534490585, + "epoch": 0.31538509825560174, + "grad_norm": 7.441555023193359, + "learning_rate": 4.50478666974093e-06, + "loss": 0.4782, + "mean_token_accuracy": 0.8485934257507324, + "num_tokens": 122327035.0, + "step": 101740 + }, + { + "entropy": 1.9011955708265305, + "epoch": 0.3154160973806514, + "grad_norm": 8.446456909179688, + "learning_rate": 4.50456529668768e-06, + "loss": 0.5141, + "mean_token_accuracy": 0.8409095421433449, + "num_tokens": 122338445.0, + "step": 101750 + }, + { + "entropy": 1.7948980048298835, + "epoch": 0.31544709650570113, + "grad_norm": 10.517394065856934, + "learning_rate": 4.5043439562671966e-06, + "loss": 0.4413, + "mean_token_accuracy": 0.8519067704677582, + "num_tokens": 122351396.0, + "step": 101760 + }, + { + "entropy": 1.7647177547216415, + "epoch": 0.3154780956307508, + "grad_norm": 7.822119235992432, + "learning_rate": 4.504122648471458e-06, + "loss": 0.3804, + "mean_token_accuracy": 0.8645659565925599, + "num_tokens": 122364593.0, + "step": 101770 + }, + { + "entropy": 1.912806712090969, + "epoch": 0.3155090947558005, + "grad_norm": 8.647019386291504, + "learning_rate": 4.503901373292454e-06, + "loss": 0.5724, + "mean_token_accuracy": 0.8309219375252723, + "num_tokens": 122376209.0, + "step": 101780 + }, + { + "entropy": 1.8549558535218238, + "epoch": 0.3155400938808502, + "grad_norm": 7.488748073577881, + "learning_rate": 4.503680130722171e-06, + "loss": 0.4575, + "mean_token_accuracy": 0.8498001918196678, + "num_tokens": 122388129.0, + "step": 101790 + }, + { + "entropy": 1.9467090874910356, + "epoch": 0.3155710930058999, + "grad_norm": 7.977104663848877, + "learning_rate": 4.5034589207526026e-06, + "loss": 0.5692, + "mean_token_accuracy": 0.8243438869714736, + "num_tokens": 122399489.0, + "step": 101800 + }, + { + "entropy": 1.8753120481967926, + "epoch": 0.3156020921309496, + "grad_norm": 7.6365275382995605, + "learning_rate": 4.503237743375743e-06, + "loss": 0.5049, + "mean_token_accuracy": 0.8401636779308319, + "num_tokens": 122411610.0, + "step": 101810 + }, + { + "entropy": 1.8790741473436356, + "epoch": 0.3156330912559993, + "grad_norm": 9.488393783569336, + "learning_rate": 4.503016598583588e-06, + "loss": 0.4516, + "mean_token_accuracy": 0.8573433116078377, + "num_tokens": 122423284.0, + "step": 101820 + }, + { + "entropy": 1.9299686640501021, + "epoch": 0.315664090381049, + "grad_norm": 7.945889472961426, + "learning_rate": 4.502795486368138e-06, + "loss": 0.5097, + "mean_token_accuracy": 0.8483487293124199, + "num_tokens": 122434547.0, + "step": 101830 + }, + { + "entropy": 1.778617675602436, + "epoch": 0.3156950895060987, + "grad_norm": 9.670980453491211, + "learning_rate": 4.502574406721396e-06, + "loss": 0.4062, + "mean_token_accuracy": 0.8523766458034515, + "num_tokens": 122447628.0, + "step": 101840 + }, + { + "entropy": 1.7951662212610244, + "epoch": 0.31572608863114837, + "grad_norm": 8.881525039672852, + "learning_rate": 4.502353359635368e-06, + "loss": 0.385, + "mean_token_accuracy": 0.861037427186966, + "num_tokens": 122460464.0, + "step": 101850 + }, + { + "entropy": 1.8687596887350082, + "epoch": 0.3157570877561981, + "grad_norm": 7.735236644744873, + "learning_rate": 4.502132345102062e-06, + "loss": 0.4575, + "mean_token_accuracy": 0.853579594194889, + "num_tokens": 122471873.0, + "step": 101860 + }, + { + "entropy": 1.8538660779595375, + "epoch": 0.31578808688124776, + "grad_norm": 9.091965675354004, + "learning_rate": 4.501911363113488e-06, + "loss": 0.4883, + "mean_token_accuracy": 0.8424102887511253, + "num_tokens": 122483758.0, + "step": 101870 + }, + { + "entropy": 1.867423267662525, + "epoch": 0.3158190860062975, + "grad_norm": 7.476759433746338, + "learning_rate": 4.50169041366166e-06, + "loss": 0.4226, + "mean_token_accuracy": 0.855827559530735, + "num_tokens": 122495774.0, + "step": 101880 + }, + { + "entropy": 1.9684102550148963, + "epoch": 0.31585008513134716, + "grad_norm": 9.030182838439941, + "learning_rate": 4.501469496738595e-06, + "loss": 0.5092, + "mean_token_accuracy": 0.8357870906591416, + "num_tokens": 122507503.0, + "step": 101890 + }, + { + "entropy": 1.8453612983226777, + "epoch": 0.3158810842563969, + "grad_norm": 3.91621732711792, + "learning_rate": 4.501248612336311e-06, + "loss": 0.4265, + "mean_token_accuracy": 0.8493709444999695, + "num_tokens": 122519916.0, + "step": 101900 + }, + { + "entropy": 1.8780709460377694, + "epoch": 0.31591208338144655, + "grad_norm": 4.055104732513428, + "learning_rate": 4.501027760446832e-06, + "loss": 0.4607, + "mean_token_accuracy": 0.8418286308646202, + "num_tokens": 122532278.0, + "step": 101910 + }, + { + "entropy": 1.8270500496029853, + "epoch": 0.3159430825064963, + "grad_norm": 8.200374603271484, + "learning_rate": 4.500806941062181e-06, + "loss": 0.4187, + "mean_token_accuracy": 0.8556545317173004, + "num_tokens": 122544680.0, + "step": 101920 + }, + { + "entropy": 1.8527694910764694, + "epoch": 0.31597408163154594, + "grad_norm": 7.4447526931762695, + "learning_rate": 4.500586154174386e-06, + "loss": 0.4066, + "mean_token_accuracy": 0.8587725609540939, + "num_tokens": 122557057.0, + "step": 101930 + }, + { + "entropy": 1.8287393301725388, + "epoch": 0.31600508075659567, + "grad_norm": 8.269747734069824, + "learning_rate": 4.500365399775477e-06, + "loss": 0.4286, + "mean_token_accuracy": 0.8537361025810242, + "num_tokens": 122570020.0, + "step": 101940 + }, + { + "entropy": 1.9338515534996987, + "epoch": 0.31603607988164534, + "grad_norm": 4.0301194190979, + "learning_rate": 4.500144677857487e-06, + "loss": 0.508, + "mean_token_accuracy": 0.8461434602737427, + "num_tokens": 122580842.0, + "step": 101950 + }, + { + "entropy": 1.8989423006772994, + "epoch": 0.31606707900669506, + "grad_norm": 9.705557823181152, + "learning_rate": 4.499923988412451e-06, + "loss": 0.4887, + "mean_token_accuracy": 0.844089737534523, + "num_tokens": 122592807.0, + "step": 101960 + }, + { + "entropy": 1.7944003835320472, + "epoch": 0.31609807813174473, + "grad_norm": 8.264009475708008, + "learning_rate": 4.4997033314324076e-06, + "loss": 0.4369, + "mean_token_accuracy": 0.853903503715992, + "num_tokens": 122606158.0, + "step": 101970 + }, + { + "entropy": 1.9085728481411934, + "epoch": 0.3161290772567944, + "grad_norm": 4.879970073699951, + "learning_rate": 4.499482706909398e-06, + "loss": 0.5186, + "mean_token_accuracy": 0.8358641535043716, + "num_tokens": 122618189.0, + "step": 101980 + }, + { + "entropy": 1.9110256865620614, + "epoch": 0.3161600763818441, + "grad_norm": 11.392416954040527, + "learning_rate": 4.4992621148354666e-06, + "loss": 0.5015, + "mean_token_accuracy": 0.8411575809121132, + "num_tokens": 122630458.0, + "step": 101990 + }, + { + "entropy": 1.8680866047739983, + "epoch": 0.3161910755068938, + "grad_norm": 8.535940170288086, + "learning_rate": 4.499041555202658e-06, + "loss": 0.4669, + "mean_token_accuracy": 0.8537841528654099, + "num_tokens": 122642497.0, + "step": 102000 + }, + { + "entropy": 1.9318107053637505, + "epoch": 0.3162220746319435, + "grad_norm": 7.174762725830078, + "learning_rate": 4.498821028003023e-06, + "loss": 0.4721, + "mean_token_accuracy": 0.8535374835133552, + "num_tokens": 122654391.0, + "step": 102010 + }, + { + "entropy": 1.8076833948493003, + "epoch": 0.3162530737569932, + "grad_norm": 8.442280769348145, + "learning_rate": 4.498600533228614e-06, + "loss": 0.3978, + "mean_token_accuracy": 0.8697671309113503, + "num_tokens": 122667692.0, + "step": 102020 + }, + { + "entropy": 1.9444027364253997, + "epoch": 0.3162840728820429, + "grad_norm": 8.13665771484375, + "learning_rate": 4.498380070871485e-06, + "loss": 0.5283, + "mean_token_accuracy": 0.8343800812959671, + "num_tokens": 122679172.0, + "step": 102030 + }, + { + "entropy": 1.8926381140947341, + "epoch": 0.3163150720070926, + "grad_norm": 4.520792484283447, + "learning_rate": 4.498159640923693e-06, + "loss": 0.4614, + "mean_token_accuracy": 0.8546594932675362, + "num_tokens": 122691105.0, + "step": 102040 + }, + { + "entropy": 1.9276057749986648, + "epoch": 0.3163460711321423, + "grad_norm": 9.296843528747559, + "learning_rate": 4.497939243377298e-06, + "loss": 0.4684, + "mean_token_accuracy": 0.8524391040205955, + "num_tokens": 122702895.0, + "step": 102050 + }, + { + "entropy": 1.9363893195986748, + "epoch": 0.316377070257192, + "grad_norm": 6.961995601654053, + "learning_rate": 4.497718878224365e-06, + "loss": 0.5188, + "mean_token_accuracy": 0.8385184094309807, + "num_tokens": 122714385.0, + "step": 102060 + }, + { + "entropy": 1.9360941782593728, + "epoch": 0.3164080693822417, + "grad_norm": 8.984434127807617, + "learning_rate": 4.497498545456957e-06, + "loss": 0.4717, + "mean_token_accuracy": 0.8492272660136223, + "num_tokens": 122725974.0, + "step": 102070 + }, + { + "entropy": 1.88591488301754, + "epoch": 0.31643906850729137, + "grad_norm": 9.188456535339355, + "learning_rate": 4.497278245067143e-06, + "loss": 0.4751, + "mean_token_accuracy": 0.8486263528466225, + "num_tokens": 122737041.0, + "step": 102080 + }, + { + "entropy": 1.9170858517289162, + "epoch": 0.3164700676323411, + "grad_norm": 8.99303913116455, + "learning_rate": 4.497057977046996e-06, + "loss": 0.5033, + "mean_token_accuracy": 0.8467420935630798, + "num_tokens": 122748309.0, + "step": 102090 + }, + { + "entropy": 1.950069211423397, + "epoch": 0.31650106675739076, + "grad_norm": 8.753754615783691, + "learning_rate": 4.4968377413885885e-06, + "loss": 0.4896, + "mean_token_accuracy": 0.849167463183403, + "num_tokens": 122759466.0, + "step": 102100 + }, + { + "entropy": 1.9460075750947, + "epoch": 0.3165320658824405, + "grad_norm": 8.198152542114258, + "learning_rate": 4.496617538083995e-06, + "loss": 0.4837, + "mean_token_accuracy": 0.847400327026844, + "num_tokens": 122770569.0, + "step": 102110 + }, + { + "entropy": 1.9162710309028625, + "epoch": 0.31656306500749015, + "grad_norm": 8.986804008483887, + "learning_rate": 4.496397367125297e-06, + "loss": 0.4862, + "mean_token_accuracy": 0.8529955089092255, + "num_tokens": 122782469.0, + "step": 102120 + }, + { + "entropy": 1.9209685817360878, + "epoch": 0.3165940641325399, + "grad_norm": 9.081794738769531, + "learning_rate": 4.496177228504574e-06, + "loss": 0.5058, + "mean_token_accuracy": 0.8467201054096222, + "num_tokens": 122794654.0, + "step": 102130 + }, + { + "entropy": 1.976951664686203, + "epoch": 0.31662506325758955, + "grad_norm": 7.217496395111084, + "learning_rate": 4.495957122213915e-06, + "loss": 0.5074, + "mean_token_accuracy": 0.852759413421154, + "num_tokens": 122805399.0, + "step": 102140 + }, + { + "entropy": 1.7019195973873138, + "epoch": 0.31665606238263927, + "grad_norm": 8.643301963806152, + "learning_rate": 4.495737048245404e-06, + "loss": 0.4252, + "mean_token_accuracy": 0.8538576439023018, + "num_tokens": 122819420.0, + "step": 102150 + }, + { + "entropy": 1.8965346455574035, + "epoch": 0.31668706150768894, + "grad_norm": 8.723462104797363, + "learning_rate": 4.495517006591132e-06, + "loss": 0.476, + "mean_token_accuracy": 0.8469094544649124, + "num_tokens": 122831159.0, + "step": 102160 + }, + { + "entropy": 1.8087441340088843, + "epoch": 0.31671806063273866, + "grad_norm": 7.827014923095703, + "learning_rate": 4.495296997243191e-06, + "loss": 0.4458, + "mean_token_accuracy": 0.8492914751172066, + "num_tokens": 122844637.0, + "step": 102170 + }, + { + "entropy": 1.8186364084482194, + "epoch": 0.31674905975778833, + "grad_norm": 8.309804916381836, + "learning_rate": 4.495077020193676e-06, + "loss": 0.4508, + "mean_token_accuracy": 0.8481403559446334, + "num_tokens": 122857719.0, + "step": 102180 + }, + { + "entropy": 1.924719001352787, + "epoch": 0.31678005888283806, + "grad_norm": 8.645857810974121, + "learning_rate": 4.494857075434688e-06, + "loss": 0.46, + "mean_token_accuracy": 0.8571926310658455, + "num_tokens": 122869193.0, + "step": 102190 + }, + { + "entropy": 1.9453157484531403, + "epoch": 0.3168110580078877, + "grad_norm": 8.812455177307129, + "learning_rate": 4.494637162958325e-06, + "loss": 0.5329, + "mean_token_accuracy": 0.8361015647649765, + "num_tokens": 122879724.0, + "step": 102200 + }, + { + "entropy": 1.8693825080990791, + "epoch": 0.31684205713293745, + "grad_norm": 10.073441505432129, + "learning_rate": 4.494417282756691e-06, + "loss": 0.4661, + "mean_token_accuracy": 0.8441578835248947, + "num_tokens": 122892200.0, + "step": 102210 + }, + { + "entropy": 1.8365435376763344, + "epoch": 0.3168730562579871, + "grad_norm": 3.941197633743286, + "learning_rate": 4.494197434821895e-06, + "loss": 0.4078, + "mean_token_accuracy": 0.8510231733322143, + "num_tokens": 122904637.0, + "step": 102220 + }, + { + "entropy": 1.8284949362277985, + "epoch": 0.3169040553830368, + "grad_norm": 7.839810371398926, + "learning_rate": 4.493977619146042e-06, + "loss": 0.4285, + "mean_token_accuracy": 0.8462954074144363, + "num_tokens": 122917337.0, + "step": 102230 + }, + { + "entropy": 1.8114649429917336, + "epoch": 0.3169350545080865, + "grad_norm": 7.352813243865967, + "learning_rate": 4.493757835721245e-06, + "loss": 0.4804, + "mean_token_accuracy": 0.8442055761814118, + "num_tokens": 122930610.0, + "step": 102240 + }, + { + "entropy": 1.9509634003043175, + "epoch": 0.3169660536331362, + "grad_norm": 8.08341121673584, + "learning_rate": 4.49353808453962e-06, + "loss": 0.5304, + "mean_token_accuracy": 0.8404506236314774, + "num_tokens": 122941825.0, + "step": 102250 + }, + { + "entropy": 1.9052265360951424, + "epoch": 0.3169970527581859, + "grad_norm": 8.447335243225098, + "learning_rate": 4.4933183655932825e-06, + "loss": 0.483, + "mean_token_accuracy": 0.8546632289886474, + "num_tokens": 122953086.0, + "step": 102260 + }, + { + "entropy": 1.8697161242365836, + "epoch": 0.3170280518832356, + "grad_norm": 7.899079322814941, + "learning_rate": 4.493098678874353e-06, + "loss": 0.4338, + "mean_token_accuracy": 0.8598993286490441, + "num_tokens": 122965302.0, + "step": 102270 + }, + { + "entropy": 1.9763395830988884, + "epoch": 0.3170590510082853, + "grad_norm": 7.561324119567871, + "learning_rate": 4.4928790243749535e-06, + "loss": 0.4768, + "mean_token_accuracy": 0.8493476256728172, + "num_tokens": 122976430.0, + "step": 102280 + }, + { + "entropy": 1.9137044921517372, + "epoch": 0.31709005013333497, + "grad_norm": 8.271001815795898, + "learning_rate": 4.49265940208721e-06, + "loss": 0.5022, + "mean_token_accuracy": 0.8426155328750611, + "num_tokens": 122987244.0, + "step": 102290 + }, + { + "entropy": 1.8825251743197442, + "epoch": 0.3171210492583847, + "grad_norm": 9.43757152557373, + "learning_rate": 4.4924398120032505e-06, + "loss": 0.4783, + "mean_token_accuracy": 0.851975978910923, + "num_tokens": 122998662.0, + "step": 102300 + }, + { + "entropy": 1.8874466240406036, + "epoch": 0.31715204838343436, + "grad_norm": 8.041158676147461, + "learning_rate": 4.492220254115204e-06, + "loss": 0.496, + "mean_token_accuracy": 0.8433241844177246, + "num_tokens": 123010762.0, + "step": 102310 + }, + { + "entropy": 1.9816171437501908, + "epoch": 0.3171830475084841, + "grad_norm": 9.10623836517334, + "learning_rate": 4.492000728415204e-06, + "loss": 0.5231, + "mean_token_accuracy": 0.8387098371982574, + "num_tokens": 123021337.0, + "step": 102320 + }, + { + "entropy": 1.9616368293762207, + "epoch": 0.31721404663353375, + "grad_norm": 9.265618324279785, + "learning_rate": 4.491781234895389e-06, + "loss": 0.5216, + "mean_token_accuracy": 0.8416030526161193, + "num_tokens": 123032883.0, + "step": 102330 + }, + { + "entropy": 1.8963989228010179, + "epoch": 0.3172450457585835, + "grad_norm": 8.671114921569824, + "learning_rate": 4.4915617735478936e-06, + "loss": 0.4523, + "mean_token_accuracy": 0.8574667409062385, + "num_tokens": 123044915.0, + "step": 102340 + }, + { + "entropy": 1.8731760695576667, + "epoch": 0.31727604488363315, + "grad_norm": 7.5588297843933105, + "learning_rate": 4.491342344364861e-06, + "loss": 0.4867, + "mean_token_accuracy": 0.8465674906969071, + "num_tokens": 123056529.0, + "step": 102350 + }, + { + "entropy": 1.9712413370609283, + "epoch": 0.31730704400868287, + "grad_norm": 8.025903701782227, + "learning_rate": 4.491122947338437e-06, + "loss": 0.5524, + "mean_token_accuracy": 0.8367073446512222, + "num_tokens": 123067329.0, + "step": 102360 + }, + { + "entropy": 1.849116511642933, + "epoch": 0.31733804313373254, + "grad_norm": 7.625123023986816, + "learning_rate": 4.490903582460766e-06, + "loss": 0.419, + "mean_token_accuracy": 0.847317686676979, + "num_tokens": 123080562.0, + "step": 102370 + }, + { + "entropy": 1.980668443441391, + "epoch": 0.31736904225878226, + "grad_norm": 7.4852705001831055, + "learning_rate": 4.490684249724e-06, + "loss": 0.5089, + "mean_token_accuracy": 0.8438632428646088, + "num_tokens": 123092615.0, + "step": 102380 + }, + { + "entropy": 1.9390667513012887, + "epoch": 0.31740004138383193, + "grad_norm": 8.869697570800781, + "learning_rate": 4.4904649491202866e-06, + "loss": 0.4443, + "mean_token_accuracy": 0.8526061177253723, + "num_tokens": 123104152.0, + "step": 102390 + }, + { + "entropy": 1.866631529480219, + "epoch": 0.31743104050888166, + "grad_norm": 7.319535255432129, + "learning_rate": 4.490245680641784e-06, + "loss": 0.4087, + "mean_token_accuracy": 0.8547472521662712, + "num_tokens": 123116432.0, + "step": 102400 + }, + { + "entropy": 1.9240225657820702, + "epoch": 0.3174620396339313, + "grad_norm": 7.42906379699707, + "learning_rate": 4.490026444280649e-06, + "loss": 0.4896, + "mean_token_accuracy": 0.8577197745442391, + "num_tokens": 123128281.0, + "step": 102410 + }, + { + "entropy": 1.8641801089048387, + "epoch": 0.31749303875898105, + "grad_norm": 8.197351455688477, + "learning_rate": 4.48980724002904e-06, + "loss": 0.4176, + "mean_token_accuracy": 0.8553032830357552, + "num_tokens": 123140851.0, + "step": 102420 + }, + { + "entropy": 1.8485140323638916, + "epoch": 0.3175240378840307, + "grad_norm": 11.487910270690918, + "learning_rate": 4.489588067879123e-06, + "loss": 0.459, + "mean_token_accuracy": 0.8603411689400673, + "num_tokens": 123153326.0, + "step": 102430 + }, + { + "entropy": 1.93117448836565, + "epoch": 0.31755503700908044, + "grad_norm": 9.506771087646484, + "learning_rate": 4.489368927823061e-06, + "loss": 0.4954, + "mean_token_accuracy": 0.8498334422707557, + "num_tokens": 123165165.0, + "step": 102440 + }, + { + "entropy": 1.958940689265728, + "epoch": 0.3175860361341301, + "grad_norm": 7.967811107635498, + "learning_rate": 4.489149819853024e-06, + "loss": 0.4962, + "mean_token_accuracy": 0.847984354197979, + "num_tokens": 123176618.0, + "step": 102450 + }, + { + "entropy": 1.9103962182998657, + "epoch": 0.31761703525917984, + "grad_norm": 8.108328819274902, + "learning_rate": 4.4889307439611805e-06, + "loss": 0.4917, + "mean_token_accuracy": 0.8437720760703087, + "num_tokens": 123188100.0, + "step": 102460 + }, + { + "entropy": 1.916041937470436, + "epoch": 0.3176480343842295, + "grad_norm": 8.520224571228027, + "learning_rate": 4.488711700139705e-06, + "loss": 0.4949, + "mean_token_accuracy": 0.8483468234539032, + "num_tokens": 123199271.0, + "step": 102470 + }, + { + "entropy": 1.8879058375954627, + "epoch": 0.3176790335092792, + "grad_norm": 6.797135829925537, + "learning_rate": 4.488492688380775e-06, + "loss": 0.4652, + "mean_token_accuracy": 0.8475539967417717, + "num_tokens": 123211068.0, + "step": 102480 + }, + { + "entropy": 1.9228038251399995, + "epoch": 0.3177100326343289, + "grad_norm": 8.574908256530762, + "learning_rate": 4.488273708676567e-06, + "loss": 0.48, + "mean_token_accuracy": 0.8467731967568397, + "num_tokens": 123222860.0, + "step": 102490 + }, + { + "entropy": 1.9041634023189544, + "epoch": 0.31774103175937857, + "grad_norm": 3.588517427444458, + "learning_rate": 4.488054761019265e-06, + "loss": 0.4622, + "mean_token_accuracy": 0.8531323969364166, + "num_tokens": 123234902.0, + "step": 102500 + }, + { + "entropy": 1.880505882203579, + "epoch": 0.3177720308844283, + "grad_norm": 7.573834419250488, + "learning_rate": 4.487835845401051e-06, + "loss": 0.4511, + "mean_token_accuracy": 0.8521980285644531, + "num_tokens": 123247483.0, + "step": 102510 + }, + { + "entropy": 1.9097674801945685, + "epoch": 0.31780303000947796, + "grad_norm": 8.0872163772583, + "learning_rate": 4.487616961814113e-06, + "loss": 0.5004, + "mean_token_accuracy": 0.8391719311475754, + "num_tokens": 123259632.0, + "step": 102520 + }, + { + "entropy": 1.9394026383757592, + "epoch": 0.3178340291345277, + "grad_norm": 7.864894390106201, + "learning_rate": 4.48739811025064e-06, + "loss": 0.5, + "mean_token_accuracy": 0.8454336494207382, + "num_tokens": 123270948.0, + "step": 102530 + }, + { + "entropy": 1.8619687780737877, + "epoch": 0.31786502825957735, + "grad_norm": 3.913823127746582, + "learning_rate": 4.487179290702825e-06, + "loss": 0.465, + "mean_token_accuracy": 0.8360814332962037, + "num_tokens": 123283604.0, + "step": 102540 + }, + { + "entropy": 1.934596875309944, + "epoch": 0.3178960273846271, + "grad_norm": 8.981165885925293, + "learning_rate": 4.486960503162861e-06, + "loss": 0.4668, + "mean_token_accuracy": 0.8490733042359352, + "num_tokens": 123294899.0, + "step": 102550 + }, + { + "entropy": 1.9189150497317313, + "epoch": 0.31792702650967675, + "grad_norm": 7.386570453643799, + "learning_rate": 4.4867417476229475e-06, + "loss": 0.496, + "mean_token_accuracy": 0.8401806384325028, + "num_tokens": 123306606.0, + "step": 102560 + }, + { + "entropy": 1.746788166463375, + "epoch": 0.31795802563472647, + "grad_norm": 8.746540069580078, + "learning_rate": 4.486523024075284e-06, + "loss": 0.3844, + "mean_token_accuracy": 0.8678254216909409, + "num_tokens": 123319627.0, + "step": 102570 + }, + { + "entropy": 1.8892858445644378, + "epoch": 0.31798902475977614, + "grad_norm": 9.26871109008789, + "learning_rate": 4.486304332512073e-06, + "loss": 0.4685, + "mean_token_accuracy": 0.8421612724661827, + "num_tokens": 123331981.0, + "step": 102580 + }, + { + "entropy": 1.847158958017826, + "epoch": 0.31802002388482586, + "grad_norm": 8.670111656188965, + "learning_rate": 4.48608567292552e-06, + "loss": 0.4146, + "mean_token_accuracy": 0.8566896244883537, + "num_tokens": 123344136.0, + "step": 102590 + }, + { + "entropy": 1.839782963693142, + "epoch": 0.31805102300987553, + "grad_norm": 3.6876060962677, + "learning_rate": 4.485867045307833e-06, + "loss": 0.435, + "mean_token_accuracy": 0.8545563265681266, + "num_tokens": 123356337.0, + "step": 102600 + }, + { + "entropy": 1.8948510199785233, + "epoch": 0.31808202213492526, + "grad_norm": 8.7213773727417, + "learning_rate": 4.485648449651225e-06, + "loss": 0.4486, + "mean_token_accuracy": 0.8595595166087151, + "num_tokens": 123367945.0, + "step": 102610 + }, + { + "entropy": 1.8524553552269936, + "epoch": 0.3181130212599749, + "grad_norm": 8.925561904907227, + "learning_rate": 4.485429885947906e-06, + "loss": 0.4511, + "mean_token_accuracy": 0.8614932551980019, + "num_tokens": 123380029.0, + "step": 102620 + }, + { + "entropy": 1.8003907322883606, + "epoch": 0.31814402038502465, + "grad_norm": 7.866354465484619, + "learning_rate": 4.485211354190095e-06, + "loss": 0.3988, + "mean_token_accuracy": 0.8583444371819496, + "num_tokens": 123392498.0, + "step": 102630 + }, + { + "entropy": 1.8503638431429863, + "epoch": 0.3181750195100743, + "grad_norm": 9.890091896057129, + "learning_rate": 4.4849928543700085e-06, + "loss": 0.4831, + "mean_token_accuracy": 0.8414298683404923, + "num_tokens": 123405679.0, + "step": 102640 + }, + { + "entropy": 1.8876867666840553, + "epoch": 0.31820601863512404, + "grad_norm": 4.612701416015625, + "learning_rate": 4.4847743864798694e-06, + "loss": 0.4913, + "mean_token_accuracy": 0.8404382139444351, + "num_tokens": 123417818.0, + "step": 102650 + }, + { + "entropy": 1.8589577794075012, + "epoch": 0.3182370177601737, + "grad_norm": 8.45789909362793, + "learning_rate": 4.4845559505119026e-06, + "loss": 0.4453, + "mean_token_accuracy": 0.8587053641676903, + "num_tokens": 123430054.0, + "step": 102660 + }, + { + "entropy": 1.8961587071418762, + "epoch": 0.31826801688522344, + "grad_norm": 7.61850118637085, + "learning_rate": 4.484337546458332e-06, + "loss": 0.4666, + "mean_token_accuracy": 0.8480044439435005, + "num_tokens": 123442303.0, + "step": 102670 + }, + { + "entropy": 1.8941489323973655, + "epoch": 0.3182990160102731, + "grad_norm": 7.859434604644775, + "learning_rate": 4.484119174311389e-06, + "loss": 0.4962, + "mean_token_accuracy": 0.8440102905035018, + "num_tokens": 123454240.0, + "step": 102680 + }, + { + "entropy": 1.8841777712106704, + "epoch": 0.31833001513532283, + "grad_norm": 3.9713032245635986, + "learning_rate": 4.483900834063305e-06, + "loss": 0.505, + "mean_token_accuracy": 0.8437553867697716, + "num_tokens": 123465668.0, + "step": 102690 + }, + { + "entropy": 1.8325778350234032, + "epoch": 0.3183610142603725, + "grad_norm": 7.278379917144775, + "learning_rate": 4.483682525706316e-06, + "loss": 0.4385, + "mean_token_accuracy": 0.8592413857579231, + "num_tokens": 123478018.0, + "step": 102700 + }, + { + "entropy": 1.868157622218132, + "epoch": 0.3183920133854222, + "grad_norm": 4.052938461303711, + "learning_rate": 4.483464249232657e-06, + "loss": 0.4497, + "mean_token_accuracy": 0.8553399801254272, + "num_tokens": 123490636.0, + "step": 102710 + }, + { + "entropy": 1.8868106931447983, + "epoch": 0.3184230125104719, + "grad_norm": 7.89746618270874, + "learning_rate": 4.483246004634569e-06, + "loss": 0.4555, + "mean_token_accuracy": 0.8540831059217453, + "num_tokens": 123502078.0, + "step": 102720 + }, + { + "entropy": 1.9166793465614318, + "epoch": 0.31845401163552156, + "grad_norm": 8.033954620361328, + "learning_rate": 4.4830277919042956e-06, + "loss": 0.4815, + "mean_token_accuracy": 0.8464692577719688, + "num_tokens": 123513541.0, + "step": 102730 + }, + { + "entropy": 1.8216803684830665, + "epoch": 0.3184850107605713, + "grad_norm": 7.996546745300293, + "learning_rate": 4.482809611034082e-06, + "loss": 0.4857, + "mean_token_accuracy": 0.8364669859409333, + "num_tokens": 123526355.0, + "step": 102740 + }, + { + "entropy": 1.8570113122463225, + "epoch": 0.31851600988562095, + "grad_norm": 7.581510066986084, + "learning_rate": 4.482591462016174e-06, + "loss": 0.429, + "mean_token_accuracy": 0.8555145755410194, + "num_tokens": 123538211.0, + "step": 102750 + }, + { + "entropy": 1.812189969420433, + "epoch": 0.3185470090106707, + "grad_norm": 9.858771324157715, + "learning_rate": 4.482373344842824e-06, + "loss": 0.4234, + "mean_token_accuracy": 0.8614937171339989, + "num_tokens": 123550270.0, + "step": 102760 + }, + { + "entropy": 1.8564748376607896, + "epoch": 0.31857800813572035, + "grad_norm": 6.510000228881836, + "learning_rate": 4.482155259506284e-06, + "loss": 0.5008, + "mean_token_accuracy": 0.8492413744330406, + "num_tokens": 123561940.0, + "step": 102770 + }, + { + "entropy": 1.9163195461034774, + "epoch": 0.31860900726077007, + "grad_norm": 7.715924263000488, + "learning_rate": 4.4819372059988115e-06, + "loss": 0.4913, + "mean_token_accuracy": 0.8402003988623619, + "num_tokens": 123573352.0, + "step": 102780 + }, + { + "entropy": 1.851955994963646, + "epoch": 0.31864000638581974, + "grad_norm": 8.36665153503418, + "learning_rate": 4.4817191843126635e-06, + "loss": 0.4581, + "mean_token_accuracy": 0.8535103261470794, + "num_tokens": 123584588.0, + "step": 102790 + }, + { + "entropy": 1.9031839981675147, + "epoch": 0.31867100551086947, + "grad_norm": 8.757781982421875, + "learning_rate": 4.4815011944401015e-06, + "loss": 0.469, + "mean_token_accuracy": 0.8555735006928444, + "num_tokens": 123596035.0, + "step": 102800 + }, + { + "entropy": 1.8560205325484276, + "epoch": 0.31870200463591913, + "grad_norm": 8.463615417480469, + "learning_rate": 4.4812832363733894e-06, + "loss": 0.4696, + "mean_token_accuracy": 0.8499815389513969, + "num_tokens": 123607275.0, + "step": 102810 + }, + { + "entropy": 1.848600834608078, + "epoch": 0.31873300376096886, + "grad_norm": 7.636923313140869, + "learning_rate": 4.481065310104793e-06, + "loss": 0.4672, + "mean_token_accuracy": 0.8392280101776123, + "num_tokens": 123619498.0, + "step": 102820 + }, + { + "entropy": 1.9192082822322845, + "epoch": 0.3187640028860185, + "grad_norm": 7.930289268493652, + "learning_rate": 4.480847415626582e-06, + "loss": 0.5126, + "mean_token_accuracy": 0.8376297026872634, + "num_tokens": 123631251.0, + "step": 102830 + }, + { + "entropy": 1.897643305361271, + "epoch": 0.31879500201106825, + "grad_norm": 8.173277854919434, + "learning_rate": 4.480629552931028e-06, + "loss": 0.4469, + "mean_token_accuracy": 0.8563371822237968, + "num_tokens": 123642062.0, + "step": 102840 + }, + { + "entropy": 1.88473329693079, + "epoch": 0.3188260011361179, + "grad_norm": 3.9585182666778564, + "learning_rate": 4.480411722010404e-06, + "loss": 0.4589, + "mean_token_accuracy": 0.8535011231899261, + "num_tokens": 123652956.0, + "step": 102850 + }, + { + "entropy": 1.8646925508975982, + "epoch": 0.31885700026116764, + "grad_norm": 9.49219799041748, + "learning_rate": 4.4801939228569895e-06, + "loss": 0.4491, + "mean_token_accuracy": 0.8506576329469681, + "num_tokens": 123665406.0, + "step": 102860 + }, + { + "entropy": 1.8895646423101424, + "epoch": 0.3188879993862173, + "grad_norm": 8.293597221374512, + "learning_rate": 4.4799761554630605e-06, + "loss": 0.5429, + "mean_token_accuracy": 0.8343157604336738, + "num_tokens": 123677366.0, + "step": 102870 + }, + { + "entropy": 1.8557443410158156, + "epoch": 0.31891899851126704, + "grad_norm": 3.470597505569458, + "learning_rate": 4.479758419820902e-06, + "loss": 0.4917, + "mean_token_accuracy": 0.8372866466641427, + "num_tokens": 123689278.0, + "step": 102880 + }, + { + "entropy": 1.8852709233760834, + "epoch": 0.3189499976363167, + "grad_norm": 8.924466133117676, + "learning_rate": 4.479540715922798e-06, + "loss": 0.488, + "mean_token_accuracy": 0.8414081260561943, + "num_tokens": 123701095.0, + "step": 102890 + }, + { + "entropy": 1.8459098264575005, + "epoch": 0.31898099676136643, + "grad_norm": 9.664191246032715, + "learning_rate": 4.479323043761035e-06, + "loss": 0.4614, + "mean_token_accuracy": 0.8502274408936501, + "num_tokens": 123713345.0, + "step": 102900 + }, + { + "entropy": 1.8116124227643013, + "epoch": 0.3190119958864161, + "grad_norm": 8.264068603515625, + "learning_rate": 4.479105403327904e-06, + "loss": 0.4638, + "mean_token_accuracy": 0.8466699972748757, + "num_tokens": 123725722.0, + "step": 102910 + }, + { + "entropy": 1.8975732818245887, + "epoch": 0.3190429950114658, + "grad_norm": 8.010908126831055, + "learning_rate": 4.478887794615696e-06, + "loss": 0.5165, + "mean_token_accuracy": 0.8428432583808899, + "num_tokens": 123736798.0, + "step": 102920 + }, + { + "entropy": 1.836000031232834, + "epoch": 0.3190739941365155, + "grad_norm": 10.32380485534668, + "learning_rate": 4.4786702176167084e-06, + "loss": 0.4547, + "mean_token_accuracy": 0.8488903164863586, + "num_tokens": 123748662.0, + "step": 102930 + }, + { + "entropy": 1.8443504258990289, + "epoch": 0.3191049932615652, + "grad_norm": 7.753139019012451, + "learning_rate": 4.478452672323238e-06, + "loss": 0.4525, + "mean_token_accuracy": 0.8580980643630027, + "num_tokens": 123761662.0, + "step": 102940 + }, + { + "entropy": 1.8338020756840705, + "epoch": 0.3191359923866149, + "grad_norm": 9.386250495910645, + "learning_rate": 4.4782351587275865e-06, + "loss": 0.4805, + "mean_token_accuracy": 0.844118581712246, + "num_tokens": 123773805.0, + "step": 102950 + }, + { + "entropy": 1.8451603963971137, + "epoch": 0.3191669915116646, + "grad_norm": 8.358642578125, + "learning_rate": 4.478017676822054e-06, + "loss": 0.4361, + "mean_token_accuracy": 0.8548496574163437, + "num_tokens": 123786092.0, + "step": 102960 + }, + { + "entropy": 1.9037387266755104, + "epoch": 0.3191979906367143, + "grad_norm": 10.784769058227539, + "learning_rate": 4.47780022659895e-06, + "loss": 0.4779, + "mean_token_accuracy": 0.8458883672952652, + "num_tokens": 123798134.0, + "step": 102970 + }, + { + "entropy": 1.9118751406669616, + "epoch": 0.31922898976176395, + "grad_norm": 8.400568962097168, + "learning_rate": 4.47758280805058e-06, + "loss": 0.5022, + "mean_token_accuracy": 0.8501716613769531, + "num_tokens": 123809256.0, + "step": 102980 + }, + { + "entropy": 1.9268585234880446, + "epoch": 0.3192599888868137, + "grad_norm": 9.43397045135498, + "learning_rate": 4.477365421169256e-06, + "loss": 0.5159, + "mean_token_accuracy": 0.8423597529530525, + "num_tokens": 123819919.0, + "step": 102990 + }, + { + "entropy": 1.7849035665392876, + "epoch": 0.31929098801186334, + "grad_norm": 8.134185791015625, + "learning_rate": 4.477148065947293e-06, + "loss": 0.4243, + "mean_token_accuracy": 0.8558384835720062, + "num_tokens": 123832624.0, + "step": 103000 + }, + { + "entropy": 1.8854929059743881, + "epoch": 0.31932198713691307, + "grad_norm": 7.981983661651611, + "learning_rate": 4.476930742377004e-06, + "loss": 0.4774, + "mean_token_accuracy": 0.8419500634074211, + "num_tokens": 123843833.0, + "step": 103010 + }, + { + "entropy": 1.910216248035431, + "epoch": 0.31935298626196273, + "grad_norm": 8.01833438873291, + "learning_rate": 4.47671345045071e-06, + "loss": 0.527, + "mean_token_accuracy": 0.8421685129404068, + "num_tokens": 123854736.0, + "step": 103020 + }, + { + "entropy": 1.869119329750538, + "epoch": 0.31938398538701246, + "grad_norm": 9.197733879089355, + "learning_rate": 4.4764961901607315e-06, + "loss": 0.5199, + "mean_token_accuracy": 0.8363765180110931, + "num_tokens": 123866473.0, + "step": 103030 + }, + { + "entropy": 1.9347961366176605, + "epoch": 0.31941498451206213, + "grad_norm": 7.009976863861084, + "learning_rate": 4.476278961499394e-06, + "loss": 0.5033, + "mean_token_accuracy": 0.8408634766936303, + "num_tokens": 123877914.0, + "step": 103040 + }, + { + "entropy": 1.765951743721962, + "epoch": 0.31944598363711185, + "grad_norm": 6.8173933029174805, + "learning_rate": 4.4760617644590216e-06, + "loss": 0.3763, + "mean_token_accuracy": 0.8628764286637306, + "num_tokens": 123891728.0, + "step": 103050 + }, + { + "entropy": 1.8665249332785607, + "epoch": 0.3194769827621615, + "grad_norm": 7.756616592407227, + "learning_rate": 4.475844599031945e-06, + "loss": 0.4528, + "mean_token_accuracy": 0.8461050614714622, + "num_tokens": 123904519.0, + "step": 103060 + }, + { + "entropy": 1.9601498633623122, + "epoch": 0.31950798188721125, + "grad_norm": 9.403843879699707, + "learning_rate": 4.475627465210497e-06, + "loss": 0.4825, + "mean_token_accuracy": 0.8456299170851708, + "num_tokens": 123915934.0, + "step": 103070 + }, + { + "entropy": 1.872973631322384, + "epoch": 0.3195389810122609, + "grad_norm": 8.8914155960083, + "learning_rate": 4.475410362987011e-06, + "loss": 0.4958, + "mean_token_accuracy": 0.837957626581192, + "num_tokens": 123928155.0, + "step": 103080 + }, + { + "entropy": 1.909055621922016, + "epoch": 0.31956998013731064, + "grad_norm": 7.7289509773254395, + "learning_rate": 4.475193292353822e-06, + "loss": 0.4712, + "mean_token_accuracy": 0.848746582865715, + "num_tokens": 123939530.0, + "step": 103090 + }, + { + "entropy": 1.9184820994734764, + "epoch": 0.3196009792623603, + "grad_norm": 8.20902156829834, + "learning_rate": 4.474976253303274e-06, + "loss": 0.4887, + "mean_token_accuracy": 0.843792799115181, + "num_tokens": 123950860.0, + "step": 103100 + }, + { + "entropy": 1.8916692659258842, + "epoch": 0.31963197838741003, + "grad_norm": 11.049240112304688, + "learning_rate": 4.4747592458277056e-06, + "loss": 0.5029, + "mean_token_accuracy": 0.8373655050992965, + "num_tokens": 123962960.0, + "step": 103110 + }, + { + "entropy": 1.9099874019622802, + "epoch": 0.3196629775124597, + "grad_norm": 7.249629497528076, + "learning_rate": 4.474542269919464e-06, + "loss": 0.5163, + "mean_token_accuracy": 0.8369929790496826, + "num_tokens": 123974112.0, + "step": 103120 + }, + { + "entropy": 1.8539279848337173, + "epoch": 0.3196939766375094, + "grad_norm": 8.467607498168945, + "learning_rate": 4.474325325570893e-06, + "loss": 0.4473, + "mean_token_accuracy": 0.8553314670920372, + "num_tokens": 123986201.0, + "step": 103130 + }, + { + "entropy": 1.9297155156731605, + "epoch": 0.3197249757625591, + "grad_norm": 11.687642097473145, + "learning_rate": 4.474108412774347e-06, + "loss": 0.5208, + "mean_token_accuracy": 0.8387605383992195, + "num_tokens": 123997260.0, + "step": 103140 + }, + { + "entropy": 1.8051437750458716, + "epoch": 0.3197559748876088, + "grad_norm": 7.733287334442139, + "learning_rate": 4.473891531522177e-06, + "loss": 0.3998, + "mean_token_accuracy": 0.8608478337526322, + "num_tokens": 124009391.0, + "step": 103150 + }, + { + "entropy": 1.7467816695570946, + "epoch": 0.3197869740126585, + "grad_norm": 2.378082036972046, + "learning_rate": 4.473674681806737e-06, + "loss": 0.3921, + "mean_token_accuracy": 0.869289082288742, + "num_tokens": 124022647.0, + "step": 103160 + }, + { + "entropy": 1.876607745885849, + "epoch": 0.3198179731377082, + "grad_norm": 9.72205924987793, + "learning_rate": 4.473457863620386e-06, + "loss": 0.4799, + "mean_token_accuracy": 0.8447935312986374, + "num_tokens": 124034377.0, + "step": 103170 + }, + { + "entropy": 1.8972087427973747, + "epoch": 0.3198489722627579, + "grad_norm": 7.99284553527832, + "learning_rate": 4.473241076955484e-06, + "loss": 0.4895, + "mean_token_accuracy": 0.847452299296856, + "num_tokens": 124045591.0, + "step": 103180 + }, + { + "entropy": 1.8817680388689042, + "epoch": 0.3198799713878076, + "grad_norm": 8.3921480178833, + "learning_rate": 4.473024321804395e-06, + "loss": 0.4908, + "mean_token_accuracy": 0.8453219920396805, + "num_tokens": 124056874.0, + "step": 103190 + }, + { + "entropy": 1.845687435567379, + "epoch": 0.3199109705128573, + "grad_norm": 8.912002563476562, + "learning_rate": 4.472807598159483e-06, + "loss": 0.4672, + "mean_token_accuracy": 0.845789223909378, + "num_tokens": 124068631.0, + "step": 103200 + }, + { + "entropy": 1.7905006155371666, + "epoch": 0.319941969637907, + "grad_norm": 9.591071128845215, + "learning_rate": 4.472590906013117e-06, + "loss": 0.4584, + "mean_token_accuracy": 0.8493612468242645, + "num_tokens": 124081286.0, + "step": 103210 + }, + { + "entropy": 1.9089469194412232, + "epoch": 0.31997296876295667, + "grad_norm": 7.169071197509766, + "learning_rate": 4.4723742453576675e-06, + "loss": 0.502, + "mean_token_accuracy": 0.8382734417915344, + "num_tokens": 124092376.0, + "step": 103220 + }, + { + "entropy": 1.8337425097823143, + "epoch": 0.32000396788800634, + "grad_norm": 8.232246398925781, + "learning_rate": 4.472157616185508e-06, + "loss": 0.4213, + "mean_token_accuracy": 0.8578509896993637, + "num_tokens": 124104861.0, + "step": 103230 + }, + { + "entropy": 1.9013035476207734, + "epoch": 0.32003496701305606, + "grad_norm": 8.12004280090332, + "learning_rate": 4.471941018489015e-06, + "loss": 0.4745, + "mean_token_accuracy": 0.8492400407791137, + "num_tokens": 124116168.0, + "step": 103240 + }, + { + "entropy": 1.8799728155136108, + "epoch": 0.32006596613810573, + "grad_norm": 7.341602325439453, + "learning_rate": 4.471724452260566e-06, + "loss": 0.4878, + "mean_token_accuracy": 0.8415581986308098, + "num_tokens": 124127690.0, + "step": 103250 + }, + { + "entropy": 1.8120905488729477, + "epoch": 0.32009696526315545, + "grad_norm": 9.098899841308594, + "learning_rate": 4.471507917492542e-06, + "loss": 0.4301, + "mean_token_accuracy": 0.8570401355624199, + "num_tokens": 124140339.0, + "step": 103260 + }, + { + "entropy": 1.8739472836256028, + "epoch": 0.3201279643882051, + "grad_norm": 9.830232620239258, + "learning_rate": 4.471291414177328e-06, + "loss": 0.4751, + "mean_token_accuracy": 0.8453883707523346, + "num_tokens": 124151956.0, + "step": 103270 + }, + { + "entropy": 1.84071436971426, + "epoch": 0.32015896351325485, + "grad_norm": 8.632100105285645, + "learning_rate": 4.47107494230731e-06, + "loss": 0.4897, + "mean_token_accuracy": 0.8345205932855606, + "num_tokens": 124165412.0, + "step": 103280 + }, + { + "entropy": 1.8590612232685089, + "epoch": 0.3201899626383045, + "grad_norm": 4.4268975257873535, + "learning_rate": 4.470858501874875e-06, + "loss": 0.4495, + "mean_token_accuracy": 0.8525667667388916, + "num_tokens": 124177344.0, + "step": 103290 + }, + { + "entropy": 1.887212759256363, + "epoch": 0.32022096176335424, + "grad_norm": 7.400831699371338, + "learning_rate": 4.470642092872416e-06, + "loss": 0.4953, + "mean_token_accuracy": 0.8470708400011062, + "num_tokens": 124188119.0, + "step": 103300 + }, + { + "entropy": 1.8698201969265937, + "epoch": 0.3202519608884039, + "grad_norm": 9.03722858428955, + "learning_rate": 4.470425715292328e-06, + "loss": 0.4894, + "mean_token_accuracy": 0.8450605183839798, + "num_tokens": 124199654.0, + "step": 103310 + }, + { + "entropy": 1.9616728246212005, + "epoch": 0.32028296001345363, + "grad_norm": 7.2699503898620605, + "learning_rate": 4.4702093691270045e-06, + "loss": 0.4896, + "mean_token_accuracy": 0.8387251660227776, + "num_tokens": 124210832.0, + "step": 103320 + }, + { + "entropy": 1.888064543902874, + "epoch": 0.3203139591385033, + "grad_norm": 9.468146324157715, + "learning_rate": 4.469993054368849e-06, + "loss": 0.52, + "mean_token_accuracy": 0.8365824237465859, + "num_tokens": 124223111.0, + "step": 103330 + }, + { + "entropy": 1.8957604005932809, + "epoch": 0.320344958263553, + "grad_norm": 4.3390302658081055, + "learning_rate": 4.469776771010258e-06, + "loss": 0.5079, + "mean_token_accuracy": 0.8410006389021873, + "num_tokens": 124234226.0, + "step": 103340 + }, + { + "entropy": 1.9063668623566628, + "epoch": 0.3203759573886027, + "grad_norm": 8.515616416931152, + "learning_rate": 4.46956051904364e-06, + "loss": 0.4756, + "mean_token_accuracy": 0.8457826554775238, + "num_tokens": 124245737.0, + "step": 103350 + }, + { + "entropy": 1.8583616137504577, + "epoch": 0.3204069565136524, + "grad_norm": 8.301240921020508, + "learning_rate": 4.469344298461399e-06, + "loss": 0.4415, + "mean_token_accuracy": 0.8528946593403817, + "num_tokens": 124258363.0, + "step": 103360 + }, + { + "entropy": 1.8560489758849144, + "epoch": 0.3204379556387021, + "grad_norm": 7.1716532707214355, + "learning_rate": 4.4691281092559474e-06, + "loss": 0.4214, + "mean_token_accuracy": 0.8615340426564216, + "num_tokens": 124270052.0, + "step": 103370 + }, + { + "entropy": 1.762592874467373, + "epoch": 0.3204689547637518, + "grad_norm": 3.3581652641296387, + "learning_rate": 4.468911951419696e-06, + "loss": 0.3764, + "mean_token_accuracy": 0.8641716584563255, + "num_tokens": 124283377.0, + "step": 103380 + }, + { + "entropy": 1.8988282978534698, + "epoch": 0.3204999538888015, + "grad_norm": 6.756303310394287, + "learning_rate": 4.468695824945058e-06, + "loss": 0.4711, + "mean_token_accuracy": 0.8476831942796708, + "num_tokens": 124294752.0, + "step": 103390 + }, + { + "entropy": 1.846705262362957, + "epoch": 0.3205309530138512, + "grad_norm": 3.833975315093994, + "learning_rate": 4.46847972982445e-06, + "loss": 0.4031, + "mean_token_accuracy": 0.8555555865168571, + "num_tokens": 124306566.0, + "step": 103400 + }, + { + "entropy": 1.8886190831661225, + "epoch": 0.3205619521389009, + "grad_norm": 7.964195728302002, + "learning_rate": 4.4682636660502945e-06, + "loss": 0.5005, + "mean_token_accuracy": 0.8455360010266304, + "num_tokens": 124318282.0, + "step": 103410 + }, + { + "entropy": 1.9076228067278862, + "epoch": 0.3205929512639506, + "grad_norm": 7.338057994842529, + "learning_rate": 4.468047633615013e-06, + "loss": 0.4968, + "mean_token_accuracy": 0.8368429213762283, + "num_tokens": 124329917.0, + "step": 103420 + }, + { + "entropy": 1.9275338113307954, + "epoch": 0.32062395038900027, + "grad_norm": 10.642918586730957, + "learning_rate": 4.46783163251103e-06, + "loss": 0.5179, + "mean_token_accuracy": 0.8408781468868256, + "num_tokens": 124341468.0, + "step": 103430 + }, + { + "entropy": 1.967161425948143, + "epoch": 0.32065494951405, + "grad_norm": 7.011558532714844, + "learning_rate": 4.467615662730772e-06, + "loss": 0.535, + "mean_token_accuracy": 0.8357431918382645, + "num_tokens": 124352683.0, + "step": 103440 + }, + { + "entropy": 1.8656065806746482, + "epoch": 0.32068594863909966, + "grad_norm": 7.173338413238525, + "learning_rate": 4.467399724266671e-06, + "loss": 0.4432, + "mean_token_accuracy": 0.8562691912055016, + "num_tokens": 124364944.0, + "step": 103450 + }, + { + "entropy": 1.7879148676991463, + "epoch": 0.32071694776414933, + "grad_norm": 8.152206420898438, + "learning_rate": 4.467183817111157e-06, + "loss": 0.422, + "mean_token_accuracy": 0.8556606069207191, + "num_tokens": 124378575.0, + "step": 103460 + }, + { + "entropy": 1.9068648159503936, + "epoch": 0.32074794688919905, + "grad_norm": 10.881712913513184, + "learning_rate": 4.466967941256668e-06, + "loss": 0.4996, + "mean_token_accuracy": 0.8376988887786865, + "num_tokens": 124390206.0, + "step": 103470 + }, + { + "entropy": 1.9137423366308213, + "epoch": 0.3207789460142487, + "grad_norm": 7.468229293823242, + "learning_rate": 4.4667520966956385e-06, + "loss": 0.5072, + "mean_token_accuracy": 0.8356907978653908, + "num_tokens": 124401455.0, + "step": 103480 + }, + { + "entropy": 1.9200310036540031, + "epoch": 0.32080994513929845, + "grad_norm": 6.459151744842529, + "learning_rate": 4.466536283420511e-06, + "loss": 0.4964, + "mean_token_accuracy": 0.8393821954727173, + "num_tokens": 124412634.0, + "step": 103490 + }, + { + "entropy": 1.9078100383281709, + "epoch": 0.3208409442643481, + "grad_norm": 6.7956013679504395, + "learning_rate": 4.466320501423726e-06, + "loss": 0.4456, + "mean_token_accuracy": 0.8611381649971008, + "num_tokens": 124423788.0, + "step": 103500 + }, + { + "entropy": 1.9619006097316742, + "epoch": 0.32087194338939784, + "grad_norm": 9.467850685119629, + "learning_rate": 4.466104750697733e-06, + "loss": 0.5486, + "mean_token_accuracy": 0.8296520456671714, + "num_tokens": 124434439.0, + "step": 103510 + }, + { + "entropy": 1.8933661192655564, + "epoch": 0.3209029425144475, + "grad_norm": 4.713155746459961, + "learning_rate": 4.465889031234975e-06, + "loss": 0.458, + "mean_token_accuracy": 0.8469473794102669, + "num_tokens": 124446667.0, + "step": 103520 + }, + { + "entropy": 1.853841246664524, + "epoch": 0.32093394163949723, + "grad_norm": 8.347504615783691, + "learning_rate": 4.465673343027906e-06, + "loss": 0.4666, + "mean_token_accuracy": 0.844866256415844, + "num_tokens": 124459684.0, + "step": 103530 + }, + { + "entropy": 1.8124510049819946, + "epoch": 0.3209649407645469, + "grad_norm": 4.092881202697754, + "learning_rate": 4.465457686068977e-06, + "loss": 0.4031, + "mean_token_accuracy": 0.8626367405056954, + "num_tokens": 124472022.0, + "step": 103540 + }, + { + "entropy": 1.8816181853413583, + "epoch": 0.3209959398895966, + "grad_norm": 7.426637172698975, + "learning_rate": 4.465242060350643e-06, + "loss": 0.4722, + "mean_token_accuracy": 0.844549834728241, + "num_tokens": 124484006.0, + "step": 103550 + }, + { + "entropy": 1.8122664958238601, + "epoch": 0.3210269390146463, + "grad_norm": 3.879347562789917, + "learning_rate": 4.4650264658653655e-06, + "loss": 0.4232, + "mean_token_accuracy": 0.8467624381184577, + "num_tokens": 124496792.0, + "step": 103560 + }, + { + "entropy": 1.8967302456498145, + "epoch": 0.321057938139696, + "grad_norm": 5.540412902832031, + "learning_rate": 4.464810902605601e-06, + "loss": 0.5402, + "mean_token_accuracy": 0.8287664383649826, + "num_tokens": 124508254.0, + "step": 103570 + }, + { + "entropy": 1.8487597212195397, + "epoch": 0.3210889372647457, + "grad_norm": 4.171758651733398, + "learning_rate": 4.464595370563815e-06, + "loss": 0.4429, + "mean_token_accuracy": 0.8530851736664772, + "num_tokens": 124520515.0, + "step": 103580 + }, + { + "entropy": 1.8957301035523415, + "epoch": 0.3211199363897954, + "grad_norm": 8.205263137817383, + "learning_rate": 4.464379869732473e-06, + "loss": 0.4656, + "mean_token_accuracy": 0.8478555142879486, + "num_tokens": 124532156.0, + "step": 103590 + }, + { + "entropy": 1.8724404126405716, + "epoch": 0.3211509355148451, + "grad_norm": 8.49742317199707, + "learning_rate": 4.464164400104043e-06, + "loss": 0.4443, + "mean_token_accuracy": 0.8556983798742295, + "num_tokens": 124543672.0, + "step": 103600 + }, + { + "entropy": 1.8402127534151078, + "epoch": 0.3211819346398948, + "grad_norm": 8.611581802368164, + "learning_rate": 4.4639489616709956e-06, + "loss": 0.4504, + "mean_token_accuracy": 0.8543668672442436, + "num_tokens": 124555679.0, + "step": 103610 + }, + { + "entropy": 1.8802747756242753, + "epoch": 0.3212129337649445, + "grad_norm": 4.206470012664795, + "learning_rate": 4.463733554425804e-06, + "loss": 0.4831, + "mean_token_accuracy": 0.8467269361019134, + "num_tokens": 124567696.0, + "step": 103620 + }, + { + "entropy": 1.864666721224785, + "epoch": 0.3212439328899942, + "grad_norm": 10.374842643737793, + "learning_rate": 4.4635181783609455e-06, + "loss": 0.4742, + "mean_token_accuracy": 0.8502185359597206, + "num_tokens": 124580191.0, + "step": 103630 + }, + { + "entropy": 1.839669795334339, + "epoch": 0.32127493201504387, + "grad_norm": 3.81731915473938, + "learning_rate": 4.463302833468897e-06, + "loss": 0.4586, + "mean_token_accuracy": 0.8490176141262055, + "num_tokens": 124592216.0, + "step": 103640 + }, + { + "entropy": 1.8732964277267456, + "epoch": 0.3213059311400936, + "grad_norm": 7.773388385772705, + "learning_rate": 4.463087519742139e-06, + "loss": 0.5065, + "mean_token_accuracy": 0.8358982741832733, + "num_tokens": 124604399.0, + "step": 103650 + }, + { + "entropy": 1.8805906429886818, + "epoch": 0.32133693026514326, + "grad_norm": 8.990577697753906, + "learning_rate": 4.462872237173157e-06, + "loss": 0.4652, + "mean_token_accuracy": 0.8515235692262649, + "num_tokens": 124615677.0, + "step": 103660 + }, + { + "entropy": 1.8140804409980773, + "epoch": 0.321367929390193, + "grad_norm": 8.409754753112793, + "learning_rate": 4.462656985754436e-06, + "loss": 0.4065, + "mean_token_accuracy": 0.8694860532879829, + "num_tokens": 124628163.0, + "step": 103670 + }, + { + "entropy": 1.8692081153392792, + "epoch": 0.32139892851524265, + "grad_norm": 8.463071823120117, + "learning_rate": 4.462441765478465e-06, + "loss": 0.438, + "mean_token_accuracy": 0.8528854250907898, + "num_tokens": 124640372.0, + "step": 103680 + }, + { + "entropy": 1.8829544261097908, + "epoch": 0.3214299276402924, + "grad_norm": 9.008349418640137, + "learning_rate": 4.462226576337735e-06, + "loss": 0.4922, + "mean_token_accuracy": 0.8522943645715714, + "num_tokens": 124652104.0, + "step": 103690 + }, + { + "entropy": 1.908381125330925, + "epoch": 0.32146092676534205, + "grad_norm": 8.958301544189453, + "learning_rate": 4.462011418324738e-06, + "loss": 0.4776, + "mean_token_accuracy": 0.8482077926397323, + "num_tokens": 124663567.0, + "step": 103700 + }, + { + "entropy": 1.8642259582877159, + "epoch": 0.3214919258903917, + "grad_norm": 8.490578651428223, + "learning_rate": 4.461796291431973e-06, + "loss": 0.4727, + "mean_token_accuracy": 0.8547911241650581, + "num_tokens": 124675305.0, + "step": 103710 + }, + { + "entropy": 1.8607799611985683, + "epoch": 0.32152292501544144, + "grad_norm": 7.851955413818359, + "learning_rate": 4.461581195651937e-06, + "loss": 0.4312, + "mean_token_accuracy": 0.8484767660498619, + "num_tokens": 124688192.0, + "step": 103720 + }, + { + "entropy": 1.930846455693245, + "epoch": 0.3215539241404911, + "grad_norm": 3.855354070663452, + "learning_rate": 4.461366130977132e-06, + "loss": 0.4822, + "mean_token_accuracy": 0.8470954403281212, + "num_tokens": 124699818.0, + "step": 103730 + }, + { + "entropy": 1.7706571273505687, + "epoch": 0.32158492326554083, + "grad_norm": 4.847051620483398, + "learning_rate": 4.461151097400059e-06, + "loss": 0.3651, + "mean_token_accuracy": 0.8597544595599175, + "num_tokens": 124714446.0, + "step": 103740 + }, + { + "entropy": 1.9413534983992577, + "epoch": 0.3216159223905905, + "grad_norm": 7.653467655181885, + "learning_rate": 4.460936094913229e-06, + "loss": 0.4902, + "mean_token_accuracy": 0.8460107401013375, + "num_tokens": 124725683.0, + "step": 103750 + }, + { + "entropy": 1.899087278544903, + "epoch": 0.3216469215156402, + "grad_norm": 9.189424514770508, + "learning_rate": 4.460721123509149e-06, + "loss": 0.476, + "mean_token_accuracy": 0.8444273769855499, + "num_tokens": 124738312.0, + "step": 103760 + }, + { + "entropy": 1.951054508984089, + "epoch": 0.3216779206406899, + "grad_norm": 8.07810115814209, + "learning_rate": 4.460506183180329e-06, + "loss": 0.4815, + "mean_token_accuracy": 0.841863676905632, + "num_tokens": 124749524.0, + "step": 103770 + }, + { + "entropy": 1.9111177667975425, + "epoch": 0.3217089197657396, + "grad_norm": 8.68226146697998, + "learning_rate": 4.4602912739192835e-06, + "loss": 0.4722, + "mean_token_accuracy": 0.8471680104732513, + "num_tokens": 124760976.0, + "step": 103780 + }, + { + "entropy": 1.8448022976517677, + "epoch": 0.3217399188907893, + "grad_norm": 7.906839847564697, + "learning_rate": 4.4600763957185295e-06, + "loss": 0.4239, + "mean_token_accuracy": 0.8532932892441749, + "num_tokens": 124773367.0, + "step": 103790 + }, + { + "entropy": 1.9335693135857581, + "epoch": 0.321770918015839, + "grad_norm": 8.961040496826172, + "learning_rate": 4.459861548570586e-06, + "loss": 0.4793, + "mean_token_accuracy": 0.8489025041460991, + "num_tokens": 124784721.0, + "step": 103800 + }, + { + "entropy": 1.8875031411647796, + "epoch": 0.3218019171408887, + "grad_norm": 3.5718820095062256, + "learning_rate": 4.459646732467974e-06, + "loss": 0.4734, + "mean_token_accuracy": 0.8439980506896972, + "num_tokens": 124797599.0, + "step": 103810 + }, + { + "entropy": 1.8897089153528213, + "epoch": 0.3218329162659384, + "grad_norm": 9.0376558303833, + "learning_rate": 4.459431947403218e-06, + "loss": 0.4489, + "mean_token_accuracy": 0.8548488318920135, + "num_tokens": 124808832.0, + "step": 103820 + }, + { + "entropy": 1.8961857289075852, + "epoch": 0.3218639153909881, + "grad_norm": 7.7893829345703125, + "learning_rate": 4.4592171933688435e-06, + "loss": 0.4688, + "mean_token_accuracy": 0.8496623903512954, + "num_tokens": 124821106.0, + "step": 103830 + }, + { + "entropy": 1.8757227182388305, + "epoch": 0.3218949145160378, + "grad_norm": 4.173471927642822, + "learning_rate": 4.45900247035738e-06, + "loss": 0.4201, + "mean_token_accuracy": 0.8542967364192009, + "num_tokens": 124833050.0, + "step": 103840 + }, + { + "entropy": 1.9881081491708756, + "epoch": 0.32192591364108747, + "grad_norm": 7.131890296936035, + "learning_rate": 4.458787778361361e-06, + "loss": 0.5528, + "mean_token_accuracy": 0.8397052332758903, + "num_tokens": 124844238.0, + "step": 103850 + }, + { + "entropy": 1.8623345792293549, + "epoch": 0.3219569127661372, + "grad_norm": 3.7538177967071533, + "learning_rate": 4.458573117373317e-06, + "loss": 0.413, + "mean_token_accuracy": 0.8614621505141258, + "num_tokens": 124856830.0, + "step": 103860 + }, + { + "entropy": 1.8851841911673546, + "epoch": 0.32198791189118686, + "grad_norm": 4.068458557128906, + "learning_rate": 4.458358487385787e-06, + "loss": 0.4739, + "mean_token_accuracy": 0.8413153782486915, + "num_tokens": 124869280.0, + "step": 103870 + }, + { + "entropy": 1.8985486201941968, + "epoch": 0.3220189110162366, + "grad_norm": 4.279972076416016, + "learning_rate": 4.458143888391309e-06, + "loss": 0.4229, + "mean_token_accuracy": 0.8557350024580955, + "num_tokens": 124880870.0, + "step": 103880 + }, + { + "entropy": 1.834354992210865, + "epoch": 0.32204991014128626, + "grad_norm": 3.6236605644226074, + "learning_rate": 4.457929320382427e-06, + "loss": 0.3981, + "mean_token_accuracy": 0.8552977159619332, + "num_tokens": 124893391.0, + "step": 103890 + }, + { + "entropy": 1.9463069021701813, + "epoch": 0.322080909266336, + "grad_norm": 8.719969749450684, + "learning_rate": 4.457714783351681e-06, + "loss": 0.5187, + "mean_token_accuracy": 0.8463471621274948, + "num_tokens": 124904359.0, + "step": 103900 + }, + { + "entropy": 1.9136415734887122, + "epoch": 0.32211190839138565, + "grad_norm": 8.018022537231445, + "learning_rate": 4.457500277291621e-06, + "loss": 0.4671, + "mean_token_accuracy": 0.8480435863137246, + "num_tokens": 124916133.0, + "step": 103910 + }, + { + "entropy": 1.9327465161681174, + "epoch": 0.3221429075164354, + "grad_norm": 9.761283874511719, + "learning_rate": 4.457285802194794e-06, + "loss": 0.5029, + "mean_token_accuracy": 0.8388472273945808, + "num_tokens": 124927380.0, + "step": 103920 + }, + { + "entropy": 1.9528162240982057, + "epoch": 0.32217390664148504, + "grad_norm": 8.009254455566406, + "learning_rate": 4.457071358053754e-06, + "loss": 0.5264, + "mean_token_accuracy": 0.8379970923066139, + "num_tokens": 124937778.0, + "step": 103930 + }, + { + "entropy": 1.9495197027921676, + "epoch": 0.32220490576653477, + "grad_norm": 8.856219291687012, + "learning_rate": 4.456856944861052e-06, + "loss": 0.5091, + "mean_token_accuracy": 0.8489312395453453, + "num_tokens": 124948572.0, + "step": 103940 + }, + { + "entropy": 1.8287577375769615, + "epoch": 0.32223590489158443, + "grad_norm": 4.316620349884033, + "learning_rate": 4.4566425626092495e-06, + "loss": 0.4242, + "mean_token_accuracy": 0.8560908138751984, + "num_tokens": 124961049.0, + "step": 103950 + }, + { + "entropy": 1.8360350668430327, + "epoch": 0.3222669040166341, + "grad_norm": 8.45445442199707, + "learning_rate": 4.456428211290899e-06, + "loss": 0.4338, + "mean_token_accuracy": 0.843099394440651, + "num_tokens": 124973591.0, + "step": 103960 + }, + { + "entropy": 1.829276867955923, + "epoch": 0.32229790314168383, + "grad_norm": 2.9788012504577637, + "learning_rate": 4.456213890898567e-06, + "loss": 0.4226, + "mean_token_accuracy": 0.8461241707205772, + "num_tokens": 124986733.0, + "step": 103970 + }, + { + "entropy": 1.951300072669983, + "epoch": 0.3223289022667335, + "grad_norm": 7.77920389175415, + "learning_rate": 4.455999601424818e-06, + "loss": 0.5354, + "mean_token_accuracy": 0.8368278592824936, + "num_tokens": 124998066.0, + "step": 103980 + }, + { + "entropy": 1.8031010538339616, + "epoch": 0.3223599013917832, + "grad_norm": 4.566207408905029, + "learning_rate": 4.455785342862216e-06, + "loss": 0.4264, + "mean_token_accuracy": 0.8428297311067581, + "num_tokens": 125011581.0, + "step": 103990 + }, + { + "entropy": 1.8515203520655632, + "epoch": 0.3223909005168329, + "grad_norm": 7.359809875488281, + "learning_rate": 4.4555711152033325e-06, + "loss": 0.4531, + "mean_token_accuracy": 0.852726761996746, + "num_tokens": 125024051.0, + "step": 104000 + }, + { + "entropy": 1.8627462826669217, + "epoch": 0.3224218996418826, + "grad_norm": 3.8170571327209473, + "learning_rate": 4.455356918440736e-06, + "loss": 0.4399, + "mean_token_accuracy": 0.8479026794433594, + "num_tokens": 125037097.0, + "step": 104010 + }, + { + "entropy": 1.8624150231480598, + "epoch": 0.3224528987669323, + "grad_norm": 8.229700088500977, + "learning_rate": 4.455142752567004e-06, + "loss": 0.4912, + "mean_token_accuracy": 0.8403912633657455, + "num_tokens": 125049190.0, + "step": 104020 + }, + { + "entropy": 1.8673087805509567, + "epoch": 0.322483897891982, + "grad_norm": 8.417292594909668, + "learning_rate": 4.454928617574712e-06, + "loss": 0.4518, + "mean_token_accuracy": 0.8480266436934472, + "num_tokens": 125061183.0, + "step": 104030 + }, + { + "entropy": 1.83387650847435, + "epoch": 0.3225148970170317, + "grad_norm": 7.1093220710754395, + "learning_rate": 4.4547145134564384e-06, + "loss": 0.4777, + "mean_token_accuracy": 0.8557756185531616, + "num_tokens": 125074081.0, + "step": 104040 + }, + { + "entropy": 1.8816956907510758, + "epoch": 0.3225458961420814, + "grad_norm": 8.284878730773926, + "learning_rate": 4.454500440204765e-06, + "loss": 0.4527, + "mean_token_accuracy": 0.8433906123042106, + "num_tokens": 125086116.0, + "step": 104050 + }, + { + "entropy": 1.9563316345214843, + "epoch": 0.32257689526713107, + "grad_norm": 7.569207668304443, + "learning_rate": 4.454286397812278e-06, + "loss": 0.5523, + "mean_token_accuracy": 0.82982589751482, + "num_tokens": 125097380.0, + "step": 104060 + }, + { + "entropy": 1.941270676255226, + "epoch": 0.3226078943921808, + "grad_norm": 9.317317008972168, + "learning_rate": 4.454072386271562e-06, + "loss": 0.5532, + "mean_token_accuracy": 0.8411487266421318, + "num_tokens": 125108379.0, + "step": 104070 + }, + { + "entropy": 1.920466449856758, + "epoch": 0.32263889351723046, + "grad_norm": 9.6175537109375, + "learning_rate": 4.453858405575206e-06, + "loss": 0.5543, + "mean_token_accuracy": 0.8352476447820664, + "num_tokens": 125119711.0, + "step": 104080 + }, + { + "entropy": 1.8606573291122914, + "epoch": 0.3226698926422802, + "grad_norm": 3.547621488571167, + "learning_rate": 4.453644455715805e-06, + "loss": 0.4318, + "mean_token_accuracy": 0.8499340802431107, + "num_tokens": 125131618.0, + "step": 104090 + }, + { + "entropy": 1.9070662692189218, + "epoch": 0.32270089176732986, + "grad_norm": 4.856088638305664, + "learning_rate": 4.453430536685948e-06, + "loss": 0.4756, + "mean_token_accuracy": 0.8461712196469307, + "num_tokens": 125143282.0, + "step": 104100 + }, + { + "entropy": 1.8635522559285165, + "epoch": 0.3227318908923796, + "grad_norm": 8.272976875305176, + "learning_rate": 4.453216648478236e-06, + "loss": 0.4279, + "mean_token_accuracy": 0.85389723777771, + "num_tokens": 125155283.0, + "step": 104110 + }, + { + "entropy": 1.8606613449752332, + "epoch": 0.32276289001742925, + "grad_norm": 7.335941314697266, + "learning_rate": 4.453002791085265e-06, + "loss": 0.4972, + "mean_token_accuracy": 0.8420170918107033, + "num_tokens": 125168312.0, + "step": 104120 + }, + { + "entropy": 1.869354782998562, + "epoch": 0.322793889142479, + "grad_norm": 8.720457077026367, + "learning_rate": 4.452788964499638e-06, + "loss": 0.4668, + "mean_token_accuracy": 0.8450367793440818, + "num_tokens": 125180831.0, + "step": 104130 + }, + { + "entropy": 1.886098426580429, + "epoch": 0.32282488826752864, + "grad_norm": 7.61613130569458, + "learning_rate": 4.452575168713959e-06, + "loss": 0.4643, + "mean_token_accuracy": 0.8547211870551109, + "num_tokens": 125191572.0, + "step": 104140 + }, + { + "entropy": 1.8242208793759347, + "epoch": 0.32285588739257837, + "grad_norm": 8.236608505249023, + "learning_rate": 4.452361403720835e-06, + "loss": 0.5051, + "mean_token_accuracy": 0.8449318900704383, + "num_tokens": 125204559.0, + "step": 104150 + }, + { + "entropy": 1.9878936111927032, + "epoch": 0.32288688651762804, + "grad_norm": 7.960468769073486, + "learning_rate": 4.452147669512874e-06, + "loss": 0.5418, + "mean_token_accuracy": 0.8334811016917228, + "num_tokens": 125215145.0, + "step": 104160 + }, + { + "entropy": 1.8423168882727623, + "epoch": 0.32291788564267776, + "grad_norm": 8.506884574890137, + "learning_rate": 4.451933966082689e-06, + "loss": 0.436, + "mean_token_accuracy": 0.8522772178053856, + "num_tokens": 125227069.0, + "step": 104170 + }, + { + "entropy": 1.9097376301884652, + "epoch": 0.32294888476772743, + "grad_norm": 9.220355033874512, + "learning_rate": 4.451720293422894e-06, + "loss": 0.5155, + "mean_token_accuracy": 0.8438367456197738, + "num_tokens": 125238513.0, + "step": 104180 + }, + { + "entropy": 1.8769617334008217, + "epoch": 0.32297988389277715, + "grad_norm": 8.432296752929688, + "learning_rate": 4.451506651526103e-06, + "loss": 0.4553, + "mean_token_accuracy": 0.8506760001182556, + "num_tokens": 125249982.0, + "step": 104190 + }, + { + "entropy": 1.8073933839797973, + "epoch": 0.3230108830178268, + "grad_norm": 7.3089985847473145, + "learning_rate": 4.451293040384938e-06, + "loss": 0.4174, + "mean_token_accuracy": 0.8688766479492187, + "num_tokens": 125263337.0, + "step": 104200 + }, + { + "entropy": 1.8363840654492378, + "epoch": 0.3230418821428765, + "grad_norm": 4.247748851776123, + "learning_rate": 4.4510794599920185e-06, + "loss": 0.463, + "mean_token_accuracy": 0.8401136696338654, + "num_tokens": 125275329.0, + "step": 104210 + }, + { + "entropy": 1.8861171647906303, + "epoch": 0.3230728812679262, + "grad_norm": 8.15084457397461, + "learning_rate": 4.45086591033997e-06, + "loss": 0.4632, + "mean_token_accuracy": 0.8480441614985466, + "num_tokens": 125287001.0, + "step": 104220 + }, + { + "entropy": 1.862444320321083, + "epoch": 0.3231038803929759, + "grad_norm": 9.114710807800293, + "learning_rate": 4.450652391421417e-06, + "loss": 0.4541, + "mean_token_accuracy": 0.8597635760903358, + "num_tokens": 125299228.0, + "step": 104230 + }, + { + "entropy": 1.8157359957695007, + "epoch": 0.3231348795180256, + "grad_norm": 10.471318244934082, + "learning_rate": 4.45043890322899e-06, + "loss": 0.4853, + "mean_token_accuracy": 0.8404217720031738, + "num_tokens": 125312489.0, + "step": 104240 + }, + { + "entropy": 1.8435872822999955, + "epoch": 0.3231658786430753, + "grad_norm": 6.866489410400391, + "learning_rate": 4.45022544575532e-06, + "loss": 0.4322, + "mean_token_accuracy": 0.8526515334844589, + "num_tokens": 125325340.0, + "step": 104250 + }, + { + "entropy": 1.7846007272601128, + "epoch": 0.323196877768125, + "grad_norm": 8.732168197631836, + "learning_rate": 4.450012018993041e-06, + "loss": 0.4007, + "mean_token_accuracy": 0.8618106812238693, + "num_tokens": 125338525.0, + "step": 104260 + }, + { + "entropy": 1.9067389905452727, + "epoch": 0.32322787689317467, + "grad_norm": 9.090784072875977, + "learning_rate": 4.4497986229347886e-06, + "loss": 0.4901, + "mean_token_accuracy": 0.8495014622807503, + "num_tokens": 125349264.0, + "step": 104270 + }, + { + "entropy": 1.8359978944063187, + "epoch": 0.3232588760182244, + "grad_norm": 8.766422271728516, + "learning_rate": 4.449585257573202e-06, + "loss": 0.453, + "mean_token_accuracy": 0.8507504045963288, + "num_tokens": 125361176.0, + "step": 104280 + }, + { + "entropy": 1.860073594748974, + "epoch": 0.32328987514327406, + "grad_norm": 8.850909233093262, + "learning_rate": 4.4493719229009234e-06, + "loss": 0.4902, + "mean_token_accuracy": 0.8514194667339325, + "num_tokens": 125372673.0, + "step": 104290 + }, + { + "entropy": 1.8551115795969964, + "epoch": 0.3233208742683238, + "grad_norm": 4.093299388885498, + "learning_rate": 4.449158618910594e-06, + "loss": 0.4353, + "mean_token_accuracy": 0.8481899574398994, + "num_tokens": 125385018.0, + "step": 104300 + }, + { + "entropy": 1.9149028778076171, + "epoch": 0.32335187339337346, + "grad_norm": 8.859583854675293, + "learning_rate": 4.448945345594864e-06, + "loss": 0.5378, + "mean_token_accuracy": 0.8369590178132057, + "num_tokens": 125396234.0, + "step": 104310 + }, + { + "entropy": 1.9485036730766296, + "epoch": 0.3233828725184232, + "grad_norm": 8.875260353088379, + "learning_rate": 4.448732102946378e-06, + "loss": 0.5439, + "mean_token_accuracy": 0.83717480301857, + "num_tokens": 125407354.0, + "step": 104320 + }, + { + "entropy": 1.8549368754029274, + "epoch": 0.32341387164347285, + "grad_norm": 3.1946167945861816, + "learning_rate": 4.448518890957789e-06, + "loss": 0.4336, + "mean_token_accuracy": 0.8576024889945983, + "num_tokens": 125419710.0, + "step": 104330 + }, + { + "entropy": 1.8528822794556619, + "epoch": 0.3234448707685226, + "grad_norm": 5.784121036529541, + "learning_rate": 4.44830570962175e-06, + "loss": 0.4463, + "mean_token_accuracy": 0.8541821107268334, + "num_tokens": 125432442.0, + "step": 104340 + }, + { + "entropy": 1.9138613402843476, + "epoch": 0.32347586989357224, + "grad_norm": 3.181318998336792, + "learning_rate": 4.448092558930918e-06, + "loss": 0.4904, + "mean_token_accuracy": 0.8456951528787613, + "num_tokens": 125444223.0, + "step": 104350 + }, + { + "entropy": 1.893858689069748, + "epoch": 0.32350686901862197, + "grad_norm": 7.411118030548096, + "learning_rate": 4.447879438877952e-06, + "loss": 0.5144, + "mean_token_accuracy": 0.8454016864299774, + "num_tokens": 125455851.0, + "step": 104360 + }, + { + "entropy": 1.9555244013667106, + "epoch": 0.32353786814367164, + "grad_norm": 8.233355522155762, + "learning_rate": 4.447666349455512e-06, + "loss": 0.5283, + "mean_token_accuracy": 0.8410166382789612, + "num_tokens": 125466870.0, + "step": 104370 + }, + { + "entropy": 1.9233867451548576, + "epoch": 0.32356886726872136, + "grad_norm": 8.369714736938477, + "learning_rate": 4.44745329065626e-06, + "loss": 0.5215, + "mean_token_accuracy": 0.8294639691710473, + "num_tokens": 125477884.0, + "step": 104380 + }, + { + "entropy": 1.842393586039543, + "epoch": 0.32359986639377103, + "grad_norm": 8.982789993286133, + "learning_rate": 4.447240262472865e-06, + "loss": 0.4581, + "mean_token_accuracy": 0.8471063405275345, + "num_tokens": 125490187.0, + "step": 104390 + }, + { + "entropy": 1.695570257306099, + "epoch": 0.32363086551882075, + "grad_norm": 8.46627426147461, + "learning_rate": 4.447027264897993e-06, + "loss": 0.387, + "mean_token_accuracy": 0.8552982524037361, + "num_tokens": 125504771.0, + "step": 104400 + }, + { + "entropy": 1.790937101840973, + "epoch": 0.3236618646438704, + "grad_norm": 7.850381374359131, + "learning_rate": 4.446814297924315e-06, + "loss": 0.4354, + "mean_token_accuracy": 0.8472680017352104, + "num_tokens": 125517462.0, + "step": 104410 + }, + { + "entropy": 1.9535409420728684, + "epoch": 0.32369286376892015, + "grad_norm": 8.090740203857422, + "learning_rate": 4.446601361544507e-06, + "loss": 0.4965, + "mean_token_accuracy": 0.8454528123140335, + "num_tokens": 125528256.0, + "step": 104420 + }, + { + "entropy": 1.8886640325188637, + "epoch": 0.3237238628939698, + "grad_norm": 4.096344470977783, + "learning_rate": 4.44638845575124e-06, + "loss": 0.4827, + "mean_token_accuracy": 0.8485276773571968, + "num_tokens": 125540038.0, + "step": 104430 + }, + { + "entropy": 1.8927355989813806, + "epoch": 0.32375486201901954, + "grad_norm": 8.601818084716797, + "learning_rate": 4.446175580537197e-06, + "loss": 0.5351, + "mean_token_accuracy": 0.8406930550932884, + "num_tokens": 125551783.0, + "step": 104440 + }, + { + "entropy": 1.933559662103653, + "epoch": 0.3237858611440692, + "grad_norm": 7.8219475746154785, + "learning_rate": 4.445962735895055e-06, + "loss": 0.5101, + "mean_token_accuracy": 0.8485094651579856, + "num_tokens": 125562679.0, + "step": 104450 + }, + { + "entropy": 1.8612630635499954, + "epoch": 0.3238168602691189, + "grad_norm": 8.244562149047852, + "learning_rate": 4.445749921817498e-06, + "loss": 0.4741, + "mean_token_accuracy": 0.8419337302446366, + "num_tokens": 125574193.0, + "step": 104460 + }, + { + "entropy": 1.8549982547760009, + "epoch": 0.3238478593941686, + "grad_norm": 9.33348274230957, + "learning_rate": 4.445537138297214e-06, + "loss": 0.4646, + "mean_token_accuracy": 0.8472777247428894, + "num_tokens": 125585671.0, + "step": 104470 + }, + { + "entropy": 1.8255490154027938, + "epoch": 0.32387885851921827, + "grad_norm": 7.582674503326416, + "learning_rate": 4.445324385326889e-06, + "loss": 0.4751, + "mean_token_accuracy": 0.8529926791787148, + "num_tokens": 125599259.0, + "step": 104480 + }, + { + "entropy": 1.8867202073335647, + "epoch": 0.323909857644268, + "grad_norm": 7.966434478759766, + "learning_rate": 4.445111662899213e-06, + "loss": 0.4734, + "mean_token_accuracy": 0.8491407498717308, + "num_tokens": 125611435.0, + "step": 104490 + }, + { + "entropy": 1.8497378468513488, + "epoch": 0.32394085676931766, + "grad_norm": 5.026673793792725, + "learning_rate": 4.444898971006879e-06, + "loss": 0.4795, + "mean_token_accuracy": 0.8529240384697914, + "num_tokens": 125623829.0, + "step": 104500 + }, + { + "entropy": 1.8570104882121086, + "epoch": 0.3239718558943674, + "grad_norm": 3.9983022212982178, + "learning_rate": 4.444686309642584e-06, + "loss": 0.4461, + "mean_token_accuracy": 0.8561410129070282, + "num_tokens": 125636076.0, + "step": 104510 + }, + { + "entropy": 1.9286787793040276, + "epoch": 0.32400285501941706, + "grad_norm": 9.436726570129395, + "learning_rate": 4.444473678799025e-06, + "loss": 0.5129, + "mean_token_accuracy": 0.8409220159053803, + "num_tokens": 125647410.0, + "step": 104520 + }, + { + "entropy": 1.790224689245224, + "epoch": 0.3240338541444668, + "grad_norm": 7.947468280792236, + "learning_rate": 4.444261078468901e-06, + "loss": 0.3994, + "mean_token_accuracy": 0.8576122537255287, + "num_tokens": 125660817.0, + "step": 104530 + }, + { + "entropy": 1.979808408021927, + "epoch": 0.32406485326951645, + "grad_norm": 8.681998252868652, + "learning_rate": 4.444048508644915e-06, + "loss": 0.5421, + "mean_token_accuracy": 0.8427085399627685, + "num_tokens": 125671400.0, + "step": 104540 + }, + { + "entropy": 1.8585129737854005, + "epoch": 0.3240958523945662, + "grad_norm": 8.156949043273926, + "learning_rate": 4.443835969319773e-06, + "loss": 0.5264, + "mean_token_accuracy": 0.8462090104818344, + "num_tokens": 125684225.0, + "step": 104550 + }, + { + "entropy": 1.8992905005812646, + "epoch": 0.32412685151961584, + "grad_norm": 10.150556564331055, + "learning_rate": 4.443623460486183e-06, + "loss": 0.4891, + "mean_token_accuracy": 0.838335144519806, + "num_tokens": 125696461.0, + "step": 104560 + }, + { + "entropy": 1.8747254326939582, + "epoch": 0.32415785064466557, + "grad_norm": 8.261116981506348, + "learning_rate": 4.443410982136853e-06, + "loss": 0.4682, + "mean_token_accuracy": 0.8448391363024712, + "num_tokens": 125708703.0, + "step": 104570 + }, + { + "entropy": 1.812967798113823, + "epoch": 0.32418884976971524, + "grad_norm": 9.305374145507812, + "learning_rate": 4.443198534264497e-06, + "loss": 0.4241, + "mean_token_accuracy": 0.853294064104557, + "num_tokens": 125721322.0, + "step": 104580 + }, + { + "entropy": 1.8749259188771248, + "epoch": 0.32421984889476496, + "grad_norm": 8.172529220581055, + "learning_rate": 4.442986116861831e-06, + "loss": 0.4241, + "mean_token_accuracy": 0.86143379509449, + "num_tokens": 125733217.0, + "step": 104590 + }, + { + "entropy": 1.9058199599385262, + "epoch": 0.32425084801981463, + "grad_norm": 4.497680187225342, + "learning_rate": 4.442773729921569e-06, + "loss": 0.5266, + "mean_token_accuracy": 0.832387951016426, + "num_tokens": 125744499.0, + "step": 104600 + }, + { + "entropy": 1.9057392075657844, + "epoch": 0.32428184714486435, + "grad_norm": 9.339095115661621, + "learning_rate": 4.4425613734364346e-06, + "loss": 0.5003, + "mean_token_accuracy": 0.8399216368794441, + "num_tokens": 125755625.0, + "step": 104610 + }, + { + "entropy": 1.887702539563179, + "epoch": 0.324312846269914, + "grad_norm": 9.061288833618164, + "learning_rate": 4.442349047399148e-06, + "loss": 0.5048, + "mean_token_accuracy": 0.8430167078971863, + "num_tokens": 125767881.0, + "step": 104620 + }, + { + "entropy": 1.8627898827195168, + "epoch": 0.32434384539496375, + "grad_norm": 7.091198921203613, + "learning_rate": 4.442136751802433e-06, + "loss": 0.4502, + "mean_token_accuracy": 0.8567918986082077, + "num_tokens": 125779283.0, + "step": 104630 + }, + { + "entropy": 1.8987063318490982, + "epoch": 0.3243748445200134, + "grad_norm": 7.691847324371338, + "learning_rate": 4.441924486639018e-06, + "loss": 0.4419, + "mean_token_accuracy": 0.8585168570280075, + "num_tokens": 125790750.0, + "step": 104640 + }, + { + "entropy": 1.8565407916903496, + "epoch": 0.32440584364506314, + "grad_norm": 8.805460929870605, + "learning_rate": 4.441712251901632e-06, + "loss": 0.4991, + "mean_token_accuracy": 0.8448611825704575, + "num_tokens": 125803070.0, + "step": 104650 + }, + { + "entropy": 1.8467104405164718, + "epoch": 0.3244368427701128, + "grad_norm": 7.713294982910156, + "learning_rate": 4.4415000475830064e-06, + "loss": 0.4652, + "mean_token_accuracy": 0.85001719892025, + "num_tokens": 125814740.0, + "step": 104660 + }, + { + "entropy": 1.8211202561855315, + "epoch": 0.32446784189516253, + "grad_norm": 4.372882843017578, + "learning_rate": 4.441287873675877e-06, + "loss": 0.4047, + "mean_token_accuracy": 0.8514401495456696, + "num_tokens": 125826554.0, + "step": 104670 + }, + { + "entropy": 1.7906322300434112, + "epoch": 0.3244988410202122, + "grad_norm": 7.335409641265869, + "learning_rate": 4.44107573017298e-06, + "loss": 0.4032, + "mean_token_accuracy": 0.8584406465291977, + "num_tokens": 125839095.0, + "step": 104680 + }, + { + "entropy": 1.887892808020115, + "epoch": 0.3245298401452619, + "grad_norm": 9.185553550720215, + "learning_rate": 4.4408636170670526e-06, + "loss": 0.4883, + "mean_token_accuracy": 0.839701421558857, + "num_tokens": 125850893.0, + "step": 104690 + }, + { + "entropy": 1.9207829117774964, + "epoch": 0.3245608392703116, + "grad_norm": 10.154474258422852, + "learning_rate": 4.4406515343508405e-06, + "loss": 0.4934, + "mean_token_accuracy": 0.8411696791648865, + "num_tokens": 125862536.0, + "step": 104700 + }, + { + "entropy": 1.9450317040085792, + "epoch": 0.32459183839536127, + "grad_norm": 10.254415512084961, + "learning_rate": 4.440439482017084e-06, + "loss": 0.4989, + "mean_token_accuracy": 0.8435680419206619, + "num_tokens": 125873937.0, + "step": 104710 + }, + { + "entropy": 1.9040479019284249, + "epoch": 0.324622837520411, + "grad_norm": 7.563967704772949, + "learning_rate": 4.440227460058531e-06, + "loss": 0.4784, + "mean_token_accuracy": 0.8446063458919525, + "num_tokens": 125885785.0, + "step": 104720 + }, + { + "entropy": 1.9021693363785743, + "epoch": 0.32465383664546066, + "grad_norm": 8.840468406677246, + "learning_rate": 4.440015468467932e-06, + "loss": 0.4776, + "mean_token_accuracy": 0.8490529716014862, + "num_tokens": 125897435.0, + "step": 104730 + }, + { + "entropy": 1.9175179213285447, + "epoch": 0.3246848357705104, + "grad_norm": 3.9579575061798096, + "learning_rate": 4.439803507238037e-06, + "loss": 0.4974, + "mean_token_accuracy": 0.8449933081865311, + "num_tokens": 125908914.0, + "step": 104740 + }, + { + "entropy": 1.8500194996595383, + "epoch": 0.32471583489556005, + "grad_norm": 8.99378490447998, + "learning_rate": 4.439591576361599e-06, + "loss": 0.4437, + "mean_token_accuracy": 0.8460746437311173, + "num_tokens": 125921013.0, + "step": 104750 + }, + { + "entropy": 1.8048820734024047, + "epoch": 0.3247468340206098, + "grad_norm": 2.689271926879883, + "learning_rate": 4.439379675831374e-06, + "loss": 0.4395, + "mean_token_accuracy": 0.8416306912899018, + "num_tokens": 125934812.0, + "step": 104760 + }, + { + "entropy": 1.8943803861737252, + "epoch": 0.32477783314565944, + "grad_norm": 9.720478057861328, + "learning_rate": 4.439167805640121e-06, + "loss": 0.462, + "mean_token_accuracy": 0.8504800125956535, + "num_tokens": 125946337.0, + "step": 104770 + }, + { + "entropy": 1.8520411089062692, + "epoch": 0.32480883227070917, + "grad_norm": 7.526730060577393, + "learning_rate": 4.438955965780603e-06, + "loss": 0.4906, + "mean_token_accuracy": 0.8448957860469818, + "num_tokens": 125958824.0, + "step": 104780 + }, + { + "entropy": 1.9352994859218597, + "epoch": 0.32483983139575884, + "grad_norm": 9.028653144836426, + "learning_rate": 4.438744156245582e-06, + "loss": 0.4804, + "mean_token_accuracy": 0.8472319498658181, + "num_tokens": 125970046.0, + "step": 104790 + }, + { + "entropy": 1.8998494163155555, + "epoch": 0.32487083052080856, + "grad_norm": 7.81031608581543, + "learning_rate": 4.438532377027824e-06, + "loss": 0.5041, + "mean_token_accuracy": 0.847889693081379, + "num_tokens": 125981669.0, + "step": 104800 + }, + { + "entropy": 1.8678305372595787, + "epoch": 0.32490182964585823, + "grad_norm": 6.961506366729736, + "learning_rate": 4.438320628120095e-06, + "loss": 0.4633, + "mean_token_accuracy": 0.8497982889413833, + "num_tokens": 125994098.0, + "step": 104810 + }, + { + "entropy": 1.8759206786751748, + "epoch": 0.32493282877090796, + "grad_norm": 7.766717433929443, + "learning_rate": 4.43810890951517e-06, + "loss": 0.5073, + "mean_token_accuracy": 0.8452546551823616, + "num_tokens": 126006395.0, + "step": 104820 + }, + { + "entropy": 1.9110927432775497, + "epoch": 0.3249638278959576, + "grad_norm": 9.222915649414062, + "learning_rate": 4.437897221205818e-06, + "loss": 0.5202, + "mean_token_accuracy": 0.8430734008550644, + "num_tokens": 126017620.0, + "step": 104830 + }, + { + "entropy": 1.81925430893898, + "epoch": 0.32499482702100735, + "grad_norm": 8.071293830871582, + "learning_rate": 4.4376855631848185e-06, + "loss": 0.4387, + "mean_token_accuracy": 0.8518892988562584, + "num_tokens": 126030236.0, + "step": 104840 + }, + { + "entropy": 1.8442829206585885, + "epoch": 0.325025826146057, + "grad_norm": 4.356393337249756, + "learning_rate": 4.437473935444945e-06, + "loss": 0.4454, + "mean_token_accuracy": 0.8437263607978821, + "num_tokens": 126043043.0, + "step": 104850 + }, + { + "entropy": 1.8087534308433533, + "epoch": 0.32505682527110674, + "grad_norm": 8.250895500183105, + "learning_rate": 4.437262337978981e-06, + "loss": 0.4178, + "mean_token_accuracy": 0.8567241251468658, + "num_tokens": 126055544.0, + "step": 104860 + }, + { + "entropy": 1.866674281656742, + "epoch": 0.3250878243961564, + "grad_norm": 7.741235733032227, + "learning_rate": 4.437050770779709e-06, + "loss": 0.4723, + "mean_token_accuracy": 0.8507555142045021, + "num_tokens": 126067524.0, + "step": 104870 + }, + { + "entropy": 1.9428170025348663, + "epoch": 0.32511882352120614, + "grad_norm": 7.691112995147705, + "learning_rate": 4.436839233839913e-06, + "loss": 0.5591, + "mean_token_accuracy": 0.833220262825489, + "num_tokens": 126078929.0, + "step": 104880 + }, + { + "entropy": 1.8663004338741302, + "epoch": 0.3251498226462558, + "grad_norm": 6.292846202850342, + "learning_rate": 4.436627727152381e-06, + "loss": 0.4767, + "mean_token_accuracy": 0.8499148935079575, + "num_tokens": 126091005.0, + "step": 104890 + }, + { + "entropy": 1.9085410133004188, + "epoch": 0.32518082177130553, + "grad_norm": 8.354232788085938, + "learning_rate": 4.436416250709903e-06, + "loss": 0.4846, + "mean_token_accuracy": 0.8494470819830895, + "num_tokens": 126102258.0, + "step": 104900 + }, + { + "entropy": 1.9257890343666078, + "epoch": 0.3252118208963552, + "grad_norm": 8.245911598205566, + "learning_rate": 4.436204804505272e-06, + "loss": 0.5215, + "mean_token_accuracy": 0.8361289039254188, + "num_tokens": 126112823.0, + "step": 104910 + }, + { + "entropy": 1.8960036650300025, + "epoch": 0.3252428200214049, + "grad_norm": 7.979340553283691, + "learning_rate": 4.435993388531282e-06, + "loss": 0.4719, + "mean_token_accuracy": 0.8566076219081878, + "num_tokens": 126123821.0, + "step": 104920 + }, + { + "entropy": 1.7642639800906181, + "epoch": 0.3252738191464546, + "grad_norm": 4.4087324142456055, + "learning_rate": 4.435782002780731e-06, + "loss": 0.3795, + "mean_token_accuracy": 0.8499569103121758, + "num_tokens": 126136484.0, + "step": 104930 + }, + { + "entropy": 1.8943352848291397, + "epoch": 0.32530481827150426, + "grad_norm": 7.900808334350586, + "learning_rate": 4.435570647246417e-06, + "loss": 0.5372, + "mean_token_accuracy": 0.8378649175167083, + "num_tokens": 126147568.0, + "step": 104940 + }, + { + "entropy": 1.7571329042315482, + "epoch": 0.325335817396554, + "grad_norm": 5.164160251617432, + "learning_rate": 4.435359321921144e-06, + "loss": 0.4033, + "mean_token_accuracy": 0.8640245348215103, + "num_tokens": 126161243.0, + "step": 104950 + }, + { + "entropy": 1.8740806519985198, + "epoch": 0.32536681652160365, + "grad_norm": 7.368401050567627, + "learning_rate": 4.435148026797714e-06, + "loss": 0.4493, + "mean_token_accuracy": 0.8456128895282745, + "num_tokens": 126173067.0, + "step": 104960 + }, + { + "entropy": 1.852415107935667, + "epoch": 0.3253978156466534, + "grad_norm": 8.494685173034668, + "learning_rate": 4.434936761868937e-06, + "loss": 0.4704, + "mean_token_accuracy": 0.8431513279676437, + "num_tokens": 126185590.0, + "step": 104970 + }, + { + "entropy": 1.872846459597349, + "epoch": 0.32542881477170305, + "grad_norm": 2.7987353801727295, + "learning_rate": 4.434725527127619e-06, + "loss": 0.451, + "mean_token_accuracy": 0.8551968723535538, + "num_tokens": 126197812.0, + "step": 104980 + }, + { + "entropy": 1.8627401351928712, + "epoch": 0.32545981389675277, + "grad_norm": 4.279714584350586, + "learning_rate": 4.434514322566573e-06, + "loss": 0.4787, + "mean_token_accuracy": 0.8442763239145279, + "num_tokens": 126210066.0, + "step": 104990 + }, + { + "entropy": 1.88872180134058, + "epoch": 0.32549081302180244, + "grad_norm": 7.514123439788818, + "learning_rate": 4.434303148178613e-06, + "loss": 0.4768, + "mean_token_accuracy": 0.8437312439084053, + "num_tokens": 126222842.0, + "step": 105000 + }, + { + "entropy": 1.9054294735193253, + "epoch": 0.32552181214685216, + "grad_norm": 7.55472993850708, + "learning_rate": 4.434092003956556e-06, + "loss": 0.4566, + "mean_token_accuracy": 0.8568596675992012, + "num_tokens": 126233568.0, + "step": 105010 + }, + { + "entropy": 1.9124674081802369, + "epoch": 0.32555281127190183, + "grad_norm": 10.53081226348877, + "learning_rate": 4.4338808898932204e-06, + "loss": 0.5074, + "mean_token_accuracy": 0.84600650370121, + "num_tokens": 126245643.0, + "step": 105020 + }, + { + "entropy": 1.8103389233350753, + "epoch": 0.32558381039695156, + "grad_norm": 3.0053796768188477, + "learning_rate": 4.433669805981426e-06, + "loss": 0.434, + "mean_token_accuracy": 0.8554301127791405, + "num_tokens": 126257601.0, + "step": 105030 + }, + { + "entropy": 1.9138575494289398, + "epoch": 0.3256148095220012, + "grad_norm": 8.38183879852295, + "learning_rate": 4.433458752213998e-06, + "loss": 0.5186, + "mean_token_accuracy": 0.838289988040924, + "num_tokens": 126269101.0, + "step": 105040 + }, + { + "entropy": 1.8735656663775444, + "epoch": 0.32564580864705095, + "grad_norm": 8.337263107299805, + "learning_rate": 4.433247728583761e-06, + "loss": 0.4654, + "mean_token_accuracy": 0.8512678787112236, + "num_tokens": 126279995.0, + "step": 105050 + }, + { + "entropy": 1.8865850910544395, + "epoch": 0.3256768077721006, + "grad_norm": 10.122349739074707, + "learning_rate": 4.433036735083546e-06, + "loss": 0.456, + "mean_token_accuracy": 0.8520815536379814, + "num_tokens": 126291441.0, + "step": 105060 + }, + { + "entropy": 1.8525515541434288, + "epoch": 0.32570780689715034, + "grad_norm": 9.650886535644531, + "learning_rate": 4.43282577170618e-06, + "loss": 0.4447, + "mean_token_accuracy": 0.8546164348721504, + "num_tokens": 126303505.0, + "step": 105070 + }, + { + "entropy": 1.813267020881176, + "epoch": 0.3257388060222, + "grad_norm": 8.041412353515625, + "learning_rate": 4.432614838444499e-06, + "loss": 0.4331, + "mean_token_accuracy": 0.8510597214102745, + "num_tokens": 126316240.0, + "step": 105080 + }, + { + "entropy": 1.837722858786583, + "epoch": 0.32576980514724974, + "grad_norm": 7.805492401123047, + "learning_rate": 4.432403935291336e-06, + "loss": 0.4346, + "mean_token_accuracy": 0.8534246236085892, + "num_tokens": 126328521.0, + "step": 105090 + }, + { + "entropy": 1.916102121770382, + "epoch": 0.3258008042722994, + "grad_norm": 8.766565322875977, + "learning_rate": 4.432193062239532e-06, + "loss": 0.5365, + "mean_token_accuracy": 0.8374799504876137, + "num_tokens": 126340386.0, + "step": 105100 + }, + { + "entropy": 1.894291676580906, + "epoch": 0.32583180339734913, + "grad_norm": 7.835315227508545, + "learning_rate": 4.431982219281925e-06, + "loss": 0.4592, + "mean_token_accuracy": 0.8560818731784821, + "num_tokens": 126351585.0, + "step": 105110 + }, + { + "entropy": 1.9061674028635025, + "epoch": 0.3258628025223988, + "grad_norm": 8.130203247070312, + "learning_rate": 4.431771406411358e-06, + "loss": 0.4926, + "mean_token_accuracy": 0.8496122330427169, + "num_tokens": 126362872.0, + "step": 105120 + }, + { + "entropy": 1.980933591723442, + "epoch": 0.3258938016474485, + "grad_norm": 8.286996841430664, + "learning_rate": 4.431560623620675e-06, + "loss": 0.5517, + "mean_token_accuracy": 0.8369853526353837, + "num_tokens": 126374021.0, + "step": 105130 + }, + { + "entropy": 1.8085208341479302, + "epoch": 0.3259248007724982, + "grad_norm": 6.969970226287842, + "learning_rate": 4.431349870902727e-06, + "loss": 0.4279, + "mean_token_accuracy": 0.8551606863737107, + "num_tokens": 126387654.0, + "step": 105140 + }, + { + "entropy": 1.9031474754214286, + "epoch": 0.3259557998975479, + "grad_norm": 7.189389705657959, + "learning_rate": 4.431139148250362e-06, + "loss": 0.4203, + "mean_token_accuracy": 0.8574759498238563, + "num_tokens": 126398864.0, + "step": 105150 + }, + { + "entropy": 1.7899470388889314, + "epoch": 0.3259867990225976, + "grad_norm": 7.750818252563477, + "learning_rate": 4.430928455656429e-06, + "loss": 0.3847, + "mean_token_accuracy": 0.8657332420349121, + "num_tokens": 126412367.0, + "step": 105160 + }, + { + "entropy": 1.812622857093811, + "epoch": 0.3260177981476473, + "grad_norm": 6.185579776763916, + "learning_rate": 4.4307177931137864e-06, + "loss": 0.3992, + "mean_token_accuracy": 0.8634996026754379, + "num_tokens": 126424902.0, + "step": 105170 + }, + { + "entropy": 1.9225602239370345, + "epoch": 0.326048797272697, + "grad_norm": 9.488457679748535, + "learning_rate": 4.4305071606152906e-06, + "loss": 0.5349, + "mean_token_accuracy": 0.8386317417025566, + "num_tokens": 126436598.0, + "step": 105180 + }, + { + "entropy": 1.8649600803852082, + "epoch": 0.32607979639774665, + "grad_norm": 9.695698738098145, + "learning_rate": 4.4302965581538e-06, + "loss": 0.4751, + "mean_token_accuracy": 0.8493688553571701, + "num_tokens": 126447796.0, + "step": 105190 + }, + { + "entropy": 1.8659547924995423, + "epoch": 0.32611079552279637, + "grad_norm": 8.478187561035156, + "learning_rate": 4.4300859857221765e-06, + "loss": 0.4578, + "mean_token_accuracy": 0.8515862852334977, + "num_tokens": 126460081.0, + "step": 105200 + }, + { + "entropy": 1.8974830508232117, + "epoch": 0.32614179464784604, + "grad_norm": 4.3100457191467285, + "learning_rate": 4.429875443313283e-06, + "loss": 0.4964, + "mean_token_accuracy": 0.8492521673440934, + "num_tokens": 126471858.0, + "step": 105210 + }, + { + "entropy": 1.8549850180745124, + "epoch": 0.32617279377289576, + "grad_norm": 8.894652366638184, + "learning_rate": 4.429664930919989e-06, + "loss": 0.4619, + "mean_token_accuracy": 0.8504776194691658, + "num_tokens": 126483795.0, + "step": 105220 + }, + { + "entropy": 1.8823614329099656, + "epoch": 0.32620379289794543, + "grad_norm": 8.619571685791016, + "learning_rate": 4.429454448535162e-06, + "loss": 0.5536, + "mean_token_accuracy": 0.8416207283735275, + "num_tokens": 126495379.0, + "step": 105230 + }, + { + "entropy": 1.8804425790905952, + "epoch": 0.32623479202299516, + "grad_norm": 8.832228660583496, + "learning_rate": 4.429243996151671e-06, + "loss": 0.4898, + "mean_token_accuracy": 0.8424814388155937, + "num_tokens": 126507418.0, + "step": 105240 + }, + { + "entropy": 1.9131366968154908, + "epoch": 0.3262657911480448, + "grad_norm": 9.161233901977539, + "learning_rate": 4.4290335737623915e-06, + "loss": 0.5025, + "mean_token_accuracy": 0.8429681926965713, + "num_tokens": 126519465.0, + "step": 105250 + }, + { + "entropy": 1.8811295062303544, + "epoch": 0.32629679027309455, + "grad_norm": 7.556185245513916, + "learning_rate": 4.4288231813602e-06, + "loss": 0.4708, + "mean_token_accuracy": 0.8429519325494766, + "num_tokens": 126531139.0, + "step": 105260 + }, + { + "entropy": 1.9287168130278587, + "epoch": 0.3263277893981442, + "grad_norm": 8.719147682189941, + "learning_rate": 4.428612818937974e-06, + "loss": 0.5497, + "mean_token_accuracy": 0.8285974323749542, + "num_tokens": 126542914.0, + "step": 105270 + }, + { + "entropy": 1.8551797360181808, + "epoch": 0.32635878852319394, + "grad_norm": 3.8900067806243896, + "learning_rate": 4.428402486488593e-06, + "loss": 0.4313, + "mean_token_accuracy": 0.8506741672754288, + "num_tokens": 126554599.0, + "step": 105280 + }, + { + "entropy": 1.9169548124074935, + "epoch": 0.3263897876482436, + "grad_norm": 7.38561487197876, + "learning_rate": 4.428192184004942e-06, + "loss": 0.4984, + "mean_token_accuracy": 0.8483023956418038, + "num_tokens": 126565301.0, + "step": 105290 + }, + { + "entropy": 1.9362420484423637, + "epoch": 0.32642078677329334, + "grad_norm": 8.075181007385254, + "learning_rate": 4.427981911479907e-06, + "loss": 0.5269, + "mean_token_accuracy": 0.837260690331459, + "num_tokens": 126576660.0, + "step": 105300 + }, + { + "entropy": 1.9895741552114488, + "epoch": 0.326451785898343, + "grad_norm": 8.598695755004883, + "learning_rate": 4.427771668906373e-06, + "loss": 0.5641, + "mean_token_accuracy": 0.8313561111688614, + "num_tokens": 126587261.0, + "step": 105310 + }, + { + "entropy": 1.9371146634221077, + "epoch": 0.32648278502339273, + "grad_norm": 4.246029376983643, + "learning_rate": 4.427561456277231e-06, + "loss": 0.4739, + "mean_token_accuracy": 0.8425526946783066, + "num_tokens": 126598325.0, + "step": 105320 + }, + { + "entropy": 1.897105310857296, + "epoch": 0.3265137841484424, + "grad_norm": 4.175163269042969, + "learning_rate": 4.427351273585373e-06, + "loss": 0.4996, + "mean_token_accuracy": 0.8386947169899941, + "num_tokens": 126610456.0, + "step": 105330 + }, + { + "entropy": 1.9232771515846252, + "epoch": 0.3265447832734921, + "grad_norm": 7.9354071617126465, + "learning_rate": 4.427141120823697e-06, + "loss": 0.5314, + "mean_token_accuracy": 0.8433753445744514, + "num_tokens": 126621448.0, + "step": 105340 + }, + { + "entropy": 1.9362381488084792, + "epoch": 0.3265757823985418, + "grad_norm": 8.758517265319824, + "learning_rate": 4.426930997985096e-06, + "loss": 0.5004, + "mean_token_accuracy": 0.8483623921871185, + "num_tokens": 126632162.0, + "step": 105350 + }, + { + "entropy": 1.8089591830968856, + "epoch": 0.3266067815235915, + "grad_norm": 8.597465515136719, + "learning_rate": 4.426720905062472e-06, + "loss": 0.4347, + "mean_token_accuracy": 0.851777009665966, + "num_tokens": 126645035.0, + "step": 105360 + }, + { + "entropy": 1.9275547727942466, + "epoch": 0.3266377806486412, + "grad_norm": 7.728219509124756, + "learning_rate": 4.426510842048728e-06, + "loss": 0.5099, + "mean_token_accuracy": 0.8419729739427566, + "num_tokens": 126656366.0, + "step": 105370 + }, + { + "entropy": 1.947744831442833, + "epoch": 0.3266687797736909, + "grad_norm": 8.231205940246582, + "learning_rate": 4.426300808936765e-06, + "loss": 0.5361, + "mean_token_accuracy": 0.8379727810621261, + "num_tokens": 126667203.0, + "step": 105380 + }, + { + "entropy": 1.89700628221035, + "epoch": 0.3266997788987406, + "grad_norm": 8.626848220825195, + "learning_rate": 4.426090805719492e-06, + "loss": 0.4758, + "mean_token_accuracy": 0.8507444426417351, + "num_tokens": 126679228.0, + "step": 105390 + }, + { + "entropy": 1.7944568783044814, + "epoch": 0.3267307780237903, + "grad_norm": 7.826314926147461, + "learning_rate": 4.4258808323898175e-06, + "loss": 0.3949, + "mean_token_accuracy": 0.8589142486453056, + "num_tokens": 126692017.0, + "step": 105400 + }, + { + "entropy": 1.8504990950226783, + "epoch": 0.32676177714883997, + "grad_norm": 8.685942649841309, + "learning_rate": 4.425670888940653e-06, + "loss": 0.4481, + "mean_token_accuracy": 0.8543372765183449, + "num_tokens": 126704537.0, + "step": 105410 + }, + { + "entropy": 1.774632167816162, + "epoch": 0.3267927762738897, + "grad_norm": 8.145844459533691, + "learning_rate": 4.425460975364912e-06, + "loss": 0.4201, + "mean_token_accuracy": 0.853350467979908, + "num_tokens": 126717875.0, + "step": 105420 + }, + { + "entropy": 1.878597044944763, + "epoch": 0.32682377539893936, + "grad_norm": 8.8296480178833, + "learning_rate": 4.425251091655509e-06, + "loss": 0.4983, + "mean_token_accuracy": 0.8486263751983643, + "num_tokens": 126729109.0, + "step": 105430 + }, + { + "entropy": 1.8818610459566116, + "epoch": 0.32685477452398903, + "grad_norm": 7.772421836853027, + "learning_rate": 4.425041237805365e-06, + "loss": 0.4909, + "mean_token_accuracy": 0.8495032295584679, + "num_tokens": 126741332.0, + "step": 105440 + }, + { + "entropy": 1.8752919748425483, + "epoch": 0.32688577364903876, + "grad_norm": 7.812198162078857, + "learning_rate": 4.4248314138074e-06, + "loss": 0.4661, + "mean_token_accuracy": 0.8592524334788323, + "num_tokens": 126753407.0, + "step": 105450 + }, + { + "entropy": 1.8762979090213776, + "epoch": 0.3269167727740884, + "grad_norm": 3.573888063430786, + "learning_rate": 4.424621619654536e-06, + "loss": 0.4951, + "mean_token_accuracy": 0.8430203974246979, + "num_tokens": 126764669.0, + "step": 105460 + }, + { + "entropy": 1.9314564675092698, + "epoch": 0.32694777189913815, + "grad_norm": 8.204230308532715, + "learning_rate": 4.4244118553397e-06, + "loss": 0.5152, + "mean_token_accuracy": 0.8395652711391449, + "num_tokens": 126775741.0, + "step": 105470 + }, + { + "entropy": 1.8367981255054473, + "epoch": 0.3269787710241878, + "grad_norm": 8.883301734924316, + "learning_rate": 4.424202120855818e-06, + "loss": 0.4383, + "mean_token_accuracy": 0.8451298654079438, + "num_tokens": 126788018.0, + "step": 105480 + }, + { + "entropy": 1.9303217083215714, + "epoch": 0.32700977014923754, + "grad_norm": 7.516885757446289, + "learning_rate": 4.423992416195822e-06, + "loss": 0.524, + "mean_token_accuracy": 0.8447012811899185, + "num_tokens": 126798946.0, + "step": 105490 + }, + { + "entropy": 1.8869639337062836, + "epoch": 0.3270407692742872, + "grad_norm": 7.857840538024902, + "learning_rate": 4.4237827413526425e-06, + "loss": 0.5241, + "mean_token_accuracy": 0.8342716425657273, + "num_tokens": 126810950.0, + "step": 105500 + }, + { + "entropy": 1.8075002878904343, + "epoch": 0.32707176839933694, + "grad_norm": 3.914418935775757, + "learning_rate": 4.423573096319217e-06, + "loss": 0.4139, + "mean_token_accuracy": 0.8586590677499771, + "num_tokens": 126823453.0, + "step": 105510 + }, + { + "entropy": 1.898749789595604, + "epoch": 0.3271027675243866, + "grad_norm": 8.268956184387207, + "learning_rate": 4.423363481088481e-06, + "loss": 0.5246, + "mean_token_accuracy": 0.8393109634518623, + "num_tokens": 126835289.0, + "step": 105520 + }, + { + "entropy": 1.7726179018616677, + "epoch": 0.32713376664943633, + "grad_norm": 3.696336030960083, + "learning_rate": 4.423153895653373e-06, + "loss": 0.4025, + "mean_token_accuracy": 0.8711417764425278, + "num_tokens": 126847849.0, + "step": 105530 + }, + { + "entropy": 1.7812189802527427, + "epoch": 0.327164765774486, + "grad_norm": 7.559261798858643, + "learning_rate": 4.422944340006837e-06, + "loss": 0.4275, + "mean_token_accuracy": 0.8589012801647187, + "num_tokens": 126860795.0, + "step": 105540 + }, + { + "entropy": 1.869552193582058, + "epoch": 0.3271957648995357, + "grad_norm": 7.7391252517700195, + "learning_rate": 4.4227348141418165e-06, + "loss": 0.5163, + "mean_token_accuracy": 0.8421175703406334, + "num_tokens": 126872641.0, + "step": 105550 + }, + { + "entropy": 1.847855243086815, + "epoch": 0.3272267640245854, + "grad_norm": 8.301652908325195, + "learning_rate": 4.422525318051257e-06, + "loss": 0.4613, + "mean_token_accuracy": 0.8523299112915993, + "num_tokens": 126884824.0, + "step": 105560 + }, + { + "entropy": 1.8750713467597961, + "epoch": 0.3272577631496351, + "grad_norm": 9.099956512451172, + "learning_rate": 4.422315851728109e-06, + "loss": 0.4838, + "mean_token_accuracy": 0.848430560529232, + "num_tokens": 126896206.0, + "step": 105570 + }, + { + "entropy": 1.8710100874304771, + "epoch": 0.3272887622746848, + "grad_norm": 7.930703163146973, + "learning_rate": 4.422106415165322e-06, + "loss": 0.4751, + "mean_token_accuracy": 0.8564017862081528, + "num_tokens": 126907868.0, + "step": 105580 + }, + { + "entropy": 1.9006529331207276, + "epoch": 0.3273197613997345, + "grad_norm": 3.9899120330810547, + "learning_rate": 4.4218970083558505e-06, + "loss": 0.5064, + "mean_token_accuracy": 0.8440683215856553, + "num_tokens": 126919807.0, + "step": 105590 + }, + { + "entropy": 1.8714349627494813, + "epoch": 0.3273507605247842, + "grad_norm": 8.205395698547363, + "learning_rate": 4.421687631292651e-06, + "loss": 0.5088, + "mean_token_accuracy": 0.8427043676376342, + "num_tokens": 126931174.0, + "step": 105600 + }, + { + "entropy": 1.8943498641252519, + "epoch": 0.3273817596498339, + "grad_norm": 5.008574485778809, + "learning_rate": 4.4214782839686805e-06, + "loss": 0.4976, + "mean_token_accuracy": 0.8429151371121406, + "num_tokens": 126942843.0, + "step": 105610 + }, + { + "entropy": 1.8473748803138732, + "epoch": 0.32741275877488357, + "grad_norm": 8.37645435333252, + "learning_rate": 4.421268966376901e-06, + "loss": 0.4991, + "mean_token_accuracy": 0.8443385511636734, + "num_tokens": 126954725.0, + "step": 105620 + }, + { + "entropy": 1.7291858568787575, + "epoch": 0.3274437578999333, + "grad_norm": 3.739603042602539, + "learning_rate": 4.421059678510274e-06, + "loss": 0.3693, + "mean_token_accuracy": 0.8665387943387032, + "num_tokens": 126968620.0, + "step": 105630 + }, + { + "entropy": 1.7893247798085212, + "epoch": 0.32747475702498297, + "grad_norm": 4.73966121673584, + "learning_rate": 4.420850420361765e-06, + "loss": 0.3692, + "mean_token_accuracy": 0.8631637006998062, + "num_tokens": 126981603.0, + "step": 105640 + }, + { + "entropy": 1.8417418763041495, + "epoch": 0.3275057561500327, + "grad_norm": 3.766573190689087, + "learning_rate": 4.420641191924342e-06, + "loss": 0.4141, + "mean_token_accuracy": 0.8520455986261368, + "num_tokens": 126994028.0, + "step": 105650 + }, + { + "entropy": 1.8928628534078598, + "epoch": 0.32753675527508236, + "grad_norm": 10.196480751037598, + "learning_rate": 4.420431993190975e-06, + "loss": 0.5382, + "mean_token_accuracy": 0.8365366339683533, + "num_tokens": 127005092.0, + "step": 105660 + }, + { + "entropy": 1.8437321752309799, + "epoch": 0.3275677544001321, + "grad_norm": 3.234116792678833, + "learning_rate": 4.4202228241546354e-06, + "loss": 0.4473, + "mean_token_accuracy": 0.8564884319901467, + "num_tokens": 127017233.0, + "step": 105670 + }, + { + "entropy": 1.8227815330028534, + "epoch": 0.32759875352518175, + "grad_norm": 6.8498921394348145, + "learning_rate": 4.420013684808299e-06, + "loss": 0.4251, + "mean_token_accuracy": 0.8651960000395775, + "num_tokens": 127028817.0, + "step": 105680 + }, + { + "entropy": 1.8443077996373176, + "epoch": 0.3276297526502314, + "grad_norm": 5.660989761352539, + "learning_rate": 4.419804575144942e-06, + "loss": 0.4744, + "mean_token_accuracy": 0.8490277513861656, + "num_tokens": 127041425.0, + "step": 105690 + }, + { + "entropy": 1.8642980486154557, + "epoch": 0.32766075177528114, + "grad_norm": 7.390862464904785, + "learning_rate": 4.419595495157543e-06, + "loss": 0.4562, + "mean_token_accuracy": 0.8533131554722786, + "num_tokens": 127053051.0, + "step": 105700 + }, + { + "entropy": 1.906336459517479, + "epoch": 0.3276917509003308, + "grad_norm": 7.918664455413818, + "learning_rate": 4.419386444839084e-06, + "loss": 0.536, + "mean_token_accuracy": 0.8386664062738418, + "num_tokens": 127064707.0, + "step": 105710 + }, + { + "entropy": 1.8976897805929185, + "epoch": 0.32772275002538054, + "grad_norm": 8.025975227355957, + "learning_rate": 4.419177424182549e-06, + "loss": 0.4764, + "mean_token_accuracy": 0.8574541077017784, + "num_tokens": 127075139.0, + "step": 105720 + }, + { + "entropy": 1.7558037593960762, + "epoch": 0.3277537491504302, + "grad_norm": 8.160879135131836, + "learning_rate": 4.418968433180924e-06, + "loss": 0.4305, + "mean_token_accuracy": 0.8632358491420746, + "num_tokens": 127088887.0, + "step": 105730 + }, + { + "entropy": 1.7168943211436272, + "epoch": 0.32778474827547993, + "grad_norm": 4.3874359130859375, + "learning_rate": 4.418759471827199e-06, + "loss": 0.3729, + "mean_token_accuracy": 0.861461877822876, + "num_tokens": 127103106.0, + "step": 105740 + }, + { + "entropy": 1.84348274320364, + "epoch": 0.3278157474005296, + "grad_norm": 4.105869770050049, + "learning_rate": 4.418550540114362e-06, + "loss": 0.4811, + "mean_token_accuracy": 0.8481574520468712, + "num_tokens": 127115076.0, + "step": 105750 + }, + { + "entropy": 1.803061343729496, + "epoch": 0.3278467465255793, + "grad_norm": 4.1507368087768555, + "learning_rate": 4.418341638035409e-06, + "loss": 0.4453, + "mean_token_accuracy": 0.8516246452927589, + "num_tokens": 127127443.0, + "step": 105760 + }, + { + "entropy": 1.821767383813858, + "epoch": 0.327877745650629, + "grad_norm": 3.866901159286499, + "learning_rate": 4.4181327655833315e-06, + "loss": 0.44, + "mean_token_accuracy": 0.8582574099302291, + "num_tokens": 127139967.0, + "step": 105770 + }, + { + "entropy": 1.8919250801205636, + "epoch": 0.3279087447756787, + "grad_norm": 7.636270999908447, + "learning_rate": 4.417923922751132e-06, + "loss": 0.5007, + "mean_token_accuracy": 0.8451293349266052, + "num_tokens": 127151560.0, + "step": 105780 + }, + { + "entropy": 1.8248420521616935, + "epoch": 0.3279397439007284, + "grad_norm": 3.9394943714141846, + "learning_rate": 4.417715109531807e-06, + "loss": 0.4343, + "mean_token_accuracy": 0.8457361742854118, + "num_tokens": 127163965.0, + "step": 105790 + }, + { + "entropy": 1.8081183552742004, + "epoch": 0.3279707430257781, + "grad_norm": 9.723323822021484, + "learning_rate": 4.41750632591836e-06, + "loss": 0.4581, + "mean_token_accuracy": 0.856559830904007, + "num_tokens": 127176304.0, + "step": 105800 + }, + { + "entropy": 1.9597806930541992, + "epoch": 0.3280017421508278, + "grad_norm": 9.514747619628906, + "learning_rate": 4.417297571903797e-06, + "loss": 0.5702, + "mean_token_accuracy": 0.823980063199997, + "num_tokens": 127187023.0, + "step": 105810 + }, + { + "entropy": 1.8679911822080613, + "epoch": 0.3280327412758775, + "grad_norm": 8.901237487792969, + "learning_rate": 4.4170888474811235e-06, + "loss": 0.4157, + "mean_token_accuracy": 0.8576633468270302, + "num_tokens": 127199443.0, + "step": 105820 + }, + { + "entropy": 1.7843319460749627, + "epoch": 0.3280637404009272, + "grad_norm": 7.45582914352417, + "learning_rate": 4.4168801526433495e-06, + "loss": 0.3749, + "mean_token_accuracy": 0.8751977890729904, + "num_tokens": 127212000.0, + "step": 105830 + }, + { + "entropy": 1.8773673444986343, + "epoch": 0.3280947395259769, + "grad_norm": 7.825649261474609, + "learning_rate": 4.416671487383486e-06, + "loss": 0.4706, + "mean_token_accuracy": 0.8506253302097321, + "num_tokens": 127222912.0, + "step": 105840 + }, + { + "entropy": 1.9522982880473136, + "epoch": 0.32812573865102657, + "grad_norm": 11.103384017944336, + "learning_rate": 4.416462851694547e-06, + "loss": 0.5427, + "mean_token_accuracy": 0.8257913753390312, + "num_tokens": 127234149.0, + "step": 105850 + }, + { + "entropy": 1.9224087953567506, + "epoch": 0.3281567377760763, + "grad_norm": 8.676215171813965, + "learning_rate": 4.4162542455695495e-06, + "loss": 0.4731, + "mean_token_accuracy": 0.8379293918609619, + "num_tokens": 127245446.0, + "step": 105860 + }, + { + "entropy": 1.8061835870146752, + "epoch": 0.32818773690112596, + "grad_norm": 3.818633794784546, + "learning_rate": 4.416045669001512e-06, + "loss": 0.4125, + "mean_token_accuracy": 0.8506641060113906, + "num_tokens": 127258271.0, + "step": 105870 + }, + { + "entropy": 1.891014301776886, + "epoch": 0.3282187360261757, + "grad_norm": 8.472275733947754, + "learning_rate": 4.415837121983454e-06, + "loss": 0.5129, + "mean_token_accuracy": 0.8417830243706703, + "num_tokens": 127269244.0, + "step": 105880 + }, + { + "entropy": 1.8804811149835587, + "epoch": 0.32824973515122535, + "grad_norm": 7.921799659729004, + "learning_rate": 4.415628604508402e-06, + "loss": 0.503, + "mean_token_accuracy": 0.843516594171524, + "num_tokens": 127281119.0, + "step": 105890 + }, + { + "entropy": 1.8675379946827888, + "epoch": 0.3282807342762751, + "grad_norm": 7.780767917633057, + "learning_rate": 4.415420116569378e-06, + "loss": 0.4707, + "mean_token_accuracy": 0.8497234523296356, + "num_tokens": 127292982.0, + "step": 105900 + }, + { + "entropy": 1.8299174696207046, + "epoch": 0.32831173340132475, + "grad_norm": 8.369937896728516, + "learning_rate": 4.41521165815941e-06, + "loss": 0.46, + "mean_token_accuracy": 0.8549463152885437, + "num_tokens": 127305007.0, + "step": 105910 + }, + { + "entropy": 1.902072112262249, + "epoch": 0.32834273252637447, + "grad_norm": 6.59075403213501, + "learning_rate": 4.4150032292715315e-06, + "loss": 0.4707, + "mean_token_accuracy": 0.84717957675457, + "num_tokens": 127316897.0, + "step": 105920 + }, + { + "entropy": 1.9103587403893472, + "epoch": 0.32837373165142414, + "grad_norm": 9.852415084838867, + "learning_rate": 4.414794829898772e-06, + "loss": 0.518, + "mean_token_accuracy": 0.8442836970090866, + "num_tokens": 127328580.0, + "step": 105930 + }, + { + "entropy": 1.8694602742791175, + "epoch": 0.3284047307764738, + "grad_norm": 9.172961235046387, + "learning_rate": 4.4145864600341656e-06, + "loss": 0.4835, + "mean_token_accuracy": 0.8534001842141151, + "num_tokens": 127339695.0, + "step": 105940 + }, + { + "entropy": 1.8509365767240524, + "epoch": 0.32843572990152353, + "grad_norm": 7.372804641723633, + "learning_rate": 4.414378119670751e-06, + "loss": 0.4497, + "mean_token_accuracy": 0.85338544100523, + "num_tokens": 127350996.0, + "step": 105950 + }, + { + "entropy": 1.9594619423151016, + "epoch": 0.3284667290265732, + "grad_norm": 9.022714614868164, + "learning_rate": 4.414169808801567e-06, + "loss": 0.5748, + "mean_token_accuracy": 0.8271567165851593, + "num_tokens": 127361650.0, + "step": 105960 + }, + { + "entropy": 1.788424487411976, + "epoch": 0.3284977281516229, + "grad_norm": 8.611357688903809, + "learning_rate": 4.413961527419656e-06, + "loss": 0.4161, + "mean_token_accuracy": 0.8564797580242157, + "num_tokens": 127375584.0, + "step": 105970 + }, + { + "entropy": 1.8492983341217042, + "epoch": 0.3285287272766726, + "grad_norm": 6.395881175994873, + "learning_rate": 4.4137532755180615e-06, + "loss": 0.4253, + "mean_token_accuracy": 0.8596012353897095, + "num_tokens": 127388011.0, + "step": 105980 + }, + { + "entropy": 1.9068158611655235, + "epoch": 0.3285597264017223, + "grad_norm": 9.72159481048584, + "learning_rate": 4.4135450530898296e-06, + "loss": 0.5133, + "mean_token_accuracy": 0.8352061986923218, + "num_tokens": 127399562.0, + "step": 105990 + }, + { + "entropy": 1.8263181149959564, + "epoch": 0.328590725526772, + "grad_norm": 9.199592590332031, + "learning_rate": 4.413336860128008e-06, + "loss": 0.4156, + "mean_token_accuracy": 0.856217360496521, + "num_tokens": 127412251.0, + "step": 106000 + }, + { + "entropy": 1.7998489513993263, + "epoch": 0.3286217246518217, + "grad_norm": 7.553385257720947, + "learning_rate": 4.413128696625648e-06, + "loss": 0.4414, + "mean_token_accuracy": 0.8605973869562149, + "num_tokens": 127424851.0, + "step": 106010 + }, + { + "entropy": 1.8556075662374496, + "epoch": 0.3286527237768714, + "grad_norm": 9.884352684020996, + "learning_rate": 4.412920562575802e-06, + "loss": 0.503, + "mean_token_accuracy": 0.8411578819155693, + "num_tokens": 127436563.0, + "step": 106020 + }, + { + "entropy": 1.8433560684323311, + "epoch": 0.3286837229019211, + "grad_norm": 9.31692123413086, + "learning_rate": 4.412712457971527e-06, + "loss": 0.4248, + "mean_token_accuracy": 0.85454620718956, + "num_tokens": 127448302.0, + "step": 106030 + }, + { + "entropy": 1.8037659108638764, + "epoch": 0.3287147220269708, + "grad_norm": 4.995815753936768, + "learning_rate": 4.412504382805881e-06, + "loss": 0.4362, + "mean_token_accuracy": 0.8571658223867417, + "num_tokens": 127459774.0, + "step": 106040 + }, + { + "entropy": 1.8213044986128808, + "epoch": 0.3287457211520205, + "grad_norm": 8.79011344909668, + "learning_rate": 4.412296337071922e-06, + "loss": 0.4453, + "mean_token_accuracy": 0.8526079818606377, + "num_tokens": 127472645.0, + "step": 106050 + }, + { + "entropy": 1.9069361835718155, + "epoch": 0.32877672027707017, + "grad_norm": 9.181472778320312, + "learning_rate": 4.412088320762712e-06, + "loss": 0.481, + "mean_token_accuracy": 0.8438616082072258, + "num_tokens": 127483985.0, + "step": 106060 + }, + { + "entropy": 1.7741859510540963, + "epoch": 0.3288077194021199, + "grad_norm": 4.265276908874512, + "learning_rate": 4.411880333871319e-06, + "loss": 0.4459, + "mean_token_accuracy": 0.85208670347929, + "num_tokens": 127497519.0, + "step": 106070 + }, + { + "entropy": 1.774897436797619, + "epoch": 0.32883871852716956, + "grad_norm": 3.4330549240112305, + "learning_rate": 4.411672376390806e-06, + "loss": 0.4134, + "mean_token_accuracy": 0.862097978591919, + "num_tokens": 127510803.0, + "step": 106080 + }, + { + "entropy": 1.7656168833374977, + "epoch": 0.3288697176522193, + "grad_norm": 7.550422191619873, + "learning_rate": 4.411464448314243e-06, + "loss": 0.3932, + "mean_token_accuracy": 0.859065192937851, + "num_tokens": 127523417.0, + "step": 106090 + }, + { + "entropy": 1.8322931870818138, + "epoch": 0.32890071677726895, + "grad_norm": 8.406190872192383, + "learning_rate": 4.411256549634704e-06, + "loss": 0.4203, + "mean_token_accuracy": 0.8471446886658669, + "num_tokens": 127536405.0, + "step": 106100 + }, + { + "entropy": 1.914437648653984, + "epoch": 0.3289317159023187, + "grad_norm": 10.335504531860352, + "learning_rate": 4.411048680345259e-06, + "loss": 0.5209, + "mean_token_accuracy": 0.8394327759742737, + "num_tokens": 127546905.0, + "step": 106110 + }, + { + "entropy": 1.863400039076805, + "epoch": 0.32896271502736835, + "grad_norm": 7.929013252258301, + "learning_rate": 4.410840840438987e-06, + "loss": 0.4397, + "mean_token_accuracy": 0.8584112614393234, + "num_tokens": 127558189.0, + "step": 106120 + }, + { + "entropy": 1.8088774591684342, + "epoch": 0.32899371415241807, + "grad_norm": 12.394187927246094, + "learning_rate": 4.410633029908964e-06, + "loss": 0.4221, + "mean_token_accuracy": 0.8598813712596893, + "num_tokens": 127570793.0, + "step": 106130 + }, + { + "entropy": 1.8091413155198097, + "epoch": 0.32902471327746774, + "grad_norm": 7.606610298156738, + "learning_rate": 4.4104252487482726e-06, + "loss": 0.4373, + "mean_token_accuracy": 0.8585825636982918, + "num_tokens": 127582513.0, + "step": 106140 + }, + { + "entropy": 1.8211368069052696, + "epoch": 0.32905571240251746, + "grad_norm": 3.8526268005371094, + "learning_rate": 4.4102174969499945e-06, + "loss": 0.4541, + "mean_token_accuracy": 0.8475786969065666, + "num_tokens": 127595268.0, + "step": 106150 + }, + { + "entropy": 1.8827776312828064, + "epoch": 0.32908671152756713, + "grad_norm": 8.69912052154541, + "learning_rate": 4.410009774507214e-06, + "loss": 0.5015, + "mean_token_accuracy": 0.8499000027775765, + "num_tokens": 127606870.0, + "step": 106160 + }, + { + "entropy": 1.9105200260877608, + "epoch": 0.32911771065261686, + "grad_norm": 8.402679443359375, + "learning_rate": 4.40980208141302e-06, + "loss": 0.4985, + "mean_token_accuracy": 0.8454275488853454, + "num_tokens": 127617735.0, + "step": 106170 + }, + { + "entropy": 1.8559418886899948, + "epoch": 0.3291487097776665, + "grad_norm": 10.870802879333496, + "learning_rate": 4.4095944176605015e-06, + "loss": 0.4824, + "mean_token_accuracy": 0.8455240190029144, + "num_tokens": 127629429.0, + "step": 106180 + }, + { + "entropy": 1.8259831920266152, + "epoch": 0.3291797089027162, + "grad_norm": 8.05710506439209, + "learning_rate": 4.40938678324275e-06, + "loss": 0.4642, + "mean_token_accuracy": 0.8483387231826782, + "num_tokens": 127642106.0, + "step": 106190 + }, + { + "entropy": 1.8355090886354446, + "epoch": 0.3292107080277659, + "grad_norm": 6.109237194061279, + "learning_rate": 4.40917917815286e-06, + "loss": 0.4785, + "mean_token_accuracy": 0.8482315883040428, + "num_tokens": 127653739.0, + "step": 106200 + }, + { + "entropy": 1.8437444880604743, + "epoch": 0.3292417071528156, + "grad_norm": 3.581789493560791, + "learning_rate": 4.408971602383929e-06, + "loss": 0.4453, + "mean_token_accuracy": 0.8457508951425552, + "num_tokens": 127665387.0, + "step": 106210 + }, + { + "entropy": 1.7818902999162674, + "epoch": 0.3292727062778653, + "grad_norm": 8.96249771118164, + "learning_rate": 4.408764055929055e-06, + "loss": 0.4116, + "mean_token_accuracy": 0.8549827590584755, + "num_tokens": 127678366.0, + "step": 106220 + }, + { + "entropy": 1.8756959035992622, + "epoch": 0.329303705402915, + "grad_norm": 8.410722732543945, + "learning_rate": 4.40855653878134e-06, + "loss": 0.4863, + "mean_token_accuracy": 0.8546160280704498, + "num_tokens": 127689938.0, + "step": 106230 + }, + { + "entropy": 1.782509133219719, + "epoch": 0.3293347045279647, + "grad_norm": 4.164084434509277, + "learning_rate": 4.408349050933885e-06, + "loss": 0.4215, + "mean_token_accuracy": 0.8557716026902199, + "num_tokens": 127703501.0, + "step": 106240 + }, + { + "entropy": 1.9288730174303055, + "epoch": 0.3293657036530144, + "grad_norm": 9.329178810119629, + "learning_rate": 4.408141592379797e-06, + "loss": 0.5069, + "mean_token_accuracy": 0.8449020832777023, + "num_tokens": 127714411.0, + "step": 106250 + }, + { + "entropy": 1.9135432869195939, + "epoch": 0.3293967027780641, + "grad_norm": 7.811339378356934, + "learning_rate": 4.4079341631121846e-06, + "loss": 0.5017, + "mean_token_accuracy": 0.843287567794323, + "num_tokens": 127725960.0, + "step": 106260 + }, + { + "entropy": 1.8159632161259651, + "epoch": 0.32942770190311377, + "grad_norm": 10.536284446716309, + "learning_rate": 4.407726763124156e-06, + "loss": 0.4572, + "mean_token_accuracy": 0.8523865416646004, + "num_tokens": 127738519.0, + "step": 106270 + }, + { + "entropy": 1.6889009296894073, + "epoch": 0.3294587010281635, + "grad_norm": 3.854187488555908, + "learning_rate": 4.407519392408826e-06, + "loss": 0.3543, + "mean_token_accuracy": 0.8675200149416924, + "num_tokens": 127753002.0, + "step": 106280 + }, + { + "entropy": 1.9123955562710762, + "epoch": 0.32948970015321316, + "grad_norm": 8.895185470581055, + "learning_rate": 4.4073120509593084e-06, + "loss": 0.5115, + "mean_token_accuracy": 0.843984666466713, + "num_tokens": 127764315.0, + "step": 106290 + }, + { + "entropy": 1.8070273652672768, + "epoch": 0.3295206992782629, + "grad_norm": 6.719578266143799, + "learning_rate": 4.4071047387687186e-06, + "loss": 0.4363, + "mean_token_accuracy": 0.8462962463498116, + "num_tokens": 127777448.0, + "step": 106300 + }, + { + "entropy": 1.9051207095384597, + "epoch": 0.32955169840331255, + "grad_norm": 6.615823268890381, + "learning_rate": 4.406897455830177e-06, + "loss": 0.5134, + "mean_token_accuracy": 0.8489801079034806, + "num_tokens": 127788724.0, + "step": 106310 + }, + { + "entropy": 1.790643498301506, + "epoch": 0.3295826975283623, + "grad_norm": 4.387211322784424, + "learning_rate": 4.4066902021368055e-06, + "loss": 0.4293, + "mean_token_accuracy": 0.8558807000517845, + "num_tokens": 127801391.0, + "step": 106320 + }, + { + "entropy": 1.9066824808716774, + "epoch": 0.32961369665341195, + "grad_norm": 8.245688438415527, + "learning_rate": 4.406482977681727e-06, + "loss": 0.5221, + "mean_token_accuracy": 0.8399421364068985, + "num_tokens": 127812990.0, + "step": 106330 + }, + { + "entropy": 1.7780104890465736, + "epoch": 0.32964469577846167, + "grad_norm": 7.971060276031494, + "learning_rate": 4.406275782458069e-06, + "loss": 0.439, + "mean_token_accuracy": 0.8576152637600899, + "num_tokens": 127825366.0, + "step": 106340 + }, + { + "entropy": 1.7980201825499535, + "epoch": 0.32967569490351134, + "grad_norm": 10.708061218261719, + "learning_rate": 4.406068616458957e-06, + "loss": 0.4155, + "mean_token_accuracy": 0.8627904281020164, + "num_tokens": 127837420.0, + "step": 106350 + }, + { + "entropy": 1.788325347006321, + "epoch": 0.32970669402856106, + "grad_norm": 7.981190204620361, + "learning_rate": 4.405861479677525e-06, + "loss": 0.4451, + "mean_token_accuracy": 0.8535309046506881, + "num_tokens": 127850016.0, + "step": 106360 + }, + { + "entropy": 1.8833219155669212, + "epoch": 0.32973769315361073, + "grad_norm": 7.901392936706543, + "learning_rate": 4.4056543721069024e-06, + "loss": 0.5564, + "mean_token_accuracy": 0.83796806037426, + "num_tokens": 127861358.0, + "step": 106370 + }, + { + "entropy": 1.9057514756917953, + "epoch": 0.32976869227866046, + "grad_norm": 7.878623962402344, + "learning_rate": 4.405447293740227e-06, + "loss": 0.5158, + "mean_token_accuracy": 0.850669352710247, + "num_tokens": 127872047.0, + "step": 106380 + }, + { + "entropy": 1.8584971472620964, + "epoch": 0.3297996914037101, + "grad_norm": 3.818636894226074, + "learning_rate": 4.405240244570635e-06, + "loss": 0.4382, + "mean_token_accuracy": 0.8438824072480202, + "num_tokens": 127884719.0, + "step": 106390 + }, + { + "entropy": 1.9173836246132852, + "epoch": 0.32983069052875985, + "grad_norm": 9.852693557739258, + "learning_rate": 4.405033224591264e-06, + "loss": 0.4975, + "mean_token_accuracy": 0.8338365435600281, + "num_tokens": 127896128.0, + "step": 106400 + }, + { + "entropy": 1.8377716928720473, + "epoch": 0.3298616896538095, + "grad_norm": 5.034619331359863, + "learning_rate": 4.404826233795259e-06, + "loss": 0.4568, + "mean_token_accuracy": 0.8457611680030823, + "num_tokens": 127908855.0, + "step": 106410 + }, + { + "entropy": 1.7964683637022971, + "epoch": 0.32989268877885924, + "grad_norm": 4.223313808441162, + "learning_rate": 4.4046192721757625e-06, + "loss": 0.4682, + "mean_token_accuracy": 0.847607783973217, + "num_tokens": 127921526.0, + "step": 106420 + }, + { + "entropy": 1.806885550916195, + "epoch": 0.3299236879039089, + "grad_norm": 3.7442190647125244, + "learning_rate": 4.404412339725922e-06, + "loss": 0.4447, + "mean_token_accuracy": 0.8525063678622246, + "num_tokens": 127934569.0, + "step": 106430 + }, + { + "entropy": 1.763914243876934, + "epoch": 0.3299546870289586, + "grad_norm": 9.370475769042969, + "learning_rate": 4.404205436438884e-06, + "loss": 0.4088, + "mean_token_accuracy": 0.8634126842021942, + "num_tokens": 127947927.0, + "step": 106440 + }, + { + "entropy": 1.8372886329889297, + "epoch": 0.3299856861540083, + "grad_norm": 7.662943363189697, + "learning_rate": 4.403998562307801e-06, + "loss": 0.4612, + "mean_token_accuracy": 0.8420701563358307, + "num_tokens": 127960835.0, + "step": 106450 + }, + { + "entropy": 1.8602828592061997, + "epoch": 0.330016685279058, + "grad_norm": 8.085721969604492, + "learning_rate": 4.403791717325825e-06, + "loss": 0.4263, + "mean_token_accuracy": 0.8534385710954666, + "num_tokens": 127973028.0, + "step": 106460 + }, + { + "entropy": 1.8481948718428611, + "epoch": 0.3300476844041077, + "grad_norm": 9.864887237548828, + "learning_rate": 4.403584901486113e-06, + "loss": 0.4325, + "mean_token_accuracy": 0.8566409692168235, + "num_tokens": 127985321.0, + "step": 106470 + }, + { + "entropy": 1.8503880456089974, + "epoch": 0.33007868352915737, + "grad_norm": 8.877874374389648, + "learning_rate": 4.403378114781821e-06, + "loss": 0.4707, + "mean_token_accuracy": 0.8484792679548263, + "num_tokens": 127996333.0, + "step": 106480 + }, + { + "entropy": 1.850000935792923, + "epoch": 0.3301096826542071, + "grad_norm": 8.956764221191406, + "learning_rate": 4.403171357206109e-06, + "loss": 0.4779, + "mean_token_accuracy": 0.8442446753382683, + "num_tokens": 128007956.0, + "step": 106490 + }, + { + "entropy": 1.84800376445055, + "epoch": 0.33014068177925676, + "grad_norm": 7.336550712585449, + "learning_rate": 4.402964628752139e-06, + "loss": 0.4393, + "mean_token_accuracy": 0.8560839980840683, + "num_tokens": 128019797.0, + "step": 106500 + }, + { + "entropy": 1.887277567386627, + "epoch": 0.3301716809043065, + "grad_norm": 8.57533073425293, + "learning_rate": 4.4027579294130766e-06, + "loss": 0.4599, + "mean_token_accuracy": 0.8528825983405113, + "num_tokens": 128030487.0, + "step": 106510 + }, + { + "entropy": 1.8442515268921853, + "epoch": 0.33020268002935615, + "grad_norm": 8.904195785522461, + "learning_rate": 4.402551259182087e-06, + "loss": 0.4448, + "mean_token_accuracy": 0.8501736581325531, + "num_tokens": 128042554.0, + "step": 106520 + }, + { + "entropy": 1.8353326499462128, + "epoch": 0.3302336791544059, + "grad_norm": 7.989103317260742, + "learning_rate": 4.402344618052339e-06, + "loss": 0.4452, + "mean_token_accuracy": 0.8517720341682434, + "num_tokens": 128054608.0, + "step": 106530 + }, + { + "entropy": 1.8498850226402284, + "epoch": 0.33026467827945555, + "grad_norm": 4.291723251342773, + "learning_rate": 4.402138006017006e-06, + "loss": 0.4413, + "mean_token_accuracy": 0.848425455391407, + "num_tokens": 128066823.0, + "step": 106540 + }, + { + "entropy": 1.8223819851875305, + "epoch": 0.3302956774045053, + "grad_norm": 4.217907428741455, + "learning_rate": 4.401931423069258e-06, + "loss": 0.4128, + "mean_token_accuracy": 0.8563816383481025, + "num_tokens": 128078929.0, + "step": 106550 + }, + { + "entropy": 1.943525066971779, + "epoch": 0.33032667652955494, + "grad_norm": 6.72135066986084, + "learning_rate": 4.401724869202272e-06, + "loss": 0.5059, + "mean_token_accuracy": 0.8440445035696029, + "num_tokens": 128089603.0, + "step": 106560 + }, + { + "entropy": 1.8807444974780083, + "epoch": 0.33035767565460467, + "grad_norm": 9.296396255493164, + "learning_rate": 4.401518344409226e-06, + "loss": 0.4962, + "mean_token_accuracy": 0.8367669865489006, + "num_tokens": 128101462.0, + "step": 106570 + }, + { + "entropy": 1.8421072617173195, + "epoch": 0.33038867477965433, + "grad_norm": 4.264364719390869, + "learning_rate": 4.4013118486833e-06, + "loss": 0.447, + "mean_token_accuracy": 0.8508803278207779, + "num_tokens": 128113035.0, + "step": 106580 + }, + { + "entropy": 1.9045246794819832, + "epoch": 0.33041967390470406, + "grad_norm": 7.94566011428833, + "learning_rate": 4.401105382017675e-06, + "loss": 0.5491, + "mean_token_accuracy": 0.83597691655159, + "num_tokens": 128124609.0, + "step": 106590 + }, + { + "entropy": 1.8478467270731926, + "epoch": 0.3304506730297537, + "grad_norm": 7.042159557342529, + "learning_rate": 4.400898944405538e-06, + "loss": 0.4577, + "mean_token_accuracy": 0.8560745418071747, + "num_tokens": 128136516.0, + "step": 106600 + }, + { + "entropy": 1.8195524469017983, + "epoch": 0.33048167215480345, + "grad_norm": 8.00984001159668, + "learning_rate": 4.400692535840075e-06, + "loss": 0.4113, + "mean_token_accuracy": 0.8634827017784119, + "num_tokens": 128148763.0, + "step": 106610 + }, + { + "entropy": 1.864709873497486, + "epoch": 0.3305126712798531, + "grad_norm": 7.909924030303955, + "learning_rate": 4.4004861563144735e-06, + "loss": 0.5111, + "mean_token_accuracy": 0.8368150681257248, + "num_tokens": 128160851.0, + "step": 106620 + }, + { + "entropy": 1.8297677680850029, + "epoch": 0.33054367040490285, + "grad_norm": 4.41657018661499, + "learning_rate": 4.4002798058219256e-06, + "loss": 0.4139, + "mean_token_accuracy": 0.8527476787567139, + "num_tokens": 128173705.0, + "step": 106630 + }, + { + "entropy": 1.78669353723526, + "epoch": 0.3305746695299525, + "grad_norm": 8.336793899536133, + "learning_rate": 4.400073484355625e-06, + "loss": 0.3994, + "mean_token_accuracy": 0.8534336164593697, + "num_tokens": 128187455.0, + "step": 106640 + }, + { + "entropy": 1.872094802558422, + "epoch": 0.33060566865500224, + "grad_norm": 6.102949142456055, + "learning_rate": 4.399867191908766e-06, + "loss": 0.5354, + "mean_token_accuracy": 0.8369955107569694, + "num_tokens": 128199573.0, + "step": 106650 + }, + { + "entropy": 1.8278701439499856, + "epoch": 0.3306366677800519, + "grad_norm": 6.1908793449401855, + "learning_rate": 4.399660928474549e-06, + "loss": 0.4362, + "mean_token_accuracy": 0.8451722010970115, + "num_tokens": 128212545.0, + "step": 106660 + }, + { + "entropy": 1.80742659419775, + "epoch": 0.3306676669051016, + "grad_norm": 9.044242858886719, + "learning_rate": 4.399454694046172e-06, + "loss": 0.4347, + "mean_token_accuracy": 0.8628106907010078, + "num_tokens": 128225069.0, + "step": 106670 + }, + { + "entropy": 1.8953262731432914, + "epoch": 0.3306986660301513, + "grad_norm": 7.473639965057373, + "learning_rate": 4.399248488616839e-06, + "loss": 0.4912, + "mean_token_accuracy": 0.846845431625843, + "num_tokens": 128236139.0, + "step": 106680 + }, + { + "entropy": 1.8824147969484328, + "epoch": 0.33072966515520097, + "grad_norm": 3.392375946044922, + "learning_rate": 4.399042312179753e-06, + "loss": 0.486, + "mean_token_accuracy": 0.847435437142849, + "num_tokens": 128247581.0, + "step": 106690 + }, + { + "entropy": 1.8657734468579292, + "epoch": 0.3307606642802507, + "grad_norm": 8.0246000289917, + "learning_rate": 4.398836164728121e-06, + "loss": 0.4465, + "mean_token_accuracy": 0.8519965663552285, + "num_tokens": 128259094.0, + "step": 106700 + }, + { + "entropy": 1.9068785265088082, + "epoch": 0.33079166340530036, + "grad_norm": 10.08130168914795, + "learning_rate": 4.398630046255153e-06, + "loss": 0.4959, + "mean_token_accuracy": 0.8427546977996826, + "num_tokens": 128270336.0, + "step": 106710 + }, + { + "entropy": 1.6765178635716438, + "epoch": 0.3308226625303501, + "grad_norm": 3.5709166526794434, + "learning_rate": 4.39842395675406e-06, + "loss": 0.3849, + "mean_token_accuracy": 0.8611236557364463, + "num_tokens": 128284344.0, + "step": 106720 + }, + { + "entropy": 1.8389539405703546, + "epoch": 0.33085366165539976, + "grad_norm": 11.363747596740723, + "learning_rate": 4.398217896218056e-06, + "loss": 0.4644, + "mean_token_accuracy": 0.8520830184221267, + "num_tokens": 128296693.0, + "step": 106730 + }, + { + "entropy": 1.807772381603718, + "epoch": 0.3308846607804495, + "grad_norm": 4.15220308303833, + "learning_rate": 4.398011864640357e-06, + "loss": 0.4797, + "mean_token_accuracy": 0.8365904971957207, + "num_tokens": 128309280.0, + "step": 106740 + }, + { + "entropy": 1.8760442197322846, + "epoch": 0.33091565990549915, + "grad_norm": 8.140005111694336, + "learning_rate": 4.397805862014179e-06, + "loss": 0.4703, + "mean_token_accuracy": 0.8473433390259743, + "num_tokens": 128320766.0, + "step": 106750 + }, + { + "entropy": 1.884317898750305, + "epoch": 0.3309466590305489, + "grad_norm": 8.764328956604004, + "learning_rate": 4.397599888332744e-06, + "loss": 0.4655, + "mean_token_accuracy": 0.8561802804470062, + "num_tokens": 128332462.0, + "step": 106760 + }, + { + "entropy": 1.790523299574852, + "epoch": 0.33097765815559854, + "grad_norm": 7.192575454711914, + "learning_rate": 4.397393943589274e-06, + "loss": 0.4529, + "mean_token_accuracy": 0.841584998369217, + "num_tokens": 128344386.0, + "step": 106770 + }, + { + "entropy": 1.765225899219513, + "epoch": 0.33100865728064827, + "grad_norm": 4.198668956756592, + "learning_rate": 4.397188027776993e-06, + "loss": 0.3832, + "mean_token_accuracy": 0.8681812718510628, + "num_tokens": 128357282.0, + "step": 106780 + }, + { + "entropy": 1.8882387548685073, + "epoch": 0.33103965640569794, + "grad_norm": 8.834793090820312, + "learning_rate": 4.396982140889129e-06, + "loss": 0.5091, + "mean_token_accuracy": 0.8361070469021797, + "num_tokens": 128368511.0, + "step": 106790 + }, + { + "entropy": 1.9408668845891952, + "epoch": 0.33107065553074766, + "grad_norm": 7.283912181854248, + "learning_rate": 4.396776282918909e-06, + "loss": 0.5231, + "mean_token_accuracy": 0.8402132421731949, + "num_tokens": 128379340.0, + "step": 106800 + }, + { + "entropy": 1.9052711009979248, + "epoch": 0.33110165465579733, + "grad_norm": 9.804369926452637, + "learning_rate": 4.396570453859568e-06, + "loss": 0.5315, + "mean_token_accuracy": 0.8372917637228966, + "num_tokens": 128390571.0, + "step": 106810 + }, + { + "entropy": 1.8480284005403518, + "epoch": 0.33113265378084705, + "grad_norm": 8.21649169921875, + "learning_rate": 4.396364653704335e-06, + "loss": 0.4565, + "mean_token_accuracy": 0.8489649355411529, + "num_tokens": 128402248.0, + "step": 106820 + }, + { + "entropy": 1.8601251021027565, + "epoch": 0.3311636529058967, + "grad_norm": 3.575690746307373, + "learning_rate": 4.396158882446449e-06, + "loss": 0.4446, + "mean_token_accuracy": 0.8458746924996376, + "num_tokens": 128414209.0, + "step": 106830 + }, + { + "entropy": 1.8285237357020379, + "epoch": 0.33119465203094645, + "grad_norm": 9.334534645080566, + "learning_rate": 4.3959531400791465e-06, + "loss": 0.4558, + "mean_token_accuracy": 0.8475871294736862, + "num_tokens": 128426736.0, + "step": 106840 + }, + { + "entropy": 1.8750257775187493, + "epoch": 0.3312256511559961, + "grad_norm": 5.553659439086914, + "learning_rate": 4.395747426595669e-06, + "loss": 0.5246, + "mean_token_accuracy": 0.8419804513454437, + "num_tokens": 128438360.0, + "step": 106850 + }, + { + "entropy": 1.870549239218235, + "epoch": 0.33125665028104584, + "grad_norm": 6.454956531524658, + "learning_rate": 4.395541741989258e-06, + "loss": 0.529, + "mean_token_accuracy": 0.8351958334445954, + "num_tokens": 128449971.0, + "step": 106860 + }, + { + "entropy": 1.8797499716281891, + "epoch": 0.3312876494060955, + "grad_norm": 6.935102939605713, + "learning_rate": 4.395336086253158e-06, + "loss": 0.435, + "mean_token_accuracy": 0.8618351891636848, + "num_tokens": 128461856.0, + "step": 106870 + }, + { + "entropy": 1.8809518918395043, + "epoch": 0.33131864853114523, + "grad_norm": 7.828016757965088, + "learning_rate": 4.395130459380615e-06, + "loss": 0.4947, + "mean_token_accuracy": 0.844907422363758, + "num_tokens": 128473604.0, + "step": 106880 + }, + { + "entropy": 1.8513751283288002, + "epoch": 0.3313496476561949, + "grad_norm": 8.020812034606934, + "learning_rate": 4.39492486136488e-06, + "loss": 0.4934, + "mean_token_accuracy": 0.8413770824670792, + "num_tokens": 128485181.0, + "step": 106890 + }, + { + "entropy": 1.8325881719589234, + "epoch": 0.3313806467812446, + "grad_norm": 8.994607925415039, + "learning_rate": 4.3947192921992015e-06, + "loss": 0.4488, + "mean_token_accuracy": 0.854704113304615, + "num_tokens": 128497524.0, + "step": 106900 + }, + { + "entropy": 1.7628432139754295, + "epoch": 0.3314116459062943, + "grad_norm": 4.272587299346924, + "learning_rate": 4.394513751876836e-06, + "loss": 0.4269, + "mean_token_accuracy": 0.853622929751873, + "num_tokens": 128510023.0, + "step": 106910 + }, + { + "entropy": 1.8294406726956367, + "epoch": 0.33144264503134396, + "grad_norm": 7.545256614685059, + "learning_rate": 4.394308240391038e-06, + "loss": 0.4807, + "mean_token_accuracy": 0.8351798385381699, + "num_tokens": 128522418.0, + "step": 106920 + }, + { + "entropy": 1.765715080499649, + "epoch": 0.3314736441563937, + "grad_norm": 6.0750508308410645, + "learning_rate": 4.394102757735063e-06, + "loss": 0.4355, + "mean_token_accuracy": 0.8552381888031959, + "num_tokens": 128535920.0, + "step": 106930 + }, + { + "entropy": 1.8687207847833633, + "epoch": 0.33150464328144336, + "grad_norm": 4.047634124755859, + "learning_rate": 4.393897303902175e-06, + "loss": 0.5086, + "mean_token_accuracy": 0.8374577537178993, + "num_tokens": 128548220.0, + "step": 106940 + }, + { + "entropy": 1.7995543777942657, + "epoch": 0.3315356424064931, + "grad_norm": 2.202331781387329, + "learning_rate": 4.3936918788856334e-06, + "loss": 0.4747, + "mean_token_accuracy": 0.8424097955226898, + "num_tokens": 128561328.0, + "step": 106950 + }, + { + "entropy": 1.7521587029099464, + "epoch": 0.33156664153154275, + "grad_norm": 2.6816599369049072, + "learning_rate": 4.393486482678704e-06, + "loss": 0.3981, + "mean_token_accuracy": 0.8565811112523078, + "num_tokens": 128575383.0, + "step": 106960 + }, + { + "entropy": 1.841699157655239, + "epoch": 0.3315976406565925, + "grad_norm": 7.942765235900879, + "learning_rate": 4.393281115274653e-06, + "loss": 0.4587, + "mean_token_accuracy": 0.8441447481513024, + "num_tokens": 128587295.0, + "step": 106970 + }, + { + "entropy": 1.8485121354460716, + "epoch": 0.33162863978164214, + "grad_norm": 3.5550448894500732, + "learning_rate": 4.39307577666675e-06, + "loss": 0.444, + "mean_token_accuracy": 0.8580849900841713, + "num_tokens": 128598970.0, + "step": 106980 + }, + { + "entropy": 1.8623722419142723, + "epoch": 0.33165963890669187, + "grad_norm": 9.27586555480957, + "learning_rate": 4.3928704668482655e-06, + "loss": 0.4854, + "mean_token_accuracy": 0.8380335718393326, + "num_tokens": 128611672.0, + "step": 106990 + }, + { + "entropy": 1.8408109113574027, + "epoch": 0.33169063803174154, + "grad_norm": 8.369120597839355, + "learning_rate": 4.392665185812473e-06, + "loss": 0.519, + "mean_token_accuracy": 0.8391941249370575, + "num_tokens": 128623923.0, + "step": 107000 + }, + { + "entropy": 1.9453389286994933, + "epoch": 0.33172163715679126, + "grad_norm": 8.78305435180664, + "learning_rate": 4.392459933552647e-06, + "loss": 0.5335, + "mean_token_accuracy": 0.8356341332197189, + "num_tokens": 128634685.0, + "step": 107010 + }, + { + "entropy": 1.8125431835651398, + "epoch": 0.33175263628184093, + "grad_norm": 3.5980794429779053, + "learning_rate": 4.392254710062065e-06, + "loss": 0.4076, + "mean_token_accuracy": 0.854391148686409, + "num_tokens": 128648003.0, + "step": 107020 + }, + { + "entropy": 1.8818465709686278, + "epoch": 0.33178363540689065, + "grad_norm": 9.284706115722656, + "learning_rate": 4.3920495153340094e-06, + "loss": 0.5199, + "mean_token_accuracy": 0.838673610985279, + "num_tokens": 128659608.0, + "step": 107030 + }, + { + "entropy": 1.8223556622862815, + "epoch": 0.3318146345319403, + "grad_norm": 5.312669277191162, + "learning_rate": 4.39184434936176e-06, + "loss": 0.4506, + "mean_token_accuracy": 0.8526388436555863, + "num_tokens": 128672024.0, + "step": 107040 + }, + { + "entropy": 1.8191813245415687, + "epoch": 0.33184563365699005, + "grad_norm": 9.039751052856445, + "learning_rate": 4.3916392121386025e-06, + "loss": 0.4316, + "mean_token_accuracy": 0.8523767486214637, + "num_tokens": 128684326.0, + "step": 107050 + }, + { + "entropy": 1.8492727160453797, + "epoch": 0.3318766327820397, + "grad_norm": 4.566840648651123, + "learning_rate": 4.3914341036578225e-06, + "loss": 0.4415, + "mean_token_accuracy": 0.8528471812605858, + "num_tokens": 128696669.0, + "step": 107060 + }, + { + "entropy": 1.8731192171573638, + "epoch": 0.33190763190708944, + "grad_norm": 10.486961364746094, + "learning_rate": 4.391229023912708e-06, + "loss": 0.464, + "mean_token_accuracy": 0.8397856906056405, + "num_tokens": 128708878.0, + "step": 107070 + }, + { + "entropy": 1.757045941054821, + "epoch": 0.3319386310321391, + "grad_norm": 4.080169677734375, + "learning_rate": 4.3910239728965514e-06, + "loss": 0.3642, + "mean_token_accuracy": 0.8683019027113914, + "num_tokens": 128722019.0, + "step": 107080 + }, + { + "entropy": 1.8372290149331092, + "epoch": 0.33196963015718883, + "grad_norm": 7.9230265617370605, + "learning_rate": 4.390818950602644e-06, + "loss": 0.4465, + "mean_token_accuracy": 0.8605737552046776, + "num_tokens": 128733732.0, + "step": 107090 + }, + { + "entropy": 1.7193499386310578, + "epoch": 0.3320006292822385, + "grad_norm": 3.894108533859253, + "learning_rate": 4.390613957024283e-06, + "loss": 0.3726, + "mean_token_accuracy": 0.8623716205358505, + "num_tokens": 128747568.0, + "step": 107100 + }, + { + "entropy": 1.8526916906237603, + "epoch": 0.3320316284072882, + "grad_norm": 7.582345008850098, + "learning_rate": 4.390408992154765e-06, + "loss": 0.4731, + "mean_token_accuracy": 0.8529738247394562, + "num_tokens": 128758676.0, + "step": 107110 + }, + { + "entropy": 1.7845040023326875, + "epoch": 0.3320626275323379, + "grad_norm": 4.077572822570801, + "learning_rate": 4.390204055987388e-06, + "loss": 0.4233, + "mean_token_accuracy": 0.8509064018726349, + "num_tokens": 128770630.0, + "step": 107120 + }, + { + "entropy": 1.9034978345036506, + "epoch": 0.3320936266573876, + "grad_norm": 11.147791862487793, + "learning_rate": 4.389999148515457e-06, + "loss": 0.5067, + "mean_token_accuracy": 0.8462053269147873, + "num_tokens": 128781845.0, + "step": 107130 + }, + { + "entropy": 1.8408338278532028, + "epoch": 0.3321246257824373, + "grad_norm": 5.386666297912598, + "learning_rate": 4.389794269732274e-06, + "loss": 0.5242, + "mean_token_accuracy": 0.8391245663166046, + "num_tokens": 128794017.0, + "step": 107140 + }, + { + "entropy": 1.8361081883311272, + "epoch": 0.332155624907487, + "grad_norm": 7.174749374389648, + "learning_rate": 4.389589419631145e-06, + "loss": 0.462, + "mean_token_accuracy": 0.8466487154364586, + "num_tokens": 128805772.0, + "step": 107150 + }, + { + "entropy": 1.8699164032936095, + "epoch": 0.3321866240325367, + "grad_norm": 9.90065860748291, + "learning_rate": 4.389384598205379e-06, + "loss": 0.4566, + "mean_token_accuracy": 0.8560563385486603, + "num_tokens": 128817741.0, + "step": 107160 + }, + { + "entropy": 1.8497094124555589, + "epoch": 0.33221762315758635, + "grad_norm": 8.163619995117188, + "learning_rate": 4.389179805448286e-06, + "loss": 0.4387, + "mean_token_accuracy": 0.8564316019415855, + "num_tokens": 128829031.0, + "step": 107170 + }, + { + "entropy": 1.8336839243769645, + "epoch": 0.3322486222826361, + "grad_norm": 8.241865158081055, + "learning_rate": 4.38897504135318e-06, + "loss": 0.4941, + "mean_token_accuracy": 0.8471016883850098, + "num_tokens": 128841424.0, + "step": 107180 + }, + { + "entropy": 1.7827384427189827, + "epoch": 0.33227962140768574, + "grad_norm": 4.903016090393066, + "learning_rate": 4.388770305913374e-06, + "loss": 0.4137, + "mean_token_accuracy": 0.8514184907078743, + "num_tokens": 128855066.0, + "step": 107190 + }, + { + "entropy": 1.89354208111763, + "epoch": 0.33231062053273547, + "grad_norm": 8.681282043457031, + "learning_rate": 4.388565599122187e-06, + "loss": 0.5053, + "mean_token_accuracy": 0.8418336838483811, + "num_tokens": 128866035.0, + "step": 107200 + }, + { + "entropy": 1.8403149604797364, + "epoch": 0.33234161965778514, + "grad_norm": 4.251253604888916, + "learning_rate": 4.3883609209729375e-06, + "loss": 0.447, + "mean_token_accuracy": 0.8545697897672653, + "num_tokens": 128878048.0, + "step": 107210 + }, + { + "entropy": 1.9954572170972824, + "epoch": 0.33237261878283486, + "grad_norm": 8.248795509338379, + "learning_rate": 4.388156271458946e-06, + "loss": 0.5474, + "mean_token_accuracy": 0.8341551095247268, + "num_tokens": 128888607.0, + "step": 107220 + }, + { + "entropy": 1.7177160628139974, + "epoch": 0.33240361790788453, + "grad_norm": 7.692461967468262, + "learning_rate": 4.387951650573538e-06, + "loss": 0.4112, + "mean_token_accuracy": 0.8625841051340103, + "num_tokens": 128903267.0, + "step": 107230 + }, + { + "entropy": 1.9298606514930725, + "epoch": 0.33243461703293425, + "grad_norm": 7.782931327819824, + "learning_rate": 4.387747058310038e-06, + "loss": 0.4988, + "mean_token_accuracy": 0.8428981378674507, + "num_tokens": 128914211.0, + "step": 107240 + }, + { + "entropy": 1.9102041110396386, + "epoch": 0.3324656161579839, + "grad_norm": 9.960110664367676, + "learning_rate": 4.387542494661775e-06, + "loss": 0.5518, + "mean_token_accuracy": 0.8341942340135574, + "num_tokens": 128925160.0, + "step": 107250 + }, + { + "entropy": 1.7972071468830109, + "epoch": 0.33249661528303365, + "grad_norm": 8.017036437988281, + "learning_rate": 4.3873379596220784e-06, + "loss": 0.4138, + "mean_token_accuracy": 0.8529744014143944, + "num_tokens": 128937771.0, + "step": 107260 + }, + { + "entropy": 1.8894135251641273, + "epoch": 0.3325276144080833, + "grad_norm": 8.887062072753906, + "learning_rate": 4.3871334531842805e-06, + "loss": 0.4585, + "mean_token_accuracy": 0.8535187065601348, + "num_tokens": 128949419.0, + "step": 107270 + }, + { + "entropy": 1.8428677216172218, + "epoch": 0.33255861353313304, + "grad_norm": 6.1778645515441895, + "learning_rate": 4.386928975341716e-06, + "loss": 0.4089, + "mean_token_accuracy": 0.8626132607460022, + "num_tokens": 128960944.0, + "step": 107280 + }, + { + "entropy": 1.8553937733173371, + "epoch": 0.3325896126581827, + "grad_norm": 8.516434669494629, + "learning_rate": 4.386724526087722e-06, + "loss": 0.4573, + "mean_token_accuracy": 0.8496822208166123, + "num_tokens": 128972466.0, + "step": 107290 + }, + { + "entropy": 1.8534843027591705, + "epoch": 0.33262061178323243, + "grad_norm": 8.477677345275879, + "learning_rate": 4.386520105415637e-06, + "loss": 0.4455, + "mean_token_accuracy": 0.8520261511206627, + "num_tokens": 128984702.0, + "step": 107300 + }, + { + "entropy": 1.8577327355742455, + "epoch": 0.3326516109082821, + "grad_norm": 8.226881980895996, + "learning_rate": 4.386315713318802e-06, + "loss": 0.4799, + "mean_token_accuracy": 0.8503885567188263, + "num_tokens": 128996533.0, + "step": 107310 + }, + { + "entropy": 1.766034336388111, + "epoch": 0.3326826100333318, + "grad_norm": 7.096409320831299, + "learning_rate": 4.386111349790562e-06, + "loss": 0.4041, + "mean_token_accuracy": 0.8526156499981881, + "num_tokens": 129009556.0, + "step": 107320 + }, + { + "entropy": 1.823246991634369, + "epoch": 0.3327136091583815, + "grad_norm": 8.441973686218262, + "learning_rate": 4.385907014824259e-06, + "loss": 0.4427, + "mean_token_accuracy": 0.8562351033091545, + "num_tokens": 129021912.0, + "step": 107330 + }, + { + "entropy": 1.8534847319126129, + "epoch": 0.3327446082834312, + "grad_norm": 4.784743785858154, + "learning_rate": 4.385702708413244e-06, + "loss": 0.4468, + "mean_token_accuracy": 0.8512447312474251, + "num_tokens": 129033807.0, + "step": 107340 + }, + { + "entropy": 1.8405146718025207, + "epoch": 0.3327756074084809, + "grad_norm": 3.7886173725128174, + "learning_rate": 4.385498430550864e-06, + "loss": 0.4918, + "mean_token_accuracy": 0.8435701325535774, + "num_tokens": 129046120.0, + "step": 107350 + }, + { + "entropy": 1.7437313303351403, + "epoch": 0.3328066065335306, + "grad_norm": 4.013438701629639, + "learning_rate": 4.385294181230472e-06, + "loss": 0.3651, + "mean_token_accuracy": 0.8705343827605247, + "num_tokens": 129060159.0, + "step": 107360 + }, + { + "entropy": 1.8268103674054146, + "epoch": 0.3328376056585803, + "grad_norm": 10.785335540771484, + "learning_rate": 4.385089960445422e-06, + "loss": 0.403, + "mean_token_accuracy": 0.853385554254055, + "num_tokens": 129072056.0, + "step": 107370 + }, + { + "entropy": 1.7955839529633522, + "epoch": 0.33286860478363, + "grad_norm": 3.143990993499756, + "learning_rate": 4.384885768189071e-06, + "loss": 0.4295, + "mean_token_accuracy": 0.854556767642498, + "num_tokens": 129084936.0, + "step": 107380 + }, + { + "entropy": 1.9022680431604386, + "epoch": 0.3328996039086797, + "grad_norm": 7.960306167602539, + "learning_rate": 4.384681604454776e-06, + "loss": 0.5184, + "mean_token_accuracy": 0.8433623731136322, + "num_tokens": 129096132.0, + "step": 107390 + }, + { + "entropy": 1.8644019782543182, + "epoch": 0.3329306030337294, + "grad_norm": 9.219409942626953, + "learning_rate": 4.384477469235899e-06, + "loss": 0.5549, + "mean_token_accuracy": 0.8375217631459236, + "num_tokens": 129108399.0, + "step": 107400 + }, + { + "entropy": 1.8267999082803725, + "epoch": 0.33296160215877907, + "grad_norm": 9.918363571166992, + "learning_rate": 4.384273362525801e-06, + "loss": 0.4495, + "mean_token_accuracy": 0.8537021040916443, + "num_tokens": 129120240.0, + "step": 107410 + }, + { + "entropy": 1.8040613248944282, + "epoch": 0.33299260128382874, + "grad_norm": 9.216553688049316, + "learning_rate": 4.384069284317849e-06, + "loss": 0.3904, + "mean_token_accuracy": 0.8598408699035645, + "num_tokens": 129133042.0, + "step": 107420 + }, + { + "entropy": 1.8831330671906472, + "epoch": 0.33302360040887846, + "grad_norm": 7.262103080749512, + "learning_rate": 4.383865234605409e-06, + "loss": 0.4714, + "mean_token_accuracy": 0.8518488049507141, + "num_tokens": 129144475.0, + "step": 107430 + }, + { + "entropy": 1.8981991022825242, + "epoch": 0.33305459953392813, + "grad_norm": 8.177460670471191, + "learning_rate": 4.38366121338185e-06, + "loss": 0.4907, + "mean_token_accuracy": 0.8512781083583831, + "num_tokens": 129156011.0, + "step": 107440 + }, + { + "entropy": 1.829132416844368, + "epoch": 0.33308559865897786, + "grad_norm": 9.38082504272461, + "learning_rate": 4.383457220640543e-06, + "loss": 0.4463, + "mean_token_accuracy": 0.8530718177556992, + "num_tokens": 129168301.0, + "step": 107450 + }, + { + "entropy": 1.76314737200737, + "epoch": 0.3331165977840275, + "grad_norm": 8.310257911682129, + "learning_rate": 4.383253256374863e-06, + "loss": 0.3821, + "mean_token_accuracy": 0.8589179947972297, + "num_tokens": 129181867.0, + "step": 107460 + }, + { + "entropy": 1.7426991537213326, + "epoch": 0.33314759690907725, + "grad_norm": 4.443640232086182, + "learning_rate": 4.383049320578185e-06, + "loss": 0.4003, + "mean_token_accuracy": 0.8596662864089012, + "num_tokens": 129194857.0, + "step": 107470 + }, + { + "entropy": 1.8887342095375061, + "epoch": 0.3331785960341269, + "grad_norm": 9.242331504821777, + "learning_rate": 4.382845413243886e-06, + "loss": 0.5217, + "mean_token_accuracy": 0.8357272610068321, + "num_tokens": 129206228.0, + "step": 107480 + }, + { + "entropy": 1.7917708232998848, + "epoch": 0.33320959515917664, + "grad_norm": 7.552341938018799, + "learning_rate": 4.382641534365348e-06, + "loss": 0.4455, + "mean_token_accuracy": 0.8635924890637398, + "num_tokens": 129219549.0, + "step": 107490 + }, + { + "entropy": 1.9377831488847732, + "epoch": 0.3332405942842263, + "grad_norm": 8.330388069152832, + "learning_rate": 4.38243768393595e-06, + "loss": 0.5005, + "mean_token_accuracy": 0.8412655621767045, + "num_tokens": 129230964.0, + "step": 107500 + }, + { + "entropy": 1.8014700457453727, + "epoch": 0.33327159340927603, + "grad_norm": 9.167746543884277, + "learning_rate": 4.382233861949079e-06, + "loss": 0.4119, + "mean_token_accuracy": 0.8664462238550186, + "num_tokens": 129242857.0, + "step": 107510 + }, + { + "entropy": 1.8836747616529466, + "epoch": 0.3333025925343257, + "grad_norm": 8.401406288146973, + "learning_rate": 4.382030068398122e-06, + "loss": 0.48, + "mean_token_accuracy": 0.846863467991352, + "num_tokens": 129254438.0, + "step": 107520 + }, + { + "entropy": 1.8405706137418747, + "epoch": 0.33333359165937543, + "grad_norm": 3.3866302967071533, + "learning_rate": 4.381826303276464e-06, + "loss": 0.4663, + "mean_token_accuracy": 0.849305622279644, + "num_tokens": 129266522.0, + "step": 107530 + }, + { + "entropy": 1.8931007355451583, + "epoch": 0.3333645907844251, + "grad_norm": 9.184843063354492, + "learning_rate": 4.3816225665775e-06, + "loss": 0.5305, + "mean_token_accuracy": 0.8357789561152458, + "num_tokens": 129278660.0, + "step": 107540 + }, + { + "entropy": 1.8619947463274003, + "epoch": 0.3333955899094748, + "grad_norm": 8.761635780334473, + "learning_rate": 4.38141885829462e-06, + "loss": 0.4635, + "mean_token_accuracy": 0.8441411420702934, + "num_tokens": 129290127.0, + "step": 107550 + }, + { + "entropy": 1.8187642887234687, + "epoch": 0.3334265890345245, + "grad_norm": 7.852782249450684, + "learning_rate": 4.3812151784212205e-06, + "loss": 0.4222, + "mean_token_accuracy": 0.8587213516235351, + "num_tokens": 129302352.0, + "step": 107560 + }, + { + "entropy": 1.8852619290351869, + "epoch": 0.3334575881595742, + "grad_norm": 8.888199806213379, + "learning_rate": 4.3810115269506985e-06, + "loss": 0.5021, + "mean_token_accuracy": 0.850848950445652, + "num_tokens": 129313337.0, + "step": 107570 + }, + { + "entropy": 1.8726006522774696, + "epoch": 0.3334885872846239, + "grad_norm": 7.686779022216797, + "learning_rate": 4.380807903876452e-06, + "loss": 0.4997, + "mean_token_accuracy": 0.8449873507022858, + "num_tokens": 129324483.0, + "step": 107580 + }, + { + "entropy": 1.857853028178215, + "epoch": 0.3335195864096736, + "grad_norm": 9.525311470031738, + "learning_rate": 4.3806043091918855e-06, + "loss": 0.4275, + "mean_token_accuracy": 0.8560262963175773, + "num_tokens": 129336845.0, + "step": 107590 + }, + { + "entropy": 1.8294879227876664, + "epoch": 0.3335505855347233, + "grad_norm": 3.713810920715332, + "learning_rate": 4.3804007428904e-06, + "loss": 0.4397, + "mean_token_accuracy": 0.8501683503389359, + "num_tokens": 129349320.0, + "step": 107600 + }, + { + "entropy": 1.7990604281425475, + "epoch": 0.333581584659773, + "grad_norm": 8.975895881652832, + "learning_rate": 4.380197204965402e-06, + "loss": 0.4444, + "mean_token_accuracy": 0.850322599709034, + "num_tokens": 129361696.0, + "step": 107610 + }, + { + "entropy": 1.8823053747415543, + "epoch": 0.33361258378482267, + "grad_norm": 9.130423545837402, + "learning_rate": 4.3799936954103e-06, + "loss": 0.5031, + "mean_token_accuracy": 0.8529215559363366, + "num_tokens": 129373047.0, + "step": 107620 + }, + { + "entropy": 1.7972995683550834, + "epoch": 0.3336435829098724, + "grad_norm": 6.733953952789307, + "learning_rate": 4.3797902142185034e-06, + "loss": 0.4064, + "mean_token_accuracy": 0.8613699153065681, + "num_tokens": 129386076.0, + "step": 107630 + }, + { + "entropy": 1.9032483518123626, + "epoch": 0.33367458203492206, + "grad_norm": 8.466991424560547, + "learning_rate": 4.379586761383426e-06, + "loss": 0.5461, + "mean_token_accuracy": 0.8323118224740028, + "num_tokens": 129397394.0, + "step": 107640 + }, + { + "entropy": 1.8645583361387252, + "epoch": 0.3337055811599718, + "grad_norm": 9.792939186096191, + "learning_rate": 4.379383336898479e-06, + "loss": 0.4703, + "mean_token_accuracy": 0.8481997266411782, + "num_tokens": 129408714.0, + "step": 107650 + }, + { + "entropy": 1.9275704205036164, + "epoch": 0.33373658028502146, + "grad_norm": 8.203200340270996, + "learning_rate": 4.3791799407570814e-06, + "loss": 0.5707, + "mean_token_accuracy": 0.8284709110856057, + "num_tokens": 129420176.0, + "step": 107660 + }, + { + "entropy": 1.9315796703100205, + "epoch": 0.3337675794100711, + "grad_norm": 7.3529953956604, + "learning_rate": 4.378976572952653e-06, + "loss": 0.4796, + "mean_token_accuracy": 0.8478155717253685, + "num_tokens": 129430827.0, + "step": 107670 + }, + { + "entropy": 1.7806938499212266, + "epoch": 0.33379857853512085, + "grad_norm": 3.7703471183776855, + "learning_rate": 4.378773233478612e-06, + "loss": 0.4423, + "mean_token_accuracy": 0.8491846337914467, + "num_tokens": 129443966.0, + "step": 107680 + }, + { + "entropy": 1.7839350268244742, + "epoch": 0.3338295776601705, + "grad_norm": 7.560736179351807, + "learning_rate": 4.378569922328383e-06, + "loss": 0.3992, + "mean_token_accuracy": 0.8620262175798417, + "num_tokens": 129457050.0, + "step": 107690 + }, + { + "entropy": 1.8768269151449204, + "epoch": 0.33386057678522024, + "grad_norm": 8.099947929382324, + "learning_rate": 4.37836663949539e-06, + "loss": 0.4894, + "mean_token_accuracy": 0.844311997294426, + "num_tokens": 129468839.0, + "step": 107700 + }, + { + "entropy": 1.8897486120462417, + "epoch": 0.3338915759102699, + "grad_norm": 8.095099449157715, + "learning_rate": 4.3781633849730605e-06, + "loss": 0.5024, + "mean_token_accuracy": 0.8418814584612846, + "num_tokens": 129480361.0, + "step": 107710 + }, + { + "entropy": 1.8843684524297715, + "epoch": 0.33392257503531964, + "grad_norm": 5.762594223022461, + "learning_rate": 4.377960158754823e-06, + "loss": 0.482, + "mean_token_accuracy": 0.8446230858564376, + "num_tokens": 129492583.0, + "step": 107720 + }, + { + "entropy": 1.969536703824997, + "epoch": 0.3339535741603693, + "grad_norm": 8.858036041259766, + "learning_rate": 4.377756960834111e-06, + "loss": 0.509, + "mean_token_accuracy": 0.844764456152916, + "num_tokens": 129503430.0, + "step": 107730 + }, + { + "entropy": 1.8859269142150878, + "epoch": 0.33398457328541903, + "grad_norm": 4.305178165435791, + "learning_rate": 4.377553791204358e-06, + "loss": 0.4548, + "mean_token_accuracy": 0.8533265367150307, + "num_tokens": 129514786.0, + "step": 107740 + }, + { + "entropy": 1.859623844921589, + "epoch": 0.3340155724104687, + "grad_norm": 7.1102190017700195, + "learning_rate": 4.377350649858999e-06, + "loss": 0.4748, + "mean_token_accuracy": 0.8454333543777466, + "num_tokens": 129526854.0, + "step": 107750 + }, + { + "entropy": 1.897015392780304, + "epoch": 0.3340465715355184, + "grad_norm": 8.836033821105957, + "learning_rate": 4.377147536791471e-06, + "loss": 0.5238, + "mean_token_accuracy": 0.8378755912184715, + "num_tokens": 129537781.0, + "step": 107760 + }, + { + "entropy": 1.8058869987726212, + "epoch": 0.3340775706605681, + "grad_norm": 2.6924660205841064, + "learning_rate": 4.376944451995214e-06, + "loss": 0.4335, + "mean_token_accuracy": 0.8562029376626015, + "num_tokens": 129550764.0, + "step": 107770 + }, + { + "entropy": 1.8765144675970078, + "epoch": 0.3341085697856178, + "grad_norm": 7.857125759124756, + "learning_rate": 4.376741395463672e-06, + "loss": 0.4803, + "mean_token_accuracy": 0.8540433526039124, + "num_tokens": 129562412.0, + "step": 107780 + }, + { + "entropy": 1.8461247250437736, + "epoch": 0.3341395689106675, + "grad_norm": 8.548455238342285, + "learning_rate": 4.376538367190288e-06, + "loss": 0.4783, + "mean_token_accuracy": 0.8496983379125596, + "num_tokens": 129573651.0, + "step": 107790 + }, + { + "entropy": 1.8069075807929038, + "epoch": 0.3341705680357172, + "grad_norm": 9.008338928222656, + "learning_rate": 4.37633536716851e-06, + "loss": 0.4151, + "mean_token_accuracy": 0.8606833323836327, + "num_tokens": 129586574.0, + "step": 107800 + }, + { + "entropy": 1.8529318809509276, + "epoch": 0.3342015671607669, + "grad_norm": 3.940270185470581, + "learning_rate": 4.376132395391783e-06, + "loss": 0.4425, + "mean_token_accuracy": 0.8578683048486709, + "num_tokens": 129598430.0, + "step": 107810 + }, + { + "entropy": 1.8817769691348076, + "epoch": 0.3342325662858166, + "grad_norm": 8.67603588104248, + "learning_rate": 4.375929451853561e-06, + "loss": 0.5161, + "mean_token_accuracy": 0.8386638849973679, + "num_tokens": 129610580.0, + "step": 107820 + }, + { + "entropy": 1.8078914038836955, + "epoch": 0.33426356541086627, + "grad_norm": 10.10114574432373, + "learning_rate": 4.375726536547296e-06, + "loss": 0.4132, + "mean_token_accuracy": 0.8479559436440468, + "num_tokens": 129623248.0, + "step": 107830 + }, + { + "entropy": 1.8651557683944702, + "epoch": 0.334294564535916, + "grad_norm": 7.359059810638428, + "learning_rate": 4.375523649466441e-06, + "loss": 0.4767, + "mean_token_accuracy": 0.8485989525914193, + "num_tokens": 129635182.0, + "step": 107840 + }, + { + "entropy": 1.744455586373806, + "epoch": 0.33432556366096566, + "grad_norm": 5.441449165344238, + "learning_rate": 4.375320790604457e-06, + "loss": 0.3794, + "mean_token_accuracy": 0.8629415735602379, + "num_tokens": 129649244.0, + "step": 107850 + }, + { + "entropy": 1.7221811696887017, + "epoch": 0.3343565627860154, + "grad_norm": 3.5538432598114014, + "learning_rate": 4.375117959954799e-06, + "loss": 0.353, + "mean_token_accuracy": 0.8725274935364723, + "num_tokens": 129662708.0, + "step": 107860 + }, + { + "entropy": 1.7094054415822029, + "epoch": 0.33438756191106506, + "grad_norm": 11.59929370880127, + "learning_rate": 4.37491515751093e-06, + "loss": 0.39, + "mean_token_accuracy": 0.8701628282666206, + "num_tokens": 129676262.0, + "step": 107870 + }, + { + "entropy": 1.7959278047084808, + "epoch": 0.3344185610361148, + "grad_norm": 3.8675200939178467, + "learning_rate": 4.374712383266314e-06, + "loss": 0.4187, + "mean_token_accuracy": 0.8563177272677421, + "num_tokens": 129689029.0, + "step": 107880 + }, + { + "entropy": 1.8378399103879928, + "epoch": 0.33444956016116445, + "grad_norm": 7.299082279205322, + "learning_rate": 4.3745096372144166e-06, + "loss": 0.5197, + "mean_token_accuracy": 0.8417981162667274, + "num_tokens": 129701201.0, + "step": 107890 + }, + { + "entropy": 1.8643528178334237, + "epoch": 0.3344805592862142, + "grad_norm": 7.960921287536621, + "learning_rate": 4.374306919348705e-06, + "loss": 0.4717, + "mean_token_accuracy": 0.8524877533316613, + "num_tokens": 129712743.0, + "step": 107900 + }, + { + "entropy": 1.900659802556038, + "epoch": 0.33451155841126384, + "grad_norm": 7.667184829711914, + "learning_rate": 4.374104229662648e-06, + "loss": 0.4911, + "mean_token_accuracy": 0.8470824211835861, + "num_tokens": 129723529.0, + "step": 107910 + }, + { + "entropy": 1.8972845152020454, + "epoch": 0.3345425575363135, + "grad_norm": 7.8113484382629395, + "learning_rate": 4.3739015681497185e-06, + "loss": 0.4998, + "mean_token_accuracy": 0.8441723167896271, + "num_tokens": 129734887.0, + "step": 107920 + }, + { + "entropy": 1.8530030235648156, + "epoch": 0.33457355666136324, + "grad_norm": 7.784739971160889, + "learning_rate": 4.37369893480339e-06, + "loss": 0.4475, + "mean_token_accuracy": 0.8602523878216743, + "num_tokens": 129746166.0, + "step": 107930 + }, + { + "entropy": 1.8418569251894952, + "epoch": 0.3346045557864129, + "grad_norm": 3.6897940635681152, + "learning_rate": 4.37349632961714e-06, + "loss": 0.4411, + "mean_token_accuracy": 0.8551413804292679, + "num_tokens": 129758785.0, + "step": 107940 + }, + { + "entropy": 1.8382650315761566, + "epoch": 0.33463555491146263, + "grad_norm": 6.814600467681885, + "learning_rate": 4.373293752584445e-06, + "loss": 0.4678, + "mean_token_accuracy": 0.8480258256196975, + "num_tokens": 129771240.0, + "step": 107950 + }, + { + "entropy": 1.7906376250088214, + "epoch": 0.3346665540365123, + "grad_norm": 8.821784973144531, + "learning_rate": 4.373091203698785e-06, + "loss": 0.3782, + "mean_token_accuracy": 0.8606226801872253, + "num_tokens": 129783410.0, + "step": 107960 + }, + { + "entropy": 1.850863453745842, + "epoch": 0.334697553161562, + "grad_norm": 3.5716798305511475, + "learning_rate": 4.372888682953645e-06, + "loss": 0.4767, + "mean_token_accuracy": 0.8479766875505448, + "num_tokens": 129794773.0, + "step": 107970 + }, + { + "entropy": 1.8356069535017014, + "epoch": 0.3347285522866117, + "grad_norm": 8.035232543945312, + "learning_rate": 4.372686190342508e-06, + "loss": 0.4409, + "mean_token_accuracy": 0.8509008720517158, + "num_tokens": 129807004.0, + "step": 107980 + }, + { + "entropy": 1.8803606986999513, + "epoch": 0.3347595514116614, + "grad_norm": 7.623292446136475, + "learning_rate": 4.3724837258588594e-06, + "loss": 0.5087, + "mean_token_accuracy": 0.8468181252479553, + "num_tokens": 129818561.0, + "step": 107990 + }, + { + "entropy": 1.8860761135816575, + "epoch": 0.3347905505367111, + "grad_norm": 9.617867469787598, + "learning_rate": 4.37228128949619e-06, + "loss": 0.5025, + "mean_token_accuracy": 0.8485965311527253, + "num_tokens": 129830143.0, + "step": 108000 + }, + { + "entropy": 1.912020392715931, + "epoch": 0.3348215496617608, + "grad_norm": 8.843141555786133, + "learning_rate": 4.37207888124799e-06, + "loss": 0.504, + "mean_token_accuracy": 0.8369658708572387, + "num_tokens": 129841114.0, + "step": 108010 + }, + { + "entropy": 1.9048634201288224, + "epoch": 0.3348525487868105, + "grad_norm": 6.282487392425537, + "learning_rate": 4.3718765011077526e-06, + "loss": 0.5192, + "mean_token_accuracy": 0.8487017884850502, + "num_tokens": 129851752.0, + "step": 108020 + }, + { + "entropy": 1.9025760471820832, + "epoch": 0.3348835479118602, + "grad_norm": 9.245036125183105, + "learning_rate": 4.371674149068973e-06, + "loss": 0.4887, + "mean_token_accuracy": 0.8425028279423714, + "num_tokens": 129863830.0, + "step": 108030 + }, + { + "entropy": 1.863828182220459, + "epoch": 0.33491454703690987, + "grad_norm": 3.375025510787964, + "learning_rate": 4.3714718251251484e-06, + "loss": 0.4603, + "mean_token_accuracy": 0.8575912222266198, + "num_tokens": 129875417.0, + "step": 108040 + }, + { + "entropy": 1.749958410859108, + "epoch": 0.3349455461619596, + "grad_norm": 3.8491711616516113, + "learning_rate": 4.371269529269777e-06, + "loss": 0.3726, + "mean_token_accuracy": 0.8608524903655053, + "num_tokens": 129888923.0, + "step": 108050 + }, + { + "entropy": 1.8047884538769723, + "epoch": 0.33497654528700926, + "grad_norm": 4.140578746795654, + "learning_rate": 4.371067261496363e-06, + "loss": 0.4493, + "mean_token_accuracy": 0.8508468925952911, + "num_tokens": 129901648.0, + "step": 108060 + }, + { + "entropy": 1.9018609017133712, + "epoch": 0.335007544412059, + "grad_norm": 7.736186504364014, + "learning_rate": 4.3708650217984065e-06, + "loss": 0.4772, + "mean_token_accuracy": 0.8536763086915016, + "num_tokens": 129912303.0, + "step": 108070 + }, + { + "entropy": 1.8833556294441223, + "epoch": 0.33503854353710866, + "grad_norm": 8.170893669128418, + "learning_rate": 4.370662810169415e-06, + "loss": 0.454, + "mean_token_accuracy": 0.8535693794488907, + "num_tokens": 129923394.0, + "step": 108080 + }, + { + "entropy": 1.7835390165448188, + "epoch": 0.3350695426621584, + "grad_norm": 8.293116569519043, + "learning_rate": 4.370460626602896e-06, + "loss": 0.4399, + "mean_token_accuracy": 0.8520323082804679, + "num_tokens": 129936227.0, + "step": 108090 + }, + { + "entropy": 1.8735915690660476, + "epoch": 0.33510054178720805, + "grad_norm": 8.832682609558105, + "learning_rate": 4.37025847109236e-06, + "loss": 0.4544, + "mean_token_accuracy": 0.8542598769068718, + "num_tokens": 129947875.0, + "step": 108100 + }, + { + "entropy": 1.8861595943570137, + "epoch": 0.3351315409122578, + "grad_norm": 4.6345601081848145, + "learning_rate": 4.370056343631318e-06, + "loss": 0.473, + "mean_token_accuracy": 0.8438780650496482, + "num_tokens": 129959918.0, + "step": 108110 + }, + { + "entropy": 1.8636915504932403, + "epoch": 0.33516254003730744, + "grad_norm": 9.650102615356445, + "learning_rate": 4.3698542442132845e-06, + "loss": 0.4528, + "mean_token_accuracy": 0.8606543466448784, + "num_tokens": 129970846.0, + "step": 108120 + }, + { + "entropy": 1.911002305150032, + "epoch": 0.33519353916235717, + "grad_norm": 9.570003509521484, + "learning_rate": 4.369652172831775e-06, + "loss": 0.5068, + "mean_token_accuracy": 0.8438562154769897, + "num_tokens": 129981852.0, + "step": 108130 + }, + { + "entropy": 1.8741035476326942, + "epoch": 0.33522453828740684, + "grad_norm": 6.404723644256592, + "learning_rate": 4.3694501294803084e-06, + "loss": 0.4746, + "mean_token_accuracy": 0.8497413903474808, + "num_tokens": 129993561.0, + "step": 108140 + }, + { + "entropy": 1.881354147195816, + "epoch": 0.3352555374124565, + "grad_norm": 8.92978286743164, + "learning_rate": 4.369248114152405e-06, + "loss": 0.4388, + "mean_token_accuracy": 0.8553690627217293, + "num_tokens": 130005553.0, + "step": 108150 + }, + { + "entropy": 1.9167394667863846, + "epoch": 0.33528653653750623, + "grad_norm": 6.4430341720581055, + "learning_rate": 4.369046126841588e-06, + "loss": 0.4929, + "mean_token_accuracy": 0.848999661207199, + "num_tokens": 130016296.0, + "step": 108160 + }, + { + "entropy": 1.7608971558511257, + "epoch": 0.3353175356625559, + "grad_norm": 11.467843055725098, + "learning_rate": 4.3688441675413805e-06, + "loss": 0.4082, + "mean_token_accuracy": 0.8577596142888069, + "num_tokens": 130029257.0, + "step": 108170 + }, + { + "entropy": 1.7902075618505477, + "epoch": 0.3353485347876056, + "grad_norm": 7.754476070404053, + "learning_rate": 4.3686422362453095e-06, + "loss": 0.4486, + "mean_token_accuracy": 0.8457227498292923, + "num_tokens": 130042376.0, + "step": 108180 + }, + { + "entropy": 1.80088053047657, + "epoch": 0.3353795339126553, + "grad_norm": 9.529817581176758, + "learning_rate": 4.368440332946905e-06, + "loss": 0.4318, + "mean_token_accuracy": 0.8530313909053803, + "num_tokens": 130054996.0, + "step": 108190 + }, + { + "entropy": 1.88881815969944, + "epoch": 0.335410533037705, + "grad_norm": 7.529262542724609, + "learning_rate": 4.368238457639695e-06, + "loss": 0.4901, + "mean_token_accuracy": 0.8425343319773674, + "num_tokens": 130066402.0, + "step": 108200 + }, + { + "entropy": 1.9495450556278229, + "epoch": 0.3354415321627547, + "grad_norm": 7.71326208114624, + "learning_rate": 4.368036610317216e-06, + "loss": 0.5015, + "mean_token_accuracy": 0.8409694746136666, + "num_tokens": 130077178.0, + "step": 108210 + }, + { + "entropy": 1.898306019604206, + "epoch": 0.3354725312878044, + "grad_norm": 7.798426151275635, + "learning_rate": 4.367834790973002e-06, + "loss": 0.4673, + "mean_token_accuracy": 0.8483818009495735, + "num_tokens": 130088667.0, + "step": 108220 + }, + { + "entropy": 1.8723418027162553, + "epoch": 0.3355035304128541, + "grad_norm": 4.362555503845215, + "learning_rate": 4.367632999600588e-06, + "loss": 0.4794, + "mean_token_accuracy": 0.8386352211236954, + "num_tokens": 130101166.0, + "step": 108230 + }, + { + "entropy": 1.8666965618729592, + "epoch": 0.3355345295379038, + "grad_norm": 6.942992687225342, + "learning_rate": 4.367431236193516e-06, + "loss": 0.4756, + "mean_token_accuracy": 0.8470498412847519, + "num_tokens": 130113744.0, + "step": 108240 + }, + { + "entropy": 1.811617687344551, + "epoch": 0.33556552866295347, + "grad_norm": 10.553791999816895, + "learning_rate": 4.367229500745324e-06, + "loss": 0.4317, + "mean_token_accuracy": 0.8510163232684136, + "num_tokens": 130126707.0, + "step": 108250 + }, + { + "entropy": 1.9607309699058533, + "epoch": 0.3355965277880032, + "grad_norm": 8.012662887573242, + "learning_rate": 4.367027793249559e-06, + "loss": 0.5177, + "mean_token_accuracy": 0.8463420748710633, + "num_tokens": 130137777.0, + "step": 108260 + }, + { + "entropy": 1.7869203016161919, + "epoch": 0.33562752691305286, + "grad_norm": 7.589692115783691, + "learning_rate": 4.366826113699764e-06, + "loss": 0.4129, + "mean_token_accuracy": 0.8630554795265197, + "num_tokens": 130150461.0, + "step": 108270 + }, + { + "entropy": 1.880982668697834, + "epoch": 0.3356585260381026, + "grad_norm": 7.124267101287842, + "learning_rate": 4.366624462089486e-06, + "loss": 0.5833, + "mean_token_accuracy": 0.827752648293972, + "num_tokens": 130162409.0, + "step": 108280 + }, + { + "entropy": 1.8591894909739495, + "epoch": 0.33568952516315226, + "grad_norm": 7.850539684295654, + "learning_rate": 4.3664228384122775e-06, + "loss": 0.4857, + "mean_token_accuracy": 0.8456223160028458, + "num_tokens": 130175026.0, + "step": 108290 + }, + { + "entropy": 1.8133115381002427, + "epoch": 0.335720524288202, + "grad_norm": 3.5843214988708496, + "learning_rate": 4.366221242661688e-06, + "loss": 0.4181, + "mean_token_accuracy": 0.8574982568621635, + "num_tokens": 130187161.0, + "step": 108300 + }, + { + "entropy": 1.8298716515302658, + "epoch": 0.33575152341325165, + "grad_norm": 7.22581148147583, + "learning_rate": 4.366019674831272e-06, + "loss": 0.4093, + "mean_token_accuracy": 0.8628581598401069, + "num_tokens": 130199254.0, + "step": 108310 + }, + { + "entropy": 1.8459315612912177, + "epoch": 0.3357825225383014, + "grad_norm": 7.195556640625, + "learning_rate": 4.365818134914586e-06, + "loss": 0.4566, + "mean_token_accuracy": 0.855617669224739, + "num_tokens": 130211367.0, + "step": 108320 + }, + { + "entropy": 1.7838531404733657, + "epoch": 0.33581352166335104, + "grad_norm": 3.379918098449707, + "learning_rate": 4.3656166229051865e-06, + "loss": 0.3927, + "mean_token_accuracy": 0.868264564871788, + "num_tokens": 130223512.0, + "step": 108330 + }, + { + "entropy": 1.805635717511177, + "epoch": 0.33584452078840077, + "grad_norm": 8.333756446838379, + "learning_rate": 4.365415138796635e-06, + "loss": 0.4398, + "mean_token_accuracy": 0.8466704249382019, + "num_tokens": 130235508.0, + "step": 108340 + }, + { + "entropy": 1.8783694118261338, + "epoch": 0.33587551991345044, + "grad_norm": 8.413524627685547, + "learning_rate": 4.3652136825824914e-06, + "loss": 0.5141, + "mean_token_accuracy": 0.8432982712984085, + "num_tokens": 130247392.0, + "step": 108350 + }, + { + "entropy": 1.8464765295386314, + "epoch": 0.33590651903850016, + "grad_norm": 10.091898918151855, + "learning_rate": 4.365012254256323e-06, + "loss": 0.5256, + "mean_token_accuracy": 0.836361040174961, + "num_tokens": 130259254.0, + "step": 108360 + }, + { + "entropy": 1.8891173303127289, + "epoch": 0.33593751816354983, + "grad_norm": 7.264171123504639, + "learning_rate": 4.3648108538116935e-06, + "loss": 0.4516, + "mean_token_accuracy": 0.854046767950058, + "num_tokens": 130270895.0, + "step": 108370 + }, + { + "entropy": 1.7990408152341844, + "epoch": 0.33596851728859956, + "grad_norm": 7.887672424316406, + "learning_rate": 4.364609481242173e-06, + "loss": 0.3973, + "mean_token_accuracy": 0.8649446800351143, + "num_tokens": 130283074.0, + "step": 108380 + }, + { + "entropy": 1.8251417383551598, + "epoch": 0.3359995164136492, + "grad_norm": 8.665481567382812, + "learning_rate": 4.36440813654133e-06, + "loss": 0.4955, + "mean_token_accuracy": 0.8497830405831337, + "num_tokens": 130295411.0, + "step": 108390 + }, + { + "entropy": 1.8189852401614188, + "epoch": 0.3360305155386989, + "grad_norm": 7.711446762084961, + "learning_rate": 4.36420681970274e-06, + "loss": 0.4558, + "mean_token_accuracy": 0.8486074671149254, + "num_tokens": 130307723.0, + "step": 108400 + }, + { + "entropy": 1.8464395835995675, + "epoch": 0.3360615146637486, + "grad_norm": 8.451082229614258, + "learning_rate": 4.364005530719975e-06, + "loss": 0.4729, + "mean_token_accuracy": 0.845195434987545, + "num_tokens": 130320326.0, + "step": 108410 + }, + { + "entropy": 1.9397107988595963, + "epoch": 0.3360925137887983, + "grad_norm": 8.73201847076416, + "learning_rate": 4.363804269586612e-06, + "loss": 0.5236, + "mean_token_accuracy": 0.8377207443118095, + "num_tokens": 130330931.0, + "step": 108420 + }, + { + "entropy": 1.8559130504727364, + "epoch": 0.336123512913848, + "grad_norm": 7.468655109405518, + "learning_rate": 4.363603036296231e-06, + "loss": 0.4628, + "mean_token_accuracy": 0.8583543822169304, + "num_tokens": 130342645.0, + "step": 108430 + }, + { + "entropy": 1.8359305024147035, + "epoch": 0.3361545120388977, + "grad_norm": 6.1533522605896, + "learning_rate": 4.363401830842411e-06, + "loss": 0.438, + "mean_token_accuracy": 0.8503975555300712, + "num_tokens": 130354736.0, + "step": 108440 + }, + { + "entropy": 1.8281599283218384, + "epoch": 0.3361855111639474, + "grad_norm": 10.031996726989746, + "learning_rate": 4.3632006532187375e-06, + "loss": 0.4527, + "mean_token_accuracy": 0.8479070514440536, + "num_tokens": 130366811.0, + "step": 108450 + }, + { + "entropy": 1.8181740254163743, + "epoch": 0.3362165102889971, + "grad_norm": 9.026259422302246, + "learning_rate": 4.3629995034187926e-06, + "loss": 0.4531, + "mean_token_accuracy": 0.8515908911824226, + "num_tokens": 130378443.0, + "step": 108460 + }, + { + "entropy": 1.653176798671484, + "epoch": 0.3362475094140468, + "grad_norm": 3.760293960571289, + "learning_rate": 4.362798381436164e-06, + "loss": 0.3436, + "mean_token_accuracy": 0.8677978113293647, + "num_tokens": 130393766.0, + "step": 108470 + }, + { + "entropy": 1.8992000639438629, + "epoch": 0.33627850853909647, + "grad_norm": 8.01069164276123, + "learning_rate": 4.362597287264443e-06, + "loss": 0.4874, + "mean_token_accuracy": 0.8475998684763908, + "num_tokens": 130405181.0, + "step": 108480 + }, + { + "entropy": 1.8159596145153045, + "epoch": 0.3363095076641462, + "grad_norm": 4.691624641418457, + "learning_rate": 4.362396220897218e-06, + "loss": 0.4395, + "mean_token_accuracy": 0.8557477623224259, + "num_tokens": 130417472.0, + "step": 108490 + }, + { + "entropy": 1.8598027050495147, + "epoch": 0.33634050678919586, + "grad_norm": 4.708345413208008, + "learning_rate": 4.362195182328085e-06, + "loss": 0.4617, + "mean_token_accuracy": 0.8497734665870667, + "num_tokens": 130429769.0, + "step": 108500 + }, + { + "entropy": 1.8282673090696335, + "epoch": 0.3363715059142456, + "grad_norm": 7.435179233551025, + "learning_rate": 4.361994171550637e-06, + "loss": 0.4484, + "mean_token_accuracy": 0.8541338846087456, + "num_tokens": 130441794.0, + "step": 108510 + }, + { + "entropy": 1.875466302037239, + "epoch": 0.33640250503929525, + "grad_norm": 9.038471221923828, + "learning_rate": 4.361793188558471e-06, + "loss": 0.5276, + "mean_token_accuracy": 0.8360525846481324, + "num_tokens": 130453505.0, + "step": 108520 + }, + { + "entropy": 1.9005172535777093, + "epoch": 0.336433504164345, + "grad_norm": 8.704236030578613, + "learning_rate": 4.361592233345188e-06, + "loss": 0.5057, + "mean_token_accuracy": 0.8375848770141602, + "num_tokens": 130465066.0, + "step": 108530 + }, + { + "entropy": 1.811057348549366, + "epoch": 0.33646450328939465, + "grad_norm": 10.785872459411621, + "learning_rate": 4.361391305904388e-06, + "loss": 0.4455, + "mean_token_accuracy": 0.8490529522299767, + "num_tokens": 130477719.0, + "step": 108540 + }, + { + "entropy": 1.8520541563630104, + "epoch": 0.33649550241444437, + "grad_norm": 7.816697597503662, + "learning_rate": 4.361190406229676e-06, + "loss": 0.4668, + "mean_token_accuracy": 0.849137333035469, + "num_tokens": 130489472.0, + "step": 108550 + }, + { + "entropy": 1.867296999692917, + "epoch": 0.33652650153949404, + "grad_norm": 4.023216724395752, + "learning_rate": 4.360989534314658e-06, + "loss": 0.4848, + "mean_token_accuracy": 0.8456032857298851, + "num_tokens": 130500834.0, + "step": 108560 + }, + { + "entropy": 1.9245821714401246, + "epoch": 0.33655750066454376, + "grad_norm": 7.590324401855469, + "learning_rate": 4.36078869015294e-06, + "loss": 0.5113, + "mean_token_accuracy": 0.8328892081975937, + "num_tokens": 130512705.0, + "step": 108570 + }, + { + "entropy": 1.8899716123938561, + "epoch": 0.33658849978959343, + "grad_norm": 8.06901741027832, + "learning_rate": 4.360587873738132e-06, + "loss": 0.4813, + "mean_token_accuracy": 0.847601892054081, + "num_tokens": 130524196.0, + "step": 108580 + }, + { + "entropy": 1.8788098588585853, + "epoch": 0.33661949891464316, + "grad_norm": 8.891491889953613, + "learning_rate": 4.360387085063847e-06, + "loss": 0.5064, + "mean_token_accuracy": 0.844712820649147, + "num_tokens": 130536389.0, + "step": 108590 + }, + { + "entropy": 1.8847306236624717, + "epoch": 0.3366504980396928, + "grad_norm": 4.331157684326172, + "learning_rate": 4.3601863241237e-06, + "loss": 0.4542, + "mean_token_accuracy": 0.8504201784729958, + "num_tokens": 130548320.0, + "step": 108600 + }, + { + "entropy": 1.8718352302908898, + "epoch": 0.33668149716474255, + "grad_norm": 7.971691131591797, + "learning_rate": 4.359985590911303e-06, + "loss": 0.4724, + "mean_token_accuracy": 0.8353804409503937, + "num_tokens": 130560292.0, + "step": 108610 + }, + { + "entropy": 1.8931324303150177, + "epoch": 0.3367124962897922, + "grad_norm": 7.773447513580322, + "learning_rate": 4.359784885420276e-06, + "loss": 0.4605, + "mean_token_accuracy": 0.8583063185214996, + "num_tokens": 130570906.0, + "step": 108620 + }, + { + "entropy": 1.844712384045124, + "epoch": 0.33674349541484194, + "grad_norm": 4.043328285217285, + "learning_rate": 4.3595842076442394e-06, + "loss": 0.4537, + "mean_token_accuracy": 0.8548864290118218, + "num_tokens": 130583434.0, + "step": 108630 + }, + { + "entropy": 1.851549918949604, + "epoch": 0.3367744945398916, + "grad_norm": 8.612878799438477, + "learning_rate": 4.359383557576814e-06, + "loss": 0.4301, + "mean_token_accuracy": 0.8524287804961205, + "num_tokens": 130595397.0, + "step": 108640 + }, + { + "entropy": 1.90091462880373, + "epoch": 0.3368054936649413, + "grad_norm": 9.851509094238281, + "learning_rate": 4.359182935211626e-06, + "loss": 0.534, + "mean_token_accuracy": 0.8377526298165321, + "num_tokens": 130606655.0, + "step": 108650 + }, + { + "entropy": 1.8972817674279212, + "epoch": 0.336836492789991, + "grad_norm": 8.680237770080566, + "learning_rate": 4.3589823405423e-06, + "loss": 0.4671, + "mean_token_accuracy": 0.8444389298558235, + "num_tokens": 130617788.0, + "step": 108660 + }, + { + "entropy": 1.8298688307404518, + "epoch": 0.3368674919150407, + "grad_norm": 9.806267738342285, + "learning_rate": 4.358781773562466e-06, + "loss": 0.4908, + "mean_token_accuracy": 0.8443355247378349, + "num_tokens": 130629777.0, + "step": 108670 + }, + { + "entropy": 1.8319950178265572, + "epoch": 0.3368984910400904, + "grad_norm": 7.356091499328613, + "learning_rate": 4.358581234265752e-06, + "loss": 0.424, + "mean_token_accuracy": 0.8494518354535103, + "num_tokens": 130642431.0, + "step": 108680 + }, + { + "entropy": 1.9162609457969666, + "epoch": 0.33692949016514007, + "grad_norm": 4.026700973510742, + "learning_rate": 4.3583807226457904e-06, + "loss": 0.5023, + "mean_token_accuracy": 0.8415546149015427, + "num_tokens": 130653469.0, + "step": 108690 + }, + { + "entropy": 1.846106144785881, + "epoch": 0.3369604892901898, + "grad_norm": 8.164050102233887, + "learning_rate": 4.358180238696217e-06, + "loss": 0.5068, + "mean_token_accuracy": 0.8469562456011772, + "num_tokens": 130665500.0, + "step": 108700 + }, + { + "entropy": 1.8313665881752967, + "epoch": 0.33699148841523946, + "grad_norm": 9.194726943969727, + "learning_rate": 4.357979782410669e-06, + "loss": 0.4965, + "mean_token_accuracy": 0.8402792170643807, + "num_tokens": 130677813.0, + "step": 108710 + }, + { + "entropy": 1.9255888879299163, + "epoch": 0.3370224875402892, + "grad_norm": 10.219868659973145, + "learning_rate": 4.357779353782783e-06, + "loss": 0.5116, + "mean_token_accuracy": 0.8486444607377053, + "num_tokens": 130689073.0, + "step": 108720 + }, + { + "entropy": 1.87244506329298, + "epoch": 0.33705348666533885, + "grad_norm": 8.656083106994629, + "learning_rate": 4.3575789528062e-06, + "loss": 0.4825, + "mean_token_accuracy": 0.8389718398451805, + "num_tokens": 130701943.0, + "step": 108730 + }, + { + "entropy": 1.8355872690677644, + "epoch": 0.3370844857903886, + "grad_norm": 7.93245792388916, + "learning_rate": 4.357378579474562e-06, + "loss": 0.4352, + "mean_token_accuracy": 0.8615069106221199, + "num_tokens": 130714006.0, + "step": 108740 + }, + { + "entropy": 1.8478736653923988, + "epoch": 0.33711548491543825, + "grad_norm": 9.331077575683594, + "learning_rate": 4.357178233781516e-06, + "loss": 0.474, + "mean_token_accuracy": 0.8405976980924607, + "num_tokens": 130726086.0, + "step": 108750 + }, + { + "entropy": 1.8251071408391, + "epoch": 0.33714648404048797, + "grad_norm": 9.389089584350586, + "learning_rate": 4.356977915720707e-06, + "loss": 0.4713, + "mean_token_accuracy": 0.8374730452895165, + "num_tokens": 130738640.0, + "step": 108760 + }, + { + "entropy": 1.8872618451714516, + "epoch": 0.33717748316553764, + "grad_norm": 6.961200714111328, + "learning_rate": 4.356777625285783e-06, + "loss": 0.5043, + "mean_token_accuracy": 0.8394622817635536, + "num_tokens": 130750629.0, + "step": 108770 + }, + { + "entropy": 1.8409572780132293, + "epoch": 0.33720848229058736, + "grad_norm": 4.427051067352295, + "learning_rate": 4.356577362470397e-06, + "loss": 0.4558, + "mean_token_accuracy": 0.8474222555756569, + "num_tokens": 130763704.0, + "step": 108780 + }, + { + "entropy": 1.8041258588433267, + "epoch": 0.33723948141563703, + "grad_norm": 2.4223392009735107, + "learning_rate": 4.3563771272681995e-06, + "loss": 0.494, + "mean_token_accuracy": 0.8473551660776139, + "num_tokens": 130776752.0, + "step": 108790 + }, + { + "entropy": 1.8579876363277434, + "epoch": 0.33727048054068676, + "grad_norm": 4.547750949859619, + "learning_rate": 4.356176919672846e-06, + "loss": 0.4858, + "mean_token_accuracy": 0.8426438301801682, + "num_tokens": 130789486.0, + "step": 108800 + }, + { + "entropy": 1.756131686270237, + "epoch": 0.3373014796657364, + "grad_norm": 4.558795928955078, + "learning_rate": 4.355976739677995e-06, + "loss": 0.3766, + "mean_token_accuracy": 0.8526749208569526, + "num_tokens": 130803159.0, + "step": 108810 + }, + { + "entropy": 1.736269749701023, + "epoch": 0.33733247879078615, + "grad_norm": 3.4384872913360596, + "learning_rate": 4.355776587277302e-06, + "loss": 0.4221, + "mean_token_accuracy": 0.8524326965212822, + "num_tokens": 130817219.0, + "step": 108820 + }, + { + "entropy": 1.7983601108193397, + "epoch": 0.3373634779158358, + "grad_norm": 7.676399230957031, + "learning_rate": 4.355576462464431e-06, + "loss": 0.4438, + "mean_token_accuracy": 0.8553432121872901, + "num_tokens": 130829997.0, + "step": 108830 + }, + { + "entropy": 1.849858644604683, + "epoch": 0.33739447704088554, + "grad_norm": 6.735903739929199, + "learning_rate": 4.3553763652330446e-06, + "loss": 0.4716, + "mean_token_accuracy": 0.8419752418994904, + "num_tokens": 130842661.0, + "step": 108840 + }, + { + "entropy": 1.9195626258850098, + "epoch": 0.3374254761659352, + "grad_norm": 8.325475692749023, + "learning_rate": 4.355176295576807e-06, + "loss": 0.5276, + "mean_token_accuracy": 0.8380102962255478, + "num_tokens": 130854094.0, + "step": 108850 + }, + { + "entropy": 1.6827729061245917, + "epoch": 0.33745647529098494, + "grad_norm": 3.7391862869262695, + "learning_rate": 4.354976253489386e-06, + "loss": 0.345, + "mean_token_accuracy": 0.8674192950129509, + "num_tokens": 130868296.0, + "step": 108860 + }, + { + "entropy": 1.8783330723643303, + "epoch": 0.3374874744160346, + "grad_norm": 7.508564472198486, + "learning_rate": 4.35477623896445e-06, + "loss": 0.4763, + "mean_token_accuracy": 0.8525745928287506, + "num_tokens": 130879523.0, + "step": 108870 + }, + { + "entropy": 1.831697914004326, + "epoch": 0.33751847354108433, + "grad_norm": 7.616162300109863, + "learning_rate": 4.35457625199567e-06, + "loss": 0.4391, + "mean_token_accuracy": 0.8564835578203202, + "num_tokens": 130891707.0, + "step": 108880 + }, + { + "entropy": 1.780026839673519, + "epoch": 0.337549472666134, + "grad_norm": 6.331145763397217, + "learning_rate": 4.354376292576721e-06, + "loss": 0.3985, + "mean_token_accuracy": 0.8688737958669662, + "num_tokens": 130904551.0, + "step": 108890 + }, + { + "entropy": 1.8686219871044158, + "epoch": 0.33758047179118367, + "grad_norm": 8.733838081359863, + "learning_rate": 4.354176360701276e-06, + "loss": 0.4802, + "mean_token_accuracy": 0.8480039656162262, + "num_tokens": 130916170.0, + "step": 108900 + }, + { + "entropy": 1.8567358165979386, + "epoch": 0.3376114709162334, + "grad_norm": 9.616422653198242, + "learning_rate": 4.353976456363014e-06, + "loss": 0.4977, + "mean_token_accuracy": 0.83982073366642, + "num_tokens": 130927254.0, + "step": 108910 + }, + { + "entropy": 1.7685384809970857, + "epoch": 0.33764247004128306, + "grad_norm": 9.59286117553711, + "learning_rate": 4.353776579555613e-06, + "loss": 0.3828, + "mean_token_accuracy": 0.8706537902355194, + "num_tokens": 130939683.0, + "step": 108920 + }, + { + "entropy": 1.864783415198326, + "epoch": 0.3376734691663328, + "grad_norm": 8.993294715881348, + "learning_rate": 4.353576730272754e-06, + "loss": 0.4998, + "mean_token_accuracy": 0.847262118756771, + "num_tokens": 130951156.0, + "step": 108930 + }, + { + "entropy": 1.850709395110607, + "epoch": 0.33770446829138245, + "grad_norm": 8.352401733398438, + "learning_rate": 4.3533769085081226e-06, + "loss": 0.5349, + "mean_token_accuracy": 0.8358264163136482, + "num_tokens": 130962743.0, + "step": 108940 + }, + { + "entropy": 1.8151063159108163, + "epoch": 0.3377354674164322, + "grad_norm": 7.70052433013916, + "learning_rate": 4.353177114255402e-06, + "loss": 0.4231, + "mean_token_accuracy": 0.8555310294032097, + "num_tokens": 130974981.0, + "step": 108950 + }, + { + "entropy": 1.813981081545353, + "epoch": 0.33776646654148185, + "grad_norm": 7.85250997543335, + "learning_rate": 4.352977347508281e-06, + "loss": 0.408, + "mean_token_accuracy": 0.8673159837722778, + "num_tokens": 130985978.0, + "step": 108960 + }, + { + "entropy": 1.8001227349042892, + "epoch": 0.33779746566653157, + "grad_norm": 7.5097270011901855, + "learning_rate": 4.352777608260448e-06, + "loss": 0.4514, + "mean_token_accuracy": 0.8529728427529335, + "num_tokens": 130998505.0, + "step": 108970 + }, + { + "entropy": 1.812097106873989, + "epoch": 0.33782846479158124, + "grad_norm": 8.388304710388184, + "learning_rate": 4.352577896505595e-06, + "loss": 0.45, + "mean_token_accuracy": 0.8446347638964653, + "num_tokens": 131010534.0, + "step": 108980 + }, + { + "entropy": 1.8360211491584777, + "epoch": 0.33785946391663096, + "grad_norm": 9.120504379272461, + "learning_rate": 4.352378212237415e-06, + "loss": 0.4926, + "mean_token_accuracy": 0.8418739259243011, + "num_tokens": 131022899.0, + "step": 108990 + }, + { + "entropy": 1.9238962471485137, + "epoch": 0.33789046304168063, + "grad_norm": 8.026063919067383, + "learning_rate": 4.352178555449604e-06, + "loss": 0.5547, + "mean_token_accuracy": 0.8331388279795646, + "num_tokens": 131034537.0, + "step": 109000 + }, + { + "entropy": 1.8263150677084923, + "epoch": 0.33792146216673036, + "grad_norm": 8.960397720336914, + "learning_rate": 4.351978926135859e-06, + "loss": 0.4264, + "mean_token_accuracy": 0.8622379198670387, + "num_tokens": 131046023.0, + "step": 109010 + }, + { + "entropy": 1.8406116291880608, + "epoch": 0.33795246129178, + "grad_norm": 8.759995460510254, + "learning_rate": 4.351779324289881e-06, + "loss": 0.4578, + "mean_token_accuracy": 0.8474781602621079, + "num_tokens": 131057713.0, + "step": 109020 + }, + { + "entropy": 1.8527844935655593, + "epoch": 0.33798346041682975, + "grad_norm": 8.07425594329834, + "learning_rate": 4.35157974990537e-06, + "loss": 0.4921, + "mean_token_accuracy": 0.8427453622221946, + "num_tokens": 131069288.0, + "step": 109030 + }, + { + "entropy": 1.8225320369005202, + "epoch": 0.3380144595418794, + "grad_norm": 8.338834762573242, + "learning_rate": 4.35138020297603e-06, + "loss": 0.4286, + "mean_token_accuracy": 0.8498498395085334, + "num_tokens": 131082196.0, + "step": 109040 + }, + { + "entropy": 1.7835667803883553, + "epoch": 0.33804545866692914, + "grad_norm": 4.169846534729004, + "learning_rate": 4.351180683495567e-06, + "loss": 0.4429, + "mean_token_accuracy": 0.8465461641550064, + "num_tokens": 131094762.0, + "step": 109050 + }, + { + "entropy": 1.8385676950216294, + "epoch": 0.3380764577919788, + "grad_norm": 7.802093505859375, + "learning_rate": 4.350981191457688e-06, + "loss": 0.5106, + "mean_token_accuracy": 0.8392044931650162, + "num_tokens": 131107519.0, + "step": 109060 + }, + { + "entropy": 1.9263346076011658, + "epoch": 0.33810745691702854, + "grad_norm": 8.671483039855957, + "learning_rate": 4.350781726856105e-06, + "loss": 0.5573, + "mean_token_accuracy": 0.8391161412000656, + "num_tokens": 131118004.0, + "step": 109070 + }, + { + "entropy": 1.7481406077742576, + "epoch": 0.3381384560420782, + "grad_norm": 4.902276039123535, + "learning_rate": 4.3505822896845245e-06, + "loss": 0.4375, + "mean_token_accuracy": 0.8569299519062042, + "num_tokens": 131132148.0, + "step": 109080 + }, + { + "entropy": 1.8396126255393028, + "epoch": 0.33816945516712793, + "grad_norm": 9.361617088317871, + "learning_rate": 4.350382879936665e-06, + "loss": 0.4339, + "mean_token_accuracy": 0.8506528735160828, + "num_tokens": 131143760.0, + "step": 109090 + }, + { + "entropy": 1.7284615464508533, + "epoch": 0.3382004542921776, + "grad_norm": 3.8647422790527344, + "learning_rate": 4.350183497606242e-06, + "loss": 0.3681, + "mean_token_accuracy": 0.8593399003148079, + "num_tokens": 131158374.0, + "step": 109100 + }, + { + "entropy": 1.84774319678545, + "epoch": 0.3382314534172273, + "grad_norm": 7.73523473739624, + "learning_rate": 4.34998414268697e-06, + "loss": 0.4693, + "mean_token_accuracy": 0.8502051830291748, + "num_tokens": 131170450.0, + "step": 109110 + }, + { + "entropy": 1.8346653819084167, + "epoch": 0.338262452542277, + "grad_norm": 9.322132110595703, + "learning_rate": 4.349784815172573e-06, + "loss": 0.4714, + "mean_token_accuracy": 0.8492701262235641, + "num_tokens": 131182861.0, + "step": 109120 + }, + { + "entropy": 1.899721224606037, + "epoch": 0.3382934516673267, + "grad_norm": 10.050145149230957, + "learning_rate": 4.349585515056768e-06, + "loss": 0.5082, + "mean_token_accuracy": 0.8410122975707054, + "num_tokens": 131194694.0, + "step": 109130 + }, + { + "entropy": 1.8425235763192176, + "epoch": 0.3383244507923764, + "grad_norm": 12.264300346374512, + "learning_rate": 4.349386242333283e-06, + "loss": 0.4549, + "mean_token_accuracy": 0.8546083837747573, + "num_tokens": 131205712.0, + "step": 109140 + }, + { + "entropy": 1.896941477060318, + "epoch": 0.33835544991742605, + "grad_norm": 8.371997833251953, + "learning_rate": 4.349186996995841e-06, + "loss": 0.5291, + "mean_token_accuracy": 0.8351884350180626, + "num_tokens": 131217095.0, + "step": 109150 + }, + { + "entropy": 1.863653865456581, + "epoch": 0.3383864490424758, + "grad_norm": 8.408138275146484, + "learning_rate": 4.3489877790381716e-06, + "loss": 0.4931, + "mean_token_accuracy": 0.8465486764907837, + "num_tokens": 131228887.0, + "step": 109160 + }, + { + "entropy": 1.7314985610544682, + "epoch": 0.33841744816752545, + "grad_norm": 3.9695277214050293, + "learning_rate": 4.348788588454003e-06, + "loss": 0.3653, + "mean_token_accuracy": 0.8568756997585296, + "num_tokens": 131242417.0, + "step": 109170 + }, + { + "entropy": 1.7864164792001247, + "epoch": 0.33844844729257517, + "grad_norm": 3.7821221351623535, + "learning_rate": 4.348589425237069e-06, + "loss": 0.4225, + "mean_token_accuracy": 0.8489362448453903, + "num_tokens": 131255401.0, + "step": 109180 + }, + { + "entropy": 1.8366502463817596, + "epoch": 0.33847944641762484, + "grad_norm": 7.946313381195068, + "learning_rate": 4.348390289381101e-06, + "loss": 0.4708, + "mean_token_accuracy": 0.8512516096234322, + "num_tokens": 131267388.0, + "step": 109190 + }, + { + "entropy": 1.800801232457161, + "epoch": 0.33851044554267457, + "grad_norm": 8.96656322479248, + "learning_rate": 4.348191180879837e-06, + "loss": 0.4545, + "mean_token_accuracy": 0.8497388571500778, + "num_tokens": 131280203.0, + "step": 109200 + }, + { + "entropy": 1.8128297343850135, + "epoch": 0.33854144466772423, + "grad_norm": 7.974830150604248, + "learning_rate": 4.347992099727013e-06, + "loss": 0.4203, + "mean_token_accuracy": 0.8562871024012566, + "num_tokens": 131292537.0, + "step": 109210 + }, + { + "entropy": 1.8276149809360505, + "epoch": 0.33857244379277396, + "grad_norm": 7.294045448303223, + "learning_rate": 4.347793045916371e-06, + "loss": 0.4722, + "mean_token_accuracy": 0.8447601184248924, + "num_tokens": 131304251.0, + "step": 109220 + }, + { + "entropy": 1.9087721765041352, + "epoch": 0.3386034429178236, + "grad_norm": 9.217912673950195, + "learning_rate": 4.3475940194416516e-06, + "loss": 0.4886, + "mean_token_accuracy": 0.8431843280792236, + "num_tokens": 131315640.0, + "step": 109230 + }, + { + "entropy": 1.8142705112695694, + "epoch": 0.33863444204287335, + "grad_norm": 8.342325210571289, + "learning_rate": 4.347395020296598e-06, + "loss": 0.4458, + "mean_token_accuracy": 0.84480060338974, + "num_tokens": 131328126.0, + "step": 109240 + }, + { + "entropy": 1.8766521513462067, + "epoch": 0.338665441167923, + "grad_norm": 7.5658745765686035, + "learning_rate": 4.347196048474958e-06, + "loss": 0.4759, + "mean_token_accuracy": 0.8464653968811036, + "num_tokens": 131339951.0, + "step": 109250 + }, + { + "entropy": 1.7888486787676812, + "epoch": 0.33869644029297274, + "grad_norm": 8.956585884094238, + "learning_rate": 4.346997103970477e-06, + "loss": 0.4082, + "mean_token_accuracy": 0.8637697413563729, + "num_tokens": 131352826.0, + "step": 109260 + }, + { + "entropy": 1.9030503317713738, + "epoch": 0.3387274394180224, + "grad_norm": 9.701215744018555, + "learning_rate": 4.3467981867769075e-06, + "loss": 0.5059, + "mean_token_accuracy": 0.8389164701104164, + "num_tokens": 131364743.0, + "step": 109270 + }, + { + "entropy": 1.904677079617977, + "epoch": 0.33875843854307214, + "grad_norm": 7.389717102050781, + "learning_rate": 4.346599296888e-06, + "loss": 0.472, + "mean_token_accuracy": 0.8455563247203827, + "num_tokens": 131376346.0, + "step": 109280 + }, + { + "entropy": 1.853348232060671, + "epoch": 0.3387894376681218, + "grad_norm": 8.647706031799316, + "learning_rate": 4.346400434297507e-06, + "loss": 0.4289, + "mean_token_accuracy": 0.8570475921034812, + "num_tokens": 131388648.0, + "step": 109290 + }, + { + "entropy": 1.854999852180481, + "epoch": 0.33882043679317153, + "grad_norm": 5.456407070159912, + "learning_rate": 4.346201598999186e-06, + "loss": 0.4675, + "mean_token_accuracy": 0.845246997475624, + "num_tokens": 131401406.0, + "step": 109300 + }, + { + "entropy": 1.9005474954843522, + "epoch": 0.3388514359182212, + "grad_norm": 7.554257869720459, + "learning_rate": 4.346002790986796e-06, + "loss": 0.5159, + "mean_token_accuracy": 0.8443148121237755, + "num_tokens": 131412576.0, + "step": 109310 + }, + { + "entropy": 1.957038275897503, + "epoch": 0.3388824350432709, + "grad_norm": 8.818881034851074, + "learning_rate": 4.3458040102540945e-06, + "loss": 0.5046, + "mean_token_accuracy": 0.8357266262173653, + "num_tokens": 131424034.0, + "step": 109320 + }, + { + "entropy": 1.7917650774121285, + "epoch": 0.3389134341683206, + "grad_norm": 7.657764911651611, + "learning_rate": 4.345605256794846e-06, + "loss": 0.438, + "mean_token_accuracy": 0.8518206834793091, + "num_tokens": 131437119.0, + "step": 109330 + }, + { + "entropy": 1.9100136697292327, + "epoch": 0.3389444332933703, + "grad_norm": 7.320883274078369, + "learning_rate": 4.345406530602812e-06, + "loss": 0.4628, + "mean_token_accuracy": 0.8504310637712479, + "num_tokens": 131448308.0, + "step": 109340 + }, + { + "entropy": 1.8857989877462387, + "epoch": 0.33897543241842, + "grad_norm": 8.363656044006348, + "learning_rate": 4.3452078316717594e-06, + "loss": 0.5471, + "mean_token_accuracy": 0.8354710638523102, + "num_tokens": 131460352.0, + "step": 109350 + }, + { + "entropy": 1.829300546646118, + "epoch": 0.3390064315434697, + "grad_norm": 3.853241443634033, + "learning_rate": 4.345009159995455e-06, + "loss": 0.5, + "mean_token_accuracy": 0.8451276645064354, + "num_tokens": 131472410.0, + "step": 109360 + }, + { + "entropy": 1.9137260258197784, + "epoch": 0.3390374306685194, + "grad_norm": 8.751523971557617, + "learning_rate": 4.344810515567671e-06, + "loss": 0.4801, + "mean_token_accuracy": 0.8443731889128685, + "num_tokens": 131484493.0, + "step": 109370 + }, + { + "entropy": 1.839902514219284, + "epoch": 0.3390684297935691, + "grad_norm": 8.783478736877441, + "learning_rate": 4.344611898382176e-06, + "loss": 0.4881, + "mean_token_accuracy": 0.8553899496793747, + "num_tokens": 131496822.0, + "step": 109380 + }, + { + "entropy": 1.896230974793434, + "epoch": 0.3390994289186188, + "grad_norm": 8.84762954711914, + "learning_rate": 4.3444133084327464e-06, + "loss": 0.4819, + "mean_token_accuracy": 0.8462594136595726, + "num_tokens": 131508185.0, + "step": 109390 + }, + { + "entropy": 1.889091356098652, + "epoch": 0.33913042804366844, + "grad_norm": 9.512887954711914, + "learning_rate": 4.344214745713158e-06, + "loss": 0.505, + "mean_token_accuracy": 0.8467561930418015, + "num_tokens": 131519594.0, + "step": 109400 + }, + { + "entropy": 1.9061472669243813, + "epoch": 0.33916142716871817, + "grad_norm": 9.376303672790527, + "learning_rate": 4.344016210217188e-06, + "loss": 0.5523, + "mean_token_accuracy": 0.8321489855647087, + "num_tokens": 131531669.0, + "step": 109410 + }, + { + "entropy": 1.8296129301190376, + "epoch": 0.33919242629376783, + "grad_norm": 3.8118650913238525, + "learning_rate": 4.343817701938615e-06, + "loss": 0.4197, + "mean_token_accuracy": 0.8557925269007682, + "num_tokens": 131544918.0, + "step": 109420 + }, + { + "entropy": 1.8758042737841607, + "epoch": 0.33922342541881756, + "grad_norm": 8.741567611694336, + "learning_rate": 4.3436192208712234e-06, + "loss": 0.4798, + "mean_token_accuracy": 0.849548727273941, + "num_tokens": 131557550.0, + "step": 109430 + }, + { + "entropy": 1.8672025337815286, + "epoch": 0.33925442454386723, + "grad_norm": 7.4333624839782715, + "learning_rate": 4.343420767008793e-06, + "loss": 0.489, + "mean_token_accuracy": 0.8443247124552726, + "num_tokens": 131569386.0, + "step": 109440 + }, + { + "entropy": 1.9236103370785713, + "epoch": 0.33928542366891695, + "grad_norm": 9.117382049560547, + "learning_rate": 4.343222340345114e-06, + "loss": 0.5212, + "mean_token_accuracy": 0.8418242424726486, + "num_tokens": 131580901.0, + "step": 109450 + }, + { + "entropy": 1.7963170796632766, + "epoch": 0.3393164227939666, + "grad_norm": 7.715478897094727, + "learning_rate": 4.343023940873973e-06, + "loss": 0.3927, + "mean_token_accuracy": 0.8599506393074989, + "num_tokens": 131594068.0, + "step": 109460 + }, + { + "entropy": 1.7216453760862351, + "epoch": 0.33934742191901635, + "grad_norm": 7.901419162750244, + "learning_rate": 4.342825568589158e-06, + "loss": 0.3747, + "mean_token_accuracy": 0.8583090871572494, + "num_tokens": 131608444.0, + "step": 109470 + }, + { + "entropy": 1.9911436915397644, + "epoch": 0.339378421044066, + "grad_norm": 8.45919132232666, + "learning_rate": 4.342627223484461e-06, + "loss": 0.5489, + "mean_token_accuracy": 0.8311255067586899, + "num_tokens": 131619482.0, + "step": 109480 + }, + { + "entropy": 1.8852492436766624, + "epoch": 0.33940942016911574, + "grad_norm": 7.630876064300537, + "learning_rate": 4.342428905553678e-06, + "loss": 0.442, + "mean_token_accuracy": 0.8610611036419868, + "num_tokens": 131632114.0, + "step": 109490 + }, + { + "entropy": 1.7850932255387306, + "epoch": 0.3394404192941654, + "grad_norm": 7.689153671264648, + "learning_rate": 4.342230614790603e-06, + "loss": 0.3813, + "mean_token_accuracy": 0.8630185857415199, + "num_tokens": 131645342.0, + "step": 109500 + }, + { + "entropy": 1.8250619187951087, + "epoch": 0.33947141841921513, + "grad_norm": 9.863249778747559, + "learning_rate": 4.342032351189033e-06, + "loss": 0.4457, + "mean_token_accuracy": 0.8580538392066955, + "num_tokens": 131657561.0, + "step": 109510 + }, + { + "entropy": 1.8692012056708336, + "epoch": 0.3395024175442648, + "grad_norm": 7.83109712600708, + "learning_rate": 4.341834114742769e-06, + "loss": 0.503, + "mean_token_accuracy": 0.8402553409337997, + "num_tokens": 131669689.0, + "step": 109520 + }, + { + "entropy": 1.8381701841950417, + "epoch": 0.3395334166693145, + "grad_norm": 9.981649398803711, + "learning_rate": 4.341635905445612e-06, + "loss": 0.4459, + "mean_token_accuracy": 0.8586729139089584, + "num_tokens": 131681608.0, + "step": 109530 + }, + { + "entropy": 1.879381312429905, + "epoch": 0.3395644157943642, + "grad_norm": 3.283308506011963, + "learning_rate": 4.341437723291367e-06, + "loss": 0.4761, + "mean_token_accuracy": 0.846294941008091, + "num_tokens": 131693388.0, + "step": 109540 + }, + { + "entropy": 1.812347574532032, + "epoch": 0.3395954149194139, + "grad_norm": 8.128570556640625, + "learning_rate": 4.341239568273838e-06, + "loss": 0.4339, + "mean_token_accuracy": 0.8501547113060951, + "num_tokens": 131706396.0, + "step": 109550 + }, + { + "entropy": 1.8860585197806359, + "epoch": 0.3396264140444636, + "grad_norm": 8.69900894165039, + "learning_rate": 4.341041440386833e-06, + "loss": 0.4872, + "mean_token_accuracy": 0.844724814593792, + "num_tokens": 131718236.0, + "step": 109560 + }, + { + "entropy": 1.7079974353313445, + "epoch": 0.3396574131695133, + "grad_norm": 2.750688314437866, + "learning_rate": 4.34084333962416e-06, + "loss": 0.3525, + "mean_token_accuracy": 0.8660884618759155, + "num_tokens": 131732740.0, + "step": 109570 + }, + { + "entropy": 1.9225364074110984, + "epoch": 0.339688412294563, + "grad_norm": 8.820713996887207, + "learning_rate": 4.340645265979634e-06, + "loss": 0.5022, + "mean_token_accuracy": 0.8516004055738449, + "num_tokens": 131743968.0, + "step": 109580 + }, + { + "entropy": 1.9468763768672943, + "epoch": 0.3397194114196127, + "grad_norm": 9.133736610412598, + "learning_rate": 4.340447219447068e-06, + "loss": 0.5167, + "mean_token_accuracy": 0.8412905469536781, + "num_tokens": 131754741.0, + "step": 109590 + }, + { + "entropy": 1.99237479865551, + "epoch": 0.3397504105446624, + "grad_norm": 8.745469093322754, + "learning_rate": 4.340249200020274e-06, + "loss": 0.5682, + "mean_token_accuracy": 0.8262647077441215, + "num_tokens": 131766026.0, + "step": 109600 + }, + { + "entropy": 1.9354695051908493, + "epoch": 0.3397814096697121, + "grad_norm": 7.548391819000244, + "learning_rate": 4.340051207693073e-06, + "loss": 0.4983, + "mean_token_accuracy": 0.8456323340535163, + "num_tokens": 131776690.0, + "step": 109610 + }, + { + "entropy": 1.8958056300878525, + "epoch": 0.33981240879476177, + "grad_norm": 4.0965256690979, + "learning_rate": 4.339853242459283e-06, + "loss": 0.5162, + "mean_token_accuracy": 0.8384672284126282, + "num_tokens": 131788190.0, + "step": 109620 + }, + { + "entropy": 1.8747544452548026, + "epoch": 0.3398434079198115, + "grad_norm": 9.14088249206543, + "learning_rate": 4.339655304312725e-06, + "loss": 0.4678, + "mean_token_accuracy": 0.8471368581056595, + "num_tokens": 131799932.0, + "step": 109630 + }, + { + "entropy": 1.9145475149154663, + "epoch": 0.33987440704486116, + "grad_norm": 8.718486785888672, + "learning_rate": 4.339457393247224e-06, + "loss": 0.4874, + "mean_token_accuracy": 0.8452249899506569, + "num_tokens": 131810626.0, + "step": 109640 + }, + { + "entropy": 1.9000482648611068, + "epoch": 0.33990540616991083, + "grad_norm": 5.583610534667969, + "learning_rate": 4.339259509256604e-06, + "loss": 0.5213, + "mean_token_accuracy": 0.8375597059726715, + "num_tokens": 131821937.0, + "step": 109650 + }, + { + "entropy": 1.7997367039322854, + "epoch": 0.33993640529496055, + "grad_norm": 7.190251350402832, + "learning_rate": 4.339061652334693e-06, + "loss": 0.412, + "mean_token_accuracy": 0.8610153466463089, + "num_tokens": 131834731.0, + "step": 109660 + }, + { + "entropy": 1.8661419078707695, + "epoch": 0.3399674044200102, + "grad_norm": 7.567733287811279, + "learning_rate": 4.33886382247532e-06, + "loss": 0.4701, + "mean_token_accuracy": 0.8479538485407829, + "num_tokens": 131846172.0, + "step": 109670 + }, + { + "entropy": 1.8096156984567642, + "epoch": 0.33999840354505995, + "grad_norm": 8.170653343200684, + "learning_rate": 4.338666019672315e-06, + "loss": 0.3938, + "mean_token_accuracy": 0.8594262331724167, + "num_tokens": 131859132.0, + "step": 109680 + }, + { + "entropy": 1.8735254988074304, + "epoch": 0.3400294026701096, + "grad_norm": 4.316888809204102, + "learning_rate": 4.3384682439195146e-06, + "loss": 0.5105, + "mean_token_accuracy": 0.8408343940973282, + "num_tokens": 131870647.0, + "step": 109690 + }, + { + "entropy": 1.8006722897291183, + "epoch": 0.34006040179515934, + "grad_norm": 3.814378499984741, + "learning_rate": 4.338270495210751e-06, + "loss": 0.4023, + "mean_token_accuracy": 0.8559133544564247, + "num_tokens": 131882827.0, + "step": 109700 + }, + { + "entropy": 1.8134817466139794, + "epoch": 0.340091400920209, + "grad_norm": 4.081186771392822, + "learning_rate": 4.338072773539862e-06, + "loss": 0.4345, + "mean_token_accuracy": 0.8579586237668991, + "num_tokens": 131894503.0, + "step": 109710 + }, + { + "entropy": 1.8244572654366493, + "epoch": 0.34012240004525873, + "grad_norm": 3.6130762100219727, + "learning_rate": 4.337875078900688e-06, + "loss": 0.4724, + "mean_token_accuracy": 0.8462868794798851, + "num_tokens": 131907275.0, + "step": 109720 + }, + { + "entropy": 1.911041909456253, + "epoch": 0.3401533991703084, + "grad_norm": 10.027078628540039, + "learning_rate": 4.337677411287069e-06, + "loss": 0.4936, + "mean_token_accuracy": 0.855143415927887, + "num_tokens": 131918604.0, + "step": 109730 + }, + { + "entropy": 1.7550349622964858, + "epoch": 0.3401843982953581, + "grad_norm": 4.361454010009766, + "learning_rate": 4.337479770692849e-06, + "loss": 0.4275, + "mean_token_accuracy": 0.8564995616674423, + "num_tokens": 131931986.0, + "step": 109740 + }, + { + "entropy": 1.8325242474675179, + "epoch": 0.3402153974204078, + "grad_norm": 7.0168890953063965, + "learning_rate": 4.337282157111871e-06, + "loss": 0.4177, + "mean_token_accuracy": 0.856087788939476, + "num_tokens": 131944417.0, + "step": 109750 + }, + { + "entropy": 1.8343531802296638, + "epoch": 0.3402463965454575, + "grad_norm": 8.980559349060059, + "learning_rate": 4.337084570537985e-06, + "loss": 0.4608, + "mean_token_accuracy": 0.8543761387467385, + "num_tokens": 131956829.0, + "step": 109760 + }, + { + "entropy": 1.9214718401432038, + "epoch": 0.3402773956705072, + "grad_norm": 9.840754508972168, + "learning_rate": 4.336887010965037e-06, + "loss": 0.5471, + "mean_token_accuracy": 0.8250068485736847, + "num_tokens": 131967623.0, + "step": 109770 + }, + { + "entropy": 1.7958107739686966, + "epoch": 0.3403083947955569, + "grad_norm": 6.940483570098877, + "learning_rate": 4.336689478386879e-06, + "loss": 0.3996, + "mean_token_accuracy": 0.8612048402428627, + "num_tokens": 131980237.0, + "step": 109780 + }, + { + "entropy": 1.8743787840008737, + "epoch": 0.3403393939206066, + "grad_norm": 8.042901039123535, + "learning_rate": 4.3364919727973655e-06, + "loss": 0.4931, + "mean_token_accuracy": 0.8504285082221031, + "num_tokens": 131991550.0, + "step": 109790 + }, + { + "entropy": 1.86217290610075, + "epoch": 0.3403703930456563, + "grad_norm": 8.980916023254395, + "learning_rate": 4.33629449419035e-06, + "loss": 0.4735, + "mean_token_accuracy": 0.8460476860404015, + "num_tokens": 132003560.0, + "step": 109800 + }, + { + "entropy": 1.7462153866887093, + "epoch": 0.340401392170706, + "grad_norm": 7.179854393005371, + "learning_rate": 4.336097042559689e-06, + "loss": 0.3725, + "mean_token_accuracy": 0.8604865953326225, + "num_tokens": 132016859.0, + "step": 109810 + }, + { + "entropy": 1.8352238804101944, + "epoch": 0.3404323912957557, + "grad_norm": 8.293797492980957, + "learning_rate": 4.3358996178992416e-06, + "loss": 0.4728, + "mean_token_accuracy": 0.8555570766329765, + "num_tokens": 132028207.0, + "step": 109820 + }, + { + "entropy": 1.7612206950783729, + "epoch": 0.34046339042080537, + "grad_norm": 8.661210060119629, + "learning_rate": 4.3357022202028685e-06, + "loss": 0.452, + "mean_token_accuracy": 0.8550508677959442, + "num_tokens": 132041782.0, + "step": 109830 + }, + { + "entropy": 1.8841624334454536, + "epoch": 0.3404943895458551, + "grad_norm": 8.303897857666016, + "learning_rate": 4.335504849464432e-06, + "loss": 0.543, + "mean_token_accuracy": 0.8422963932156563, + "num_tokens": 132053535.0, + "step": 109840 + }, + { + "entropy": 1.7789085522294044, + "epoch": 0.34052538867090476, + "grad_norm": 3.7404472827911377, + "learning_rate": 4.335307505677798e-06, + "loss": 0.407, + "mean_token_accuracy": 0.8541888251900673, + "num_tokens": 132066748.0, + "step": 109850 + }, + { + "entropy": 1.8748383790254592, + "epoch": 0.3405563877959545, + "grad_norm": 7.891180515289307, + "learning_rate": 4.335110188836832e-06, + "loss": 0.4698, + "mean_token_accuracy": 0.8546398028731346, + "num_tokens": 132078371.0, + "step": 109860 + }, + { + "entropy": 1.8530076041817665, + "epoch": 0.34058738692100415, + "grad_norm": 8.61839771270752, + "learning_rate": 4.334912898935402e-06, + "loss": 0.4438, + "mean_token_accuracy": 0.845193374156952, + "num_tokens": 132090696.0, + "step": 109870 + }, + { + "entropy": 1.9376697182655334, + "epoch": 0.3406183860460538, + "grad_norm": 10.494202613830566, + "learning_rate": 4.334715635967379e-06, + "loss": 0.5363, + "mean_token_accuracy": 0.8341496884822845, + "num_tokens": 132101490.0, + "step": 109880 + }, + { + "entropy": 1.8309767156839372, + "epoch": 0.34064938517110355, + "grad_norm": 8.481719970703125, + "learning_rate": 4.334518399926636e-06, + "loss": 0.4131, + "mean_token_accuracy": 0.85340007096529, + "num_tokens": 132114554.0, + "step": 109890 + }, + { + "entropy": 1.8747343346476555, + "epoch": 0.3406803842961532, + "grad_norm": 9.323307037353516, + "learning_rate": 4.334321190807049e-06, + "loss": 0.5063, + "mean_token_accuracy": 0.8413134127855301, + "num_tokens": 132126561.0, + "step": 109900 + }, + { + "entropy": 1.8665272369980812, + "epoch": 0.34071138342120294, + "grad_norm": 9.879739761352539, + "learning_rate": 4.334124008602491e-06, + "loss": 0.5596, + "mean_token_accuracy": 0.8321680203080177, + "num_tokens": 132137909.0, + "step": 109910 + }, + { + "entropy": 1.872728843986988, + "epoch": 0.3407423825462526, + "grad_norm": 9.696663856506348, + "learning_rate": 4.333926853306841e-06, + "loss": 0.4574, + "mean_token_accuracy": 0.8571437358856201, + "num_tokens": 132149808.0, + "step": 109920 + }, + { + "entropy": 1.9334780007600785, + "epoch": 0.34077338167130233, + "grad_norm": 8.289495468139648, + "learning_rate": 4.333729724913981e-06, + "loss": 0.5049, + "mean_token_accuracy": 0.837024663388729, + "num_tokens": 132160716.0, + "step": 109930 + }, + { + "entropy": 1.879612100124359, + "epoch": 0.340804380796352, + "grad_norm": 8.466344833374023, + "learning_rate": 4.333532623417792e-06, + "loss": 0.4873, + "mean_token_accuracy": 0.8417725265026093, + "num_tokens": 132172002.0, + "step": 109940 + }, + { + "entropy": 1.8401907190680504, + "epoch": 0.3408353799214017, + "grad_norm": 8.750163078308105, + "learning_rate": 4.333335548812158e-06, + "loss": 0.447, + "mean_token_accuracy": 0.8564073830842972, + "num_tokens": 132183522.0, + "step": 109950 + }, + { + "entropy": 1.7649420738220214, + "epoch": 0.3408663790464514, + "grad_norm": 7.617003917694092, + "learning_rate": 4.333138501090967e-06, + "loss": 0.4166, + "mean_token_accuracy": 0.8601633608341217, + "num_tokens": 132197109.0, + "step": 109960 + }, + { + "entropy": 1.823840895295143, + "epoch": 0.3408973781715011, + "grad_norm": 4.131731986999512, + "learning_rate": 4.332941480248105e-06, + "loss": 0.4298, + "mean_token_accuracy": 0.8559944689273834, + "num_tokens": 132209939.0, + "step": 109970 + }, + { + "entropy": 1.8967970684170723, + "epoch": 0.3409283772965508, + "grad_norm": 8.258302688598633, + "learning_rate": 4.332744486277461e-06, + "loss": 0.4842, + "mean_token_accuracy": 0.8519051864743232, + "num_tokens": 132220802.0, + "step": 109980 + }, + { + "entropy": 1.8975848436355591, + "epoch": 0.3409593764216005, + "grad_norm": 9.49807071685791, + "learning_rate": 4.332547519172929e-06, + "loss": 0.469, + "mean_token_accuracy": 0.8441584467887878, + "num_tokens": 132232580.0, + "step": 109990 + }, + { + "entropy": 1.7971587955951691, + "epoch": 0.3409903755466502, + "grad_norm": 8.716933250427246, + "learning_rate": 4.332350578928402e-06, + "loss": 0.4633, + "mean_token_accuracy": 0.8555277913808823, + "num_tokens": 132244660.0, + "step": 110000 + }, + { + "entropy": 1.7943158000707626, + "epoch": 0.3410213746716999, + "grad_norm": 9.320921897888184, + "learning_rate": 4.332153665537777e-06, + "loss": 0.4137, + "mean_token_accuracy": 0.8571292623877526, + "num_tokens": 132257556.0, + "step": 110010 + }, + { + "entropy": 1.883206208050251, + "epoch": 0.3410523737967496, + "grad_norm": 7.559988021850586, + "learning_rate": 4.331956778994951e-06, + "loss": 0.4647, + "mean_token_accuracy": 0.8475514382123948, + "num_tokens": 132269837.0, + "step": 110020 + }, + { + "entropy": 1.8622760981321336, + "epoch": 0.3410833729217993, + "grad_norm": 8.432063102722168, + "learning_rate": 4.331759919293823e-06, + "loss": 0.4867, + "mean_token_accuracy": 0.8502196714282035, + "num_tokens": 132281620.0, + "step": 110030 + }, + { + "entropy": 1.8266623839735985, + "epoch": 0.34111437204684897, + "grad_norm": 7.229222774505615, + "learning_rate": 4.331563086428295e-06, + "loss": 0.3981, + "mean_token_accuracy": 0.8582778573036194, + "num_tokens": 132293944.0, + "step": 110040 + }, + { + "entropy": 1.8780689999461173, + "epoch": 0.3411453711718987, + "grad_norm": 9.767049789428711, + "learning_rate": 4.331366280392271e-06, + "loss": 0.4905, + "mean_token_accuracy": 0.8368968412280082, + "num_tokens": 132306054.0, + "step": 110050 + }, + { + "entropy": 1.9592034816741943, + "epoch": 0.34117637029694836, + "grad_norm": 8.0049467086792, + "learning_rate": 4.331169501179655e-06, + "loss": 0.5138, + "mean_token_accuracy": 0.8449469640851021, + "num_tokens": 132316729.0, + "step": 110060 + }, + { + "entropy": 1.8753671124577522, + "epoch": 0.3412073694219981, + "grad_norm": 8.132515907287598, + "learning_rate": 4.330972748784358e-06, + "loss": 0.452, + "mean_token_accuracy": 0.8468053087592124, + "num_tokens": 132328733.0, + "step": 110070 + }, + { + "entropy": 1.9104474276304244, + "epoch": 0.34123836854704775, + "grad_norm": 9.560647010803223, + "learning_rate": 4.330776023200286e-06, + "loss": 0.4779, + "mean_token_accuracy": 0.847663089632988, + "num_tokens": 132339483.0, + "step": 110080 + }, + { + "entropy": 1.7548475116491318, + "epoch": 0.3412693676720975, + "grad_norm": 7.8430280685424805, + "learning_rate": 4.330579324421352e-06, + "loss": 0.4229, + "mean_token_accuracy": 0.8698502600193023, + "num_tokens": 132352754.0, + "step": 110090 + }, + { + "entropy": 1.778562317788601, + "epoch": 0.34130036679714715, + "grad_norm": 3.4906435012817383, + "learning_rate": 4.330382652441468e-06, + "loss": 0.409, + "mean_token_accuracy": 0.8567349463701248, + "num_tokens": 132366131.0, + "step": 110100 + }, + { + "entropy": 1.8327470771968364, + "epoch": 0.34133136592219687, + "grad_norm": 9.07868766784668, + "learning_rate": 4.33018600725455e-06, + "loss": 0.4468, + "mean_token_accuracy": 0.8581578373908997, + "num_tokens": 132377583.0, + "step": 110110 + }, + { + "entropy": 1.8057816326618195, + "epoch": 0.34136236504724654, + "grad_norm": 9.56470775604248, + "learning_rate": 4.329989388854515e-06, + "loss": 0.4509, + "mean_token_accuracy": 0.8603610098361969, + "num_tokens": 132389492.0, + "step": 110120 + }, + { + "entropy": 1.8053456172347069, + "epoch": 0.3413933641722962, + "grad_norm": 8.362168312072754, + "learning_rate": 4.3297927972352825e-06, + "loss": 0.4358, + "mean_token_accuracy": 0.8586013182997704, + "num_tokens": 132401915.0, + "step": 110130 + }, + { + "entropy": 1.8421172469854354, + "epoch": 0.34142436329734593, + "grad_norm": 5.665745735168457, + "learning_rate": 4.3295962323907735e-06, + "loss": 0.4893, + "mean_token_accuracy": 0.8431659877300263, + "num_tokens": 132414324.0, + "step": 110140 + }, + { + "entropy": 1.8274363920092582, + "epoch": 0.3414553624223956, + "grad_norm": 4.698655128479004, + "learning_rate": 4.3293996943149094e-06, + "loss": 0.4581, + "mean_token_accuracy": 0.8378838330507279, + "num_tokens": 132426353.0, + "step": 110150 + }, + { + "entropy": 1.8264761924743653, + "epoch": 0.3414863615474453, + "grad_norm": 8.485648155212402, + "learning_rate": 4.329203183001617e-06, + "loss": 0.4714, + "mean_token_accuracy": 0.8489476129412651, + "num_tokens": 132438108.0, + "step": 110160 + }, + { + "entropy": 1.7348950892686843, + "epoch": 0.341517360672495, + "grad_norm": 3.869995355606079, + "learning_rate": 4.329006698444822e-06, + "loss": 0.3746, + "mean_token_accuracy": 0.8635307684540748, + "num_tokens": 132452376.0, + "step": 110170 + }, + { + "entropy": 1.8900276586413383, + "epoch": 0.3415483597975447, + "grad_norm": 7.771602630615234, + "learning_rate": 4.3288102406384535e-06, + "loss": 0.5439, + "mean_token_accuracy": 0.8306577250361442, + "num_tokens": 132463778.0, + "step": 110180 + }, + { + "entropy": 1.816038003563881, + "epoch": 0.3415793589225944, + "grad_norm": 3.8563501834869385, + "learning_rate": 4.3286138095764414e-06, + "loss": 0.4445, + "mean_token_accuracy": 0.8512075647711754, + "num_tokens": 132476109.0, + "step": 110190 + }, + { + "entropy": 1.7983648508787156, + "epoch": 0.3416103580476441, + "grad_norm": 6.0994873046875, + "learning_rate": 4.328417405252719e-06, + "loss": 0.4387, + "mean_token_accuracy": 0.857032585144043, + "num_tokens": 132489219.0, + "step": 110200 + }, + { + "entropy": 1.7746062129735947, + "epoch": 0.3416413571726938, + "grad_norm": 7.233607769012451, + "learning_rate": 4.3282210276612215e-06, + "loss": 0.3889, + "mean_token_accuracy": 0.8589346572756767, + "num_tokens": 132501835.0, + "step": 110210 + }, + { + "entropy": 1.854676540195942, + "epoch": 0.3416723562977435, + "grad_norm": 8.12541389465332, + "learning_rate": 4.328024676795884e-06, + "loss": 0.4286, + "mean_token_accuracy": 0.8524368569254875, + "num_tokens": 132513519.0, + "step": 110220 + }, + { + "entropy": 1.7707249775528908, + "epoch": 0.3417033554227932, + "grad_norm": 11.075340270996094, + "learning_rate": 4.3278283526506465e-06, + "loss": 0.4199, + "mean_token_accuracy": 0.8538433343172074, + "num_tokens": 132526640.0, + "step": 110230 + }, + { + "entropy": 1.896279625594616, + "epoch": 0.3417343545478429, + "grad_norm": 10.801493644714355, + "learning_rate": 4.3276320552194465e-06, + "loss": 0.4733, + "mean_token_accuracy": 0.8482535511255265, + "num_tokens": 132538190.0, + "step": 110240 + }, + { + "entropy": 1.8617658391594887, + "epoch": 0.34176535367289257, + "grad_norm": 7.516333103179932, + "learning_rate": 4.32743578449623e-06, + "loss": 0.4729, + "mean_token_accuracy": 0.8491196766495704, + "num_tokens": 132550185.0, + "step": 110250 + }, + { + "entropy": 1.8173480436205864, + "epoch": 0.3417963527979423, + "grad_norm": 8.738038063049316, + "learning_rate": 4.327239540474937e-06, + "loss": 0.4669, + "mean_token_accuracy": 0.836649663746357, + "num_tokens": 132563493.0, + "step": 110260 + }, + { + "entropy": 1.9029010236263275, + "epoch": 0.34182735192299196, + "grad_norm": 8.454696655273438, + "learning_rate": 4.3270433231495165e-06, + "loss": 0.4533, + "mean_token_accuracy": 0.8503329768776894, + "num_tokens": 132575617.0, + "step": 110270 + }, + { + "entropy": 1.8139191955327987, + "epoch": 0.3418583510480417, + "grad_norm": 3.524780750274658, + "learning_rate": 4.326847132513916e-06, + "loss": 0.4293, + "mean_token_accuracy": 0.8617829859256745, + "num_tokens": 132588755.0, + "step": 110280 + }, + { + "entropy": 1.7936020240187645, + "epoch": 0.34188935017309136, + "grad_norm": 8.44808578491211, + "learning_rate": 4.326650968562085e-06, + "loss": 0.4323, + "mean_token_accuracy": 0.8550214633345604, + "num_tokens": 132601884.0, + "step": 110290 + }, + { + "entropy": 1.8529448464512825, + "epoch": 0.3419203492981411, + "grad_norm": 4.374883651733398, + "learning_rate": 4.3264548312879736e-06, + "loss": 0.4341, + "mean_token_accuracy": 0.8531146451830864, + "num_tokens": 132614584.0, + "step": 110300 + }, + { + "entropy": 1.800635115802288, + "epoch": 0.34195134842319075, + "grad_norm": 4.555408954620361, + "learning_rate": 4.326258720685538e-06, + "loss": 0.4128, + "mean_token_accuracy": 0.8581522032618523, + "num_tokens": 132627313.0, + "step": 110310 + }, + { + "entropy": 1.7813701510429383, + "epoch": 0.3419823475482405, + "grad_norm": 4.661769390106201, + "learning_rate": 4.326062636748733e-06, + "loss": 0.3958, + "mean_token_accuracy": 0.8568063646554946, + "num_tokens": 132640535.0, + "step": 110320 + }, + { + "entropy": 1.8333582818508147, + "epoch": 0.34201334667329014, + "grad_norm": 8.417710304260254, + "learning_rate": 4.325866579471516e-06, + "loss": 0.4775, + "mean_token_accuracy": 0.839620991051197, + "num_tokens": 132652723.0, + "step": 110330 + }, + { + "entropy": 1.7189094245433807, + "epoch": 0.34204434579833987, + "grad_norm": 6.682080268859863, + "learning_rate": 4.325670548847847e-06, + "loss": 0.3986, + "mean_token_accuracy": 0.8616839155554772, + "num_tokens": 132667020.0, + "step": 110340 + }, + { + "entropy": 1.9014250546693803, + "epoch": 0.34207534492338953, + "grad_norm": 9.291830062866211, + "learning_rate": 4.325474544871687e-06, + "loss": 0.4903, + "mean_token_accuracy": 0.8418972924351692, + "num_tokens": 132678104.0, + "step": 110350 + }, + { + "entropy": 1.8418912634253501, + "epoch": 0.34210634404843926, + "grad_norm": 9.592106819152832, + "learning_rate": 4.325278567536999e-06, + "loss": 0.4669, + "mean_token_accuracy": 0.8414492189884186, + "num_tokens": 132691635.0, + "step": 110360 + }, + { + "entropy": 1.8121645241975783, + "epoch": 0.34213734317348893, + "grad_norm": 10.184158325195312, + "learning_rate": 4.325082616837749e-06, + "loss": 0.4521, + "mean_token_accuracy": 0.8527585849165916, + "num_tokens": 132703725.0, + "step": 110370 + }, + { + "entropy": 1.8687279969453812, + "epoch": 0.3421683422985386, + "grad_norm": 7.617676734924316, + "learning_rate": 4.324886692767904e-06, + "loss": 0.5293, + "mean_token_accuracy": 0.8405532166361809, + "num_tokens": 132715491.0, + "step": 110380 + }, + { + "entropy": 1.8479404971003532, + "epoch": 0.3421993414235883, + "grad_norm": 8.61865520477295, + "learning_rate": 4.324690795321433e-06, + "loss": 0.4618, + "mean_token_accuracy": 0.8454162836074829, + "num_tokens": 132727389.0, + "step": 110390 + }, + { + "entropy": 1.8582603454589843, + "epoch": 0.342230340548638, + "grad_norm": 8.203682899475098, + "learning_rate": 4.324494924492305e-06, + "loss": 0.5359, + "mean_token_accuracy": 0.8352208867669105, + "num_tokens": 132739602.0, + "step": 110400 + }, + { + "entropy": 1.8665910184383392, + "epoch": 0.3422613396736877, + "grad_norm": 8.948098182678223, + "learning_rate": 4.324299080274496e-06, + "loss": 0.4703, + "mean_token_accuracy": 0.8546964704990387, + "num_tokens": 132751570.0, + "step": 110410 + }, + { + "entropy": 1.870258367061615, + "epoch": 0.3422923387987374, + "grad_norm": 7.890269756317139, + "learning_rate": 4.32410326266198e-06, + "loss": 0.4762, + "mean_token_accuracy": 0.8475801780819893, + "num_tokens": 132763398.0, + "step": 110420 + }, + { + "entropy": 1.841132828593254, + "epoch": 0.3423233379237871, + "grad_norm": 6.834152698516846, + "learning_rate": 4.3239074716487314e-06, + "loss": 0.4194, + "mean_token_accuracy": 0.8546931356191635, + "num_tokens": 132775491.0, + "step": 110430 + }, + { + "entropy": 1.9839861959218978, + "epoch": 0.3423543370488368, + "grad_norm": 9.827447891235352, + "learning_rate": 4.323711707228732e-06, + "loss": 0.5771, + "mean_token_accuracy": 0.8222515046596527, + "num_tokens": 132786260.0, + "step": 110440 + }, + { + "entropy": 1.869850617647171, + "epoch": 0.3423853361738865, + "grad_norm": 7.602901935577393, + "learning_rate": 4.323515969395961e-06, + "loss": 0.487, + "mean_token_accuracy": 0.849778589606285, + "num_tokens": 132798632.0, + "step": 110450 + }, + { + "entropy": 1.8364197805523872, + "epoch": 0.34241633529893617, + "grad_norm": 6.836818695068359, + "learning_rate": 4.3233202581444e-06, + "loss": 0.3919, + "mean_token_accuracy": 0.8704248234629631, + "num_tokens": 132811082.0, + "step": 110460 + }, + { + "entropy": 1.76078250259161, + "epoch": 0.3424473344239859, + "grad_norm": 3.4893312454223633, + "learning_rate": 4.323124573468033e-06, + "loss": 0.3575, + "mean_token_accuracy": 0.8747459024190902, + "num_tokens": 132824398.0, + "step": 110470 + }, + { + "entropy": 1.9431472271680832, + "epoch": 0.34247833354903556, + "grad_norm": 8.165310859680176, + "learning_rate": 4.3229289153608484e-06, + "loss": 0.5315, + "mean_token_accuracy": 0.8350071504712104, + "num_tokens": 132835017.0, + "step": 110480 + }, + { + "entropy": 1.828757031261921, + "epoch": 0.3425093326740853, + "grad_norm": 5.19187593460083, + "learning_rate": 4.3227332838168335e-06, + "loss": 0.4395, + "mean_token_accuracy": 0.8579125210642815, + "num_tokens": 132847063.0, + "step": 110490 + }, + { + "entropy": 1.907851468026638, + "epoch": 0.34254033179913496, + "grad_norm": 8.472184181213379, + "learning_rate": 4.3225376788299765e-06, + "loss": 0.4777, + "mean_token_accuracy": 0.8507169172167778, + "num_tokens": 132858770.0, + "step": 110500 + }, + { + "entropy": 1.9092920407652856, + "epoch": 0.3425713309241847, + "grad_norm": 8.181653022766113, + "learning_rate": 4.322342100394272e-06, + "loss": 0.5, + "mean_token_accuracy": 0.8374480813741684, + "num_tokens": 132869792.0, + "step": 110510 + }, + { + "entropy": 1.801915517449379, + "epoch": 0.34260233004923435, + "grad_norm": 9.726790428161621, + "learning_rate": 4.322146548503712e-06, + "loss": 0.37, + "mean_token_accuracy": 0.8696004092693329, + "num_tokens": 132882414.0, + "step": 110520 + }, + { + "entropy": 1.871196947991848, + "epoch": 0.3426333291742841, + "grad_norm": 8.217884063720703, + "learning_rate": 4.321951023152293e-06, + "loss": 0.4747, + "mean_token_accuracy": 0.8464583426713943, + "num_tokens": 132894145.0, + "step": 110530 + }, + { + "entropy": 1.8100114524364472, + "epoch": 0.34266432829933374, + "grad_norm": 9.168874740600586, + "learning_rate": 4.321755524334014e-06, + "loss": 0.5045, + "mean_token_accuracy": 0.8399093985557556, + "num_tokens": 132907133.0, + "step": 110540 + }, + { + "entropy": 1.8548692613840103, + "epoch": 0.34269532742438347, + "grad_norm": 8.339598655700684, + "learning_rate": 4.3215600520428715e-06, + "loss": 0.5182, + "mean_token_accuracy": 0.8403585880994797, + "num_tokens": 132918583.0, + "step": 110550 + }, + { + "entropy": 1.8597151398658753, + "epoch": 0.34272632654943314, + "grad_norm": 8.382675170898438, + "learning_rate": 4.3213646062728695e-06, + "loss": 0.4849, + "mean_token_accuracy": 0.8416018903255462, + "num_tokens": 132930506.0, + "step": 110560 + }, + { + "entropy": 1.8886392131447791, + "epoch": 0.34275732567448286, + "grad_norm": 4.080488204956055, + "learning_rate": 4.321169187018011e-06, + "loss": 0.4526, + "mean_token_accuracy": 0.8549024015665054, + "num_tokens": 132942640.0, + "step": 110570 + }, + { + "entropy": 1.7458569899201393, + "epoch": 0.34278832479953253, + "grad_norm": 5.599550724029541, + "learning_rate": 4.3209737942722985e-06, + "loss": 0.3801, + "mean_token_accuracy": 0.8586847379803657, + "num_tokens": 132956699.0, + "step": 110580 + }, + { + "entropy": 1.8726666495203972, + "epoch": 0.34281932392458225, + "grad_norm": 7.666844367980957, + "learning_rate": 4.320778428029743e-06, + "loss": 0.4654, + "mean_token_accuracy": 0.8506060406565666, + "num_tokens": 132967858.0, + "step": 110590 + }, + { + "entropy": 1.8701406121253967, + "epoch": 0.3428503230496319, + "grad_norm": 9.031076431274414, + "learning_rate": 4.320583088284352e-06, + "loss": 0.4853, + "mean_token_accuracy": 0.8525993511080742, + "num_tokens": 132979844.0, + "step": 110600 + }, + { + "entropy": 1.7444271817803383, + "epoch": 0.34288132217468165, + "grad_norm": 3.681898593902588, + "learning_rate": 4.320387775030135e-06, + "loss": 0.3891, + "mean_token_accuracy": 0.8686041355133056, + "num_tokens": 132993483.0, + "step": 110610 + }, + { + "entropy": 1.8315519198775292, + "epoch": 0.3429123212997313, + "grad_norm": 13.511602401733398, + "learning_rate": 4.320192488261108e-06, + "loss": 0.4676, + "mean_token_accuracy": 0.8444625332951545, + "num_tokens": 133006258.0, + "step": 110620 + }, + { + "entropy": 1.8297417625784873, + "epoch": 0.342943320424781, + "grad_norm": 4.069921016693115, + "learning_rate": 4.319997227971282e-06, + "loss": 0.4569, + "mean_token_accuracy": 0.8498388364911079, + "num_tokens": 133018876.0, + "step": 110630 + }, + { + "entropy": 1.9210706606507302, + "epoch": 0.3429743195498307, + "grad_norm": 9.114517211914062, + "learning_rate": 4.319801994154677e-06, + "loss": 0.536, + "mean_token_accuracy": 0.8308271437883377, + "num_tokens": 133030224.0, + "step": 110640 + }, + { + "entropy": 1.8908802688121795, + "epoch": 0.3430053186748804, + "grad_norm": 9.932719230651855, + "learning_rate": 4.319606786805309e-06, + "loss": 0.529, + "mean_token_accuracy": 0.8441593378782273, + "num_tokens": 133041073.0, + "step": 110650 + }, + { + "entropy": 1.8904657304286956, + "epoch": 0.3430363177999301, + "grad_norm": 7.985291957855225, + "learning_rate": 4.3194116059172e-06, + "loss": 0.5112, + "mean_token_accuracy": 0.8325783342123032, + "num_tokens": 133052277.0, + "step": 110660 + }, + { + "entropy": 1.9574284851551056, + "epoch": 0.34306731692497977, + "grad_norm": 7.235466957092285, + "learning_rate": 4.319216451484371e-06, + "loss": 0.5211, + "mean_token_accuracy": 0.8355097994208336, + "num_tokens": 133064090.0, + "step": 110670 + }, + { + "entropy": 1.816710241138935, + "epoch": 0.3430983160500295, + "grad_norm": 8.71338939666748, + "learning_rate": 4.319021323500848e-06, + "loss": 0.4738, + "mean_token_accuracy": 0.8443104445934295, + "num_tokens": 133076199.0, + "step": 110680 + }, + { + "entropy": 1.843747340142727, + "epoch": 0.34312931517507916, + "grad_norm": 6.804162502288818, + "learning_rate": 4.318826221960655e-06, + "loss": 0.4614, + "mean_token_accuracy": 0.8444552600383759, + "num_tokens": 133088227.0, + "step": 110690 + }, + { + "entropy": 1.8602527409791947, + "epoch": 0.3431603143001289, + "grad_norm": 7.936315059661865, + "learning_rate": 4.318631146857822e-06, + "loss": 0.4549, + "mean_token_accuracy": 0.8574521571397782, + "num_tokens": 133100175.0, + "step": 110700 + }, + { + "entropy": 1.8244513690471649, + "epoch": 0.34319131342517856, + "grad_norm": 11.78140926361084, + "learning_rate": 4.318436098186376e-06, + "loss": 0.416, + "mean_token_accuracy": 0.8596523448824882, + "num_tokens": 133111645.0, + "step": 110710 + }, + { + "entropy": 1.8955079719424248, + "epoch": 0.3432223125502283, + "grad_norm": 5.455853462219238, + "learning_rate": 4.318241075940353e-06, + "loss": 0.5204, + "mean_token_accuracy": 0.8417312771081924, + "num_tokens": 133122905.0, + "step": 110720 + }, + { + "entropy": 1.898301364481449, + "epoch": 0.34325331167527795, + "grad_norm": 8.922945976257324, + "learning_rate": 4.318046080113783e-06, + "loss": 0.4867, + "mean_token_accuracy": 0.8424309939146042, + "num_tokens": 133134095.0, + "step": 110730 + }, + { + "entropy": 1.8243930265307426, + "epoch": 0.3432843108003277, + "grad_norm": 4.409710884094238, + "learning_rate": 4.317851110700703e-06, + "loss": 0.4563, + "mean_token_accuracy": 0.8556462466716767, + "num_tokens": 133146666.0, + "step": 110740 + }, + { + "entropy": 1.9261058151721955, + "epoch": 0.34331530992537734, + "grad_norm": 12.174426078796387, + "learning_rate": 4.31765616769515e-06, + "loss": 0.4769, + "mean_token_accuracy": 0.8526764348149299, + "num_tokens": 133157255.0, + "step": 110750 + }, + { + "entropy": 1.911964663863182, + "epoch": 0.34334630905042707, + "grad_norm": 8.915404319763184, + "learning_rate": 4.317461251091163e-06, + "loss": 0.4823, + "mean_token_accuracy": 0.8492590352892876, + "num_tokens": 133168516.0, + "step": 110760 + }, + { + "entropy": 1.8462224438786508, + "epoch": 0.34337730817547674, + "grad_norm": 9.527721405029297, + "learning_rate": 4.317266360882783e-06, + "loss": 0.486, + "mean_token_accuracy": 0.8471042737364769, + "num_tokens": 133180707.0, + "step": 110770 + }, + { + "entropy": 1.821668528020382, + "epoch": 0.34340830730052646, + "grad_norm": 3.8620917797088623, + "learning_rate": 4.3170714970640535e-06, + "loss": 0.4187, + "mean_token_accuracy": 0.859420596063137, + "num_tokens": 133192497.0, + "step": 110780 + }, + { + "entropy": 1.875546859204769, + "epoch": 0.34343930642557613, + "grad_norm": 7.777998924255371, + "learning_rate": 4.3168766596290205e-06, + "loss": 0.4715, + "mean_token_accuracy": 0.8493057772517204, + "num_tokens": 133204389.0, + "step": 110790 + }, + { + "entropy": 1.807924547791481, + "epoch": 0.34347030555062585, + "grad_norm": 12.520805358886719, + "learning_rate": 4.31668184857173e-06, + "loss": 0.3938, + "mean_token_accuracy": 0.8617511481046677, + "num_tokens": 133217528.0, + "step": 110800 + }, + { + "entropy": 1.8352250516414643, + "epoch": 0.3435013046756755, + "grad_norm": 5.211875915527344, + "learning_rate": 4.31648706388623e-06, + "loss": 0.4489, + "mean_token_accuracy": 0.8534446090459824, + "num_tokens": 133229712.0, + "step": 110810 + }, + { + "entropy": 1.8670003831386566, + "epoch": 0.34353230380072525, + "grad_norm": 7.83875846862793, + "learning_rate": 4.316292305566571e-06, + "loss": 0.4622, + "mean_token_accuracy": 0.850574080646038, + "num_tokens": 133241353.0, + "step": 110820 + }, + { + "entropy": 1.887353539466858, + "epoch": 0.3435633029257749, + "grad_norm": 10.31457233428955, + "learning_rate": 4.316097573606808e-06, + "loss": 0.4903, + "mean_token_accuracy": 0.8490018859505654, + "num_tokens": 133252486.0, + "step": 110830 + }, + { + "entropy": 1.891994397342205, + "epoch": 0.34359430205082464, + "grad_norm": 8.610884666442871, + "learning_rate": 4.315902868000992e-06, + "loss": 0.4975, + "mean_token_accuracy": 0.8397418484091759, + "num_tokens": 133264441.0, + "step": 110840 + }, + { + "entropy": 1.8664520055055618, + "epoch": 0.3436253011758743, + "grad_norm": 7.726016044616699, + "learning_rate": 4.3157081887431804e-06, + "loss": 0.499, + "mean_token_accuracy": 0.8486996784806251, + "num_tokens": 133276898.0, + "step": 110850 + }, + { + "entropy": 1.884059876203537, + "epoch": 0.34365630030092403, + "grad_norm": 4.186914443969727, + "learning_rate": 4.315513535827431e-06, + "loss": 0.4701, + "mean_token_accuracy": 0.8496600687503815, + "num_tokens": 133288536.0, + "step": 110860 + }, + { + "entropy": 1.9221114248037339, + "epoch": 0.3436872994259737, + "grad_norm": 9.337282180786133, + "learning_rate": 4.315318909247805e-06, + "loss": 0.4669, + "mean_token_accuracy": 0.842443785071373, + "num_tokens": 133299978.0, + "step": 110870 + }, + { + "entropy": 1.8485377907752991, + "epoch": 0.34371829855102337, + "grad_norm": 9.948966979980469, + "learning_rate": 4.315124308998364e-06, + "loss": 0.4684, + "mean_token_accuracy": 0.8506104618310928, + "num_tokens": 133311745.0, + "step": 110880 + }, + { + "entropy": 1.8648578524589539, + "epoch": 0.3437492976760731, + "grad_norm": 7.327975749969482, + "learning_rate": 4.31492973507317e-06, + "loss": 0.4557, + "mean_token_accuracy": 0.8568118900060654, + "num_tokens": 133323824.0, + "step": 110890 + }, + { + "entropy": 1.9047629460692406, + "epoch": 0.34378029680112276, + "grad_norm": 8.159683227539062, + "learning_rate": 4.314735187466291e-06, + "loss": 0.4994, + "mean_token_accuracy": 0.8451401859521865, + "num_tokens": 133335255.0, + "step": 110900 + }, + { + "entropy": 1.925184278190136, + "epoch": 0.3438112959261725, + "grad_norm": 4.655861854553223, + "learning_rate": 4.3145406661717925e-06, + "loss": 0.5484, + "mean_token_accuracy": 0.826390016078949, + "num_tokens": 133347385.0, + "step": 110910 + }, + { + "entropy": 1.9013187378644942, + "epoch": 0.34384229505122216, + "grad_norm": 7.1715922355651855, + "learning_rate": 4.3143461711837445e-06, + "loss": 0.5173, + "mean_token_accuracy": 0.8376141622662544, + "num_tokens": 133358459.0, + "step": 110920 + }, + { + "entropy": 1.9203463315963745, + "epoch": 0.3438732941762719, + "grad_norm": 10.447772026062012, + "learning_rate": 4.314151702496219e-06, + "loss": 0.5415, + "mean_token_accuracy": 0.8378437146544456, + "num_tokens": 133369315.0, + "step": 110930 + }, + { + "entropy": 1.926878383755684, + "epoch": 0.34390429330132155, + "grad_norm": 7.588080406188965, + "learning_rate": 4.313957260103287e-06, + "loss": 0.4738, + "mean_token_accuracy": 0.8456119611859322, + "num_tokens": 133379860.0, + "step": 110940 + }, + { + "entropy": 1.714067880809307, + "epoch": 0.3439352924263713, + "grad_norm": 4.141031265258789, + "learning_rate": 4.313762843999025e-06, + "loss": 0.3662, + "mean_token_accuracy": 0.8552543878555298, + "num_tokens": 133394752.0, + "step": 110950 + }, + { + "entropy": 1.7716796442866325, + "epoch": 0.34396629155142094, + "grad_norm": 8.312151908874512, + "learning_rate": 4.31356845417751e-06, + "loss": 0.4151, + "mean_token_accuracy": 0.8612663641571998, + "num_tokens": 133407887.0, + "step": 110960 + }, + { + "entropy": 1.909244528412819, + "epoch": 0.34399729067647067, + "grad_norm": 9.186685562133789, + "learning_rate": 4.31337409063282e-06, + "loss": 0.5101, + "mean_token_accuracy": 0.8425434127449989, + "num_tokens": 133419016.0, + "step": 110970 + }, + { + "entropy": 1.831558196246624, + "epoch": 0.34402828980152034, + "grad_norm": 7.182348728179932, + "learning_rate": 4.3131797533590354e-06, + "loss": 0.4402, + "mean_token_accuracy": 0.8547964975237846, + "num_tokens": 133431120.0, + "step": 110980 + }, + { + "entropy": 1.923382543027401, + "epoch": 0.34405928892657006, + "grad_norm": 7.744353771209717, + "learning_rate": 4.31298544235024e-06, + "loss": 0.5213, + "mean_token_accuracy": 0.8368471801280976, + "num_tokens": 133442547.0, + "step": 110990 + }, + { + "entropy": 1.8427509516477585, + "epoch": 0.34409028805161973, + "grad_norm": 8.841486930847168, + "learning_rate": 4.312791157600516e-06, + "loss": 0.4723, + "mean_token_accuracy": 0.8453793004155159, + "num_tokens": 133455042.0, + "step": 111000 + }, + { + "entropy": 1.854433636367321, + "epoch": 0.34412128717666945, + "grad_norm": 8.208456039428711, + "learning_rate": 4.312596899103951e-06, + "loss": 0.4855, + "mean_token_accuracy": 0.8534053891897202, + "num_tokens": 133466808.0, + "step": 111010 + }, + { + "entropy": 1.936078244447708, + "epoch": 0.3441522863017191, + "grad_norm": 10.055807113647461, + "learning_rate": 4.312402666854633e-06, + "loss": 0.4524, + "mean_token_accuracy": 0.8546758487820625, + "num_tokens": 133478433.0, + "step": 111020 + }, + { + "entropy": 1.8237747445702552, + "epoch": 0.34418328542676885, + "grad_norm": 10.912522315979004, + "learning_rate": 4.312208460846651e-06, + "loss": 0.4265, + "mean_token_accuracy": 0.848297068476677, + "num_tokens": 133490903.0, + "step": 111030 + }, + { + "entropy": 1.8850932896137238, + "epoch": 0.3442142845518185, + "grad_norm": 8.489951133728027, + "learning_rate": 4.312014281074098e-06, + "loss": 0.4681, + "mean_token_accuracy": 0.8428701400756836, + "num_tokens": 133503256.0, + "step": 111040 + }, + { + "entropy": 1.80315043926239, + "epoch": 0.34424528367686824, + "grad_norm": 4.91754674911499, + "learning_rate": 4.311820127531066e-06, + "loss": 0.4476, + "mean_token_accuracy": 0.8500543773174286, + "num_tokens": 133515626.0, + "step": 111050 + }, + { + "entropy": 1.8291157126426696, + "epoch": 0.3442762828019179, + "grad_norm": 8.575767517089844, + "learning_rate": 4.3116260002116505e-06, + "loss": 0.4447, + "mean_token_accuracy": 0.8480419516563416, + "num_tokens": 133528210.0, + "step": 111060 + }, + { + "entropy": 1.8445888727903366, + "epoch": 0.34430728192696763, + "grad_norm": 7.410253524780273, + "learning_rate": 4.3114318991099505e-06, + "loss": 0.4645, + "mean_token_accuracy": 0.8509875819087028, + "num_tokens": 133539348.0, + "step": 111070 + }, + { + "entropy": 1.7617506772279738, + "epoch": 0.3443382810520173, + "grad_norm": 4.4981770515441895, + "learning_rate": 4.311237824220064e-06, + "loss": 0.4237, + "mean_token_accuracy": 0.8568379059433937, + "num_tokens": 133552472.0, + "step": 111080 + }, + { + "entropy": 1.845915214717388, + "epoch": 0.344369280177067, + "grad_norm": 9.03842544555664, + "learning_rate": 4.311043775536092e-06, + "loss": 0.5056, + "mean_token_accuracy": 0.8473999381065369, + "num_tokens": 133564707.0, + "step": 111090 + }, + { + "entropy": 1.685810850560665, + "epoch": 0.3444002793021167, + "grad_norm": 4.30578088760376, + "learning_rate": 4.310849753052138e-06, + "loss": 0.3662, + "mean_token_accuracy": 0.864673687517643, + "num_tokens": 133579259.0, + "step": 111100 + }, + { + "entropy": 1.8005015209317208, + "epoch": 0.3444312784271664, + "grad_norm": 9.097541809082031, + "learning_rate": 4.310655756762307e-06, + "loss": 0.4941, + "mean_token_accuracy": 0.8453745514154434, + "num_tokens": 133591953.0, + "step": 111110 + }, + { + "entropy": 1.8135478526353837, + "epoch": 0.3444622775522161, + "grad_norm": 3.2069530487060547, + "learning_rate": 4.310461786660705e-06, + "loss": 0.4532, + "mean_token_accuracy": 0.8551574990153312, + "num_tokens": 133604208.0, + "step": 111120 + }, + { + "entropy": 1.873196867108345, + "epoch": 0.34449327667726576, + "grad_norm": 8.344642639160156, + "learning_rate": 4.310267842741439e-06, + "loss": 0.4472, + "mean_token_accuracy": 0.8539148271083832, + "num_tokens": 133616056.0, + "step": 111130 + }, + { + "entropy": 1.9149486266076565, + "epoch": 0.3445242758023155, + "grad_norm": 8.610182762145996, + "learning_rate": 4.3100739249986225e-06, + "loss": 0.473, + "mean_token_accuracy": 0.8442770466208458, + "num_tokens": 133627877.0, + "step": 111140 + }, + { + "entropy": 1.910437923669815, + "epoch": 0.34455527492736515, + "grad_norm": 8.95980167388916, + "learning_rate": 4.309880033426365e-06, + "loss": 0.5054, + "mean_token_accuracy": 0.8403397217392922, + "num_tokens": 133639187.0, + "step": 111150 + }, + { + "entropy": 1.9167566150426865, + "epoch": 0.3445862740524149, + "grad_norm": 9.184028625488281, + "learning_rate": 4.3096861680187815e-06, + "loss": 0.4849, + "mean_token_accuracy": 0.8474911168217659, + "num_tokens": 133650200.0, + "step": 111160 + }, + { + "entropy": 1.9442434057593345, + "epoch": 0.34461727317746454, + "grad_norm": 9.074106216430664, + "learning_rate": 4.309492328769988e-06, + "loss": 0.5261, + "mean_token_accuracy": 0.8301569879055023, + "num_tokens": 133661615.0, + "step": 111170 + }, + { + "entropy": 1.8889770179986953, + "epoch": 0.34464827230251427, + "grad_norm": 11.795031547546387, + "learning_rate": 4.309298515674102e-06, + "loss": 0.5, + "mean_token_accuracy": 0.8382586434483528, + "num_tokens": 133672713.0, + "step": 111180 + }, + { + "entropy": 1.8732004672288896, + "epoch": 0.34467927142756394, + "grad_norm": 7.575119495391846, + "learning_rate": 4.309104728725243e-06, + "loss": 0.4804, + "mean_token_accuracy": 0.8471902221441269, + "num_tokens": 133683668.0, + "step": 111190 + }, + { + "entropy": 1.9257921800017357, + "epoch": 0.34471027055261366, + "grad_norm": 7.821493148803711, + "learning_rate": 4.308910967917533e-06, + "loss": 0.4817, + "mean_token_accuracy": 0.8495068654417992, + "num_tokens": 133694848.0, + "step": 111200 + }, + { + "entropy": 1.876696328818798, + "epoch": 0.34474126967766333, + "grad_norm": 9.46831226348877, + "learning_rate": 4.308717233245096e-06, + "loss": 0.4807, + "mean_token_accuracy": 0.847463445365429, + "num_tokens": 133706948.0, + "step": 111210 + }, + { + "entropy": 1.9034352615475654, + "epoch": 0.34477226880271306, + "grad_norm": 4.813274383544922, + "learning_rate": 4.3085235247020545e-06, + "loss": 0.4282, + "mean_token_accuracy": 0.8571040198206902, + "num_tokens": 133719003.0, + "step": 111220 + }, + { + "entropy": 1.8417223021388054, + "epoch": 0.3448032679277627, + "grad_norm": 8.651920318603516, + "learning_rate": 4.3083298422825375e-06, + "loss": 0.4393, + "mean_token_accuracy": 0.8489273890852929, + "num_tokens": 133731771.0, + "step": 111230 + }, + { + "entropy": 1.919362673163414, + "epoch": 0.34483426705281245, + "grad_norm": 7.571285724639893, + "learning_rate": 4.308136185980673e-06, + "loss": 0.5243, + "mean_token_accuracy": 0.839247289299965, + "num_tokens": 133742822.0, + "step": 111240 + }, + { + "entropy": 1.8016601279377937, + "epoch": 0.3448652661778621, + "grad_norm": 9.703665733337402, + "learning_rate": 4.307942555790593e-06, + "loss": 0.4382, + "mean_token_accuracy": 0.8540796846151352, + "num_tokens": 133755750.0, + "step": 111250 + }, + { + "entropy": 1.7813518926501275, + "epoch": 0.34489626530291184, + "grad_norm": 4.28598690032959, + "learning_rate": 4.3077489517064285e-06, + "loss": 0.3962, + "mean_token_accuracy": 0.8613515332341194, + "num_tokens": 133768622.0, + "step": 111260 + }, + { + "entropy": 1.8209292754530906, + "epoch": 0.3449272644279615, + "grad_norm": 8.168103218078613, + "learning_rate": 4.307555373722316e-06, + "loss": 0.4038, + "mean_token_accuracy": 0.8652269795536995, + "num_tokens": 133780564.0, + "step": 111270 + }, + { + "entropy": 1.8425551727414131, + "epoch": 0.34495826355301124, + "grad_norm": 8.19618034362793, + "learning_rate": 4.307361821832388e-06, + "loss": 0.4519, + "mean_token_accuracy": 0.8445751652121544, + "num_tokens": 133792677.0, + "step": 111280 + }, + { + "entropy": 1.824992810189724, + "epoch": 0.3449892626780609, + "grad_norm": 8.843597412109375, + "learning_rate": 4.307168296030786e-06, + "loss": 0.4662, + "mean_token_accuracy": 0.8552077680826187, + "num_tokens": 133805279.0, + "step": 111290 + }, + { + "entropy": 1.9060368582606315, + "epoch": 0.34502026180311063, + "grad_norm": 3.7033939361572266, + "learning_rate": 4.306974796311647e-06, + "loss": 0.4961, + "mean_token_accuracy": 0.841067835688591, + "num_tokens": 133817633.0, + "step": 111300 + }, + { + "entropy": 1.864403063058853, + "epoch": 0.3450512609281603, + "grad_norm": 7.459976673126221, + "learning_rate": 4.306781322669116e-06, + "loss": 0.4578, + "mean_token_accuracy": 0.8469650357961654, + "num_tokens": 133829133.0, + "step": 111310 + }, + { + "entropy": 1.8203039526939393, + "epoch": 0.34508226005321, + "grad_norm": 9.59398365020752, + "learning_rate": 4.306587875097335e-06, + "loss": 0.4271, + "mean_token_accuracy": 0.8534867212176322, + "num_tokens": 133841674.0, + "step": 111320 + }, + { + "entropy": 1.843678180873394, + "epoch": 0.3451132591782597, + "grad_norm": 11.09599781036377, + "learning_rate": 4.306394453590449e-06, + "loss": 0.4394, + "mean_token_accuracy": 0.8509141758084298, + "num_tokens": 133853073.0, + "step": 111330 + }, + { + "entropy": 1.837014827132225, + "epoch": 0.3451442583033094, + "grad_norm": 7.771301746368408, + "learning_rate": 4.306201058142605e-06, + "loss": 0.4294, + "mean_token_accuracy": 0.8529363244771957, + "num_tokens": 133865482.0, + "step": 111340 + }, + { + "entropy": 1.8316177785396577, + "epoch": 0.3451752574283591, + "grad_norm": 3.7739436626434326, + "learning_rate": 4.3060076887479545e-06, + "loss": 0.4391, + "mean_token_accuracy": 0.8522952899336815, + "num_tokens": 133877076.0, + "step": 111350 + }, + { + "entropy": 1.8375495672225952, + "epoch": 0.34520625655340875, + "grad_norm": 2.9696900844573975, + "learning_rate": 4.305814345400645e-06, + "loss": 0.4598, + "mean_token_accuracy": 0.8476609244942666, + "num_tokens": 133889360.0, + "step": 111360 + }, + { + "entropy": 1.783223459124565, + "epoch": 0.3452372556784585, + "grad_norm": 7.960699081420898, + "learning_rate": 4.305621028094832e-06, + "loss": 0.4627, + "mean_token_accuracy": 0.8435281231999397, + "num_tokens": 133903252.0, + "step": 111370 + }, + { + "entropy": 1.84706342369318, + "epoch": 0.34526825480350815, + "grad_norm": 7.111573696136475, + "learning_rate": 4.305427736824668e-06, + "loss": 0.4798, + "mean_token_accuracy": 0.8439237996935844, + "num_tokens": 133915237.0, + "step": 111380 + }, + { + "entropy": 1.7956417188048364, + "epoch": 0.34529925392855787, + "grad_norm": 7.185604572296143, + "learning_rate": 4.305234471584312e-06, + "loss": 0.3936, + "mean_token_accuracy": 0.8640512511134147, + "num_tokens": 133927538.0, + "step": 111390 + }, + { + "entropy": 1.8561489313840867, + "epoch": 0.34533025305360754, + "grad_norm": 8.284113883972168, + "learning_rate": 4.3050412323679206e-06, + "loss": 0.457, + "mean_token_accuracy": 0.8489404112100601, + "num_tokens": 133938940.0, + "step": 111400 + }, + { + "entropy": 1.8327260576188564, + "epoch": 0.34536125217865726, + "grad_norm": 3.422919750213623, + "learning_rate": 4.3048480191696544e-06, + "loss": 0.458, + "mean_token_accuracy": 0.8489983096718788, + "num_tokens": 133951108.0, + "step": 111410 + }, + { + "entropy": 1.8617496728897094, + "epoch": 0.34539225130370693, + "grad_norm": 7.909846305847168, + "learning_rate": 4.304654831983675e-06, + "loss": 0.4596, + "mean_token_accuracy": 0.8476134285330772, + "num_tokens": 133963396.0, + "step": 111420 + }, + { + "entropy": 1.9265403121709823, + "epoch": 0.34542325042875666, + "grad_norm": 7.380890369415283, + "learning_rate": 4.304461670804146e-06, + "loss": 0.5297, + "mean_token_accuracy": 0.841274605691433, + "num_tokens": 133974112.0, + "step": 111430 + }, + { + "entropy": 1.8644670337438582, + "epoch": 0.3454542495538063, + "grad_norm": 8.807491302490234, + "learning_rate": 4.3042685356252335e-06, + "loss": 0.4656, + "mean_token_accuracy": 0.8540144443511963, + "num_tokens": 133985279.0, + "step": 111440 + }, + { + "entropy": 1.9059779167175293, + "epoch": 0.34548524867885605, + "grad_norm": 7.666346073150635, + "learning_rate": 4.304075426441105e-06, + "loss": 0.5396, + "mean_token_accuracy": 0.8371213942766189, + "num_tokens": 133996376.0, + "step": 111450 + }, + { + "entropy": 1.8472643613815307, + "epoch": 0.3455162478039057, + "grad_norm": 7.403988361358643, + "learning_rate": 4.3038823432459305e-06, + "loss": 0.4338, + "mean_token_accuracy": 0.8564211338758468, + "num_tokens": 134008017.0, + "step": 111460 + }, + { + "entropy": 1.8477882623672486, + "epoch": 0.34554724692895544, + "grad_norm": 7.693085670471191, + "learning_rate": 4.30368928603388e-06, + "loss": 0.4726, + "mean_token_accuracy": 0.8446563795208931, + "num_tokens": 134020115.0, + "step": 111470 + }, + { + "entropy": 1.8344331562519074, + "epoch": 0.3455782460540051, + "grad_norm": 4.255249977111816, + "learning_rate": 4.303496254799126e-06, + "loss": 0.465, + "mean_token_accuracy": 0.8438744112849236, + "num_tokens": 134032355.0, + "step": 111480 + }, + { + "entropy": 1.8852899730205537, + "epoch": 0.34560924517905484, + "grad_norm": 9.30275821685791, + "learning_rate": 4.303303249535845e-06, + "loss": 0.4825, + "mean_token_accuracy": 0.8437858000397682, + "num_tokens": 134043633.0, + "step": 111490 + }, + { + "entropy": 1.7422870948910714, + "epoch": 0.3456402443041045, + "grad_norm": 7.4847941398620605, + "learning_rate": 4.3031102702382125e-06, + "loss": 0.3388, + "mean_token_accuracy": 0.8664668053388596, + "num_tokens": 134057736.0, + "step": 111500 + }, + { + "entropy": 1.7793061807751656, + "epoch": 0.34567124342915423, + "grad_norm": 3.7864532470703125, + "learning_rate": 4.302917316900407e-06, + "loss": 0.4038, + "mean_token_accuracy": 0.8605924814939498, + "num_tokens": 134070395.0, + "step": 111510 + }, + { + "entropy": 1.8568082675337791, + "epoch": 0.3457022425542039, + "grad_norm": 11.483708381652832, + "learning_rate": 4.302724389516609e-06, + "loss": 0.4593, + "mean_token_accuracy": 0.8510003834962845, + "num_tokens": 134082721.0, + "step": 111520 + }, + { + "entropy": 1.8132430225610734, + "epoch": 0.3457332416792536, + "grad_norm": 2.2380177974700928, + "learning_rate": 4.302531488081001e-06, + "loss": 0.4718, + "mean_token_accuracy": 0.8539024695754052, + "num_tokens": 134094940.0, + "step": 111530 + }, + { + "entropy": 1.7989331185817719, + "epoch": 0.3457642408043033, + "grad_norm": 3.9611406326293945, + "learning_rate": 4.302338612587765e-06, + "loss": 0.4067, + "mean_token_accuracy": 0.856696180999279, + "num_tokens": 134108140.0, + "step": 111540 + }, + { + "entropy": 1.85663383603096, + "epoch": 0.345795239929353, + "grad_norm": 3.9444892406463623, + "learning_rate": 4.3021457630310894e-06, + "loss": 0.4472, + "mean_token_accuracy": 0.860635980963707, + "num_tokens": 134119906.0, + "step": 111550 + }, + { + "entropy": 1.8466177895665168, + "epoch": 0.3458262390544027, + "grad_norm": 9.456862449645996, + "learning_rate": 4.30195293940516e-06, + "loss": 0.4452, + "mean_token_accuracy": 0.8576301142573357, + "num_tokens": 134132194.0, + "step": 111560 + }, + { + "entropy": 1.8773884311318398, + "epoch": 0.3458572381794524, + "grad_norm": 6.836810111999512, + "learning_rate": 4.301760141704167e-06, + "loss": 0.4855, + "mean_token_accuracy": 0.8493657246232033, + "num_tokens": 134143520.0, + "step": 111570 + }, + { + "entropy": 1.9150910288095475, + "epoch": 0.3458882373045021, + "grad_norm": 8.402644157409668, + "learning_rate": 4.301567369922301e-06, + "loss": 0.527, + "mean_token_accuracy": 0.8446568265557289, + "num_tokens": 134154428.0, + "step": 111580 + }, + { + "entropy": 1.911669085919857, + "epoch": 0.3459192364295518, + "grad_norm": 8.802173614501953, + "learning_rate": 4.301374624053757e-06, + "loss": 0.4807, + "mean_token_accuracy": 0.849866247177124, + "num_tokens": 134165966.0, + "step": 111590 + }, + { + "entropy": 1.8143538311123848, + "epoch": 0.34595023555460147, + "grad_norm": 8.931032180786133, + "learning_rate": 4.301181904092727e-06, + "loss": 0.4441, + "mean_token_accuracy": 0.8532052963972092, + "num_tokens": 134178567.0, + "step": 111600 + }, + { + "entropy": 1.8561271876096725, + "epoch": 0.34598123467965114, + "grad_norm": 9.355246543884277, + "learning_rate": 4.300989210033409e-06, + "loss": 0.4762, + "mean_token_accuracy": 0.8413384512066842, + "num_tokens": 134190432.0, + "step": 111610 + }, + { + "entropy": 1.8068694084882737, + "epoch": 0.34601223380470086, + "grad_norm": 8.892166137695312, + "learning_rate": 4.3007965418700015e-06, + "loss": 0.4047, + "mean_token_accuracy": 0.8617716804146767, + "num_tokens": 134202936.0, + "step": 111620 + }, + { + "entropy": 1.819658127427101, + "epoch": 0.34604323292975053, + "grad_norm": 4.105354309082031, + "learning_rate": 4.300603899596706e-06, + "loss": 0.4383, + "mean_token_accuracy": 0.8501406639814377, + "num_tokens": 134215290.0, + "step": 111630 + }, + { + "entropy": 1.7999876148998737, + "epoch": 0.34607423205480026, + "grad_norm": 3.328152894973755, + "learning_rate": 4.300411283207722e-06, + "loss": 0.4242, + "mean_token_accuracy": 0.8598711341619492, + "num_tokens": 134228492.0, + "step": 111640 + }, + { + "entropy": 1.8138612404465675, + "epoch": 0.3461052311798499, + "grad_norm": 8.897016525268555, + "learning_rate": 4.300218692697255e-06, + "loss": 0.4467, + "mean_token_accuracy": 0.8562261596322059, + "num_tokens": 134240367.0, + "step": 111650 + }, + { + "entropy": 1.8443049594759942, + "epoch": 0.34613623030489965, + "grad_norm": 7.959166049957275, + "learning_rate": 4.300026128059511e-06, + "loss": 0.4563, + "mean_token_accuracy": 0.8506998106837272, + "num_tokens": 134252476.0, + "step": 111660 + }, + { + "entropy": 1.8540190950036048, + "epoch": 0.3461672294299493, + "grad_norm": 4.293795585632324, + "learning_rate": 4.2998335892886964e-06, + "loss": 0.4592, + "mean_token_accuracy": 0.8463447079062462, + "num_tokens": 134264385.0, + "step": 111670 + }, + { + "entropy": 1.8193232595920563, + "epoch": 0.34619822855499904, + "grad_norm": 8.931365966796875, + "learning_rate": 4.2996410763790225e-06, + "loss": 0.4414, + "mean_token_accuracy": 0.8522950485348701, + "num_tokens": 134276602.0, + "step": 111680 + }, + { + "entropy": 1.8343895703554154, + "epoch": 0.3462292276800487, + "grad_norm": 9.969979286193848, + "learning_rate": 4.2994485893247e-06, + "loss": 0.4677, + "mean_token_accuracy": 0.8539774760603904, + "num_tokens": 134288782.0, + "step": 111690 + }, + { + "entropy": 1.8512503013014794, + "epoch": 0.34626022680509844, + "grad_norm": 2.8974452018737793, + "learning_rate": 4.2992561281199405e-06, + "loss": 0.4929, + "mean_token_accuracy": 0.8387772515416145, + "num_tokens": 134301209.0, + "step": 111700 + }, + { + "entropy": 1.8505238316953183, + "epoch": 0.3462912259301481, + "grad_norm": 3.372145175933838, + "learning_rate": 4.29906369275896e-06, + "loss": 0.4562, + "mean_token_accuracy": 0.8435917362570763, + "num_tokens": 134313368.0, + "step": 111710 + }, + { + "entropy": 1.8507800638675689, + "epoch": 0.34632222505519783, + "grad_norm": 7.913898944854736, + "learning_rate": 4.2988712832359755e-06, + "loss": 0.4267, + "mean_token_accuracy": 0.8630237206816673, + "num_tokens": 134325077.0, + "step": 111720 + }, + { + "entropy": 1.7628626979887485, + "epoch": 0.3463532241802475, + "grad_norm": 3.487839698791504, + "learning_rate": 4.2986788995452044e-06, + "loss": 0.3905, + "mean_token_accuracy": 0.8562428668141365, + "num_tokens": 134339036.0, + "step": 111730 + }, + { + "entropy": 1.843147909641266, + "epoch": 0.3463842233052972, + "grad_norm": 6.786396026611328, + "learning_rate": 4.298486541680868e-06, + "loss": 0.4768, + "mean_token_accuracy": 0.8427472576498986, + "num_tokens": 134351445.0, + "step": 111740 + }, + { + "entropy": 1.807756444811821, + "epoch": 0.3464152224303469, + "grad_norm": 5.164430141448975, + "learning_rate": 4.298294209637186e-06, + "loss": 0.4349, + "mean_token_accuracy": 0.853118185698986, + "num_tokens": 134365069.0, + "step": 111750 + }, + { + "entropy": 1.859677466750145, + "epoch": 0.3464462215553966, + "grad_norm": 9.513275146484375, + "learning_rate": 4.298101903408386e-06, + "loss": 0.4894, + "mean_token_accuracy": 0.8469240546226502, + "num_tokens": 134377457.0, + "step": 111760 + }, + { + "entropy": 1.8968846127390862, + "epoch": 0.3464772206804463, + "grad_norm": 4.018582344055176, + "learning_rate": 4.297909622988691e-06, + "loss": 0.4795, + "mean_token_accuracy": 0.8519986256957054, + "num_tokens": 134388907.0, + "step": 111770 + }, + { + "entropy": 1.9146791011095048, + "epoch": 0.346508219805496, + "grad_norm": 10.912361145019531, + "learning_rate": 4.297717368372331e-06, + "loss": 0.4754, + "mean_token_accuracy": 0.8396982088685035, + "num_tokens": 134400847.0, + "step": 111780 + }, + { + "entropy": 1.9298876509070397, + "epoch": 0.3465392189305457, + "grad_norm": 3.7877981662750244, + "learning_rate": 4.2975251395535315e-06, + "loss": 0.6157, + "mean_token_accuracy": 0.8234444096684456, + "num_tokens": 134412392.0, + "step": 111790 + }, + { + "entropy": 1.8450885564088821, + "epoch": 0.3465702180555954, + "grad_norm": 9.34116268157959, + "learning_rate": 4.297332936526527e-06, + "loss": 0.4455, + "mean_token_accuracy": 0.8459417134523392, + "num_tokens": 134424120.0, + "step": 111800 + }, + { + "entropy": 1.8955134615302085, + "epoch": 0.34660121718064507, + "grad_norm": 8.82641315460205, + "learning_rate": 4.297140759285549e-06, + "loss": 0.4741, + "mean_token_accuracy": 0.852502977848053, + "num_tokens": 134435233.0, + "step": 111810 + }, + { + "entropy": 1.91280208081007, + "epoch": 0.3466322163056948, + "grad_norm": 7.556041240692139, + "learning_rate": 4.296948607824833e-06, + "loss": 0.4761, + "mean_token_accuracy": 0.8468595892190933, + "num_tokens": 134446764.0, + "step": 111820 + }, + { + "entropy": 1.7918421171605587, + "epoch": 0.34666321543074446, + "grad_norm": 8.250411033630371, + "learning_rate": 4.296756482138616e-06, + "loss": 0.4091, + "mean_token_accuracy": 0.8426727816462517, + "num_tokens": 134460455.0, + "step": 111830 + }, + { + "entropy": 1.8062309354543686, + "epoch": 0.3466942145557942, + "grad_norm": 8.455044746398926, + "learning_rate": 4.2965643822211335e-06, + "loss": 0.4249, + "mean_token_accuracy": 0.8552202418446541, + "num_tokens": 134473028.0, + "step": 111840 + }, + { + "entropy": 1.8862827733159064, + "epoch": 0.34672521368084386, + "grad_norm": 9.205554962158203, + "learning_rate": 4.2963723080666284e-06, + "loss": 0.4578, + "mean_token_accuracy": 0.8518849700689316, + "num_tokens": 134484150.0, + "step": 111850 + }, + { + "entropy": 1.874978469312191, + "epoch": 0.3467562128058935, + "grad_norm": 7.733843803405762, + "learning_rate": 4.2961802596693425e-06, + "loss": 0.4926, + "mean_token_accuracy": 0.8496404230594635, + "num_tokens": 134495656.0, + "step": 111860 + }, + { + "entropy": 1.7182648986577989, + "epoch": 0.34678721193094325, + "grad_norm": 3.7245407104492188, + "learning_rate": 4.295988237023518e-06, + "loss": 0.3912, + "mean_token_accuracy": 0.8628142789006233, + "num_tokens": 134509703.0, + "step": 111870 + }, + { + "entropy": 1.8488290548324584, + "epoch": 0.3468182110559929, + "grad_norm": 9.35959529876709, + "learning_rate": 4.295796240123402e-06, + "loss": 0.485, + "mean_token_accuracy": 0.8483680263161659, + "num_tokens": 134522262.0, + "step": 111880 + }, + { + "entropy": 1.7668012008070946, + "epoch": 0.34684921018104264, + "grad_norm": 6.6548662185668945, + "learning_rate": 4.2956042689632414e-06, + "loss": 0.4355, + "mean_token_accuracy": 0.8533818453550339, + "num_tokens": 134534783.0, + "step": 111890 + }, + { + "entropy": 1.8939444333314897, + "epoch": 0.3468802093060923, + "grad_norm": 3.5856080055236816, + "learning_rate": 4.295412323537284e-06, + "loss": 0.4928, + "mean_token_accuracy": 0.8458682015538216, + "num_tokens": 134545993.0, + "step": 111900 + }, + { + "entropy": 1.8058057472109794, + "epoch": 0.34691120843114204, + "grad_norm": 8.451927185058594, + "learning_rate": 4.295220403839784e-06, + "loss": 0.4545, + "mean_token_accuracy": 0.8482358440756798, + "num_tokens": 134558581.0, + "step": 111910 + }, + { + "entropy": 1.92232054322958, + "epoch": 0.3469422075561917, + "grad_norm": 9.112445831298828, + "learning_rate": 4.29502850986499e-06, + "loss": 0.5232, + "mean_token_accuracy": 0.8385997459292411, + "num_tokens": 134570176.0, + "step": 111920 + }, + { + "entropy": 1.8965173974633216, + "epoch": 0.34697320668124143, + "grad_norm": 8.362327575683594, + "learning_rate": 4.294836641607161e-06, + "loss": 0.4858, + "mean_token_accuracy": 0.8484830498695374, + "num_tokens": 134581749.0, + "step": 111930 + }, + { + "entropy": 1.8018444642424583, + "epoch": 0.3470042058062911, + "grad_norm": 7.452921390533447, + "learning_rate": 4.294644799060549e-06, + "loss": 0.3927, + "mean_token_accuracy": 0.861353749036789, + "num_tokens": 134595028.0, + "step": 111940 + }, + { + "entropy": 1.8184639766812325, + "epoch": 0.3470352049313408, + "grad_norm": 6.740150451660156, + "learning_rate": 4.2944529822194155e-06, + "loss": 0.3941, + "mean_token_accuracy": 0.8679483845829964, + "num_tokens": 134607592.0, + "step": 111950 + }, + { + "entropy": 1.8601836189627647, + "epoch": 0.3470662040563905, + "grad_norm": 7.418848991394043, + "learning_rate": 4.294261191078018e-06, + "loss": 0.4602, + "mean_token_accuracy": 0.847012946009636, + "num_tokens": 134619234.0, + "step": 111960 + }, + { + "entropy": 1.8946197971701622, + "epoch": 0.3470972031814402, + "grad_norm": 10.559837341308594, + "learning_rate": 4.29406942563062e-06, + "loss": 0.4595, + "mean_token_accuracy": 0.8531604185700417, + "num_tokens": 134630843.0, + "step": 111970 + }, + { + "entropy": 1.8593143001198769, + "epoch": 0.3471282023064899, + "grad_norm": 9.078370094299316, + "learning_rate": 4.293877685871484e-06, + "loss": 0.4688, + "mean_token_accuracy": 0.8558160826563835, + "num_tokens": 134642923.0, + "step": 111980 + }, + { + "entropy": 1.8398296535015106, + "epoch": 0.3471592014315396, + "grad_norm": 7.763385772705078, + "learning_rate": 4.293685971794876e-06, + "loss": 0.4964, + "mean_token_accuracy": 0.8360516101121902, + "num_tokens": 134655091.0, + "step": 111990 + }, + { + "entropy": 1.893106034398079, + "epoch": 0.3471902005565893, + "grad_norm": 9.48266315460205, + "learning_rate": 4.293494283395064e-06, + "loss": 0.5324, + "mean_token_accuracy": 0.8328988835215568, + "num_tokens": 134666427.0, + "step": 112000 + }, + { + "entropy": 1.9177181079983712, + "epoch": 0.347221199681639, + "grad_norm": 7.473196029663086, + "learning_rate": 4.293302620666314e-06, + "loss": 0.4983, + "mean_token_accuracy": 0.8428963899612427, + "num_tokens": 134678226.0, + "step": 112010 + }, + { + "entropy": 1.903357920050621, + "epoch": 0.34725219880668867, + "grad_norm": 8.634038925170898, + "learning_rate": 4.293110983602899e-06, + "loss": 0.5214, + "mean_token_accuracy": 0.8499746888875961, + "num_tokens": 134688516.0, + "step": 112020 + }, + { + "entropy": 1.9164575353264808, + "epoch": 0.3472831979317384, + "grad_norm": 7.6614789962768555, + "learning_rate": 4.29291937219909e-06, + "loss": 0.5117, + "mean_token_accuracy": 0.8393423616886139, + "num_tokens": 134699742.0, + "step": 112030 + }, + { + "entropy": 1.8141165480017662, + "epoch": 0.34731419705678807, + "grad_norm": 3.817478656768799, + "learning_rate": 4.292727786449164e-06, + "loss": 0.4548, + "mean_token_accuracy": 0.8489150524139404, + "num_tokens": 134713148.0, + "step": 112040 + }, + { + "entropy": 1.9213459312915802, + "epoch": 0.3473451961818378, + "grad_norm": 10.819091796875, + "learning_rate": 4.292536226347394e-06, + "loss": 0.5138, + "mean_token_accuracy": 0.8452482402324677, + "num_tokens": 134723649.0, + "step": 112050 + }, + { + "entropy": 1.8041829138994216, + "epoch": 0.34737619530688746, + "grad_norm": 2.2280683517456055, + "learning_rate": 4.29234469188806e-06, + "loss": 0.4352, + "mean_token_accuracy": 0.8564143598079681, + "num_tokens": 134736057.0, + "step": 112060 + }, + { + "entropy": 1.8665026307106019, + "epoch": 0.3474071944319372, + "grad_norm": 8.091485023498535, + "learning_rate": 4.2921531830654395e-06, + "loss": 0.5063, + "mean_token_accuracy": 0.8456338688731193, + "num_tokens": 134746830.0, + "step": 112070 + }, + { + "entropy": 1.818691223859787, + "epoch": 0.34743819355698685, + "grad_norm": 9.684453964233398, + "learning_rate": 4.291961699873817e-06, + "loss": 0.4977, + "mean_token_accuracy": 0.8366310462355614, + "num_tokens": 134759170.0, + "step": 112080 + }, + { + "entropy": 1.8601401686668395, + "epoch": 0.3474691926820366, + "grad_norm": 8.760641098022461, + "learning_rate": 4.291770242307472e-06, + "loss": 0.4772, + "mean_token_accuracy": 0.8496808990836143, + "num_tokens": 134770727.0, + "step": 112090 + }, + { + "entropy": 1.782160858809948, + "epoch": 0.34750019180708624, + "grad_norm": 3.916487455368042, + "learning_rate": 4.291578810360692e-06, + "loss": 0.4125, + "mean_token_accuracy": 0.859810471534729, + "num_tokens": 134783191.0, + "step": 112100 + }, + { + "entropy": 1.8653858050704002, + "epoch": 0.3475311909321359, + "grad_norm": 3.788681983947754, + "learning_rate": 4.291387404027763e-06, + "loss": 0.4692, + "mean_token_accuracy": 0.8446184307336807, + "num_tokens": 134794470.0, + "step": 112110 + }, + { + "entropy": 1.816742117702961, + "epoch": 0.34756219005718564, + "grad_norm": 3.5885984897613525, + "learning_rate": 4.2911960233029745e-06, + "loss": 0.4449, + "mean_token_accuracy": 0.8555623233318329, + "num_tokens": 134806055.0, + "step": 112120 + }, + { + "entropy": 1.817148557305336, + "epoch": 0.3475931891822353, + "grad_norm": 9.98969841003418, + "learning_rate": 4.291004668180616e-06, + "loss": 0.4527, + "mean_token_accuracy": 0.8531207337975502, + "num_tokens": 134817396.0, + "step": 112130 + }, + { + "entropy": 1.916732382774353, + "epoch": 0.34762418830728503, + "grad_norm": 9.270211219787598, + "learning_rate": 4.290813338654979e-06, + "loss": 0.5333, + "mean_token_accuracy": 0.8368429586291313, + "num_tokens": 134828575.0, + "step": 112140 + }, + { + "entropy": 1.8983269765973092, + "epoch": 0.3476551874323347, + "grad_norm": 8.712531089782715, + "learning_rate": 4.2906220347203585e-06, + "loss": 0.4922, + "mean_token_accuracy": 0.8402269974350929, + "num_tokens": 134840282.0, + "step": 112150 + }, + { + "entropy": 1.8531476333737373, + "epoch": 0.3476861865573844, + "grad_norm": 8.996459007263184, + "learning_rate": 4.290430756371049e-06, + "loss": 0.4724, + "mean_token_accuracy": 0.8387810304760933, + "num_tokens": 134852480.0, + "step": 112160 + }, + { + "entropy": 1.9212437227368355, + "epoch": 0.3477171856824341, + "grad_norm": 7.855039119720459, + "learning_rate": 4.290239503601349e-06, + "loss": 0.5461, + "mean_token_accuracy": 0.8299373790621758, + "num_tokens": 134864007.0, + "step": 112170 + }, + { + "entropy": 1.8462037175893784, + "epoch": 0.3477481848074838, + "grad_norm": 8.122658729553223, + "learning_rate": 4.290048276405558e-06, + "loss": 0.4725, + "mean_token_accuracy": 0.8515694797039032, + "num_tokens": 134874756.0, + "step": 112180 + }, + { + "entropy": 1.8744586423039435, + "epoch": 0.3477791839325335, + "grad_norm": 8.11306381225586, + "learning_rate": 4.289857074777977e-06, + "loss": 0.4777, + "mean_token_accuracy": 0.8434885591268539, + "num_tokens": 134886296.0, + "step": 112190 + }, + { + "entropy": 1.8067267432808876, + "epoch": 0.3478101830575832, + "grad_norm": 4.103103160858154, + "learning_rate": 4.289665898712908e-06, + "loss": 0.4167, + "mean_token_accuracy": 0.8561098381876946, + "num_tokens": 134898467.0, + "step": 112200 + }, + { + "entropy": 1.9343467682600022, + "epoch": 0.3478411821826329, + "grad_norm": 6.8563923835754395, + "learning_rate": 4.289474748204655e-06, + "loss": 0.5171, + "mean_token_accuracy": 0.8526286870241165, + "num_tokens": 134909189.0, + "step": 112210 + }, + { + "entropy": 1.8394694164395333, + "epoch": 0.3478721813076826, + "grad_norm": 12.269657135009766, + "learning_rate": 4.289283623247527e-06, + "loss": 0.4493, + "mean_token_accuracy": 0.8569044172763824, + "num_tokens": 134920985.0, + "step": 112220 + }, + { + "entropy": 1.7924383148550986, + "epoch": 0.3479031804327323, + "grad_norm": 3.9235920906066895, + "learning_rate": 4.289092523835829e-06, + "loss": 0.4259, + "mean_token_accuracy": 0.8516478061676025, + "num_tokens": 134934467.0, + "step": 112230 + }, + { + "entropy": 1.9101425260305405, + "epoch": 0.347934179557782, + "grad_norm": 8.595767974853516, + "learning_rate": 4.288901449963873e-06, + "loss": 0.5025, + "mean_token_accuracy": 0.8431054592132569, + "num_tokens": 134945129.0, + "step": 112240 + }, + { + "entropy": 1.8743219137191773, + "epoch": 0.34796517868283167, + "grad_norm": 8.251272201538086, + "learning_rate": 4.288710401625969e-06, + "loss": 0.4904, + "mean_token_accuracy": 0.8410423472523689, + "num_tokens": 134957099.0, + "step": 112250 + }, + { + "entropy": 1.8412858352065087, + "epoch": 0.3479961778078814, + "grad_norm": 7.280477046966553, + "learning_rate": 4.2885193788164325e-06, + "loss": 0.4299, + "mean_token_accuracy": 0.8605325534939766, + "num_tokens": 134969309.0, + "step": 112260 + }, + { + "entropy": 1.8739237666130066, + "epoch": 0.34802717693293106, + "grad_norm": 7.483038902282715, + "learning_rate": 4.288328381529578e-06, + "loss": 0.4976, + "mean_token_accuracy": 0.8409060567617417, + "num_tokens": 134980937.0, + "step": 112270 + }, + { + "entropy": 1.7966715544462204, + "epoch": 0.3480581760579808, + "grad_norm": 3.68406081199646, + "learning_rate": 4.288137409759721e-06, + "loss": 0.4251, + "mean_token_accuracy": 0.8603166654706002, + "num_tokens": 134993822.0, + "step": 112280 + }, + { + "entropy": 1.9044744417071342, + "epoch": 0.34808917518303045, + "grad_norm": 9.015130043029785, + "learning_rate": 4.287946463501182e-06, + "loss": 0.4947, + "mean_token_accuracy": 0.8462210029363633, + "num_tokens": 135005423.0, + "step": 112290 + }, + { + "entropy": 1.8514644920825958, + "epoch": 0.3481201743080802, + "grad_norm": 4.599631309509277, + "learning_rate": 4.287755542748281e-06, + "loss": 0.4693, + "mean_token_accuracy": 0.8454376772046089, + "num_tokens": 135017386.0, + "step": 112300 + }, + { + "entropy": 1.8641838699579238, + "epoch": 0.34815117343312985, + "grad_norm": 9.161344528198242, + "learning_rate": 4.287564647495341e-06, + "loss": 0.4695, + "mean_token_accuracy": 0.841671122610569, + "num_tokens": 135029362.0, + "step": 112310 + }, + { + "entropy": 1.8425541408360004, + "epoch": 0.34818217255817957, + "grad_norm": 8.259307861328125, + "learning_rate": 4.287373777736684e-06, + "loss": 0.4328, + "mean_token_accuracy": 0.8496177807450295, + "num_tokens": 135042307.0, + "step": 112320 + }, + { + "entropy": 1.8292279705405234, + "epoch": 0.34821317168322924, + "grad_norm": 7.943572044372559, + "learning_rate": 4.287182933466639e-06, + "loss": 0.4415, + "mean_token_accuracy": 0.8518629640340805, + "num_tokens": 135054362.0, + "step": 112330 + }, + { + "entropy": 1.7942394033074378, + "epoch": 0.34824417080827896, + "grad_norm": 3.5839719772338867, + "learning_rate": 4.286992114679531e-06, + "loss": 0.3978, + "mean_token_accuracy": 0.858057115972042, + "num_tokens": 135067150.0, + "step": 112340 + }, + { + "entropy": 1.9045852065086364, + "epoch": 0.34827516993332863, + "grad_norm": 7.259175777435303, + "learning_rate": 4.286801321369691e-06, + "loss": 0.4925, + "mean_token_accuracy": 0.8487705394625664, + "num_tokens": 135078189.0, + "step": 112350 + }, + { + "entropy": 1.8571503296494485, + "epoch": 0.3483061690583783, + "grad_norm": 9.252874374389648, + "learning_rate": 4.286610553531448e-06, + "loss": 0.4521, + "mean_token_accuracy": 0.8501718282699585, + "num_tokens": 135089770.0, + "step": 112360 + }, + { + "entropy": 1.8037226751446724, + "epoch": 0.348337168183428, + "grad_norm": 7.917973041534424, + "learning_rate": 4.286419811159137e-06, + "loss": 0.485, + "mean_token_accuracy": 0.8473261296749115, + "num_tokens": 135102849.0, + "step": 112370 + }, + { + "entropy": 1.8692201957106591, + "epoch": 0.3483681673084777, + "grad_norm": 4.2562575340271, + "learning_rate": 4.286229094247093e-06, + "loss": 0.4657, + "mean_token_accuracy": 0.8484464436769485, + "num_tokens": 135114862.0, + "step": 112380 + }, + { + "entropy": 1.7813385412096978, + "epoch": 0.3483991664335274, + "grad_norm": 8.083758354187012, + "learning_rate": 4.28603840278965e-06, + "loss": 0.4436, + "mean_token_accuracy": 0.8533005833625793, + "num_tokens": 135127643.0, + "step": 112390 + }, + { + "entropy": 1.7049074426293374, + "epoch": 0.3484301655585771, + "grad_norm": 4.437235355377197, + "learning_rate": 4.285847736781148e-06, + "loss": 0.3249, + "mean_token_accuracy": 0.871871954202652, + "num_tokens": 135141468.0, + "step": 112400 + }, + { + "entropy": 1.8399584576487542, + "epoch": 0.3484611646836268, + "grad_norm": 7.485554218292236, + "learning_rate": 4.285657096215928e-06, + "loss": 0.4449, + "mean_token_accuracy": 0.850048904120922, + "num_tokens": 135153861.0, + "step": 112410 + }, + { + "entropy": 1.8359861955046655, + "epoch": 0.3484921638086765, + "grad_norm": 4.102065563201904, + "learning_rate": 4.285466481088329e-06, + "loss": 0.5005, + "mean_token_accuracy": 0.8472228407859802, + "num_tokens": 135166609.0, + "step": 112420 + }, + { + "entropy": 1.8454686462879182, + "epoch": 0.3485231629337262, + "grad_norm": 7.877338409423828, + "learning_rate": 4.2852758913926965e-06, + "loss": 0.4434, + "mean_token_accuracy": 0.849373996257782, + "num_tokens": 135179113.0, + "step": 112430 + }, + { + "entropy": 1.8713324323296547, + "epoch": 0.3485541620587759, + "grad_norm": 9.064533233642578, + "learning_rate": 4.285085327123374e-06, + "loss": 0.4783, + "mean_token_accuracy": 0.8380694881081581, + "num_tokens": 135191714.0, + "step": 112440 + }, + { + "entropy": 1.8374795719981194, + "epoch": 0.3485851611838256, + "grad_norm": 8.24960994720459, + "learning_rate": 4.284894788274712e-06, + "loss": 0.4526, + "mean_token_accuracy": 0.8538482531905174, + "num_tokens": 135203618.0, + "step": 112450 + }, + { + "entropy": 1.9125969141721726, + "epoch": 0.34861616030887527, + "grad_norm": 7.1157002449035645, + "learning_rate": 4.284704274841055e-06, + "loss": 0.4723, + "mean_token_accuracy": 0.8587391778826714, + "num_tokens": 135214093.0, + "step": 112460 + }, + { + "entropy": 1.7782380178570747, + "epoch": 0.348647159433925, + "grad_norm": 6.913045406341553, + "learning_rate": 4.284513786816757e-06, + "loss": 0.3724, + "mean_token_accuracy": 0.865745086967945, + "num_tokens": 135227964.0, + "step": 112470 + }, + { + "entropy": 1.8960969477891922, + "epoch": 0.34867815855897466, + "grad_norm": 8.845172882080078, + "learning_rate": 4.284323324196168e-06, + "loss": 0.5353, + "mean_token_accuracy": 0.8316770166158676, + "num_tokens": 135238940.0, + "step": 112480 + }, + { + "entropy": 1.8000614181160928, + "epoch": 0.3487091576840244, + "grad_norm": 9.656888008117676, + "learning_rate": 4.284132886973643e-06, + "loss": 0.4356, + "mean_token_accuracy": 0.8554758876562119, + "num_tokens": 135251553.0, + "step": 112490 + }, + { + "entropy": 1.8248675152659417, + "epoch": 0.34874015680907405, + "grad_norm": 8.482026100158691, + "learning_rate": 4.283942475143537e-06, + "loss": 0.4344, + "mean_token_accuracy": 0.8560914367437362, + "num_tokens": 135263125.0, + "step": 112500 + }, + { + "entropy": 1.830253078043461, + "epoch": 0.3487711559341238, + "grad_norm": 7.03892183303833, + "learning_rate": 4.283752088700209e-06, + "loss": 0.4362, + "mean_token_accuracy": 0.8495120465755462, + "num_tokens": 135275581.0, + "step": 112510 + }, + { + "entropy": 1.840060842782259, + "epoch": 0.34880215505917345, + "grad_norm": 8.70134449005127, + "learning_rate": 4.283561727638018e-06, + "loss": 0.4745, + "mean_token_accuracy": 0.8449655011296272, + "num_tokens": 135287871.0, + "step": 112520 + }, + { + "entropy": 1.7875398755073548, + "epoch": 0.34883315418422317, + "grad_norm": 8.763019561767578, + "learning_rate": 4.283371391951324e-06, + "loss": 0.4492, + "mean_token_accuracy": 0.846840138733387, + "num_tokens": 135300518.0, + "step": 112530 + }, + { + "entropy": 1.906669056415558, + "epoch": 0.34886415330927284, + "grad_norm": 8.401066780090332, + "learning_rate": 4.2831810816344906e-06, + "loss": 0.5297, + "mean_token_accuracy": 0.8414530888199806, + "num_tokens": 135311327.0, + "step": 112540 + }, + { + "entropy": 1.847528837621212, + "epoch": 0.34889515243432256, + "grad_norm": 9.117242813110352, + "learning_rate": 4.282990796681881e-06, + "loss": 0.4505, + "mean_token_accuracy": 0.8574508696794509, + "num_tokens": 135322556.0, + "step": 112550 + }, + { + "entropy": 1.8085899502038956, + "epoch": 0.34892615155937223, + "grad_norm": 8.368361473083496, + "learning_rate": 4.282800537087866e-06, + "loss": 0.4078, + "mean_token_accuracy": 0.8587646409869194, + "num_tokens": 135335243.0, + "step": 112560 + }, + { + "entropy": 1.8577345311641693, + "epoch": 0.34895715068442196, + "grad_norm": 8.698378562927246, + "learning_rate": 4.282610302846807e-06, + "loss": 0.4703, + "mean_token_accuracy": 0.8500236317515373, + "num_tokens": 135346757.0, + "step": 112570 + }, + { + "entropy": 1.8168217539787292, + "epoch": 0.3489881498094716, + "grad_norm": 9.063081741333008, + "learning_rate": 4.2824200939530796e-06, + "loss": 0.4817, + "mean_token_accuracy": 0.8485912069678306, + "num_tokens": 135359508.0, + "step": 112580 + }, + { + "entropy": 1.8818160638213157, + "epoch": 0.34901914893452135, + "grad_norm": 9.063243865966797, + "learning_rate": 4.282229910401052e-06, + "loss": 0.5124, + "mean_token_accuracy": 0.8384617939591408, + "num_tokens": 135371143.0, + "step": 112590 + }, + { + "entropy": 1.908516588807106, + "epoch": 0.349050148059571, + "grad_norm": 7.1072282791137695, + "learning_rate": 4.282039752185099e-06, + "loss": 0.4676, + "mean_token_accuracy": 0.8510788649320602, + "num_tokens": 135382304.0, + "step": 112600 + }, + { + "entropy": 1.9243893340229987, + "epoch": 0.3490811471846207, + "grad_norm": 8.661938667297363, + "learning_rate": 4.2818496192995955e-06, + "loss": 0.5046, + "mean_token_accuracy": 0.8423077017068863, + "num_tokens": 135393382.0, + "step": 112610 + }, + { + "entropy": 1.9478312745690345, + "epoch": 0.3491121463096704, + "grad_norm": 8.19333267211914, + "learning_rate": 4.281659511738918e-06, + "loss": 0.545, + "mean_token_accuracy": 0.8402326017618179, + "num_tokens": 135404664.0, + "step": 112620 + }, + { + "entropy": 1.9101561456918716, + "epoch": 0.3491431454347201, + "grad_norm": 11.05017375946045, + "learning_rate": 4.281469429497445e-06, + "loss": 0.5287, + "mean_token_accuracy": 0.8389782354235649, + "num_tokens": 135416146.0, + "step": 112630 + }, + { + "entropy": 1.8043391317129136, + "epoch": 0.3491741445597698, + "grad_norm": 9.59979248046875, + "learning_rate": 4.281279372569557e-06, + "loss": 0.4637, + "mean_token_accuracy": 0.8521313741803169, + "num_tokens": 135428371.0, + "step": 112640 + }, + { + "entropy": 1.822686144709587, + "epoch": 0.3492051436848195, + "grad_norm": 9.630875587463379, + "learning_rate": 4.281089340949636e-06, + "loss": 0.4281, + "mean_token_accuracy": 0.8514738500118255, + "num_tokens": 135440601.0, + "step": 112650 + }, + { + "entropy": 1.844109094142914, + "epoch": 0.3492361428098692, + "grad_norm": 3.6397085189819336, + "learning_rate": 4.280899334632067e-06, + "loss": 0.4534, + "mean_token_accuracy": 0.8515435203909874, + "num_tokens": 135452180.0, + "step": 112660 + }, + { + "entropy": 1.7884308710694312, + "epoch": 0.34926714193491887, + "grad_norm": 8.593812942504883, + "learning_rate": 4.280709353611234e-06, + "loss": 0.4016, + "mean_token_accuracy": 0.8537843570113182, + "num_tokens": 135465795.0, + "step": 112670 + }, + { + "entropy": 1.7283026084303856, + "epoch": 0.3492981410599686, + "grad_norm": 8.442497253417969, + "learning_rate": 4.280519397881524e-06, + "loss": 0.3646, + "mean_token_accuracy": 0.8640974462032318, + "num_tokens": 135478919.0, + "step": 112680 + }, + { + "entropy": 1.8432815566658973, + "epoch": 0.34932914018501826, + "grad_norm": 8.06016731262207, + "learning_rate": 4.280329467437327e-06, + "loss": 0.4117, + "mean_token_accuracy": 0.8606173783540726, + "num_tokens": 135490424.0, + "step": 112690 + }, + { + "entropy": 1.821248809993267, + "epoch": 0.349360139310068, + "grad_norm": 7.616511821746826, + "learning_rate": 4.280139562273034e-06, + "loss": 0.4417, + "mean_token_accuracy": 0.847284109890461, + "num_tokens": 135503303.0, + "step": 112700 + }, + { + "entropy": 1.8346032842993736, + "epoch": 0.34939113843511765, + "grad_norm": 8.359513282775879, + "learning_rate": 4.279949682383039e-06, + "loss": 0.4911, + "mean_token_accuracy": 0.8445788651704789, + "num_tokens": 135514959.0, + "step": 112710 + }, + { + "entropy": 1.8606849431991577, + "epoch": 0.3494221375601674, + "grad_norm": 9.360651016235352, + "learning_rate": 4.279759827761733e-06, + "loss": 0.5023, + "mean_token_accuracy": 0.8447454944252968, + "num_tokens": 135525909.0, + "step": 112720 + }, + { + "entropy": 1.8562923952937127, + "epoch": 0.34945313668521705, + "grad_norm": 7.43868350982666, + "learning_rate": 4.279569998403512e-06, + "loss": 0.5419, + "mean_token_accuracy": 0.8330693423748017, + "num_tokens": 135537774.0, + "step": 112730 + }, + { + "entropy": 1.7586598336696624, + "epoch": 0.34948413581026677, + "grad_norm": 4.424074649810791, + "learning_rate": 4.279380194302777e-06, + "loss": 0.4219, + "mean_token_accuracy": 0.8576010316610336, + "num_tokens": 135550831.0, + "step": 112740 + }, + { + "entropy": 1.7588186517357827, + "epoch": 0.34951513493531644, + "grad_norm": 8.668641090393066, + "learning_rate": 4.279190415453926e-06, + "loss": 0.3962, + "mean_token_accuracy": 0.8586042076349258, + "num_tokens": 135564325.0, + "step": 112750 + }, + { + "entropy": 1.814384798705578, + "epoch": 0.34954613406036616, + "grad_norm": 8.124059677124023, + "learning_rate": 4.279000661851359e-06, + "loss": 0.4319, + "mean_token_accuracy": 0.8556953683495522, + "num_tokens": 135576775.0, + "step": 112760 + }, + { + "entropy": 1.8137044474482535, + "epoch": 0.34957713318541583, + "grad_norm": 9.291952133178711, + "learning_rate": 4.278810933489481e-06, + "loss": 0.4802, + "mean_token_accuracy": 0.8497354611754417, + "num_tokens": 135589390.0, + "step": 112770 + }, + { + "entropy": 1.8097498089075088, + "epoch": 0.34960813231046556, + "grad_norm": 7.762622356414795, + "learning_rate": 4.278621230362695e-06, + "loss": 0.4427, + "mean_token_accuracy": 0.8576091334223748, + "num_tokens": 135602123.0, + "step": 112780 + }, + { + "entropy": 1.9097516283392906, + "epoch": 0.3496391314355152, + "grad_norm": 8.101143836975098, + "learning_rate": 4.278431552465409e-06, + "loss": 0.522, + "mean_token_accuracy": 0.8374461770057678, + "num_tokens": 135613911.0, + "step": 112790 + }, + { + "entropy": 1.8759268373250961, + "epoch": 0.34967013056056495, + "grad_norm": 8.917213439941406, + "learning_rate": 4.27824189979203e-06, + "loss": 0.4684, + "mean_token_accuracy": 0.8517908856272698, + "num_tokens": 135625727.0, + "step": 112800 + }, + { + "entropy": 1.8602227076888085, + "epoch": 0.3497011296856146, + "grad_norm": 5.823835372924805, + "learning_rate": 4.278052272336967e-06, + "loss": 0.4806, + "mean_token_accuracy": 0.8478166356682777, + "num_tokens": 135637869.0, + "step": 112810 + }, + { + "entropy": 1.921775433421135, + "epoch": 0.34973212881066434, + "grad_norm": 10.301258087158203, + "learning_rate": 4.2778626700946335e-06, + "loss": 0.5161, + "mean_token_accuracy": 0.8499249458312989, + "num_tokens": 135648405.0, + "step": 112820 + }, + { + "entropy": 1.877712282538414, + "epoch": 0.349763127935714, + "grad_norm": 7.107439994812012, + "learning_rate": 4.2776730930594425e-06, + "loss": 0.5527, + "mean_token_accuracy": 0.8331371307373047, + "num_tokens": 135660457.0, + "step": 112830 + }, + { + "entropy": 1.83882287889719, + "epoch": 0.34979412706076374, + "grad_norm": 12.444573402404785, + "learning_rate": 4.277483541225809e-06, + "loss": 0.5465, + "mean_token_accuracy": 0.8394935175776481, + "num_tokens": 135672611.0, + "step": 112840 + }, + { + "entropy": 1.8228483900427819, + "epoch": 0.3498251261858134, + "grad_norm": 7.8412933349609375, + "learning_rate": 4.27729401458815e-06, + "loss": 0.4319, + "mean_token_accuracy": 0.8546199440956116, + "num_tokens": 135684136.0, + "step": 112850 + }, + { + "entropy": 1.8391767397522927, + "epoch": 0.3498561253108631, + "grad_norm": 8.248656272888184, + "learning_rate": 4.2771045131408834e-06, + "loss": 0.4476, + "mean_token_accuracy": 0.8546992540359497, + "num_tokens": 135696190.0, + "step": 112860 + }, + { + "entropy": 1.8091487675905227, + "epoch": 0.3498871244359128, + "grad_norm": 3.6982574462890625, + "learning_rate": 4.2769150368784295e-06, + "loss": 0.4425, + "mean_token_accuracy": 0.8531347304582596, + "num_tokens": 135708890.0, + "step": 112870 + }, + { + "entropy": 1.7924383908510209, + "epoch": 0.34991812356096247, + "grad_norm": 8.61406135559082, + "learning_rate": 4.2767255857952115e-06, + "loss": 0.4149, + "mean_token_accuracy": 0.8567244291305542, + "num_tokens": 135721688.0, + "step": 112880 + }, + { + "entropy": 1.850319354236126, + "epoch": 0.3499491226860122, + "grad_norm": 7.656332015991211, + "learning_rate": 4.2765361598856534e-06, + "loss": 0.52, + "mean_token_accuracy": 0.8434787318110466, + "num_tokens": 135733821.0, + "step": 112890 + }, + { + "entropy": 1.8255574628710747, + "epoch": 0.34998012181106186, + "grad_norm": 3.7060792446136475, + "learning_rate": 4.276346759144178e-06, + "loss": 0.4367, + "mean_token_accuracy": 0.8535738855600357, + "num_tokens": 135745431.0, + "step": 112900 + }, + { + "entropy": 1.8570487424731255, + "epoch": 0.3500111209361116, + "grad_norm": 8.941734313964844, + "learning_rate": 4.276157383565215e-06, + "loss": 0.488, + "mean_token_accuracy": 0.8438050165772438, + "num_tokens": 135757174.0, + "step": 112910 + }, + { + "entropy": 1.683679609745741, + "epoch": 0.35004212006116125, + "grad_norm": 2.426884174346924, + "learning_rate": 4.2759680331431915e-06, + "loss": 0.3749, + "mean_token_accuracy": 0.8656910866498947, + "num_tokens": 135772071.0, + "step": 112920 + }, + { + "entropy": 1.8811020210385323, + "epoch": 0.350073119186211, + "grad_norm": 9.89200210571289, + "learning_rate": 4.275778707872541e-06, + "loss": 0.4854, + "mean_token_accuracy": 0.8509863570332528, + "num_tokens": 135783363.0, + "step": 112930 + }, + { + "entropy": 1.8929930627346039, + "epoch": 0.35010411831126065, + "grad_norm": 8.002644538879395, + "learning_rate": 4.2755894077476925e-06, + "loss": 0.5056, + "mean_token_accuracy": 0.8419990673661232, + "num_tokens": 135794454.0, + "step": 112940 + }, + { + "entropy": 1.8019443243741988, + "epoch": 0.3501351174363104, + "grad_norm": 7.989790916442871, + "learning_rate": 4.275400132763083e-06, + "loss": 0.4648, + "mean_token_accuracy": 0.8515826135873794, + "num_tokens": 135806472.0, + "step": 112950 + }, + { + "entropy": 1.8456357419490814, + "epoch": 0.35016611656136004, + "grad_norm": 4.0877556800842285, + "learning_rate": 4.275210882913148e-06, + "loss": 0.5089, + "mean_token_accuracy": 0.8436346635222435, + "num_tokens": 135818253.0, + "step": 112960 + }, + { + "entropy": 1.8550100848078728, + "epoch": 0.35019711568640977, + "grad_norm": 9.118851661682129, + "learning_rate": 4.275021658192323e-06, + "loss": 0.4907, + "mean_token_accuracy": 0.8482651144266129, + "num_tokens": 135829973.0, + "step": 112970 + }, + { + "entropy": 1.8620183497667313, + "epoch": 0.35022811481145943, + "grad_norm": 12.924306869506836, + "learning_rate": 4.274832458595049e-06, + "loss": 0.4881, + "mean_token_accuracy": 0.8375033006072045, + "num_tokens": 135840885.0, + "step": 112980 + }, + { + "entropy": 1.7792857438325882, + "epoch": 0.35025911393650916, + "grad_norm": 7.036866664886475, + "learning_rate": 4.274643284115767e-06, + "loss": 0.4054, + "mean_token_accuracy": 0.848195219039917, + "num_tokens": 135853598.0, + "step": 112990 + }, + { + "entropy": 1.8005570635199546, + "epoch": 0.3502901130615588, + "grad_norm": 8.605463027954102, + "learning_rate": 4.274454134748919e-06, + "loss": 0.4765, + "mean_token_accuracy": 0.8483079835772515, + "num_tokens": 135866796.0, + "step": 113000 + }, + { + "entropy": 1.867513844370842, + "epoch": 0.35032111218660855, + "grad_norm": 8.543618202209473, + "learning_rate": 4.2742650104889496e-06, + "loss": 0.4927, + "mean_token_accuracy": 0.8421450614929199, + "num_tokens": 135878432.0, + "step": 113010 + }, + { + "entropy": 1.7870407178997993, + "epoch": 0.3503521113116582, + "grad_norm": 8.509590148925781, + "learning_rate": 4.274075911330306e-06, + "loss": 0.437, + "mean_token_accuracy": 0.8518823325634003, + "num_tokens": 135891001.0, + "step": 113020 + }, + { + "entropy": 1.8838384568691253, + "epoch": 0.35038311043670795, + "grad_norm": 8.782331466674805, + "learning_rate": 4.273886837267435e-06, + "loss": 0.5032, + "mean_token_accuracy": 0.8412837624549866, + "num_tokens": 135902263.0, + "step": 113030 + }, + { + "entropy": 1.7536726333200932, + "epoch": 0.3504141095617576, + "grad_norm": 4.015054702758789, + "learning_rate": 4.2736977882947855e-06, + "loss": 0.3985, + "mean_token_accuracy": 0.8617945462465286, + "num_tokens": 135915709.0, + "step": 113040 + }, + { + "entropy": 1.8604958072304725, + "epoch": 0.35044510868680734, + "grad_norm": 7.668582916259766, + "learning_rate": 4.273508764406811e-06, + "loss": 0.4567, + "mean_token_accuracy": 0.8354014694690705, + "num_tokens": 135928153.0, + "step": 113050 + }, + { + "entropy": 1.7598976030945779, + "epoch": 0.350476107811857, + "grad_norm": 10.459297180175781, + "learning_rate": 4.273319765597964e-06, + "loss": 0.3907, + "mean_token_accuracy": 0.8666064769029618, + "num_tokens": 135940924.0, + "step": 113060 + }, + { + "entropy": 1.925863367319107, + "epoch": 0.35050710693690673, + "grad_norm": 4.1394853591918945, + "learning_rate": 4.273130791862697e-06, + "loss": 0.5217, + "mean_token_accuracy": 0.839854308962822, + "num_tokens": 135952237.0, + "step": 113070 + }, + { + "entropy": 1.8460562735795976, + "epoch": 0.3505381060619564, + "grad_norm": 8.900185585021973, + "learning_rate": 4.27294184319547e-06, + "loss": 0.4746, + "mean_token_accuracy": 0.8506451055407525, + "num_tokens": 135964374.0, + "step": 113080 + }, + { + "entropy": 1.8739062339067458, + "epoch": 0.35056910518700607, + "grad_norm": Infinity, + "learning_rate": 4.272752919590739e-06, + "loss": 0.4781, + "mean_token_accuracy": 0.8531906709074975, + "num_tokens": 135974841.0, + "step": 113090 + }, + { + "entropy": 1.9152967289090157, + "epoch": 0.3506001043120558, + "grad_norm": 9.287792205810547, + "learning_rate": 4.272564021042964e-06, + "loss": 0.5305, + "mean_token_accuracy": 0.8334060996770859, + "num_tokens": 135986242.0, + "step": 113100 + }, + { + "entropy": 1.779229535162449, + "epoch": 0.35063110343710546, + "grad_norm": 8.452899932861328, + "learning_rate": 4.272375147546608e-06, + "loss": 0.3765, + "mean_token_accuracy": 0.8608766749501229, + "num_tokens": 135998810.0, + "step": 113110 + }, + { + "entropy": 1.9012344628572464, + "epoch": 0.3506621025621552, + "grad_norm": 8.408564567565918, + "learning_rate": 4.272186299096133e-06, + "loss": 0.4997, + "mean_token_accuracy": 0.8417745620012284, + "num_tokens": 136009754.0, + "step": 113120 + }, + { + "entropy": 1.8403314396739006, + "epoch": 0.35069310168720486, + "grad_norm": 10.097549438476562, + "learning_rate": 4.271997475686004e-06, + "loss": 0.4692, + "mean_token_accuracy": 0.8540255069732666, + "num_tokens": 136021001.0, + "step": 113130 + }, + { + "entropy": 1.8598705619573592, + "epoch": 0.3507241008122546, + "grad_norm": 10.163576126098633, + "learning_rate": 4.2718086773106895e-06, + "loss": 0.4469, + "mean_token_accuracy": 0.8600896030664444, + "num_tokens": 136032506.0, + "step": 113140 + }, + { + "entropy": 1.8539843708276749, + "epoch": 0.35075509993730425, + "grad_norm": 9.255050659179688, + "learning_rate": 4.271619903964656e-06, + "loss": 0.4837, + "mean_token_accuracy": 0.8301028832793236, + "num_tokens": 136044950.0, + "step": 113150 + }, + { + "entropy": 1.7757720202207565, + "epoch": 0.350786099062354, + "grad_norm": 9.801958084106445, + "learning_rate": 4.271431155642374e-06, + "loss": 0.4251, + "mean_token_accuracy": 0.8526146367192269, + "num_tokens": 136057364.0, + "step": 113160 + }, + { + "entropy": 1.818256576359272, + "epoch": 0.35081709818740364, + "grad_norm": 8.34134578704834, + "learning_rate": 4.271242432338316e-06, + "loss": 0.4512, + "mean_token_accuracy": 0.8509904339909553, + "num_tokens": 136069897.0, + "step": 113170 + }, + { + "entropy": 1.7901548087596892, + "epoch": 0.35084809731245337, + "grad_norm": 10.111919403076172, + "learning_rate": 4.271053734046957e-06, + "loss": 0.428, + "mean_token_accuracy": 0.8516396522521973, + "num_tokens": 136081662.0, + "step": 113180 + }, + { + "entropy": 1.8030595824122428, + "epoch": 0.35087909643750304, + "grad_norm": 8.223822593688965, + "learning_rate": 4.270865060762769e-06, + "loss": 0.4416, + "mean_token_accuracy": 0.8596414580941201, + "num_tokens": 136094054.0, + "step": 113190 + }, + { + "entropy": 1.8742952436208724, + "epoch": 0.35091009556255276, + "grad_norm": 7.283956050872803, + "learning_rate": 4.270676412480232e-06, + "loss": 0.4801, + "mean_token_accuracy": 0.8499576464295387, + "num_tokens": 136104584.0, + "step": 113200 + }, + { + "entropy": 1.8488594472408295, + "epoch": 0.35094109468760243, + "grad_norm": 4.605078220367432, + "learning_rate": 4.270487789193823e-06, + "loss": 0.5479, + "mean_token_accuracy": 0.829779888689518, + "num_tokens": 136116396.0, + "step": 113210 + }, + { + "entropy": 1.7764787435531617, + "epoch": 0.35097209381265215, + "grad_norm": 4.046032905578613, + "learning_rate": 4.270299190898024e-06, + "loss": 0.4293, + "mean_token_accuracy": 0.8495251774787903, + "num_tokens": 136129027.0, + "step": 113220 + }, + { + "entropy": 1.8619268134236335, + "epoch": 0.3510030929377018, + "grad_norm": 7.183846473693848, + "learning_rate": 4.2701106175873156e-06, + "loss": 0.5151, + "mean_token_accuracy": 0.8402719959616661, + "num_tokens": 136141038.0, + "step": 113230 + }, + { + "entropy": 1.8102800309658051, + "epoch": 0.35103409206275155, + "grad_norm": 3.5368010997772217, + "learning_rate": 4.2699220692561825e-06, + "loss": 0.4151, + "mean_token_accuracy": 0.8623706936836243, + "num_tokens": 136153341.0, + "step": 113240 + }, + { + "entropy": 1.9274267673492431, + "epoch": 0.3510650911878012, + "grad_norm": 6.329407215118408, + "learning_rate": 4.26973354589911e-06, + "loss": 0.46, + "mean_token_accuracy": 0.8464403167366982, + "num_tokens": 136164572.0, + "step": 113250 + }, + { + "entropy": 1.859284907579422, + "epoch": 0.35109609031285094, + "grad_norm": 9.37317943572998, + "learning_rate": 4.2695450475105855e-06, + "loss": 0.5012, + "mean_token_accuracy": 0.8453105211257934, + "num_tokens": 136176202.0, + "step": 113260 + }, + { + "entropy": 1.8950539276003837, + "epoch": 0.3511270894379006, + "grad_norm": 8.385150909423828, + "learning_rate": 4.269356574085098e-06, + "loss": 0.4593, + "mean_token_accuracy": 0.8488032355904579, + "num_tokens": 136187307.0, + "step": 113270 + }, + { + "entropy": 1.844504640996456, + "epoch": 0.35115808856295033, + "grad_norm": 3.8115599155426025, + "learning_rate": 4.269168125617139e-06, + "loss": 0.4382, + "mean_token_accuracy": 0.8605356469750405, + "num_tokens": 136198798.0, + "step": 113280 + }, + { + "entropy": 1.8304932430386542, + "epoch": 0.351189087688, + "grad_norm": 6.604245185852051, + "learning_rate": 4.2689797021012e-06, + "loss": 0.4187, + "mean_token_accuracy": 0.8543907523155212, + "num_tokens": 136211146.0, + "step": 113290 + }, + { + "entropy": 1.7669322073459626, + "epoch": 0.3512200868130497, + "grad_norm": 9.710320472717285, + "learning_rate": 4.268791303531774e-06, + "loss": 0.4546, + "mean_token_accuracy": 0.8585976973176003, + "num_tokens": 136223754.0, + "step": 113300 + }, + { + "entropy": 1.8966532975435257, + "epoch": 0.3512510859380994, + "grad_norm": 7.919690132141113, + "learning_rate": 4.268602929903359e-06, + "loss": 0.5226, + "mean_token_accuracy": 0.8372878462076188, + "num_tokens": 136234793.0, + "step": 113310 + }, + { + "entropy": 1.8168953225016593, + "epoch": 0.3512820850631491, + "grad_norm": 9.95252513885498, + "learning_rate": 4.26841458121045e-06, + "loss": 0.4564, + "mean_token_accuracy": 0.8458817571401596, + "num_tokens": 136247165.0, + "step": 113320 + }, + { + "entropy": 1.8963631451129914, + "epoch": 0.3513130841881988, + "grad_norm": 8.91224193572998, + "learning_rate": 4.26822625744755e-06, + "loss": 0.5239, + "mean_token_accuracy": 0.83456601947546, + "num_tokens": 136259113.0, + "step": 113330 + }, + { + "entropy": 1.7711772859096526, + "epoch": 0.35134408331324846, + "grad_norm": 8.007668495178223, + "learning_rate": 4.268037958609155e-06, + "loss": 0.3735, + "mean_token_accuracy": 0.8632699027657509, + "num_tokens": 136272310.0, + "step": 113340 + }, + { + "entropy": 1.9127058327198028, + "epoch": 0.3513750824382982, + "grad_norm": 8.543416023254395, + "learning_rate": 4.267849684689771e-06, + "loss": 0.5151, + "mean_token_accuracy": 0.8420788779854774, + "num_tokens": 136283081.0, + "step": 113350 + }, + { + "entropy": 1.8306966736912726, + "epoch": 0.35140608156334785, + "grad_norm": 7.711248397827148, + "learning_rate": 4.267661435683903e-06, + "loss": 0.4486, + "mean_token_accuracy": 0.8500320836901665, + "num_tokens": 136295002.0, + "step": 113360 + }, + { + "entropy": 1.9049995988607407, + "epoch": 0.3514370806883976, + "grad_norm": 9.507410049438477, + "learning_rate": 4.267473211586053e-06, + "loss": 0.5309, + "mean_token_accuracy": 0.8459584578871727, + "num_tokens": 136305515.0, + "step": 113370 + }, + { + "entropy": 1.91897811293602, + "epoch": 0.35146807981344724, + "grad_norm": 8.550996780395508, + "learning_rate": 4.267285012390732e-06, + "loss": 0.5122, + "mean_token_accuracy": 0.8397913321852684, + "num_tokens": 136316379.0, + "step": 113380 + }, + { + "entropy": 1.837944434583187, + "epoch": 0.35149907893849697, + "grad_norm": 9.132309913635254, + "learning_rate": 4.267096838092449e-06, + "loss": 0.4261, + "mean_token_accuracy": 0.8522770449519157, + "num_tokens": 136328473.0, + "step": 113390 + }, + { + "entropy": 1.8238879337906837, + "epoch": 0.35153007806354664, + "grad_norm": 8.676041603088379, + "learning_rate": 4.266908688685714e-06, + "loss": 0.4717, + "mean_token_accuracy": 0.8456574112176896, + "num_tokens": 136340510.0, + "step": 113400 + }, + { + "entropy": 1.8029537141323089, + "epoch": 0.35156107718859636, + "grad_norm": 8.039926528930664, + "learning_rate": 4.26672056416504e-06, + "loss": 0.4531, + "mean_token_accuracy": 0.8499104723334312, + "num_tokens": 136352921.0, + "step": 113410 + }, + { + "entropy": 1.8220291912555695, + "epoch": 0.35159207631364603, + "grad_norm": 3.9269118309020996, + "learning_rate": 4.2665324645249425e-06, + "loss": 0.4366, + "mean_token_accuracy": 0.8560845017433166, + "num_tokens": 136365546.0, + "step": 113420 + }, + { + "entropy": 1.8378960967063904, + "epoch": 0.35162307543869575, + "grad_norm": 7.433810234069824, + "learning_rate": 4.266344389759935e-06, + "loss": 0.4517, + "mean_token_accuracy": 0.8511560037732124, + "num_tokens": 136377851.0, + "step": 113430 + }, + { + "entropy": 1.838972160220146, + "epoch": 0.3516540745637454, + "grad_norm": 8.42202091217041, + "learning_rate": 4.2661563398645395e-06, + "loss": 0.4176, + "mean_token_accuracy": 0.8631669625639915, + "num_tokens": 136389500.0, + "step": 113440 + }, + { + "entropy": 1.9216330140829085, + "epoch": 0.35168507368879515, + "grad_norm": 8.987174987792969, + "learning_rate": 4.2659683148332716e-06, + "loss": 0.5193, + "mean_token_accuracy": 0.8396774441003799, + "num_tokens": 136400087.0, + "step": 113450 + }, + { + "entropy": 1.814877037703991, + "epoch": 0.3517160728138448, + "grad_norm": 4.2126688957214355, + "learning_rate": 4.265780314660655e-06, + "loss": 0.4897, + "mean_token_accuracy": 0.8484560027718544, + "num_tokens": 136412750.0, + "step": 113460 + }, + { + "entropy": 1.675320317596197, + "epoch": 0.35174707193889454, + "grad_norm": 7.873843669891357, + "learning_rate": 4.265592339341211e-06, + "loss": 0.374, + "mean_token_accuracy": 0.861806321144104, + "num_tokens": 136428096.0, + "step": 113470 + }, + { + "entropy": 1.7690242916345595, + "epoch": 0.3517780710639442, + "grad_norm": 8.084175109863281, + "learning_rate": 4.265404388869465e-06, + "loss": 0.4054, + "mean_token_accuracy": 0.8658703714609146, + "num_tokens": 136440575.0, + "step": 113480 + }, + { + "entropy": 1.853781743347645, + "epoch": 0.35180907018899393, + "grad_norm": 3.831251621246338, + "learning_rate": 4.265216463239944e-06, + "loss": 0.4657, + "mean_token_accuracy": 0.8527114719152451, + "num_tokens": 136452228.0, + "step": 113490 + }, + { + "entropy": 1.7241961359977722, + "epoch": 0.3518400693140436, + "grad_norm": 3.6539437770843506, + "learning_rate": 4.265028562447174e-06, + "loss": 0.381, + "mean_token_accuracy": 0.8586121633648872, + "num_tokens": 136465487.0, + "step": 113500 + }, + { + "entropy": 1.763987348973751, + "epoch": 0.3518710684390933, + "grad_norm": 9.397258758544922, + "learning_rate": 4.264840686485687e-06, + "loss": 0.4463, + "mean_token_accuracy": 0.8535782873630524, + "num_tokens": 136477731.0, + "step": 113510 + }, + { + "entropy": 1.821959725022316, + "epoch": 0.351902067564143, + "grad_norm": 3.157595634460449, + "learning_rate": 4.264652835350013e-06, + "loss": 0.4798, + "mean_token_accuracy": 0.8425436571240426, + "num_tokens": 136489410.0, + "step": 113520 + }, + { + "entropy": 1.8831856340169906, + "epoch": 0.3519330666891927, + "grad_norm": 6.910709857940674, + "learning_rate": 4.264465009034684e-06, + "loss": 0.4742, + "mean_token_accuracy": 0.8598052114248276, + "num_tokens": 136500743.0, + "step": 113530 + }, + { + "entropy": 1.9352434664964675, + "epoch": 0.3519640658142424, + "grad_norm": 11.542353630065918, + "learning_rate": 4.264277207534237e-06, + "loss": 0.5419, + "mean_token_accuracy": 0.8231759518384933, + "num_tokens": 136511423.0, + "step": 113540 + }, + { + "entropy": 1.719076856970787, + "epoch": 0.3519950649392921, + "grad_norm": 8.80350112915039, + "learning_rate": 4.264089430843207e-06, + "loss": 0.3627, + "mean_token_accuracy": 0.8623046159744263, + "num_tokens": 136525514.0, + "step": 113550 + }, + { + "entropy": 1.8064083874225616, + "epoch": 0.3520260640643418, + "grad_norm": 9.935619354248047, + "learning_rate": 4.263901678956134e-06, + "loss": 0.4276, + "mean_token_accuracy": 0.85721445530653, + "num_tokens": 136537364.0, + "step": 113560 + }, + { + "entropy": 1.696343556046486, + "epoch": 0.3520570631893915, + "grad_norm": 3.945061445236206, + "learning_rate": 4.263713951867554e-06, + "loss": 0.3869, + "mean_token_accuracy": 0.8635579511523247, + "num_tokens": 136551460.0, + "step": 113570 + }, + { + "entropy": 1.8621020391583443, + "epoch": 0.3520880623144412, + "grad_norm": 10.784354209899902, + "learning_rate": 4.263526249572011e-06, + "loss": 0.4916, + "mean_token_accuracy": 0.8448983728885651, + "num_tokens": 136562725.0, + "step": 113580 + }, + { + "entropy": 1.8034757658839227, + "epoch": 0.35211906143949084, + "grad_norm": 7.975634574890137, + "learning_rate": 4.263338572064049e-06, + "loss": 0.4426, + "mean_token_accuracy": 0.8480996668338776, + "num_tokens": 136574947.0, + "step": 113590 + }, + { + "entropy": 1.7639338225126266, + "epoch": 0.35215006056454057, + "grad_norm": 2.267158031463623, + "learning_rate": 4.263150919338211e-06, + "loss": 0.4393, + "mean_token_accuracy": 0.8544442996382713, + "num_tokens": 136587812.0, + "step": 113600 + }, + { + "entropy": 1.7771714597940445, + "epoch": 0.35218105968959024, + "grad_norm": 4.19359016418457, + "learning_rate": 4.262963291389045e-06, + "loss": 0.4269, + "mean_token_accuracy": 0.851497220993042, + "num_tokens": 136601096.0, + "step": 113610 + }, + { + "entropy": 1.8166782572865485, + "epoch": 0.35221205881463996, + "grad_norm": 4.370350360870361, + "learning_rate": 4.262775688211097e-06, + "loss": 0.4615, + "mean_token_accuracy": 0.8466724216938019, + "num_tokens": 136613550.0, + "step": 113620 + }, + { + "entropy": 1.8014537960290908, + "epoch": 0.35224305793968963, + "grad_norm": 7.764684677124023, + "learning_rate": 4.262588109798919e-06, + "loss": 0.4062, + "mean_token_accuracy": 0.8574507102370262, + "num_tokens": 136626300.0, + "step": 113630 + }, + { + "entropy": 1.7783242926001548, + "epoch": 0.35227405706473935, + "grad_norm": 7.457787036895752, + "learning_rate": 4.262400556147062e-06, + "loss": 0.4071, + "mean_token_accuracy": 0.8619847133755684, + "num_tokens": 136638878.0, + "step": 113640 + }, + { + "entropy": 1.7466833159327506, + "epoch": 0.352305056189789, + "grad_norm": 4.234191417694092, + "learning_rate": 4.2622130272500775e-06, + "loss": 0.4237, + "mean_token_accuracy": 0.8527040392160415, + "num_tokens": 136652119.0, + "step": 113650 + }, + { + "entropy": 1.8685536488890648, + "epoch": 0.35233605531483875, + "grad_norm": 8.825045585632324, + "learning_rate": 4.262025523102523e-06, + "loss": 0.5118, + "mean_token_accuracy": 0.8355319261550903, + "num_tokens": 136663383.0, + "step": 113660 + }, + { + "entropy": 1.88068146109581, + "epoch": 0.3523670544398884, + "grad_norm": 10.0654296875, + "learning_rate": 4.261838043698953e-06, + "loss": 0.5218, + "mean_token_accuracy": 0.8337692707777024, + "num_tokens": 136674512.0, + "step": 113670 + }, + { + "entropy": 1.8553010821342468, + "epoch": 0.35239805356493814, + "grad_norm": 8.771764755249023, + "learning_rate": 4.2616505890339275e-06, + "loss": 0.4976, + "mean_token_accuracy": 0.8464409559965134, + "num_tokens": 136686081.0, + "step": 113680 + }, + { + "entropy": 1.8780170038342476, + "epoch": 0.3524290526899878, + "grad_norm": 7.942883014678955, + "learning_rate": 4.261463159102005e-06, + "loss": 0.5024, + "mean_token_accuracy": 0.8412024825811386, + "num_tokens": 136697359.0, + "step": 113690 + }, + { + "entropy": 1.82506482899189, + "epoch": 0.35246005181503753, + "grad_norm": 8.61766242980957, + "learning_rate": 4.261275753897748e-06, + "loss": 0.4501, + "mean_token_accuracy": 0.8486872628331185, + "num_tokens": 136709869.0, + "step": 113700 + }, + { + "entropy": 1.8322357684373856, + "epoch": 0.3524910509400872, + "grad_norm": 3.983318328857422, + "learning_rate": 4.261088373415719e-06, + "loss": 0.4879, + "mean_token_accuracy": 0.8400922358036041, + "num_tokens": 136721994.0, + "step": 113710 + }, + { + "entropy": 1.8604241654276847, + "epoch": 0.3525220500651369, + "grad_norm": 7.393711090087891, + "learning_rate": 4.260901017650485e-06, + "loss": 0.4855, + "mean_token_accuracy": 0.8476014077663422, + "num_tokens": 136732793.0, + "step": 113720 + }, + { + "entropy": 1.8291943043470382, + "epoch": 0.3525530491901866, + "grad_norm": 8.577530860900879, + "learning_rate": 4.26071368659661e-06, + "loss": 0.4601, + "mean_token_accuracy": 0.8571088537573814, + "num_tokens": 136744850.0, + "step": 113730 + }, + { + "entropy": 1.810156986117363, + "epoch": 0.3525840483152363, + "grad_norm": 7.944915294647217, + "learning_rate": 4.260526380248662e-06, + "loss": 0.4268, + "mean_token_accuracy": 0.8527490749955178, + "num_tokens": 136756757.0, + "step": 113740 + }, + { + "entropy": 1.8879317745566369, + "epoch": 0.352615047440286, + "grad_norm": 9.205636978149414, + "learning_rate": 4.260339098601214e-06, + "loss": 0.4998, + "mean_token_accuracy": 0.8410181164741516, + "num_tokens": 136768280.0, + "step": 113750 + }, + { + "entropy": 1.8330311551690102, + "epoch": 0.3526460465653357, + "grad_norm": 4.422030448913574, + "learning_rate": 4.2601518416488344e-06, + "loss": 0.4343, + "mean_token_accuracy": 0.8590439051389694, + "num_tokens": 136779826.0, + "step": 113760 + }, + { + "entropy": 1.8848232328891754, + "epoch": 0.3526770456903854, + "grad_norm": 8.074309349060059, + "learning_rate": 4.259964609386099e-06, + "loss": 0.4696, + "mean_token_accuracy": 0.8494037300348282, + "num_tokens": 136791347.0, + "step": 113770 + }, + { + "entropy": 1.862524962425232, + "epoch": 0.3527080448154351, + "grad_norm": 9.903372764587402, + "learning_rate": 4.2597774018075815e-06, + "loss": 0.4728, + "mean_token_accuracy": 0.8448254838585854, + "num_tokens": 136803506.0, + "step": 113780 + }, + { + "entropy": 1.787963542342186, + "epoch": 0.3527390439404848, + "grad_norm": 9.371294975280762, + "learning_rate": 4.259590218907858e-06, + "loss": 0.4379, + "mean_token_accuracy": 0.8571085095405578, + "num_tokens": 136816790.0, + "step": 113790 + }, + { + "entropy": 1.8950905337929727, + "epoch": 0.3527700430655345, + "grad_norm": 4.0718159675598145, + "learning_rate": 4.259403060681509e-06, + "loss": 0.4897, + "mean_token_accuracy": 0.8508754774928093, + "num_tokens": 136827775.0, + "step": 113800 + }, + { + "entropy": 1.8704029634594916, + "epoch": 0.35280104219058417, + "grad_norm": 8.028072357177734, + "learning_rate": 4.259215927123112e-06, + "loss": 0.477, + "mean_token_accuracy": 0.8462028250098228, + "num_tokens": 136839928.0, + "step": 113810 + }, + { + "entropy": 1.7776376932859421, + "epoch": 0.3528320413156339, + "grad_norm": 7.8810529708862305, + "learning_rate": 4.25902881822725e-06, + "loss": 0.3856, + "mean_token_accuracy": 0.8655279219150543, + "num_tokens": 136852948.0, + "step": 113820 + }, + { + "entropy": 1.886681966483593, + "epoch": 0.35286304044068356, + "grad_norm": 9.677278518676758, + "learning_rate": 4.2588417339885055e-06, + "loss": 0.4847, + "mean_token_accuracy": 0.8440056949853897, + "num_tokens": 136864663.0, + "step": 113830 + }, + { + "entropy": 1.860078476369381, + "epoch": 0.35289403956573323, + "grad_norm": 7.314884185791016, + "learning_rate": 4.258654674401464e-06, + "loss": 0.4945, + "mean_token_accuracy": 0.8441386088728905, + "num_tokens": 136875913.0, + "step": 113840 + }, + { + "entropy": 1.856418649852276, + "epoch": 0.35292503869078296, + "grad_norm": 6.926784515380859, + "learning_rate": 4.258467639460713e-06, + "loss": 0.4665, + "mean_token_accuracy": 0.8466035217046738, + "num_tokens": 136888046.0, + "step": 113850 + }, + { + "entropy": 1.8510633751749992, + "epoch": 0.3529560378158326, + "grad_norm": 8.287582397460938, + "learning_rate": 4.258280629160838e-06, + "loss": 0.5001, + "mean_token_accuracy": 0.8363550141453743, + "num_tokens": 136900496.0, + "step": 113860 + }, + { + "entropy": 1.8378751188516618, + "epoch": 0.35298703694088235, + "grad_norm": 7.225609302520752, + "learning_rate": 4.258093643496433e-06, + "loss": 0.4568, + "mean_token_accuracy": 0.8513509348034859, + "num_tokens": 136912943.0, + "step": 113870 + }, + { + "entropy": 1.8136658892035484, + "epoch": 0.353018036065932, + "grad_norm": 8.231322288513184, + "learning_rate": 4.257906682462087e-06, + "loss": 0.4455, + "mean_token_accuracy": 0.8516188710927963, + "num_tokens": 136925555.0, + "step": 113880 + }, + { + "entropy": 1.8374749287962913, + "epoch": 0.35304903519098174, + "grad_norm": 7.933830738067627, + "learning_rate": 4.257719746052393e-06, + "loss": 0.4492, + "mean_token_accuracy": 0.8512130409479142, + "num_tokens": 136937326.0, + "step": 113890 + }, + { + "entropy": 1.8850289553403854, + "epoch": 0.3530800343160314, + "grad_norm": 9.631787300109863, + "learning_rate": 4.257532834261947e-06, + "loss": 0.5034, + "mean_token_accuracy": 0.8466032981872559, + "num_tokens": 136947938.0, + "step": 113900 + }, + { + "entropy": 1.774746771156788, + "epoch": 0.35311103344108113, + "grad_norm": 3.474881172180176, + "learning_rate": 4.257345947085346e-06, + "loss": 0.4201, + "mean_token_accuracy": 0.8617874652147293, + "num_tokens": 136960564.0, + "step": 113910 + }, + { + "entropy": 1.8160261258482933, + "epoch": 0.3531420325661308, + "grad_norm": 8.455029487609863, + "learning_rate": 4.257159084517186e-06, + "loss": 0.4386, + "mean_token_accuracy": 0.8601742401719094, + "num_tokens": 136972540.0, + "step": 113920 + }, + { + "entropy": 1.8941467106342316, + "epoch": 0.35317303169118053, + "grad_norm": 4.235901355743408, + "learning_rate": 4.25697224655207e-06, + "loss": 0.5141, + "mean_token_accuracy": 0.8378585115075111, + "num_tokens": 136984312.0, + "step": 113930 + }, + { + "entropy": 1.8767295882105828, + "epoch": 0.3532040308162302, + "grad_norm": 9.515031814575195, + "learning_rate": 4.256785433184598e-06, + "loss": 0.5171, + "mean_token_accuracy": 0.8369233354926109, + "num_tokens": 136995625.0, + "step": 113940 + }, + { + "entropy": 1.81492610424757, + "epoch": 0.3532350299412799, + "grad_norm": 4.470407485961914, + "learning_rate": 4.256598644409373e-06, + "loss": 0.4293, + "mean_token_accuracy": 0.8580437764525414, + "num_tokens": 137008213.0, + "step": 113950 + }, + { + "entropy": 1.849051834642887, + "epoch": 0.3532660290663296, + "grad_norm": 7.72014045715332, + "learning_rate": 4.2564118802210006e-06, + "loss": 0.4815, + "mean_token_accuracy": 0.8522540688514709, + "num_tokens": 137020112.0, + "step": 113960 + }, + { + "entropy": 1.8805166974663734, + "epoch": 0.3532970281913793, + "grad_norm": 4.944484710693359, + "learning_rate": 4.256225140614086e-06, + "loss": 0.4837, + "mean_token_accuracy": 0.8436195507645607, + "num_tokens": 137031603.0, + "step": 113970 + }, + { + "entropy": 1.874251839518547, + "epoch": 0.353328027316429, + "grad_norm": 8.50593090057373, + "learning_rate": 4.25603842558324e-06, + "loss": 0.4732, + "mean_token_accuracy": 0.8450019791722297, + "num_tokens": 137043366.0, + "step": 113980 + }, + { + "entropy": 1.8801655188202857, + "epoch": 0.3533590264414787, + "grad_norm": 8.969855308532715, + "learning_rate": 4.25585173512307e-06, + "loss": 0.5296, + "mean_token_accuracy": 0.8407104894518852, + "num_tokens": 137054687.0, + "step": 113990 + }, + { + "entropy": 1.8593452662229537, + "epoch": 0.3533900255665284, + "grad_norm": 8.071375846862793, + "learning_rate": 4.255665069228188e-06, + "loss": 0.4094, + "mean_token_accuracy": 0.8621448844671249, + "num_tokens": 137066396.0, + "step": 114000 + }, + { + "entropy": 1.8180650487542152, + "epoch": 0.3534210246915781, + "grad_norm": 8.56116771697998, + "learning_rate": 4.255478427893209e-06, + "loss": 0.4487, + "mean_token_accuracy": 0.8478880062699318, + "num_tokens": 137077708.0, + "step": 114010 + }, + { + "entropy": 1.815793578326702, + "epoch": 0.35345202381662777, + "grad_norm": 9.843114852905273, + "learning_rate": 4.255291811112745e-06, + "loss": 0.4388, + "mean_token_accuracy": 0.8555766463279724, + "num_tokens": 137090459.0, + "step": 114020 + }, + { + "entropy": 1.8414412125945092, + "epoch": 0.3534830229416775, + "grad_norm": 7.240721225738525, + "learning_rate": 4.255105218881416e-06, + "loss": 0.5089, + "mean_token_accuracy": 0.8408783197402954, + "num_tokens": 137101852.0, + "step": 114030 + }, + { + "entropy": 1.8683375775814057, + "epoch": 0.35351402206672716, + "grad_norm": 7.452057361602783, + "learning_rate": 4.2549186511938356e-06, + "loss": 0.4929, + "mean_token_accuracy": 0.8472641706466675, + "num_tokens": 137113521.0, + "step": 114040 + }, + { + "entropy": 1.9232849180698395, + "epoch": 0.3535450211917769, + "grad_norm": 8.561758995056152, + "learning_rate": 4.254732108044627e-06, + "loss": 0.4893, + "mean_token_accuracy": 0.8424664407968521, + "num_tokens": 137124822.0, + "step": 114050 + }, + { + "entropy": 1.8383488565683366, + "epoch": 0.35357602031682656, + "grad_norm": 10.501875877380371, + "learning_rate": 4.254545589428411e-06, + "loss": 0.4763, + "mean_token_accuracy": 0.836837200820446, + "num_tokens": 137136758.0, + "step": 114060 + }, + { + "entropy": 1.873065246641636, + "epoch": 0.3536070194418763, + "grad_norm": 9.702674865722656, + "learning_rate": 4.254359095339811e-06, + "loss": 0.4836, + "mean_token_accuracy": 0.8503431528806686, + "num_tokens": 137148091.0, + "step": 114070 + }, + { + "entropy": 1.8993804574012756, + "epoch": 0.35363801856692595, + "grad_norm": 7.872958183288574, + "learning_rate": 4.254172625773451e-06, + "loss": 0.469, + "mean_token_accuracy": 0.849186685681343, + "num_tokens": 137159064.0, + "step": 114080 + }, + { + "entropy": 1.9097682803869247, + "epoch": 0.3536690176919756, + "grad_norm": 8.155599594116211, + "learning_rate": 4.253986180723957e-06, + "loss": 0.4959, + "mean_token_accuracy": 0.8419617161154747, + "num_tokens": 137170143.0, + "step": 114090 + }, + { + "entropy": 1.8655310958623885, + "epoch": 0.35370001681702534, + "grad_norm": 10.33460807800293, + "learning_rate": 4.253799760185957e-06, + "loss": 0.5043, + "mean_token_accuracy": 0.8362808287143707, + "num_tokens": 137181569.0, + "step": 114100 + }, + { + "entropy": 1.7499642267823219, + "epoch": 0.353731015942075, + "grad_norm": 3.961226224899292, + "learning_rate": 4.253613364154082e-06, + "loss": 0.4155, + "mean_token_accuracy": 0.8528372913599014, + "num_tokens": 137194428.0, + "step": 114110 + }, + { + "entropy": 1.8577033221721648, + "epoch": 0.35376201506712474, + "grad_norm": 8.127702713012695, + "learning_rate": 4.2534269926229625e-06, + "loss": 0.4483, + "mean_token_accuracy": 0.8494561642408371, + "num_tokens": 137206496.0, + "step": 114120 + }, + { + "entropy": 1.950427946448326, + "epoch": 0.3537930141921744, + "grad_norm": 7.7237043380737305, + "learning_rate": 4.253240645587232e-06, + "loss": 0.5027, + "mean_token_accuracy": 0.847864530980587, + "num_tokens": 137217616.0, + "step": 114130 + }, + { + "entropy": 1.8073005706071854, + "epoch": 0.35382401331722413, + "grad_norm": 8.634578704833984, + "learning_rate": 4.253054323041525e-06, + "loss": 0.4007, + "mean_token_accuracy": 0.8615914180874824, + "num_tokens": 137229842.0, + "step": 114140 + }, + { + "entropy": 1.9044538155198096, + "epoch": 0.3538550124422738, + "grad_norm": 10.229740142822266, + "learning_rate": 4.252868024980477e-06, + "loss": 0.5133, + "mean_token_accuracy": 0.840241327881813, + "num_tokens": 137241062.0, + "step": 114150 + }, + { + "entropy": 1.7597592651844025, + "epoch": 0.3538860115673235, + "grad_norm": 9.922557830810547, + "learning_rate": 4.252681751398727e-06, + "loss": 0.3942, + "mean_token_accuracy": 0.8564444318413734, + "num_tokens": 137254220.0, + "step": 114160 + }, + { + "entropy": 1.9016540557146073, + "epoch": 0.3539170106923732, + "grad_norm": 10.562433242797852, + "learning_rate": 4.252495502290913e-06, + "loss": 0.5396, + "mean_token_accuracy": 0.832036292552948, + "num_tokens": 137266517.0, + "step": 114170 + }, + { + "entropy": 1.9116821497678758, + "epoch": 0.3539480098174229, + "grad_norm": 8.220035552978516, + "learning_rate": 4.252309277651677e-06, + "loss": 0.511, + "mean_token_accuracy": 0.8397183999419212, + "num_tokens": 137277540.0, + "step": 114180 + }, + { + "entropy": 1.8361477851867676, + "epoch": 0.3539790089424726, + "grad_norm": 8.83745288848877, + "learning_rate": 4.252123077475664e-06, + "loss": 0.5102, + "mean_token_accuracy": 0.8428474217653275, + "num_tokens": 137289242.0, + "step": 114190 + }, + { + "entropy": 1.8084876164793968, + "epoch": 0.3540100080675223, + "grad_norm": 7.766282081604004, + "learning_rate": 4.251936901757515e-06, + "loss": 0.4502, + "mean_token_accuracy": 0.8553501293063164, + "num_tokens": 137301999.0, + "step": 114200 + }, + { + "entropy": 1.8107165440917015, + "epoch": 0.354041007192572, + "grad_norm": 4.7136993408203125, + "learning_rate": 4.251750750491878e-06, + "loss": 0.4923, + "mean_token_accuracy": 0.8484808370471001, + "num_tokens": 137314968.0, + "step": 114210 + }, + { + "entropy": 1.94393610060215, + "epoch": 0.3540720063176217, + "grad_norm": 9.238444328308105, + "learning_rate": 4.2515646236734e-06, + "loss": 0.5164, + "mean_token_accuracy": 0.8350920498371124, + "num_tokens": 137325889.0, + "step": 114220 + }, + { + "entropy": 1.782192386686802, + "epoch": 0.35410300544267137, + "grad_norm": 9.513643264770508, + "learning_rate": 4.251378521296731e-06, + "loss": 0.427, + "mean_token_accuracy": 0.8538872867822647, + "num_tokens": 137338800.0, + "step": 114230 + }, + { + "entropy": 1.830018326640129, + "epoch": 0.3541340045677211, + "grad_norm": 7.8421759605407715, + "learning_rate": 4.251192443356523e-06, + "loss": 0.468, + "mean_token_accuracy": 0.843048295378685, + "num_tokens": 137350877.0, + "step": 114240 + }, + { + "entropy": 1.8395485177636146, + "epoch": 0.35416500369277076, + "grad_norm": 8.42922592163086, + "learning_rate": 4.251006389847427e-06, + "loss": 0.4412, + "mean_token_accuracy": 0.8559403479099273, + "num_tokens": 137362938.0, + "step": 114250 + }, + { + "entropy": 1.8116194292902947, + "epoch": 0.3541960028178205, + "grad_norm": 4.232943534851074, + "learning_rate": 4.250820360764097e-06, + "loss": 0.4439, + "mean_token_accuracy": 0.847702045738697, + "num_tokens": 137375890.0, + "step": 114260 + }, + { + "entropy": 1.699189220368862, + "epoch": 0.35422700194287016, + "grad_norm": 6.99352502822876, + "learning_rate": 4.25063435610119e-06, + "loss": 0.3176, + "mean_token_accuracy": 0.8732634499669075, + "num_tokens": 137389460.0, + "step": 114270 + }, + { + "entropy": 1.7898903474211694, + "epoch": 0.3542580010679199, + "grad_norm": 9.566137313842773, + "learning_rate": 4.250448375853365e-06, + "loss": 0.4222, + "mean_token_accuracy": 0.8538245901465416, + "num_tokens": 137402759.0, + "step": 114280 + }, + { + "entropy": 1.8570916399359703, + "epoch": 0.35428900019296955, + "grad_norm": 3.8215625286102295, + "learning_rate": 4.250262420015279e-06, + "loss": 0.5149, + "mean_token_accuracy": 0.8420146137475968, + "num_tokens": 137414428.0, + "step": 114290 + }, + { + "entropy": 1.8959661558270455, + "epoch": 0.3543199993180193, + "grad_norm": 7.246954441070557, + "learning_rate": 4.250076488581593e-06, + "loss": 0.5078, + "mean_token_accuracy": 0.8415970131754875, + "num_tokens": 137425829.0, + "step": 114300 + }, + { + "entropy": 1.8812722265720367, + "epoch": 0.35435099844306894, + "grad_norm": 7.860531330108643, + "learning_rate": 4.24989058154697e-06, + "loss": 0.4598, + "mean_token_accuracy": 0.8447592079639434, + "num_tokens": 137437268.0, + "step": 114310 + }, + { + "entropy": 1.8524739906191825, + "epoch": 0.35438199756811867, + "grad_norm": 8.29738712310791, + "learning_rate": 4.2497046989060754e-06, + "loss": 0.4355, + "mean_token_accuracy": 0.8458985432982444, + "num_tokens": 137449877.0, + "step": 114320 + }, + { + "entropy": 1.8565015509724616, + "epoch": 0.35441299669316834, + "grad_norm": 8.345024108886719, + "learning_rate": 4.2495188406535735e-06, + "loss": 0.4799, + "mean_token_accuracy": 0.8513003289699554, + "num_tokens": 137461527.0, + "step": 114330 + }, + { + "entropy": 1.8561938509345055, + "epoch": 0.354443995818218, + "grad_norm": 9.380087852478027, + "learning_rate": 4.249333006784131e-06, + "loss": 0.4599, + "mean_token_accuracy": 0.8521606922149658, + "num_tokens": 137473760.0, + "step": 114340 + }, + { + "entropy": 1.8369782000780106, + "epoch": 0.35447499494326773, + "grad_norm": 8.772756576538086, + "learning_rate": 4.249147197292419e-06, + "loss": 0.4711, + "mean_token_accuracy": 0.8478844106197357, + "num_tokens": 137485545.0, + "step": 114350 + }, + { + "entropy": 1.9263944923877716, + "epoch": 0.3545059940683174, + "grad_norm": 7.329461574554443, + "learning_rate": 4.248961412173107e-06, + "loss": 0.5178, + "mean_token_accuracy": 0.8424314111471176, + "num_tokens": 137496566.0, + "step": 114360 + }, + { + "entropy": 1.8391064286231995, + "epoch": 0.3545369931933671, + "grad_norm": 8.21514892578125, + "learning_rate": 4.2487756514208675e-06, + "loss": 0.4321, + "mean_token_accuracy": 0.8564809501171112, + "num_tokens": 137508491.0, + "step": 114370 + }, + { + "entropy": 1.8533219590783119, + "epoch": 0.3545679923184168, + "grad_norm": 8.260367393493652, + "learning_rate": 4.248589915030374e-06, + "loss": 0.4759, + "mean_token_accuracy": 0.8540527150034904, + "num_tokens": 137520622.0, + "step": 114380 + }, + { + "entropy": 1.9417014926671983, + "epoch": 0.3545989914434665, + "grad_norm": 8.334314346313477, + "learning_rate": 4.248404202996303e-06, + "loss": 0.5266, + "mean_token_accuracy": 0.8378186166286469, + "num_tokens": 137531376.0, + "step": 114390 + }, + { + "entropy": 1.8331923067569733, + "epoch": 0.3546299905685162, + "grad_norm": 9.672205924987793, + "learning_rate": 4.248218515313331e-06, + "loss": 0.4463, + "mean_token_accuracy": 0.8539971187710762, + "num_tokens": 137543888.0, + "step": 114400 + }, + { + "entropy": 1.8398851856589318, + "epoch": 0.3546609896935659, + "grad_norm": 8.222822189331055, + "learning_rate": 4.248032851976136e-06, + "loss": 0.4917, + "mean_token_accuracy": 0.8429914176464081, + "num_tokens": 137555582.0, + "step": 114410 + }, + { + "entropy": 1.817541065812111, + "epoch": 0.3546919888186156, + "grad_norm": 7.350632190704346, + "learning_rate": 4.2478472129794004e-06, + "loss": 0.4313, + "mean_token_accuracy": 0.8614602595567703, + "num_tokens": 137567761.0, + "step": 114420 + }, + { + "entropy": 1.8555574625730515, + "epoch": 0.3547229879436653, + "grad_norm": 4.468617916107178, + "learning_rate": 4.247661598317806e-06, + "loss": 0.4489, + "mean_token_accuracy": 0.8440041437745094, + "num_tokens": 137579603.0, + "step": 114430 + }, + { + "entropy": 1.8921550765633584, + "epoch": 0.35475398706871497, + "grad_norm": 9.684183120727539, + "learning_rate": 4.247476007986034e-06, + "loss": 0.5208, + "mean_token_accuracy": 0.8349121674895287, + "num_tokens": 137591460.0, + "step": 114440 + }, + { + "entropy": 1.9489626735448837, + "epoch": 0.3547849861937647, + "grad_norm": 7.472236633300781, + "learning_rate": 4.247290441978772e-06, + "loss": 0.5164, + "mean_token_accuracy": 0.8391944885253906, + "num_tokens": 137602126.0, + "step": 114450 + }, + { + "entropy": 1.8100664362311363, + "epoch": 0.35481598531881436, + "grad_norm": 7.485744476318359, + "learning_rate": 4.247104900290708e-06, + "loss": 0.413, + "mean_token_accuracy": 0.8578644886612892, + "num_tokens": 137614832.0, + "step": 114460 + }, + { + "entropy": 1.8753397777676581, + "epoch": 0.3548469844438641, + "grad_norm": 7.755117893218994, + "learning_rate": 4.246919382916528e-06, + "loss": 0.4843, + "mean_token_accuracy": 0.8451044067740441, + "num_tokens": 137626735.0, + "step": 114470 + }, + { + "entropy": 1.851782961189747, + "epoch": 0.35487798356891376, + "grad_norm": 8.35659408569336, + "learning_rate": 4.2467338898509225e-06, + "loss": 0.456, + "mean_token_accuracy": 0.8530733227729798, + "num_tokens": 137638707.0, + "step": 114480 + }, + { + "entropy": 1.8803992569446564, + "epoch": 0.3549089826939635, + "grad_norm": 9.96238899230957, + "learning_rate": 4.2465484210885845e-06, + "loss": 0.4666, + "mean_token_accuracy": 0.8485779255628586, + "num_tokens": 137650395.0, + "step": 114490 + }, + { + "entropy": 1.8529548197984695, + "epoch": 0.35493998181901315, + "grad_norm": 7.451599597930908, + "learning_rate": 4.2463629766242074e-06, + "loss": 0.4776, + "mean_token_accuracy": 0.8496254831552505, + "num_tokens": 137661248.0, + "step": 114500 + }, + { + "entropy": 1.8691770792007447, + "epoch": 0.3549709809440629, + "grad_norm": 8.7528657913208, + "learning_rate": 4.246177556452486e-06, + "loss": 0.5205, + "mean_token_accuracy": 0.8437319055199624, + "num_tokens": 137672985.0, + "step": 114510 + }, + { + "entropy": 1.8243165865540505, + "epoch": 0.35500198006911254, + "grad_norm": 7.32008171081543, + "learning_rate": 4.245992160568117e-06, + "loss": 0.4583, + "mean_token_accuracy": 0.841618250310421, + "num_tokens": 137684795.0, + "step": 114520 + }, + { + "entropy": 1.8710746631026267, + "epoch": 0.35503297919416227, + "grad_norm": 8.433467864990234, + "learning_rate": 4.245806788965798e-06, + "loss": 0.5185, + "mean_token_accuracy": 0.8461954742670059, + "num_tokens": 137696877.0, + "step": 114530 + }, + { + "entropy": 1.9102421700954437, + "epoch": 0.35506397831921194, + "grad_norm": 8.911306381225586, + "learning_rate": 4.245621441640229e-06, + "loss": 0.5067, + "mean_token_accuracy": 0.8445306703448295, + "num_tokens": 137708323.0, + "step": 114540 + }, + { + "entropy": 1.704357886314392, + "epoch": 0.35509497744426166, + "grad_norm": 5.612151145935059, + "learning_rate": 4.245436118586114e-06, + "loss": 0.3443, + "mean_token_accuracy": 0.8622689947485924, + "num_tokens": 137722546.0, + "step": 114550 + }, + { + "entropy": 1.8894066840410233, + "epoch": 0.35512597656931133, + "grad_norm": 3.8784425258636475, + "learning_rate": 4.245250819798153e-06, + "loss": 0.4699, + "mean_token_accuracy": 0.8490568831562996, + "num_tokens": 137733876.0, + "step": 114560 + }, + { + "entropy": 1.7194712564349175, + "epoch": 0.355156975694361, + "grad_norm": 9.469301223754883, + "learning_rate": 4.2450655452710515e-06, + "loss": 0.4478, + "mean_token_accuracy": 0.8465792283415794, + "num_tokens": 137747364.0, + "step": 114570 + }, + { + "entropy": 1.7580340534448624, + "epoch": 0.3551879748194107, + "grad_norm": 8.43166732788086, + "learning_rate": 4.244880294999517e-06, + "loss": 0.4017, + "mean_token_accuracy": 0.8530274391174316, + "num_tokens": 137760169.0, + "step": 114580 + }, + { + "entropy": 1.798786661028862, + "epoch": 0.3552189739444604, + "grad_norm": 3.7261557579040527, + "learning_rate": 4.2446950689782575e-06, + "loss": 0.4434, + "mean_token_accuracy": 0.8502959847450257, + "num_tokens": 137772888.0, + "step": 114590 + }, + { + "entropy": 1.782324093580246, + "epoch": 0.3552499730695101, + "grad_norm": 3.953874349594116, + "learning_rate": 4.244509867201982e-06, + "loss": 0.397, + "mean_token_accuracy": 0.8616974368691445, + "num_tokens": 137786038.0, + "step": 114600 + }, + { + "entropy": 1.8710459485650062, + "epoch": 0.3552809721945598, + "grad_norm": 7.7761101722717285, + "learning_rate": 4.244324689665401e-06, + "loss": 0.4873, + "mean_token_accuracy": 0.8453879669308663, + "num_tokens": 137797978.0, + "step": 114610 + }, + { + "entropy": 1.8599458813667298, + "epoch": 0.3553119713196095, + "grad_norm": 7.9371747970581055, + "learning_rate": 4.244139536363229e-06, + "loss": 0.4237, + "mean_token_accuracy": 0.8475207060575485, + "num_tokens": 137810153.0, + "step": 114620 + }, + { + "entropy": 1.8876738995313644, + "epoch": 0.3553429704446592, + "grad_norm": 9.171344757080078, + "learning_rate": 4.24395440729018e-06, + "loss": 0.462, + "mean_token_accuracy": 0.8501772657036781, + "num_tokens": 137822116.0, + "step": 114630 + }, + { + "entropy": 1.91156704723835, + "epoch": 0.3553739695697089, + "grad_norm": 8.98309326171875, + "learning_rate": 4.2437693024409685e-06, + "loss": 0.4877, + "mean_token_accuracy": 0.8459622815251351, + "num_tokens": 137833476.0, + "step": 114640 + }, + { + "entropy": 1.855041791498661, + "epoch": 0.35540496869475857, + "grad_norm": 7.716914176940918, + "learning_rate": 4.243584221810315e-06, + "loss": 0.4479, + "mean_token_accuracy": 0.8591550663113594, + "num_tokens": 137845518.0, + "step": 114650 + }, + { + "entropy": 1.9073261603713036, + "epoch": 0.3554359678198083, + "grad_norm": 7.420741558074951, + "learning_rate": 4.243399165392936e-06, + "loss": 0.5075, + "mean_token_accuracy": 0.842662687599659, + "num_tokens": 137856959.0, + "step": 114660 + }, + { + "entropy": 1.9126224562525749, + "epoch": 0.35546696694485796, + "grad_norm": 8.25358772277832, + "learning_rate": 4.243214133183554e-06, + "loss": 0.5035, + "mean_token_accuracy": 0.8389646530151367, + "num_tokens": 137867845.0, + "step": 114670 + }, + { + "entropy": 1.908442348241806, + "epoch": 0.3554979660699077, + "grad_norm": 3.6674983501434326, + "learning_rate": 4.243029125176892e-06, + "loss": 0.4823, + "mean_token_accuracy": 0.8443290576338768, + "num_tokens": 137879689.0, + "step": 114680 + }, + { + "entropy": 1.7568098783493042, + "epoch": 0.35552896519495736, + "grad_norm": 3.361020088195801, + "learning_rate": 4.242844141367674e-06, + "loss": 0.389, + "mean_token_accuracy": 0.8680830702185631, + "num_tokens": 137892854.0, + "step": 114690 + }, + { + "entropy": 1.8590233013033868, + "epoch": 0.3555599643200071, + "grad_norm": 8.2853422164917, + "learning_rate": 4.242659181750625e-06, + "loss": 0.4697, + "mean_token_accuracy": 0.8496688306331635, + "num_tokens": 137904751.0, + "step": 114700 + }, + { + "entropy": 1.8025938093662262, + "epoch": 0.35559096344505675, + "grad_norm": 6.7893595695495605, + "learning_rate": 4.242474246320471e-06, + "loss": 0.4388, + "mean_token_accuracy": 0.8490458041429519, + "num_tokens": 137917185.0, + "step": 114710 + }, + { + "entropy": 1.8439732417464256, + "epoch": 0.3556219625701065, + "grad_norm": 4.053720951080322, + "learning_rate": 4.2422893350719436e-06, + "loss": 0.455, + "mean_token_accuracy": 0.8445042937994003, + "num_tokens": 137929018.0, + "step": 114720 + }, + { + "entropy": 1.8289795368909836, + "epoch": 0.35565296169515614, + "grad_norm": 7.735437393188477, + "learning_rate": 4.2421044479997735e-06, + "loss": 0.4398, + "mean_token_accuracy": 0.8526295185089111, + "num_tokens": 137941602.0, + "step": 114730 + }, + { + "entropy": 1.7583549529314042, + "epoch": 0.35568396082020587, + "grad_norm": 9.589922904968262, + "learning_rate": 4.24191958509869e-06, + "loss": 0.3761, + "mean_token_accuracy": 0.8651691839098931, + "num_tokens": 137954921.0, + "step": 114740 + }, + { + "entropy": 1.848554477095604, + "epoch": 0.35571495994525554, + "grad_norm": 8.365562438964844, + "learning_rate": 4.24173474636343e-06, + "loss": 0.4633, + "mean_token_accuracy": 0.8425380006432533, + "num_tokens": 137965949.0, + "step": 114750 + }, + { + "entropy": 1.9080881744623184, + "epoch": 0.35574595907030526, + "grad_norm": 8.790996551513672, + "learning_rate": 4.241549931788727e-06, + "loss": 0.507, + "mean_token_accuracy": 0.8438135430216789, + "num_tokens": 137976920.0, + "step": 114760 + }, + { + "entropy": 1.9018929034471512, + "epoch": 0.35577695819535493, + "grad_norm": 8.04598617553711, + "learning_rate": 4.2413651413693185e-06, + "loss": 0.4829, + "mean_token_accuracy": 0.8404053092002869, + "num_tokens": 137988280.0, + "step": 114770 + }, + { + "entropy": 1.8129638865590096, + "epoch": 0.35580795732040466, + "grad_norm": 3.7647767066955566, + "learning_rate": 4.241180375099945e-06, + "loss": 0.421, + "mean_token_accuracy": 0.8564304232597351, + "num_tokens": 138001123.0, + "step": 114780 + }, + { + "entropy": 1.821030892431736, + "epoch": 0.3558389564454543, + "grad_norm": 3.7905266284942627, + "learning_rate": 4.240995632975342e-06, + "loss": 0.4884, + "mean_token_accuracy": 0.8544037476181984, + "num_tokens": 138012902.0, + "step": 114790 + }, + { + "entropy": 1.8608674079179763, + "epoch": 0.35586995557050405, + "grad_norm": 8.616358757019043, + "learning_rate": 4.240810914990257e-06, + "loss": 0.5023, + "mean_token_accuracy": 0.8417997151613236, + "num_tokens": 138025108.0, + "step": 114800 + }, + { + "entropy": 1.8812236726284026, + "epoch": 0.3559009546955537, + "grad_norm": 8.76230239868164, + "learning_rate": 4.240626221139429e-06, + "loss": 0.5105, + "mean_token_accuracy": 0.8435042083263398, + "num_tokens": 138037279.0, + "step": 114810 + }, + { + "entropy": 1.7783361107110978, + "epoch": 0.3559319538206034, + "grad_norm": 3.4879698753356934, + "learning_rate": 4.240441551417605e-06, + "loss": 0.3984, + "mean_token_accuracy": 0.8596335366368294, + "num_tokens": 138050823.0, + "step": 114820 + }, + { + "entropy": 1.7827171936631203, + "epoch": 0.3559629529456531, + "grad_norm": 3.9974160194396973, + "learning_rate": 4.240256905819533e-06, + "loss": 0.3823, + "mean_token_accuracy": 0.8612391158938408, + "num_tokens": 138064142.0, + "step": 114830 + }, + { + "entropy": 1.8113099455833435, + "epoch": 0.3559939520707028, + "grad_norm": 7.9462666511535645, + "learning_rate": 4.2400722843399585e-06, + "loss": 0.4029, + "mean_token_accuracy": 0.8514809906482697, + "num_tokens": 138076703.0, + "step": 114840 + }, + { + "entropy": 1.818100643157959, + "epoch": 0.3560249511957525, + "grad_norm": 8.276450157165527, + "learning_rate": 4.2398876869736325e-06, + "loss": 0.4672, + "mean_token_accuracy": 0.8574807778000831, + "num_tokens": 138088592.0, + "step": 114850 + }, + { + "entropy": 1.899727213382721, + "epoch": 0.3560559503208022, + "grad_norm": 8.389613151550293, + "learning_rate": 4.239703113715307e-06, + "loss": 0.5211, + "mean_token_accuracy": 0.8366639405488968, + "num_tokens": 138099369.0, + "step": 114860 + }, + { + "entropy": 1.8642989337444305, + "epoch": 0.3560869494458519, + "grad_norm": 9.097145080566406, + "learning_rate": 4.239518564559734e-06, + "loss": 0.4706, + "mean_token_accuracy": 0.8484190404415131, + "num_tokens": 138111261.0, + "step": 114870 + }, + { + "entropy": 1.8235100641846658, + "epoch": 0.35611794857090157, + "grad_norm": 8.840439796447754, + "learning_rate": 4.239334039501668e-06, + "loss": 0.4041, + "mean_token_accuracy": 0.8622523456811905, + "num_tokens": 138124180.0, + "step": 114880 + }, + { + "entropy": 1.8383688524365425, + "epoch": 0.3561489476959513, + "grad_norm": 5.670360088348389, + "learning_rate": 4.2391495385358675e-06, + "loss": 0.4512, + "mean_token_accuracy": 0.8513874992728233, + "num_tokens": 138136327.0, + "step": 114890 + }, + { + "entropy": 1.8067172005772592, + "epoch": 0.35617994682100096, + "grad_norm": 9.207831382751465, + "learning_rate": 4.238965061657087e-06, + "loss": 0.4221, + "mean_token_accuracy": 0.8570899114012718, + "num_tokens": 138148730.0, + "step": 114900 + }, + { + "entropy": 1.917934286594391, + "epoch": 0.3562109459460507, + "grad_norm": 8.583123207092285, + "learning_rate": 4.238780608860088e-06, + "loss": 0.5284, + "mean_token_accuracy": 0.8423309728503228, + "num_tokens": 138159657.0, + "step": 114910 + }, + { + "entropy": 1.8137140288949012, + "epoch": 0.35624194507110035, + "grad_norm": 7.209733009338379, + "learning_rate": 4.238596180139632e-06, + "loss": 0.4447, + "mean_token_accuracy": 0.8488231867551803, + "num_tokens": 138172455.0, + "step": 114920 + }, + { + "entropy": 1.8409941777586938, + "epoch": 0.3562729441961501, + "grad_norm": 7.371655464172363, + "learning_rate": 4.238411775490481e-06, + "loss": 0.4485, + "mean_token_accuracy": 0.8461980625987053, + "num_tokens": 138184789.0, + "step": 114930 + }, + { + "entropy": 1.8432543560862542, + "epoch": 0.35630394332119975, + "grad_norm": 8.352115631103516, + "learning_rate": 4.238227394907398e-06, + "loss": 0.4777, + "mean_token_accuracy": 0.8550651222467422, + "num_tokens": 138196321.0, + "step": 114940 + }, + { + "entropy": 1.880720390379429, + "epoch": 0.35633494244624947, + "grad_norm": 7.84929084777832, + "learning_rate": 4.23804303838515e-06, + "loss": 0.515, + "mean_token_accuracy": 0.8451797798275947, + "num_tokens": 138207824.0, + "step": 114950 + }, + { + "entropy": 1.8087940603494643, + "epoch": 0.35636594157129914, + "grad_norm": 9.440030097961426, + "learning_rate": 4.237858705918504e-06, + "loss": 0.4492, + "mean_token_accuracy": 0.8487761154770851, + "num_tokens": 138220985.0, + "step": 114960 + }, + { + "entropy": 1.875385396182537, + "epoch": 0.35639694069634886, + "grad_norm": 7.9317474365234375, + "learning_rate": 4.23767439750223e-06, + "loss": 0.4614, + "mean_token_accuracy": 0.8596486851572991, + "num_tokens": 138232460.0, + "step": 114970 + }, + { + "entropy": 1.7997772373259067, + "epoch": 0.35642793982139853, + "grad_norm": 4.375933647155762, + "learning_rate": 4.237490113131097e-06, + "loss": 0.4026, + "mean_token_accuracy": 0.8406259372830391, + "num_tokens": 138246226.0, + "step": 114980 + }, + { + "entropy": 1.9087285965681076, + "epoch": 0.35645893894644826, + "grad_norm": 9.480292320251465, + "learning_rate": 4.237305852799878e-06, + "loss": 0.5354, + "mean_token_accuracy": 0.8353115454316139, + "num_tokens": 138258064.0, + "step": 114990 + }, + { + "entropy": 1.8310803532600404, + "epoch": 0.3564899380714979, + "grad_norm": 5.65321683883667, + "learning_rate": 4.237121616503348e-06, + "loss": 0.4133, + "mean_token_accuracy": 0.8590890899300575, + "num_tokens": 138271607.0, + "step": 115000 + }, + { + "entropy": 1.727584820985794, + "epoch": 0.35652093719654765, + "grad_norm": 4.688484191894531, + "learning_rate": 4.2369374042362805e-06, + "loss": 0.3905, + "mean_token_accuracy": 0.8516164928674698, + "num_tokens": 138286322.0, + "step": 115010 + }, + { + "entropy": 1.946674033999443, + "epoch": 0.3565519363215973, + "grad_norm": 7.794990062713623, + "learning_rate": 4.236753215993452e-06, + "loss": 0.4955, + "mean_token_accuracy": 0.8492588356137276, + "num_tokens": 138297078.0, + "step": 115020 + }, + { + "entropy": 1.7960212633013726, + "epoch": 0.35658293544664704, + "grad_norm": 4.166678428649902, + "learning_rate": 4.236569051769643e-06, + "loss": 0.4404, + "mean_token_accuracy": 0.8482924640178681, + "num_tokens": 138310205.0, + "step": 115030 + }, + { + "entropy": 1.8630914211273193, + "epoch": 0.3566139345716967, + "grad_norm": 7.438714504241943, + "learning_rate": 4.236384911559633e-06, + "loss": 0.4529, + "mean_token_accuracy": 0.8541238859295845, + "num_tokens": 138322264.0, + "step": 115040 + }, + { + "entropy": 1.8478296250104904, + "epoch": 0.35664493369674644, + "grad_norm": 3.994792938232422, + "learning_rate": 4.236200795358203e-06, + "loss": 0.438, + "mean_token_accuracy": 0.853163267672062, + "num_tokens": 138334432.0, + "step": 115050 + }, + { + "entropy": 1.840542307496071, + "epoch": 0.3566759328217961, + "grad_norm": 7.899695873260498, + "learning_rate": 4.2360167031601366e-06, + "loss": 0.4738, + "mean_token_accuracy": 0.8437621504068374, + "num_tokens": 138346236.0, + "step": 115060 + }, + { + "entropy": 1.9013837978243828, + "epoch": 0.3567069319468458, + "grad_norm": 8.970711708068848, + "learning_rate": 4.235832634960219e-06, + "loss": 0.4772, + "mean_token_accuracy": 0.8377508997917176, + "num_tokens": 138357432.0, + "step": 115070 + }, + { + "entropy": 1.8663888260722161, + "epoch": 0.3567379310718955, + "grad_norm": 8.907432556152344, + "learning_rate": 4.235648590753237e-06, + "loss": 0.4474, + "mean_token_accuracy": 0.8535327434539794, + "num_tokens": 138369028.0, + "step": 115080 + }, + { + "entropy": 1.8116501927375794, + "epoch": 0.35676893019694517, + "grad_norm": 7.725051403045654, + "learning_rate": 4.235464570533978e-06, + "loss": 0.3945, + "mean_token_accuracy": 0.8590543121099472, + "num_tokens": 138380629.0, + "step": 115090 + }, + { + "entropy": 1.8142176762223243, + "epoch": 0.3567999293219949, + "grad_norm": 10.798834800720215, + "learning_rate": 4.2352805742972315e-06, + "loss": 0.4315, + "mean_token_accuracy": 0.848982447385788, + "num_tokens": 138393683.0, + "step": 115100 + }, + { + "entropy": 1.8765118628740312, + "epoch": 0.35683092844704456, + "grad_norm": 11.132994651794434, + "learning_rate": 4.23509660203779e-06, + "loss": 0.4416, + "mean_token_accuracy": 0.8448266923427582, + "num_tokens": 138405992.0, + "step": 115110 + }, + { + "entropy": 1.895536944270134, + "epoch": 0.3568619275720943, + "grad_norm": 7.782128810882568, + "learning_rate": 4.234912653750445e-06, + "loss": 0.4735, + "mean_token_accuracy": 0.8532333001494408, + "num_tokens": 138417451.0, + "step": 115120 + }, + { + "entropy": 1.851093652844429, + "epoch": 0.35689292669714395, + "grad_norm": 3.747323751449585, + "learning_rate": 4.23472872942999e-06, + "loss": 0.51, + "mean_token_accuracy": 0.8384227573871612, + "num_tokens": 138428567.0, + "step": 115130 + }, + { + "entropy": 1.9262410998344421, + "epoch": 0.3569239258221937, + "grad_norm": 8.404274940490723, + "learning_rate": 4.234544829071223e-06, + "loss": 0.5125, + "mean_token_accuracy": 0.8368908122181893, + "num_tokens": 138439819.0, + "step": 115140 + }, + { + "entropy": 1.8113437429070474, + "epoch": 0.35695492494724335, + "grad_norm": 8.569499969482422, + "learning_rate": 4.234360952668942e-06, + "loss": 0.4249, + "mean_token_accuracy": 0.8553946748375892, + "num_tokens": 138452192.0, + "step": 115150 + }, + { + "entropy": 1.8376051411032677, + "epoch": 0.35698592407229307, + "grad_norm": 8.602355003356934, + "learning_rate": 4.2341771002179445e-06, + "loss": 0.422, + "mean_token_accuracy": 0.8601582199335098, + "num_tokens": 138464939.0, + "step": 115160 + }, + { + "entropy": 1.7371099531650542, + "epoch": 0.35701692319734274, + "grad_norm": 8.904439926147461, + "learning_rate": 4.233993271713032e-06, + "loss": 0.3651, + "mean_token_accuracy": 0.8645968466997147, + "num_tokens": 138477993.0, + "step": 115170 + }, + { + "entropy": 1.8617297038435936, + "epoch": 0.35704792232239246, + "grad_norm": 7.874594211578369, + "learning_rate": 4.233809467149005e-06, + "loss": 0.4488, + "mean_token_accuracy": 0.852948172390461, + "num_tokens": 138489670.0, + "step": 115180 + }, + { + "entropy": 1.8260050147771836, + "epoch": 0.35707892144744213, + "grad_norm": 3.6491997241973877, + "learning_rate": 4.23362568652067e-06, + "loss": 0.4346, + "mean_token_accuracy": 0.8457119166851044, + "num_tokens": 138501968.0, + "step": 115190 + }, + { + "entropy": 1.84189365953207, + "epoch": 0.35710992057249186, + "grad_norm": 8.175023078918457, + "learning_rate": 4.2334419298228315e-06, + "loss": 0.4542, + "mean_token_accuracy": 0.8577545329928398, + "num_tokens": 138514376.0, + "step": 115200 + }, + { + "entropy": 1.8731770426034928, + "epoch": 0.3571409196975415, + "grad_norm": 9.102734565734863, + "learning_rate": 4.2332581970502965e-06, + "loss": 0.4974, + "mean_token_accuracy": 0.8443191021680831, + "num_tokens": 138526194.0, + "step": 115210 + }, + { + "entropy": 1.8527789831161499, + "epoch": 0.35717191882259125, + "grad_norm": 7.554600238800049, + "learning_rate": 4.233074488197873e-06, + "loss": 0.4415, + "mean_token_accuracy": 0.8490279093384743, + "num_tokens": 138538253.0, + "step": 115220 + }, + { + "entropy": 1.9107401639223098, + "epoch": 0.3572029179476409, + "grad_norm": 9.52925968170166, + "learning_rate": 4.232890803260372e-06, + "loss": 0.527, + "mean_token_accuracy": 0.8357208788394928, + "num_tokens": 138549714.0, + "step": 115230 + }, + { + "entropy": 1.857765756547451, + "epoch": 0.35723391707269064, + "grad_norm": 8.061179161071777, + "learning_rate": 4.232707142232605e-06, + "loss": 0.4667, + "mean_token_accuracy": 0.8525500863790512, + "num_tokens": 138561400.0, + "step": 115240 + }, + { + "entropy": 1.852206926047802, + "epoch": 0.3572649161977403, + "grad_norm": 2.4236626625061035, + "learning_rate": 4.232523505109386e-06, + "loss": 0.4453, + "mean_token_accuracy": 0.854184539616108, + "num_tokens": 138573545.0, + "step": 115250 + }, + { + "entropy": 1.9055633306503297, + "epoch": 0.35729591532279004, + "grad_norm": 8.553743362426758, + "learning_rate": 4.232339891885528e-06, + "loss": 0.4845, + "mean_token_accuracy": 0.8542478173971176, + "num_tokens": 138584725.0, + "step": 115260 + }, + { + "entropy": 1.917627716064453, + "epoch": 0.3573269144478397, + "grad_norm": 8.1434326171875, + "learning_rate": 4.23215630255585e-06, + "loss": 0.5061, + "mean_token_accuracy": 0.8403222784399986, + "num_tokens": 138596131.0, + "step": 115270 + }, + { + "entropy": 1.7823843270540238, + "epoch": 0.35735791357288943, + "grad_norm": 8.837855339050293, + "learning_rate": 4.2319727371151685e-06, + "loss": 0.449, + "mean_token_accuracy": 0.8501473888754845, + "num_tokens": 138609457.0, + "step": 115280 + }, + { + "entropy": 1.802987203001976, + "epoch": 0.3573889126979391, + "grad_norm": 8.615238189697266, + "learning_rate": 4.231789195558304e-06, + "loss": 0.4009, + "mean_token_accuracy": 0.8549714654684066, + "num_tokens": 138621968.0, + "step": 115290 + }, + { + "entropy": 1.7702410399913788, + "epoch": 0.3574199118229888, + "grad_norm": 9.068475723266602, + "learning_rate": 4.231605677880076e-06, + "loss": 0.417, + "mean_token_accuracy": 0.8504804894328117, + "num_tokens": 138635487.0, + "step": 115300 + }, + { + "entropy": 1.7608641982078552, + "epoch": 0.3574509109480385, + "grad_norm": 3.600853204727173, + "learning_rate": 4.2314221840753095e-06, + "loss": 0.4207, + "mean_token_accuracy": 0.8591883108019829, + "num_tokens": 138649773.0, + "step": 115310 + }, + { + "entropy": 1.845149078965187, + "epoch": 0.35748191007308816, + "grad_norm": 8.9186372756958, + "learning_rate": 4.231238714138827e-06, + "loss": 0.4463, + "mean_token_accuracy": 0.8526797413825988, + "num_tokens": 138661604.0, + "step": 115320 + }, + { + "entropy": 1.7300834864377976, + "epoch": 0.3575129091981379, + "grad_norm": 3.3661861419677734, + "learning_rate": 4.231055268065456e-06, + "loss": 0.3556, + "mean_token_accuracy": 0.8640945121645928, + "num_tokens": 138675068.0, + "step": 115330 + }, + { + "entropy": 1.7648856535553932, + "epoch": 0.35754390832318755, + "grad_norm": 7.938429832458496, + "learning_rate": 4.230871845850023e-06, + "loss": 0.3813, + "mean_token_accuracy": 0.8690492272377014, + "num_tokens": 138687865.0, + "step": 115340 + }, + { + "entropy": 1.8311346605420113, + "epoch": 0.3575749074482373, + "grad_norm": 8.160514831542969, + "learning_rate": 4.230688447487358e-06, + "loss": 0.4429, + "mean_token_accuracy": 0.8510325223207473, + "num_tokens": 138700240.0, + "step": 115350 + }, + { + "entropy": 1.8927728280425071, + "epoch": 0.35760590657328695, + "grad_norm": 7.9980926513671875, + "learning_rate": 4.230505072972291e-06, + "loss": 0.4889, + "mean_token_accuracy": 0.8445928290486335, + "num_tokens": 138711415.0, + "step": 115360 + }, + { + "entropy": 1.9151074886322021, + "epoch": 0.35763690569833667, + "grad_norm": 8.005682945251465, + "learning_rate": 4.230321722299654e-06, + "loss": 0.4521, + "mean_token_accuracy": 0.859444510936737, + "num_tokens": 138722459.0, + "step": 115370 + }, + { + "entropy": 1.8123747482895851, + "epoch": 0.35766790482338634, + "grad_norm": 3.6913387775421143, + "learning_rate": 4.23013839546428e-06, + "loss": 0.4163, + "mean_token_accuracy": 0.8595644026994705, + "num_tokens": 138734633.0, + "step": 115380 + }, + { + "entropy": 1.8173100531101227, + "epoch": 0.35769890394843606, + "grad_norm": 4.180068016052246, + "learning_rate": 4.2299550924610065e-06, + "loss": 0.4591, + "mean_token_accuracy": 0.8400069773197174, + "num_tokens": 138747367.0, + "step": 115390 + }, + { + "entropy": 1.9122267067432404, + "epoch": 0.35772990307348573, + "grad_norm": 8.708627700805664, + "learning_rate": 4.229771813284669e-06, + "loss": 0.4755, + "mean_token_accuracy": 0.8578638061881065, + "num_tokens": 138758011.0, + "step": 115400 + }, + { + "entropy": 1.8789207234978675, + "epoch": 0.35776090219853546, + "grad_norm": 8.253546714782715, + "learning_rate": 4.229588557930106e-06, + "loss": 0.4652, + "mean_token_accuracy": 0.8481252580881119, + "num_tokens": 138770045.0, + "step": 115410 + }, + { + "entropy": 1.8555074393749238, + "epoch": 0.3577919013235851, + "grad_norm": 9.735289573669434, + "learning_rate": 4.229405326392158e-06, + "loss": 0.4524, + "mean_token_accuracy": 0.850931105017662, + "num_tokens": 138782324.0, + "step": 115420 + }, + { + "entropy": 1.857953730225563, + "epoch": 0.35782290044863485, + "grad_norm": 3.539726495742798, + "learning_rate": 4.229222118665667e-06, + "loss": 0.4617, + "mean_token_accuracy": 0.8570713594555854, + "num_tokens": 138793921.0, + "step": 115430 + }, + { + "entropy": 1.8376942738890647, + "epoch": 0.3578538995736845, + "grad_norm": 9.80213451385498, + "learning_rate": 4.229038934745475e-06, + "loss": 0.4324, + "mean_token_accuracy": 0.8545270070433617, + "num_tokens": 138806022.0, + "step": 115440 + }, + { + "entropy": 1.803570680320263, + "epoch": 0.35788489869873424, + "grad_norm": 9.964187622070312, + "learning_rate": 4.228855774626427e-06, + "loss": 0.4649, + "mean_token_accuracy": 0.8454685240983963, + "num_tokens": 138818578.0, + "step": 115450 + }, + { + "entropy": 1.7595643281936646, + "epoch": 0.3579158978237839, + "grad_norm": 8.82363224029541, + "learning_rate": 4.22867263830337e-06, + "loss": 0.4198, + "mean_token_accuracy": 0.8574300840497017, + "num_tokens": 138831318.0, + "step": 115460 + }, + { + "entropy": 1.8091361120343208, + "epoch": 0.35794689694883364, + "grad_norm": 7.342787742614746, + "learning_rate": 4.228489525771151e-06, + "loss": 0.4686, + "mean_token_accuracy": 0.8519743844866753, + "num_tokens": 138843358.0, + "step": 115470 + }, + { + "entropy": 1.878666016459465, + "epoch": 0.3579778960738833, + "grad_norm": 8.650647163391113, + "learning_rate": 4.22830643702462e-06, + "loss": 0.4983, + "mean_token_accuracy": 0.8367117345333099, + "num_tokens": 138854574.0, + "step": 115480 + }, + { + "entropy": 1.7836987793445587, + "epoch": 0.35800889519893303, + "grad_norm": 4.929512023925781, + "learning_rate": 4.228123372058628e-06, + "loss": 0.4007, + "mean_token_accuracy": 0.8506701856851577, + "num_tokens": 138867932.0, + "step": 115490 + }, + { + "entropy": 1.841169884800911, + "epoch": 0.3580398943239827, + "grad_norm": 8.031444549560547, + "learning_rate": 4.227940330868028e-06, + "loss": 0.4698, + "mean_token_accuracy": 0.8463330894708634, + "num_tokens": 138880884.0, + "step": 115500 + }, + { + "entropy": 1.8855138301849366, + "epoch": 0.3580708934490324, + "grad_norm": 7.73654317855835, + "learning_rate": 4.227757313447673e-06, + "loss": 0.5016, + "mean_token_accuracy": 0.8478637263178825, + "num_tokens": 138891775.0, + "step": 115510 + }, + { + "entropy": 1.8171266317367554, + "epoch": 0.3581018925740821, + "grad_norm": 4.404027462005615, + "learning_rate": 4.2275743197924185e-06, + "loss": 0.4329, + "mean_token_accuracy": 0.8575972750782966, + "num_tokens": 138904411.0, + "step": 115520 + }, + { + "entropy": 1.8890212267637252, + "epoch": 0.3581328916991318, + "grad_norm": 8.792807579040527, + "learning_rate": 4.227391349897123e-06, + "loss": 0.5362, + "mean_token_accuracy": 0.8435036420822144, + "num_tokens": 138917174.0, + "step": 115530 + }, + { + "entropy": 1.8580753847956657, + "epoch": 0.3581638908241815, + "grad_norm": 7.962569236755371, + "learning_rate": 4.227208403756644e-06, + "loss": 0.4431, + "mean_token_accuracy": 0.8595779970288276, + "num_tokens": 138928817.0, + "step": 115540 + }, + { + "entropy": 1.79182361215353, + "epoch": 0.3581948899492312, + "grad_norm": 8.742132186889648, + "learning_rate": 4.227025481365844e-06, + "loss": 0.4222, + "mean_token_accuracy": 0.8503703057765961, + "num_tokens": 138941364.0, + "step": 115550 + }, + { + "entropy": 1.9465411305427551, + "epoch": 0.3582258890742809, + "grad_norm": 9.272126197814941, + "learning_rate": 4.226842582719583e-06, + "loss": 0.5234, + "mean_token_accuracy": 0.8437400907278061, + "num_tokens": 138952471.0, + "step": 115560 + }, + { + "entropy": 1.8139133349061012, + "epoch": 0.35825688819933055, + "grad_norm": 3.899977684020996, + "learning_rate": 4.226659707812723e-06, + "loss": 0.3716, + "mean_token_accuracy": 0.8707442149519921, + "num_tokens": 138965676.0, + "step": 115570 + }, + { + "entropy": 1.8158785462379456, + "epoch": 0.35828788732438027, + "grad_norm": 9.818337440490723, + "learning_rate": 4.2264768566401325e-06, + "loss": 0.4877, + "mean_token_accuracy": 0.8463491559028625, + "num_tokens": 138977871.0, + "step": 115580 + }, + { + "entropy": 1.764494700729847, + "epoch": 0.35831888644942994, + "grad_norm": 8.205857276916504, + "learning_rate": 4.226294029196676e-06, + "loss": 0.3989, + "mean_token_accuracy": 0.8628710359334946, + "num_tokens": 138991007.0, + "step": 115590 + }, + { + "entropy": 1.8400152049958707, + "epoch": 0.35834988557447967, + "grad_norm": 6.771747589111328, + "learning_rate": 4.226111225477222e-06, + "loss": 0.4849, + "mean_token_accuracy": 0.8476285025477409, + "num_tokens": 139002964.0, + "step": 115600 + }, + { + "entropy": 1.899264845252037, + "epoch": 0.35838088469952933, + "grad_norm": 9.301812171936035, + "learning_rate": 4.225928445476641e-06, + "loss": 0.5283, + "mean_token_accuracy": 0.8411499798297882, + "num_tokens": 139014497.0, + "step": 115610 + }, + { + "entropy": 1.861107437312603, + "epoch": 0.35841188382457906, + "grad_norm": 3.8199284076690674, + "learning_rate": 4.2257456891898015e-06, + "loss": 0.518, + "mean_token_accuracy": 0.8456135064363479, + "num_tokens": 139026732.0, + "step": 115620 + }, + { + "entropy": 1.786482220888138, + "epoch": 0.3584428829496287, + "grad_norm": 7.765440940856934, + "learning_rate": 4.2255629566115795e-06, + "loss": 0.3853, + "mean_token_accuracy": 0.8641697198152543, + "num_tokens": 139039189.0, + "step": 115630 + }, + { + "entropy": 1.8395205929875373, + "epoch": 0.35847388207467845, + "grad_norm": 9.376654624938965, + "learning_rate": 4.225380247736847e-06, + "loss": 0.515, + "mean_token_accuracy": 0.8413670718669891, + "num_tokens": 139052067.0, + "step": 115640 + }, + { + "entropy": 1.9465155333280564, + "epoch": 0.3585048811997281, + "grad_norm": 8.243704795837402, + "learning_rate": 4.225197562560482e-06, + "loss": 0.4934, + "mean_token_accuracy": 0.8416017279028892, + "num_tokens": 139062764.0, + "step": 115650 + }, + { + "entropy": 1.842594537138939, + "epoch": 0.35853588032477784, + "grad_norm": 4.1204681396484375, + "learning_rate": 4.22501490107736e-06, + "loss": 0.5122, + "mean_token_accuracy": 0.8348633021116256, + "num_tokens": 139075044.0, + "step": 115660 + }, + { + "entropy": 1.8628608852624893, + "epoch": 0.3585668794498275, + "grad_norm": 7.374316215515137, + "learning_rate": 4.2248322632823606e-06, + "loss": 0.4423, + "mean_token_accuracy": 0.8515262484550477, + "num_tokens": 139086635.0, + "step": 115670 + }, + { + "entropy": 1.8487340614199639, + "epoch": 0.35859787857487724, + "grad_norm": 8.635334014892578, + "learning_rate": 4.224649649170366e-06, + "loss": 0.4594, + "mean_token_accuracy": 0.8458402872085571, + "num_tokens": 139098547.0, + "step": 115680 + }, + { + "entropy": 1.9221700012683869, + "epoch": 0.3586288776999269, + "grad_norm": 6.838761806488037, + "learning_rate": 4.224467058736255e-06, + "loss": 0.5657, + "mean_token_accuracy": 0.8330105841159821, + "num_tokens": 139109818.0, + "step": 115690 + }, + { + "entropy": 1.8932587698101997, + "epoch": 0.35865987682497663, + "grad_norm": 9.387928009033203, + "learning_rate": 4.224284491974914e-06, + "loss": 0.5134, + "mean_token_accuracy": 0.8380125910043716, + "num_tokens": 139121059.0, + "step": 115700 + }, + { + "entropy": 1.9230019852519036, + "epoch": 0.3586908759500263, + "grad_norm": 8.831636428833008, + "learning_rate": 4.224101948881227e-06, + "loss": 0.4914, + "mean_token_accuracy": 0.8391887709498406, + "num_tokens": 139132889.0, + "step": 115710 + }, + { + "entropy": 1.8802688077092171, + "epoch": 0.358721875075076, + "grad_norm": 3.8806021213531494, + "learning_rate": 4.22391942945008e-06, + "loss": 0.4532, + "mean_token_accuracy": 0.8473272129893303, + "num_tokens": 139145205.0, + "step": 115720 + }, + { + "entropy": 1.9340951204299928, + "epoch": 0.3587528742001257, + "grad_norm": 8.25346851348877, + "learning_rate": 4.2237369336763625e-06, + "loss": 0.4932, + "mean_token_accuracy": 0.8459508880972862, + "num_tokens": 139156135.0, + "step": 115730 + }, + { + "entropy": 1.900227214396, + "epoch": 0.3587838733251754, + "grad_norm": 8.245087623596191, + "learning_rate": 4.223554461554964e-06, + "loss": 0.473, + "mean_token_accuracy": 0.8462023451924324, + "num_tokens": 139167831.0, + "step": 115740 + }, + { + "entropy": 1.8334951400756836, + "epoch": 0.3588148724502251, + "grad_norm": 4.438302040100098, + "learning_rate": 4.223372013080776e-06, + "loss": 0.4653, + "mean_token_accuracy": 0.8457862615585328, + "num_tokens": 139180382.0, + "step": 115750 + }, + { + "entropy": 1.8711395308375358, + "epoch": 0.3588458715752748, + "grad_norm": 8.4141845703125, + "learning_rate": 4.223189588248691e-06, + "loss": 0.4886, + "mean_token_accuracy": 0.8420770734548568, + "num_tokens": 139192007.0, + "step": 115760 + }, + { + "entropy": 1.892673698067665, + "epoch": 0.3588768707003245, + "grad_norm": 8.64199161529541, + "learning_rate": 4.223007187053604e-06, + "loss": 0.4734, + "mean_token_accuracy": 0.8470587819814682, + "num_tokens": 139203626.0, + "step": 115770 + }, + { + "entropy": 1.8397964909672737, + "epoch": 0.3589078698253742, + "grad_norm": 8.469186782836914, + "learning_rate": 4.222824809490409e-06, + "loss": 0.4957, + "mean_token_accuracy": 0.8453930094838142, + "num_tokens": 139215766.0, + "step": 115780 + }, + { + "entropy": 1.932980051636696, + "epoch": 0.3589388689504239, + "grad_norm": 6.354795455932617, + "learning_rate": 4.2226424555540065e-06, + "loss": 0.4755, + "mean_token_accuracy": 0.8456053122878074, + "num_tokens": 139227681.0, + "step": 115790 + }, + { + "entropy": 1.8004991948604583, + "epoch": 0.3589698680754736, + "grad_norm": 6.467407703399658, + "learning_rate": 4.2224601252392935e-06, + "loss": 0.4131, + "mean_token_accuracy": 0.8630038425326347, + "num_tokens": 139239994.0, + "step": 115800 + }, + { + "entropy": 1.8661329224705696, + "epoch": 0.35900086720052327, + "grad_norm": 4.90010929107666, + "learning_rate": 4.222277818541172e-06, + "loss": 0.498, + "mean_token_accuracy": 0.8436091437935829, + "num_tokens": 139251297.0, + "step": 115810 + }, + { + "entropy": 1.8228035122156143, + "epoch": 0.35903186632557293, + "grad_norm": 6.741776943206787, + "learning_rate": 4.2220955354545435e-06, + "loss": 0.4671, + "mean_token_accuracy": 0.8479675367474556, + "num_tokens": 139263722.0, + "step": 115820 + }, + { + "entropy": 1.796131867170334, + "epoch": 0.35906286545062266, + "grad_norm": 6.297611713409424, + "learning_rate": 4.221913275974311e-06, + "loss": 0.4424, + "mean_token_accuracy": 0.8649747148156166, + "num_tokens": 139276144.0, + "step": 115830 + }, + { + "entropy": 1.873625811934471, + "epoch": 0.35909386457567233, + "grad_norm": 7.727682590484619, + "learning_rate": 4.221731040095381e-06, + "loss": 0.4792, + "mean_token_accuracy": 0.8530295416712761, + "num_tokens": 139287760.0, + "step": 115840 + }, + { + "entropy": 1.8328562811017037, + "epoch": 0.35912486370072205, + "grad_norm": 8.941142082214355, + "learning_rate": 4.221548827812659e-06, + "loss": 0.4475, + "mean_token_accuracy": 0.8467715248465538, + "num_tokens": 139300146.0, + "step": 115850 + }, + { + "entropy": 1.8431931123137475, + "epoch": 0.3591558628257717, + "grad_norm": 8.686592102050781, + "learning_rate": 4.221366639121054e-06, + "loss": 0.4923, + "mean_token_accuracy": 0.8481633648276329, + "num_tokens": 139312174.0, + "step": 115860 + }, + { + "entropy": 1.7692224755883217, + "epoch": 0.35918686195082145, + "grad_norm": 7.942878723144531, + "learning_rate": 4.221184474015477e-06, + "loss": 0.4286, + "mean_token_accuracy": 0.8561850890517235, + "num_tokens": 139325247.0, + "step": 115870 + }, + { + "entropy": 1.8758103132247925, + "epoch": 0.3592178610758711, + "grad_norm": 8.962884902954102, + "learning_rate": 4.221002332490837e-06, + "loss": 0.4467, + "mean_token_accuracy": 0.8550149217247963, + "num_tokens": 139336232.0, + "step": 115880 + }, + { + "entropy": 1.7588933199644088, + "epoch": 0.35924886020092084, + "grad_norm": 4.053327560424805, + "learning_rate": 4.220820214542049e-06, + "loss": 0.3706, + "mean_token_accuracy": 0.8556968107819557, + "num_tokens": 139350251.0, + "step": 115890 + }, + { + "entropy": 1.8734589472413063, + "epoch": 0.3592798593259705, + "grad_norm": 9.165533065795898, + "learning_rate": 4.220638120164026e-06, + "loss": 0.5239, + "mean_token_accuracy": 0.8462863549590111, + "num_tokens": 139361764.0, + "step": 115900 + }, + { + "entropy": 1.8266181737184524, + "epoch": 0.35931085845102023, + "grad_norm": 7.9356255531311035, + "learning_rate": 4.220456049351685e-06, + "loss": 0.45, + "mean_token_accuracy": 0.8596974804997444, + "num_tokens": 139373578.0, + "step": 115910 + }, + { + "entropy": 1.8083546802401542, + "epoch": 0.3593418575760699, + "grad_norm": 3.8307013511657715, + "learning_rate": 4.220274002099943e-06, + "loss": 0.4362, + "mean_token_accuracy": 0.849444305896759, + "num_tokens": 139386017.0, + "step": 115920 + }, + { + "entropy": 1.8357669502496718, + "epoch": 0.3593728567011196, + "grad_norm": 3.281771421432495, + "learning_rate": 4.220091978403719e-06, + "loss": 0.5271, + "mean_token_accuracy": 0.848252309858799, + "num_tokens": 139398172.0, + "step": 115930 + }, + { + "entropy": 1.818470537662506, + "epoch": 0.3594038558261693, + "grad_norm": 2.5742039680480957, + "learning_rate": 4.219909978257934e-06, + "loss": 0.4774, + "mean_token_accuracy": 0.8499492272734642, + "num_tokens": 139410862.0, + "step": 115940 + }, + { + "entropy": 1.8105147659778595, + "epoch": 0.359434854951219, + "grad_norm": 7.754059791564941, + "learning_rate": 4.219728001657508e-06, + "loss": 0.4111, + "mean_token_accuracy": 0.8644904047250748, + "num_tokens": 139422867.0, + "step": 115950 + }, + { + "entropy": 1.8963597849011422, + "epoch": 0.3594658540762687, + "grad_norm": 9.430159568786621, + "learning_rate": 4.219546048597369e-06, + "loss": 0.4856, + "mean_token_accuracy": 0.8429613292217255, + "num_tokens": 139433810.0, + "step": 115960 + }, + { + "entropy": 1.8394680365920066, + "epoch": 0.3594968532013184, + "grad_norm": 3.8290538787841797, + "learning_rate": 4.219364119072439e-06, + "loss": 0.4543, + "mean_token_accuracy": 0.8480119571089745, + "num_tokens": 139446229.0, + "step": 115970 + }, + { + "entropy": 1.8272418528795242, + "epoch": 0.3595278523263681, + "grad_norm": 7.950870990753174, + "learning_rate": 4.219182213077646e-06, + "loss": 0.4343, + "mean_token_accuracy": 0.8526568979024887, + "num_tokens": 139458579.0, + "step": 115980 + }, + { + "entropy": 1.887735801935196, + "epoch": 0.3595588514514178, + "grad_norm": 3.933110237121582, + "learning_rate": 4.219000330607917e-06, + "loss": 0.4695, + "mean_token_accuracy": 0.8449479952454567, + "num_tokens": 139470657.0, + "step": 115990 + }, + { + "entropy": 1.9549269363284112, + "epoch": 0.3595898505764675, + "grad_norm": 9.912589073181152, + "learning_rate": 4.218818471658183e-06, + "loss": 0.4899, + "mean_token_accuracy": 0.841597031056881, + "num_tokens": 139481544.0, + "step": 116000 + }, + { + "entropy": 1.904000848531723, + "epoch": 0.3596208497015172, + "grad_norm": 6.567648887634277, + "learning_rate": 4.218636636223375e-06, + "loss": 0.4788, + "mean_token_accuracy": 0.8507451817393303, + "num_tokens": 139492483.0, + "step": 116010 + }, + { + "entropy": 1.9398685723543168, + "epoch": 0.35965184882656687, + "grad_norm": 4.198793411254883, + "learning_rate": 4.218454824298425e-06, + "loss": 0.5066, + "mean_token_accuracy": 0.8451578319072723, + "num_tokens": 139504258.0, + "step": 116020 + }, + { + "entropy": 1.9017282500863075, + "epoch": 0.3596828479516166, + "grad_norm": 7.781552314758301, + "learning_rate": 4.218273035878269e-06, + "loss": 0.4881, + "mean_token_accuracy": 0.8435791626572609, + "num_tokens": 139515473.0, + "step": 116030 + }, + { + "entropy": 1.9017712712287902, + "epoch": 0.35971384707666626, + "grad_norm": 8.259926795959473, + "learning_rate": 4.218091270957841e-06, + "loss": 0.5256, + "mean_token_accuracy": 0.8393255040049553, + "num_tokens": 139526776.0, + "step": 116040 + }, + { + "entropy": 1.8337740018963813, + "epoch": 0.359744846201716, + "grad_norm": 9.133225440979004, + "learning_rate": 4.21790952953208e-06, + "loss": 0.4639, + "mean_token_accuracy": 0.8523485392332077, + "num_tokens": 139539083.0, + "step": 116050 + }, + { + "entropy": 1.7753349527716638, + "epoch": 0.35977584532676565, + "grad_norm": 7.402058124542236, + "learning_rate": 4.217727811595925e-06, + "loss": 0.4548, + "mean_token_accuracy": 0.847336596250534, + "num_tokens": 139552085.0, + "step": 116060 + }, + { + "entropy": 1.8660298094153405, + "epoch": 0.3598068444518153, + "grad_norm": 7.4676337242126465, + "learning_rate": 4.217546117144314e-06, + "loss": 0.4653, + "mean_token_accuracy": 0.8392193883657455, + "num_tokens": 139564096.0, + "step": 116070 + }, + { + "entropy": 1.8619601786136628, + "epoch": 0.35983784357686505, + "grad_norm": 9.07433795928955, + "learning_rate": 4.217364446172193e-06, + "loss": 0.4855, + "mean_token_accuracy": 0.839654740691185, + "num_tokens": 139576414.0, + "step": 116080 + }, + { + "entropy": 1.9037289932370185, + "epoch": 0.3598688427019147, + "grad_norm": 8.008439064025879, + "learning_rate": 4.217182798674502e-06, + "loss": 0.4998, + "mean_token_accuracy": 0.8401681199669838, + "num_tokens": 139587784.0, + "step": 116090 + }, + { + "entropy": 1.8978828579187392, + "epoch": 0.35989984182696444, + "grad_norm": 8.578210830688477, + "learning_rate": 4.2170011746461886e-06, + "loss": 0.4871, + "mean_token_accuracy": 0.8408385306596756, + "num_tokens": 139599171.0, + "step": 116100 + }, + { + "entropy": 1.84117241948843, + "epoch": 0.3599308409520141, + "grad_norm": 3.883363962173462, + "learning_rate": 4.216819574082197e-06, + "loss": 0.4476, + "mean_token_accuracy": 0.8472616419196128, + "num_tokens": 139611729.0, + "step": 116110 + }, + { + "entropy": 1.8370385035872459, + "epoch": 0.35996184007706383, + "grad_norm": 4.151669979095459, + "learning_rate": 4.2166379969774775e-06, + "loss": 0.4473, + "mean_token_accuracy": 0.855852136015892, + "num_tokens": 139623044.0, + "step": 116120 + }, + { + "entropy": 1.8387348473072052, + "epoch": 0.3599928392021135, + "grad_norm": 8.288407325744629, + "learning_rate": 4.216456443326979e-06, + "loss": 0.4327, + "mean_token_accuracy": 0.8595815673470497, + "num_tokens": 139634713.0, + "step": 116130 + }, + { + "entropy": 1.9357994318008422, + "epoch": 0.3600238383271632, + "grad_norm": 10.54449462890625, + "learning_rate": 4.216274913125652e-06, + "loss": 0.5153, + "mean_token_accuracy": 0.8542236015200615, + "num_tokens": 139645451.0, + "step": 116140 + }, + { + "entropy": 1.8762347102165222, + "epoch": 0.3600548374522129, + "grad_norm": 8.778923034667969, + "learning_rate": 4.216093406368449e-06, + "loss": 0.4745, + "mean_token_accuracy": 0.8462166652083397, + "num_tokens": 139657312.0, + "step": 116150 + }, + { + "entropy": 1.8865218967199326, + "epoch": 0.3600858365772626, + "grad_norm": 7.668148040771484, + "learning_rate": 4.2159119230503255e-06, + "loss": 0.5044, + "mean_token_accuracy": 0.8440534695982933, + "num_tokens": 139668202.0, + "step": 116160 + }, + { + "entropy": 1.8358159840106965, + "epoch": 0.3601168357023123, + "grad_norm": 8.311078071594238, + "learning_rate": 4.215730463166237e-06, + "loss": 0.4634, + "mean_token_accuracy": 0.8523998752236366, + "num_tokens": 139680084.0, + "step": 116170 + }, + { + "entropy": 1.925374338030815, + "epoch": 0.360147834827362, + "grad_norm": 8.066094398498535, + "learning_rate": 4.21554902671114e-06, + "loss": 0.5061, + "mean_token_accuracy": 0.8462306708097458, + "num_tokens": 139690862.0, + "step": 116180 + }, + { + "entropy": 1.8480087623000145, + "epoch": 0.3601788339524117, + "grad_norm": 4.514954090118408, + "learning_rate": 4.2153676136799934e-06, + "loss": 0.421, + "mean_token_accuracy": 0.8582058116793633, + "num_tokens": 139703017.0, + "step": 116190 + }, + { + "entropy": 1.7714590035378932, + "epoch": 0.3602098330774614, + "grad_norm": 2.609532356262207, + "learning_rate": 4.215186224067758e-06, + "loss": 0.3956, + "mean_token_accuracy": 0.8530898556113243, + "num_tokens": 139716890.0, + "step": 116200 + }, + { + "entropy": 1.9455319941043854, + "epoch": 0.3602408322025111, + "grad_norm": 7.525179862976074, + "learning_rate": 4.215004857869394e-06, + "loss": 0.5113, + "mean_token_accuracy": 0.8397414952516555, + "num_tokens": 139727863.0, + "step": 116210 + }, + { + "entropy": 1.766502921283245, + "epoch": 0.3602718313275608, + "grad_norm": 8.821333885192871, + "learning_rate": 4.214823515079867e-06, + "loss": 0.4441, + "mean_token_accuracy": 0.8470347732305527, + "num_tokens": 139740638.0, + "step": 116220 + }, + { + "entropy": 1.8291141256690024, + "epoch": 0.36030283045261047, + "grad_norm": 4.927423000335693, + "learning_rate": 4.214642195694141e-06, + "loss": 0.4251, + "mean_token_accuracy": 0.8610336139798165, + "num_tokens": 139753345.0, + "step": 116230 + }, + { + "entropy": 1.8271966442465781, + "epoch": 0.3603338295776602, + "grad_norm": 9.124969482421875, + "learning_rate": 4.214460899707181e-06, + "loss": 0.4365, + "mean_token_accuracy": 0.8442619845271111, + "num_tokens": 139765345.0, + "step": 116240 + }, + { + "entropy": 1.8535583779215812, + "epoch": 0.36036482870270986, + "grad_norm": 8.4541597366333, + "learning_rate": 4.214279627113957e-06, + "loss": 0.4227, + "mean_token_accuracy": 0.8529452204704284, + "num_tokens": 139777146.0, + "step": 116250 + }, + { + "entropy": 1.8414417818188666, + "epoch": 0.3603958278277596, + "grad_norm": 3.8833229541778564, + "learning_rate": 4.214098377909436e-06, + "loss": 0.421, + "mean_token_accuracy": 0.852535355091095, + "num_tokens": 139789410.0, + "step": 116260 + }, + { + "entropy": 1.8427911520004272, + "epoch": 0.36042682695280925, + "grad_norm": 3.8969404697418213, + "learning_rate": 4.213917152088591e-06, + "loss": 0.4381, + "mean_token_accuracy": 0.8565546840429306, + "num_tokens": 139801439.0, + "step": 116270 + }, + { + "entropy": 1.7546941101551057, + "epoch": 0.360457826077859, + "grad_norm": 8.13198184967041, + "learning_rate": 4.2137359496463936e-06, + "loss": 0.3821, + "mean_token_accuracy": 0.8690331175923347, + "num_tokens": 139814865.0, + "step": 116280 + }, + { + "entropy": 1.7768574252724647, + "epoch": 0.36048882520290865, + "grad_norm": 3.7505130767822266, + "learning_rate": 4.213554770577818e-06, + "loss": 0.3773, + "mean_token_accuracy": 0.86257985830307, + "num_tokens": 139828042.0, + "step": 116290 + }, + { + "entropy": 1.8828290045261382, + "epoch": 0.3605198243279583, + "grad_norm": 6.885351181030273, + "learning_rate": 4.213373614877838e-06, + "loss": 0.4649, + "mean_token_accuracy": 0.8605077177286148, + "num_tokens": 139838856.0, + "step": 116300 + }, + { + "entropy": 1.7729797944426537, + "epoch": 0.36055082345300804, + "grad_norm": 9.330283164978027, + "learning_rate": 4.213192482541433e-06, + "loss": 0.3842, + "mean_token_accuracy": 0.8671557918190956, + "num_tokens": 139852638.0, + "step": 116310 + }, + { + "entropy": 1.9487037807703018, + "epoch": 0.3605818225780577, + "grad_norm": 7.210131645202637, + "learning_rate": 4.21301137356358e-06, + "loss": 0.5066, + "mean_token_accuracy": 0.8386420547962189, + "num_tokens": 139863146.0, + "step": 116320 + }, + { + "entropy": 1.9695353388786316, + "epoch": 0.36061282170310743, + "grad_norm": 9.178292274475098, + "learning_rate": 4.21283028793926e-06, + "loss": 0.52, + "mean_token_accuracy": 0.8453469514846802, + "num_tokens": 139874014.0, + "step": 116330 + }, + { + "entropy": 1.8018761366605758, + "epoch": 0.3606438208281571, + "grad_norm": 7.328160285949707, + "learning_rate": 4.212649225663452e-06, + "loss": 0.4631, + "mean_token_accuracy": 0.8488622546195984, + "num_tokens": 139886434.0, + "step": 116340 + }, + { + "entropy": 1.9044070094823837, + "epoch": 0.3606748199532068, + "grad_norm": 7.514301776885986, + "learning_rate": 4.212468186731141e-06, + "loss": 0.5155, + "mean_token_accuracy": 0.8392976179718972, + "num_tokens": 139897996.0, + "step": 116350 + }, + { + "entropy": 1.8434274643659592, + "epoch": 0.3607058190782565, + "grad_norm": 8.972174644470215, + "learning_rate": 4.212287171137313e-06, + "loss": 0.4632, + "mean_token_accuracy": 0.8550772503018379, + "num_tokens": 139910313.0, + "step": 116360 + }, + { + "entropy": 1.803036929666996, + "epoch": 0.3607368182033062, + "grad_norm": 8.392457008361816, + "learning_rate": 4.212106178876951e-06, + "loss": 0.4493, + "mean_token_accuracy": 0.8607822299003601, + "num_tokens": 139922530.0, + "step": 116370 + }, + { + "entropy": 1.8568144857883453, + "epoch": 0.3607678173283559, + "grad_norm": 3.606626510620117, + "learning_rate": 4.211925209945044e-06, + "loss": 0.4578, + "mean_token_accuracy": 0.8569135695695878, + "num_tokens": 139933805.0, + "step": 116380 + }, + { + "entropy": 1.9102680534124374, + "epoch": 0.3607988164534056, + "grad_norm": 8.40206241607666, + "learning_rate": 4.21174426433658e-06, + "loss": 0.5215, + "mean_token_accuracy": 0.8408423826098442, + "num_tokens": 139945504.0, + "step": 116390 + }, + { + "entropy": 1.8889756113290788, + "epoch": 0.3608298155784553, + "grad_norm": 7.147616386413574, + "learning_rate": 4.21156334204655e-06, + "loss": 0.4508, + "mean_token_accuracy": 0.8580622985959053, + "num_tokens": 139956323.0, + "step": 116400 + }, + { + "entropy": 1.7778198927640916, + "epoch": 0.360860814703505, + "grad_norm": 8.543440818786621, + "learning_rate": 4.211382443069949e-06, + "loss": 0.4076, + "mean_token_accuracy": 0.8652505591511727, + "num_tokens": 139968634.0, + "step": 116410 + }, + { + "entropy": 1.877928839623928, + "epoch": 0.3608918138285547, + "grad_norm": 2.7431368827819824, + "learning_rate": 4.2112015674017645e-06, + "loss": 0.4852, + "mean_token_accuracy": 0.8467942878603936, + "num_tokens": 139981044.0, + "step": 116420 + }, + { + "entropy": 1.8687953114509583, + "epoch": 0.3609228129536044, + "grad_norm": 8.985494613647461, + "learning_rate": 4.211020715036995e-06, + "loss": 0.5351, + "mean_token_accuracy": 0.844055813550949, + "num_tokens": 139992856.0, + "step": 116430 + }, + { + "entropy": 1.8689031660556794, + "epoch": 0.36095381207865407, + "grad_norm": 9.813769340515137, + "learning_rate": 4.210839885970638e-06, + "loss": 0.4723, + "mean_token_accuracy": 0.8516811087727547, + "num_tokens": 140003419.0, + "step": 116440 + }, + { + "entropy": 1.8415801107883454, + "epoch": 0.3609848112037038, + "grad_norm": 3.827608346939087, + "learning_rate": 4.210659080197691e-06, + "loss": 0.5246, + "mean_token_accuracy": 0.8390990689396858, + "num_tokens": 140014798.0, + "step": 116450 + }, + { + "entropy": 1.847266887128353, + "epoch": 0.36101581032875346, + "grad_norm": 8.154986381530762, + "learning_rate": 4.210478297713152e-06, + "loss": 0.4156, + "mean_token_accuracy": 0.8583290934562683, + "num_tokens": 140026777.0, + "step": 116460 + }, + { + "entropy": 1.872428523004055, + "epoch": 0.3610468094538032, + "grad_norm": 9.429468154907227, + "learning_rate": 4.210297538512023e-06, + "loss": 0.4599, + "mean_token_accuracy": 0.8427754417061806, + "num_tokens": 140038193.0, + "step": 116470 + }, + { + "entropy": 1.815330520272255, + "epoch": 0.36107780857885285, + "grad_norm": 7.44341516494751, + "learning_rate": 4.210116802589307e-06, + "loss": 0.4494, + "mean_token_accuracy": 0.8459246262907982, + "num_tokens": 140049896.0, + "step": 116480 + }, + { + "entropy": 1.80218645632267, + "epoch": 0.3611088077039026, + "grad_norm": 4.414290428161621, + "learning_rate": 4.209936089940008e-06, + "loss": 0.4232, + "mean_token_accuracy": 0.8553975149989128, + "num_tokens": 140062012.0, + "step": 116490 + }, + { + "entropy": 1.8422569379210472, + "epoch": 0.36113980682895225, + "grad_norm": 7.908370494842529, + "learning_rate": 4.20975540055913e-06, + "loss": 0.4468, + "mean_token_accuracy": 0.8528754249215126, + "num_tokens": 140074154.0, + "step": 116500 + }, + { + "entropy": 1.7738358169794082, + "epoch": 0.36117080595400197, + "grad_norm": 7.086633682250977, + "learning_rate": 4.2095747344416815e-06, + "loss": 0.3905, + "mean_token_accuracy": 0.8605238318443298, + "num_tokens": 140086516.0, + "step": 116510 + }, + { + "entropy": 1.8560999408364296, + "epoch": 0.36120180507905164, + "grad_norm": 8.640069007873535, + "learning_rate": 4.209394091582671e-06, + "loss": 0.495, + "mean_token_accuracy": 0.8428186282515526, + "num_tokens": 140098344.0, + "step": 116520 + }, + { + "entropy": 1.8515635520219802, + "epoch": 0.36123280420410137, + "grad_norm": 9.717535972595215, + "learning_rate": 4.209213471977109e-06, + "loss": 0.4698, + "mean_token_accuracy": 0.846831089258194, + "num_tokens": 140110320.0, + "step": 116530 + }, + { + "entropy": 1.8465173259377479, + "epoch": 0.36126380332915103, + "grad_norm": 7.840369701385498, + "learning_rate": 4.209032875620006e-06, + "loss": 0.4722, + "mean_token_accuracy": 0.8543838858604431, + "num_tokens": 140122072.0, + "step": 116540 + }, + { + "entropy": 1.793773990869522, + "epoch": 0.3612948024542007, + "grad_norm": 7.085291385650635, + "learning_rate": 4.2088523025063745e-06, + "loss": 0.4702, + "mean_token_accuracy": 0.8551607668399811, + "num_tokens": 140134814.0, + "step": 116550 + }, + { + "entropy": 1.8004359588027001, + "epoch": 0.3613258015792504, + "grad_norm": 2.7206969261169434, + "learning_rate": 4.208671752631231e-06, + "loss": 0.4592, + "mean_token_accuracy": 0.8510082602500916, + "num_tokens": 140147230.0, + "step": 116560 + }, + { + "entropy": 1.838956043124199, + "epoch": 0.3613568007043001, + "grad_norm": 4.567168235778809, + "learning_rate": 4.20849122598959e-06, + "loss": 0.4362, + "mean_token_accuracy": 0.8527957811951637, + "num_tokens": 140159657.0, + "step": 116570 + }, + { + "entropy": 1.8510146543383599, + "epoch": 0.3613877998293498, + "grad_norm": 8.271007537841797, + "learning_rate": 4.208310722576469e-06, + "loss": 0.4629, + "mean_token_accuracy": 0.8527455732226372, + "num_tokens": 140171588.0, + "step": 116580 + }, + { + "entropy": 1.817806077003479, + "epoch": 0.3614187989543995, + "grad_norm": 8.602225303649902, + "learning_rate": 4.208130242386889e-06, + "loss": 0.4285, + "mean_token_accuracy": 0.8533978506922721, + "num_tokens": 140184012.0, + "step": 116590 + }, + { + "entropy": 1.8294128969311714, + "epoch": 0.3614497980794492, + "grad_norm": 7.451015949249268, + "learning_rate": 4.207949785415868e-06, + "loss": 0.4322, + "mean_token_accuracy": 0.8549151733517647, + "num_tokens": 140196072.0, + "step": 116600 + }, + { + "entropy": 1.9123011171817779, + "epoch": 0.3614807972044989, + "grad_norm": 10.21450138092041, + "learning_rate": 4.207769351658429e-06, + "loss": 0.4947, + "mean_token_accuracy": 0.8501939132809639, + "num_tokens": 140206777.0, + "step": 116610 + }, + { + "entropy": 1.898598951101303, + "epoch": 0.3615117963295486, + "grad_norm": 7.918070316314697, + "learning_rate": 4.2075889411095965e-06, + "loss": 0.4788, + "mean_token_accuracy": 0.8529366075992584, + "num_tokens": 140217957.0, + "step": 116620 + }, + { + "entropy": 1.871064305305481, + "epoch": 0.3615427954545983, + "grad_norm": 3.4128713607788086, + "learning_rate": 4.2074085537643945e-06, + "loss": 0.4896, + "mean_token_accuracy": 0.8430089339613914, + "num_tokens": 140229754.0, + "step": 116630 + }, + { + "entropy": 1.8388782501220704, + "epoch": 0.361573794579648, + "grad_norm": 3.9377996921539307, + "learning_rate": 4.207228189617849e-06, + "loss": 0.4381, + "mean_token_accuracy": 0.855095773935318, + "num_tokens": 140242273.0, + "step": 116640 + }, + { + "entropy": 1.7982016265392304, + "epoch": 0.36160479370469767, + "grad_norm": 4.408384323120117, + "learning_rate": 4.20704784866499e-06, + "loss": 0.4145, + "mean_token_accuracy": 0.8601232185959816, + "num_tokens": 140255003.0, + "step": 116650 + }, + { + "entropy": 1.822973382472992, + "epoch": 0.3616357928297474, + "grad_norm": 9.451082229614258, + "learning_rate": 4.206867530900845e-06, + "loss": 0.4069, + "mean_token_accuracy": 0.8590276911854744, + "num_tokens": 140267479.0, + "step": 116660 + }, + { + "entropy": 1.7897161670029162, + "epoch": 0.36166679195479706, + "grad_norm": 8.063048362731934, + "learning_rate": 4.206687236320445e-06, + "loss": 0.4422, + "mean_token_accuracy": 0.8569493010640145, + "num_tokens": 140280536.0, + "step": 116670 + }, + { + "entropy": 1.8633152276277543, + "epoch": 0.3616977910798468, + "grad_norm": 7.586259365081787, + "learning_rate": 4.206506964918824e-06, + "loss": 0.4814, + "mean_token_accuracy": 0.84778813123703, + "num_tokens": 140292788.0, + "step": 116680 + }, + { + "entropy": 1.9379972368478775, + "epoch": 0.36172879020489646, + "grad_norm": 8.198206901550293, + "learning_rate": 4.206326716691015e-06, + "loss": 0.5271, + "mean_token_accuracy": 0.8326962515711784, + "num_tokens": 140303768.0, + "step": 116690 + }, + { + "entropy": 1.8760272949934005, + "epoch": 0.3617597893299462, + "grad_norm": 7.511582851409912, + "learning_rate": 4.206146491632053e-06, + "loss": 0.4489, + "mean_token_accuracy": 0.8527100086212158, + "num_tokens": 140314927.0, + "step": 116700 + }, + { + "entropy": 1.8044881626963616, + "epoch": 0.36179078845499585, + "grad_norm": 7.948654651641846, + "learning_rate": 4.205966289736976e-06, + "loss": 0.4147, + "mean_token_accuracy": 0.8563151493668556, + "num_tokens": 140327161.0, + "step": 116710 + }, + { + "entropy": 1.9257992446422576, + "epoch": 0.3618217875800456, + "grad_norm": 9.45364761352539, + "learning_rate": 4.205786111000822e-06, + "loss": 0.5259, + "mean_token_accuracy": 0.8418830558657646, + "num_tokens": 140337921.0, + "step": 116720 + }, + { + "entropy": 1.8222204566001892, + "epoch": 0.36185278670509524, + "grad_norm": 3.7886269092559814, + "learning_rate": 4.2056059554186305e-06, + "loss": 0.4949, + "mean_token_accuracy": 0.8379009708762168, + "num_tokens": 140350153.0, + "step": 116730 + }, + { + "entropy": 1.8184516310691834, + "epoch": 0.36188378583014497, + "grad_norm": 4.151296138763428, + "learning_rate": 4.2054258229854435e-06, + "loss": 0.4617, + "mean_token_accuracy": 0.8527412965893746, + "num_tokens": 140363079.0, + "step": 116740 + }, + { + "entropy": 1.8538252338767052, + "epoch": 0.36191478495519463, + "grad_norm": 6.772287845611572, + "learning_rate": 4.205245713696304e-06, + "loss": 0.4964, + "mean_token_accuracy": 0.8451321288943291, + "num_tokens": 140375101.0, + "step": 116750 + }, + { + "entropy": 1.9226402550935746, + "epoch": 0.36194578408024436, + "grad_norm": 8.788530349731445, + "learning_rate": 4.205065627546256e-06, + "loss": 0.5342, + "mean_token_accuracy": 0.8383546307682991, + "num_tokens": 140386168.0, + "step": 116760 + }, + { + "entropy": 1.7971034452319146, + "epoch": 0.36197678320529403, + "grad_norm": 9.055315017700195, + "learning_rate": 4.204885564530345e-06, + "loss": 0.4369, + "mean_token_accuracy": 0.8534369856119156, + "num_tokens": 140399104.0, + "step": 116770 + }, + { + "entropy": 1.903681069612503, + "epoch": 0.36200778233034375, + "grad_norm": 8.377460479736328, + "learning_rate": 4.204705524643619e-06, + "loss": 0.5469, + "mean_token_accuracy": 0.8296264916658401, + "num_tokens": 140409966.0, + "step": 116780 + }, + { + "entropy": 1.8770312041044235, + "epoch": 0.3620387814553934, + "grad_norm": 7.98540735244751, + "learning_rate": 4.204525507881126e-06, + "loss": 0.4922, + "mean_token_accuracy": 0.8401692688465119, + "num_tokens": 140423005.0, + "step": 116790 + }, + { + "entropy": 1.8793371990323067, + "epoch": 0.3620697805804431, + "grad_norm": 8.905379295349121, + "learning_rate": 4.204345514237917e-06, + "loss": 0.4797, + "mean_token_accuracy": 0.8462567001581192, + "num_tokens": 140434436.0, + "step": 116800 + }, + { + "entropy": 1.8640392243862152, + "epoch": 0.3621007797054928, + "grad_norm": 4.022253036499023, + "learning_rate": 4.204165543709043e-06, + "loss": 0.4321, + "mean_token_accuracy": 0.8513988614082336, + "num_tokens": 140446710.0, + "step": 116810 + }, + { + "entropy": 1.8475000470876695, + "epoch": 0.3621317788305425, + "grad_norm": 8.355897903442383, + "learning_rate": 4.2039855962895605e-06, + "loss": 0.4696, + "mean_token_accuracy": 0.8459990754723549, + "num_tokens": 140458545.0, + "step": 116820 + }, + { + "entropy": 1.8094294220209122, + "epoch": 0.3621627779555922, + "grad_norm": 2.8843464851379395, + "learning_rate": 4.203805671974519e-06, + "loss": 0.4604, + "mean_token_accuracy": 0.8503652885556221, + "num_tokens": 140471070.0, + "step": 116830 + }, + { + "entropy": 1.8952600628137588, + "epoch": 0.3621937770806419, + "grad_norm": 7.821008682250977, + "learning_rate": 4.203625770758979e-06, + "loss": 0.5673, + "mean_token_accuracy": 0.8331520974636077, + "num_tokens": 140482920.0, + "step": 116840 + }, + { + "entropy": 1.827965249121189, + "epoch": 0.3622247762056916, + "grad_norm": 9.462203025817871, + "learning_rate": 4.2034458926379965e-06, + "loss": 0.4623, + "mean_token_accuracy": 0.854870118200779, + "num_tokens": 140495151.0, + "step": 116850 + }, + { + "entropy": 1.8622143000364304, + "epoch": 0.36225577533074127, + "grad_norm": 8.208244323730469, + "learning_rate": 4.20326603760663e-06, + "loss": 0.4878, + "mean_token_accuracy": 0.8542701035737992, + "num_tokens": 140507333.0, + "step": 116860 + }, + { + "entropy": 1.7744985669851303, + "epoch": 0.362286774455791, + "grad_norm": 8.417732238769531, + "learning_rate": 4.2030862056599415e-06, + "loss": 0.3831, + "mean_token_accuracy": 0.8632864966988564, + "num_tokens": 140520558.0, + "step": 116870 + }, + { + "entropy": 1.8826520085334777, + "epoch": 0.36231777358084066, + "grad_norm": 8.546829223632812, + "learning_rate": 4.202906396792993e-06, + "loss": 0.4866, + "mean_token_accuracy": 0.8448991179466248, + "num_tokens": 140532773.0, + "step": 116880 + }, + { + "entropy": 1.8119245707988738, + "epoch": 0.3623487727058904, + "grad_norm": 7.6031599044799805, + "learning_rate": 4.2027266110008475e-06, + "loss": 0.4156, + "mean_token_accuracy": 0.8597419142723084, + "num_tokens": 140544815.0, + "step": 116890 + }, + { + "entropy": 1.8004948943853378, + "epoch": 0.36237977183094006, + "grad_norm": 9.985618591308594, + "learning_rate": 4.202546848278572e-06, + "loss": 0.4757, + "mean_token_accuracy": 0.8449292674660682, + "num_tokens": 140557252.0, + "step": 116900 + }, + { + "entropy": 1.8618132412433623, + "epoch": 0.3624107709559898, + "grad_norm": 6.6439290046691895, + "learning_rate": 4.2023671086212295e-06, + "loss": 0.457, + "mean_token_accuracy": 0.8493566080927849, + "num_tokens": 140568584.0, + "step": 116910 + }, + { + "entropy": 1.7272257059812546, + "epoch": 0.36244177008103945, + "grad_norm": 5.679489612579346, + "learning_rate": 4.202187392023891e-06, + "loss": 0.3764, + "mean_token_accuracy": 0.8584482952952385, + "num_tokens": 140582578.0, + "step": 116920 + }, + { + "entropy": 1.9361666440963745, + "epoch": 0.3624727692060892, + "grad_norm": 7.492152690887451, + "learning_rate": 4.202007698481626e-06, + "loss": 0.4652, + "mean_token_accuracy": 0.8505539804697037, + "num_tokens": 140593156.0, + "step": 116930 + }, + { + "entropy": 1.9066250026226044, + "epoch": 0.36250376833113884, + "grad_norm": 9.594344139099121, + "learning_rate": 4.201828027989504e-06, + "loss": 0.4974, + "mean_token_accuracy": 0.8503702595829964, + "num_tokens": 140604659.0, + "step": 116940 + }, + { + "entropy": 1.8485996812582015, + "epoch": 0.36253476745618857, + "grad_norm": 9.402889251708984, + "learning_rate": 4.201648380542599e-06, + "loss": 0.4438, + "mean_token_accuracy": 0.8464497730135918, + "num_tokens": 140616897.0, + "step": 116950 + }, + { + "entropy": 1.8569966793060302, + "epoch": 0.36256576658123824, + "grad_norm": 8.817802429199219, + "learning_rate": 4.201468756135983e-06, + "loss": 0.4861, + "mean_token_accuracy": 0.839309224486351, + "num_tokens": 140628644.0, + "step": 116960 + }, + { + "entropy": 1.827142098546028, + "epoch": 0.36259676570628796, + "grad_norm": 7.352452278137207, + "learning_rate": 4.201289154764733e-06, + "loss": 0.477, + "mean_token_accuracy": 0.8442728266119957, + "num_tokens": 140640707.0, + "step": 116970 + }, + { + "entropy": 1.807940025627613, + "epoch": 0.36262776483133763, + "grad_norm": 4.695886135101318, + "learning_rate": 4.201109576423926e-06, + "loss": 0.4861, + "mean_token_accuracy": 0.8374917089939118, + "num_tokens": 140653668.0, + "step": 116980 + }, + { + "entropy": 1.8596377670764923, + "epoch": 0.36265876395638735, + "grad_norm": 9.165081024169922, + "learning_rate": 4.20093002110864e-06, + "loss": 0.4625, + "mean_token_accuracy": 0.8492574363946914, + "num_tokens": 140665982.0, + "step": 116990 + }, + { + "entropy": 1.8663609504699707, + "epoch": 0.362689763081437, + "grad_norm": 7.703036785125732, + "learning_rate": 4.200750488813955e-06, + "loss": 0.4468, + "mean_token_accuracy": 0.8582181662321091, + "num_tokens": 140677607.0, + "step": 117000 + }, + { + "entropy": 1.808670112490654, + "epoch": 0.36272076220648675, + "grad_norm": 3.881373405456543, + "learning_rate": 4.200570979534951e-06, + "loss": 0.3998, + "mean_token_accuracy": 0.856096900999546, + "num_tokens": 140690750.0, + "step": 117010 + }, + { + "entropy": 1.8351006895303725, + "epoch": 0.3627517613315364, + "grad_norm": 3.9870259761810303, + "learning_rate": 4.200391493266714e-06, + "loss": 0.4496, + "mean_token_accuracy": 0.8495851859450341, + "num_tokens": 140702826.0, + "step": 117020 + }, + { + "entropy": 1.8416362181305885, + "epoch": 0.36278276045658614, + "grad_norm": 7.493378639221191, + "learning_rate": 4.200212030004326e-06, + "loss": 0.4321, + "mean_token_accuracy": 0.8568256065249443, + "num_tokens": 140714882.0, + "step": 117030 + }, + { + "entropy": 1.8965220645070076, + "epoch": 0.3628137595816358, + "grad_norm": 7.964942455291748, + "learning_rate": 4.200032589742872e-06, + "loss": 0.4827, + "mean_token_accuracy": 0.8496366932988166, + "num_tokens": 140726128.0, + "step": 117040 + }, + { + "entropy": 1.835560789704323, + "epoch": 0.3628447587066855, + "grad_norm": 8.899863243103027, + "learning_rate": 4.1998531724774405e-06, + "loss": 0.4407, + "mean_token_accuracy": 0.8567850336432457, + "num_tokens": 140738179.0, + "step": 117050 + }, + { + "entropy": 1.8459725484251976, + "epoch": 0.3628757578317352, + "grad_norm": 6.545345306396484, + "learning_rate": 4.19967377820312e-06, + "loss": 0.4149, + "mean_token_accuracy": 0.8515329957008362, + "num_tokens": 140750630.0, + "step": 117060 + }, + { + "entropy": 1.9014078021049499, + "epoch": 0.36290675695678487, + "grad_norm": 4.256891250610352, + "learning_rate": 4.199494406915001e-06, + "loss": 0.4797, + "mean_token_accuracy": 0.8485517993569374, + "num_tokens": 140762145.0, + "step": 117070 + }, + { + "entropy": 1.8998582303524016, + "epoch": 0.3629377560818346, + "grad_norm": 7.5159430503845215, + "learning_rate": 4.1993150586081755e-06, + "loss": 0.478, + "mean_token_accuracy": 0.845410218834877, + "num_tokens": 140773817.0, + "step": 117080 + }, + { + "entropy": 1.8339273512363434, + "epoch": 0.36296875520688426, + "grad_norm": 3.6679487228393555, + "learning_rate": 4.1991357332777346e-06, + "loss": 0.4336, + "mean_token_accuracy": 0.8533756047487259, + "num_tokens": 140786301.0, + "step": 117090 + }, + { + "entropy": 1.8324194088578225, + "epoch": 0.362999754331934, + "grad_norm": 8.492642402648926, + "learning_rate": 4.198956430918775e-06, + "loss": 0.4395, + "mean_token_accuracy": 0.8549712345004081, + "num_tokens": 140798870.0, + "step": 117100 + }, + { + "entropy": 1.8803598403930664, + "epoch": 0.36303075345698366, + "grad_norm": 7.195762634277344, + "learning_rate": 4.1987771515263905e-06, + "loss": 0.5104, + "mean_token_accuracy": 0.8416722387075424, + "num_tokens": 140811461.0, + "step": 117110 + }, + { + "entropy": 1.9139654606580734, + "epoch": 0.3630617525820334, + "grad_norm": 4.575145244598389, + "learning_rate": 4.198597895095681e-06, + "loss": 0.4905, + "mean_token_accuracy": 0.8445180609822274, + "num_tokens": 140822762.0, + "step": 117120 + }, + { + "entropy": 1.9604573130607605, + "epoch": 0.36309275170708305, + "grad_norm": 8.761122703552246, + "learning_rate": 4.198418661621745e-06, + "loss": 0.4941, + "mean_token_accuracy": 0.8457563772797585, + "num_tokens": 140833186.0, + "step": 117130 + }, + { + "entropy": 1.8488843753933906, + "epoch": 0.3631237508321328, + "grad_norm": 8.668008804321289, + "learning_rate": 4.198239451099681e-06, + "loss": 0.4754, + "mean_token_accuracy": 0.848555526137352, + "num_tokens": 140845017.0, + "step": 117140 + }, + { + "entropy": 1.8061452731490135, + "epoch": 0.36315474995718244, + "grad_norm": 8.766512870788574, + "learning_rate": 4.198060263524593e-06, + "loss": 0.4393, + "mean_token_accuracy": 0.8509079724550247, + "num_tokens": 140857708.0, + "step": 117150 + }, + { + "entropy": 1.8900872439146041, + "epoch": 0.36318574908223217, + "grad_norm": 8.589384078979492, + "learning_rate": 4.197881098891584e-06, + "loss": 0.454, + "mean_token_accuracy": 0.84776521474123, + "num_tokens": 140869482.0, + "step": 117160 + }, + { + "entropy": 1.887955754995346, + "epoch": 0.36321674820728184, + "grad_norm": 8.895465850830078, + "learning_rate": 4.197701957195758e-06, + "loss": 0.4763, + "mean_token_accuracy": 0.8418793961405754, + "num_tokens": 140881457.0, + "step": 117170 + }, + { + "entropy": 1.9077037498354912, + "epoch": 0.36324774733233156, + "grad_norm": 8.192936897277832, + "learning_rate": 4.197522838432221e-06, + "loss": 0.4826, + "mean_token_accuracy": 0.8485002934932708, + "num_tokens": 140893109.0, + "step": 117180 + }, + { + "entropy": 1.9021349415183066, + "epoch": 0.36327874645738123, + "grad_norm": 8.755243301391602, + "learning_rate": 4.197343742596083e-06, + "loss": 0.5011, + "mean_token_accuracy": 0.8397643223404885, + "num_tokens": 140904533.0, + "step": 117190 + }, + { + "entropy": 1.8732059866189956, + "epoch": 0.36330974558243095, + "grad_norm": 8.205700874328613, + "learning_rate": 4.19716466968245e-06, + "loss": 0.4276, + "mean_token_accuracy": 0.8494512528181076, + "num_tokens": 140917332.0, + "step": 117200 + }, + { + "entropy": 1.8849352940917015, + "epoch": 0.3633407447074806, + "grad_norm": 3.967863082885742, + "learning_rate": 4.196985619686435e-06, + "loss": 0.4454, + "mean_token_accuracy": 0.8578123107552529, + "num_tokens": 140929314.0, + "step": 117210 + }, + { + "entropy": 1.8729075148701668, + "epoch": 0.36337174383253035, + "grad_norm": 8.107409477233887, + "learning_rate": 4.196806592603149e-06, + "loss": 0.4313, + "mean_token_accuracy": 0.8556957468390465, + "num_tokens": 140941339.0, + "step": 117220 + }, + { + "entropy": 1.9158906683325767, + "epoch": 0.36340274295758, + "grad_norm": 9.10888385772705, + "learning_rate": 4.196627588427705e-06, + "loss": 0.4664, + "mean_token_accuracy": 0.849520568549633, + "num_tokens": 140952013.0, + "step": 117230 + }, + { + "entropy": 1.8629322737455367, + "epoch": 0.36343374208262974, + "grad_norm": 7.760976314544678, + "learning_rate": 4.196448607155221e-06, + "loss": 0.4591, + "mean_token_accuracy": 0.8583438560366631, + "num_tokens": 140963744.0, + "step": 117240 + }, + { + "entropy": 1.7615326926112176, + "epoch": 0.3634647412076794, + "grad_norm": 8.887621879577637, + "learning_rate": 4.19626964878081e-06, + "loss": 0.3829, + "mean_token_accuracy": 0.8631403967738152, + "num_tokens": 140976932.0, + "step": 117250 + }, + { + "entropy": 1.8309187158942222, + "epoch": 0.36349574033272913, + "grad_norm": 7.472461223602295, + "learning_rate": 4.196090713299592e-06, + "loss": 0.4227, + "mean_token_accuracy": 0.8512938529253006, + "num_tokens": 140989695.0, + "step": 117260 + }, + { + "entropy": 1.8179723098874092, + "epoch": 0.3635267394577788, + "grad_norm": 8.319780349731445, + "learning_rate": 4.195911800706686e-06, + "loss": 0.4561, + "mean_token_accuracy": 0.855165833234787, + "num_tokens": 141001678.0, + "step": 117270 + }, + { + "entropy": 1.873963290452957, + "epoch": 0.3635577385828285, + "grad_norm": 8.117876052856445, + "learning_rate": 4.195732910997212e-06, + "loss": 0.5322, + "mean_token_accuracy": 0.8410425320267677, + "num_tokens": 141012566.0, + "step": 117280 + }, + { + "entropy": 1.8815976113080979, + "epoch": 0.3635887377078782, + "grad_norm": 9.272675514221191, + "learning_rate": 4.195554044166294e-06, + "loss": 0.4761, + "mean_token_accuracy": 0.8484034299850464, + "num_tokens": 141023991.0, + "step": 117290 + }, + { + "entropy": 1.8928146451711654, + "epoch": 0.36361973683292786, + "grad_norm": 10.41171646118164, + "learning_rate": 4.195375200209055e-06, + "loss": 0.4817, + "mean_token_accuracy": 0.8498426347970962, + "num_tokens": 141034844.0, + "step": 117300 + }, + { + "entropy": 1.9140311002731323, + "epoch": 0.3636507359579776, + "grad_norm": 9.667838096618652, + "learning_rate": 4.195196379120619e-06, + "loss": 0.5428, + "mean_token_accuracy": 0.8315603911876679, + "num_tokens": 141045930.0, + "step": 117310 + }, + { + "entropy": 1.817529346048832, + "epoch": 0.36368173508302726, + "grad_norm": 7.76594877243042, + "learning_rate": 4.195017580896114e-06, + "loss": 0.4412, + "mean_token_accuracy": 0.8598337545990944, + "num_tokens": 141058763.0, + "step": 117320 + }, + { + "entropy": 1.971425500512123, + "epoch": 0.363712734208077, + "grad_norm": 8.319217681884766, + "learning_rate": 4.194838805530668e-06, + "loss": 0.5262, + "mean_token_accuracy": 0.8371591001749039, + "num_tokens": 141068907.0, + "step": 117330 + }, + { + "entropy": 1.9198841854929924, + "epoch": 0.36374373333312665, + "grad_norm": 8.907166481018066, + "learning_rate": 4.19466005301941e-06, + "loss": 0.4783, + "mean_token_accuracy": 0.8449474439024925, + "num_tokens": 141080433.0, + "step": 117340 + }, + { + "entropy": 1.8737755030393601, + "epoch": 0.3637747324581764, + "grad_norm": 8.013126373291016, + "learning_rate": 4.194481323357473e-06, + "loss": 0.444, + "mean_token_accuracy": 0.8571465671062469, + "num_tokens": 141092637.0, + "step": 117350 + }, + { + "entropy": 1.9522608071565628, + "epoch": 0.36380573158322604, + "grad_norm": 6.9084296226501465, + "learning_rate": 4.194302616539986e-06, + "loss": 0.4702, + "mean_token_accuracy": 0.8557817369699479, + "num_tokens": 141103184.0, + "step": 117360 + }, + { + "entropy": 1.849492047727108, + "epoch": 0.36383673070827577, + "grad_norm": 9.977517127990723, + "learning_rate": 4.194123932562086e-06, + "loss": 0.438, + "mean_token_accuracy": 0.8567962691187858, + "num_tokens": 141115374.0, + "step": 117370 + }, + { + "entropy": 1.8683731004595756, + "epoch": 0.36386772983332544, + "grad_norm": 3.932969093322754, + "learning_rate": 4.193945271418908e-06, + "loss": 0.4906, + "mean_token_accuracy": 0.8401071175932884, + "num_tokens": 141127677.0, + "step": 117380 + }, + { + "entropy": 1.8759948313236237, + "epoch": 0.36389872895837516, + "grad_norm": 6.948093414306641, + "learning_rate": 4.193766633105587e-06, + "loss": 0.4355, + "mean_token_accuracy": 0.8567280665040016, + "num_tokens": 141139281.0, + "step": 117390 + }, + { + "entropy": 1.9030549958348275, + "epoch": 0.36392972808342483, + "grad_norm": 4.025951862335205, + "learning_rate": 4.1935880176172626e-06, + "loss": 0.5086, + "mean_token_accuracy": 0.8426888778805732, + "num_tokens": 141150437.0, + "step": 117400 + }, + { + "entropy": 1.790473000705242, + "epoch": 0.36396072720847455, + "grad_norm": 7.977076053619385, + "learning_rate": 4.193409424949075e-06, + "loss": 0.361, + "mean_token_accuracy": 0.8608955994248391, + "num_tokens": 141163304.0, + "step": 117410 + }, + { + "entropy": 1.9095865696668626, + "epoch": 0.3639917263335242, + "grad_norm": 6.860559940338135, + "learning_rate": 4.193230855096164e-06, + "loss": 0.4834, + "mean_token_accuracy": 0.8461943343281746, + "num_tokens": 141174839.0, + "step": 117420 + }, + { + "entropy": 1.8187424436211586, + "epoch": 0.36402272545857395, + "grad_norm": 10.87106704711914, + "learning_rate": 4.193052308053674e-06, + "loss": 0.4349, + "mean_token_accuracy": 0.8446593895554543, + "num_tokens": 141187933.0, + "step": 117430 + }, + { + "entropy": 1.8248173132538796, + "epoch": 0.3640537245836236, + "grad_norm": 8.197173118591309, + "learning_rate": 4.192873783816748e-06, + "loss": 0.4556, + "mean_token_accuracy": 0.8538053125143051, + "num_tokens": 141200760.0, + "step": 117440 + }, + { + "entropy": 1.7480739682912827, + "epoch": 0.36408472370867334, + "grad_norm": 5.014430046081543, + "learning_rate": 4.192695282380531e-06, + "loss": 0.3556, + "mean_token_accuracy": 0.8659795328974724, + "num_tokens": 141215184.0, + "step": 117450 + }, + { + "entropy": 1.9292996317148208, + "epoch": 0.364115722833723, + "grad_norm": 8.716943740844727, + "learning_rate": 4.192516803740172e-06, + "loss": 0.4974, + "mean_token_accuracy": 0.8481010347604752, + "num_tokens": 141226311.0, + "step": 117460 + }, + { + "entropy": 1.8196698293089866, + "epoch": 0.36414672195877273, + "grad_norm": 6.581869602203369, + "learning_rate": 4.192338347890818e-06, + "loss": 0.4392, + "mean_token_accuracy": 0.8496568530797959, + "num_tokens": 141238353.0, + "step": 117470 + }, + { + "entropy": 1.8798624217510223, + "epoch": 0.3641777210838224, + "grad_norm": 8.914942741394043, + "learning_rate": 4.192159914827618e-06, + "loss": 0.5287, + "mean_token_accuracy": 0.8403584629297256, + "num_tokens": 141250412.0, + "step": 117480 + }, + { + "entropy": 1.9054614737629891, + "epoch": 0.3642087202088721, + "grad_norm": 8.384186744689941, + "learning_rate": 4.191981504545725e-06, + "loss": 0.497, + "mean_token_accuracy": 0.8418497174978257, + "num_tokens": 141261399.0, + "step": 117490 + }, + { + "entropy": 1.8841155782341956, + "epoch": 0.3642397193339218, + "grad_norm": 9.407716751098633, + "learning_rate": 4.191803117040292e-06, + "loss": 0.5278, + "mean_token_accuracy": 0.8366957977414131, + "num_tokens": 141273160.0, + "step": 117500 + }, + { + "entropy": 1.82247234582901, + "epoch": 0.3642707184589715, + "grad_norm": 10.961734771728516, + "learning_rate": 4.191624752306471e-06, + "loss": 0.4697, + "mean_token_accuracy": 0.8399570837616921, + "num_tokens": 141285805.0, + "step": 117510 + }, + { + "entropy": 1.8197978034615516, + "epoch": 0.3643017175840212, + "grad_norm": 9.942026138305664, + "learning_rate": 4.191446410339419e-06, + "loss": 0.4457, + "mean_token_accuracy": 0.8496342837810517, + "num_tokens": 141297836.0, + "step": 117520 + }, + { + "entropy": 1.8120769903063774, + "epoch": 0.3643327167090709, + "grad_norm": 4.632111549377441, + "learning_rate": 4.191268091134293e-06, + "loss": 0.4391, + "mean_token_accuracy": 0.855706425011158, + "num_tokens": 141309962.0, + "step": 117530 + }, + { + "entropy": 1.9124917179346084, + "epoch": 0.3643637158341206, + "grad_norm": 8.424860000610352, + "learning_rate": 4.191089794686252e-06, + "loss": 0.5288, + "mean_token_accuracy": 0.8378676295280456, + "num_tokens": 141320584.0, + "step": 117540 + }, + { + "entropy": 1.871908935904503, + "epoch": 0.36439471495917025, + "grad_norm": 3.5765280723571777, + "learning_rate": 4.190911520990456e-06, + "loss": 0.4489, + "mean_token_accuracy": 0.8616762563586235, + "num_tokens": 141331690.0, + "step": 117550 + }, + { + "entropy": 1.827473521232605, + "epoch": 0.36442571408422, + "grad_norm": 8.839676856994629, + "learning_rate": 4.190733270042066e-06, + "loss": 0.4528, + "mean_token_accuracy": 0.856446447968483, + "num_tokens": 141344147.0, + "step": 117560 + }, + { + "entropy": 1.9029761299490928, + "epoch": 0.36445671320926964, + "grad_norm": 10.330524444580078, + "learning_rate": 4.190555041836245e-06, + "loss": 0.4899, + "mean_token_accuracy": 0.8440795317292213, + "num_tokens": 141355707.0, + "step": 117570 + }, + { + "entropy": 1.8510171189904212, + "epoch": 0.36448771233431937, + "grad_norm": 10.09470272064209, + "learning_rate": 4.190376836368157e-06, + "loss": 0.5337, + "mean_token_accuracy": 0.8413840815424919, + "num_tokens": 141367817.0, + "step": 117580 + }, + { + "entropy": 1.9088376432657241, + "epoch": 0.36451871145936904, + "grad_norm": 7.446246147155762, + "learning_rate": 4.190198653632968e-06, + "loss": 0.4873, + "mean_token_accuracy": 0.850058002769947, + "num_tokens": 141378606.0, + "step": 117590 + }, + { + "entropy": 1.7923038378357887, + "epoch": 0.36454971058441876, + "grad_norm": 4.018001556396484, + "learning_rate": 4.190020493625845e-06, + "loss": 0.4045, + "mean_token_accuracy": 0.8695195659995079, + "num_tokens": 141390643.0, + "step": 117600 + }, + { + "entropy": 1.9072920486330986, + "epoch": 0.36458070970946843, + "grad_norm": 7.586421489715576, + "learning_rate": 4.1898423563419565e-06, + "loss": 0.496, + "mean_token_accuracy": 0.8380471915006638, + "num_tokens": 141401942.0, + "step": 117610 + }, + { + "entropy": 1.8963149681687355, + "epoch": 0.36461170883451816, + "grad_norm": Infinity, + "learning_rate": 4.1896642417764735e-06, + "loss": 0.5418, + "mean_token_accuracy": 0.8512624606490136, + "num_tokens": 141414153.0, + "step": 117620 + }, + { + "entropy": 1.8850282415747643, + "epoch": 0.3646427079595678, + "grad_norm": 10.361580848693848, + "learning_rate": 4.189486149924567e-06, + "loss": 0.4836, + "mean_token_accuracy": 0.8521965146064758, + "num_tokens": 141425510.0, + "step": 117630 + }, + { + "entropy": 1.8708653166890143, + "epoch": 0.36467370708461755, + "grad_norm": 9.141669273376465, + "learning_rate": 4.189308080781409e-06, + "loss": 0.5113, + "mean_token_accuracy": 0.8514803513884545, + "num_tokens": 141436975.0, + "step": 117640 + }, + { + "entropy": 1.86157566010952, + "epoch": 0.3647047062096672, + "grad_norm": 7.51423454284668, + "learning_rate": 4.189130034342174e-06, + "loss": 0.4494, + "mean_token_accuracy": 0.8555971041321755, + "num_tokens": 141449229.0, + "step": 117650 + }, + { + "entropy": 1.8771350249648093, + "epoch": 0.36473570533471694, + "grad_norm": 4.558736801147461, + "learning_rate": 4.18895201060204e-06, + "loss": 0.5087, + "mean_token_accuracy": 0.8356175437569618, + "num_tokens": 141460708.0, + "step": 117660 + }, + { + "entropy": 1.884423953294754, + "epoch": 0.3647667044597666, + "grad_norm": 7.785421848297119, + "learning_rate": 4.188774009556181e-06, + "loss": 0.5019, + "mean_token_accuracy": 0.845032088458538, + "num_tokens": 141472534.0, + "step": 117670 + }, + { + "entropy": 1.8946044102311135, + "epoch": 0.36479770358481634, + "grad_norm": 8.261967658996582, + "learning_rate": 4.188596031199778e-06, + "loss": 0.4688, + "mean_token_accuracy": 0.8496976479887962, + "num_tokens": 141483714.0, + "step": 117680 + }, + { + "entropy": 1.7952809534966945, + "epoch": 0.364828702709866, + "grad_norm": 7.456904888153076, + "learning_rate": 4.188418075528011e-06, + "loss": 0.4413, + "mean_token_accuracy": 0.8519541397690773, + "num_tokens": 141497189.0, + "step": 117690 + }, + { + "entropy": 1.9262281000614165, + "epoch": 0.36485970183491573, + "grad_norm": 8.199373245239258, + "learning_rate": 4.188240142536061e-06, + "loss": 0.5315, + "mean_token_accuracy": 0.8272052019834518, + "num_tokens": 141508512.0, + "step": 117700 + }, + { + "entropy": 1.8607816338539123, + "epoch": 0.3648907009599654, + "grad_norm": 8.653282165527344, + "learning_rate": 4.18806223221911e-06, + "loss": 0.5164, + "mean_token_accuracy": 0.8485778480768204, + "num_tokens": 141521091.0, + "step": 117710 + }, + { + "entropy": 1.9451293349266052, + "epoch": 0.3649217000850151, + "grad_norm": 3.9272289276123047, + "learning_rate": 4.1878843445723445e-06, + "loss": 0.5076, + "mean_token_accuracy": 0.8386497780680656, + "num_tokens": 141532558.0, + "step": 117720 + }, + { + "entropy": 1.843569315969944, + "epoch": 0.3649526992100648, + "grad_norm": 9.565539360046387, + "learning_rate": 4.187706479590949e-06, + "loss": 0.4864, + "mean_token_accuracy": 0.8429495915770531, + "num_tokens": 141545391.0, + "step": 117730 + }, + { + "entropy": 1.8172935619950294, + "epoch": 0.3649836983351145, + "grad_norm": 3.4644248485565186, + "learning_rate": 4.18752863727011e-06, + "loss": 0.4049, + "mean_token_accuracy": 0.8586100369691849, + "num_tokens": 141557495.0, + "step": 117740 + }, + { + "entropy": 1.8373050913214684, + "epoch": 0.3650146974601642, + "grad_norm": 9.560324668884277, + "learning_rate": 4.187350817605018e-06, + "loss": 0.4405, + "mean_token_accuracy": 0.8568969145417213, + "num_tokens": 141569946.0, + "step": 117750 + }, + { + "entropy": 1.8132469549775123, + "epoch": 0.3650456965852139, + "grad_norm": 10.291223526000977, + "learning_rate": 4.187173020590862e-06, + "loss": 0.4115, + "mean_token_accuracy": 0.8503494530916214, + "num_tokens": 141583063.0, + "step": 117760 + }, + { + "entropy": 1.8738684132695198, + "epoch": 0.3650766957102636, + "grad_norm": 7.412810802459717, + "learning_rate": 4.186995246222834e-06, + "loss": 0.4488, + "mean_token_accuracy": 0.8561365365982055, + "num_tokens": 141595030.0, + "step": 117770 + }, + { + "entropy": 1.9303662449121475, + "epoch": 0.36510769483531325, + "grad_norm": 7.82151985168457, + "learning_rate": 4.186817494496126e-06, + "loss": 0.4914, + "mean_token_accuracy": 0.84500802308321, + "num_tokens": 141606726.0, + "step": 117780 + }, + { + "entropy": 1.8065059944987296, + "epoch": 0.36513869396036297, + "grad_norm": 7.31696081161499, + "learning_rate": 4.186639765405935e-06, + "loss": 0.4184, + "mean_token_accuracy": 0.8545883074402809, + "num_tokens": 141620009.0, + "step": 117790 + }, + { + "entropy": 1.8446330174803733, + "epoch": 0.36516969308541264, + "grad_norm": 8.894010543823242, + "learning_rate": 4.186462058947453e-06, + "loss": 0.4723, + "mean_token_accuracy": 0.8492456763982773, + "num_tokens": 141631772.0, + "step": 117800 + }, + { + "entropy": 1.773499608039856, + "epoch": 0.36520069221046236, + "grad_norm": 8.691332817077637, + "learning_rate": 4.186284375115881e-06, + "loss": 0.4516, + "mean_token_accuracy": 0.8507530987262726, + "num_tokens": 141646041.0, + "step": 117810 + }, + { + "entropy": 1.8811022609472274, + "epoch": 0.36523169133551203, + "grad_norm": 10.04056453704834, + "learning_rate": 4.186106713906415e-06, + "loss": 0.4177, + "mean_token_accuracy": 0.8536274120211601, + "num_tokens": 141657936.0, + "step": 117820 + }, + { + "entropy": 1.8523863427340985, + "epoch": 0.36526269046056176, + "grad_norm": 9.139519691467285, + "learning_rate": 4.1859290753142566e-06, + "loss": 0.4434, + "mean_token_accuracy": 0.8525974780321122, + "num_tokens": 141670704.0, + "step": 117830 + }, + { + "entropy": 1.8781090155243874, + "epoch": 0.3652936895856114, + "grad_norm": 11.72330379486084, + "learning_rate": 4.185751459334607e-06, + "loss": 0.5319, + "mean_token_accuracy": 0.8432107910513877, + "num_tokens": 141682045.0, + "step": 117840 + }, + { + "entropy": 1.7702817119657994, + "epoch": 0.36532468871066115, + "grad_norm": 7.2551188468933105, + "learning_rate": 4.185573865962669e-06, + "loss": 0.4034, + "mean_token_accuracy": 0.8568816781044006, + "num_tokens": 141696085.0, + "step": 117850 + }, + { + "entropy": 1.9511079430580138, + "epoch": 0.3653556878357108, + "grad_norm": 9.1026029586792, + "learning_rate": 4.185396295193647e-06, + "loss": 0.5668, + "mean_token_accuracy": 0.8349045425653457, + "num_tokens": 141707166.0, + "step": 117860 + }, + { + "entropy": 1.8783316642045975, + "epoch": 0.36538668696076054, + "grad_norm": 8.064900398254395, + "learning_rate": 4.185218747022747e-06, + "loss": 0.5292, + "mean_token_accuracy": 0.8456951096653939, + "num_tokens": 141719014.0, + "step": 117870 + }, + { + "entropy": 1.8309227600693703, + "epoch": 0.3654176860858102, + "grad_norm": 8.56662654876709, + "learning_rate": 4.185041221445176e-06, + "loss": 0.5239, + "mean_token_accuracy": 0.8421471819281579, + "num_tokens": 141732151.0, + "step": 117880 + }, + { + "entropy": 1.8460636466741562, + "epoch": 0.36544868521085994, + "grad_norm": 10.314547538757324, + "learning_rate": 4.184863718456143e-06, + "loss": 0.4106, + "mean_token_accuracy": 0.8483897626399994, + "num_tokens": 141744646.0, + "step": 117890 + }, + { + "entropy": 1.8618800938129425, + "epoch": 0.3654796843359096, + "grad_norm": 8.248429298400879, + "learning_rate": 4.184686238050858e-06, + "loss": 0.4772, + "mean_token_accuracy": 0.8518645599484443, + "num_tokens": 141756281.0, + "step": 117900 + }, + { + "entropy": 1.8451855972409248, + "epoch": 0.36551068346095933, + "grad_norm": 7.507343769073486, + "learning_rate": 4.184508780224532e-06, + "loss": 0.4514, + "mean_token_accuracy": 0.8565283268690109, + "num_tokens": 141768488.0, + "step": 117910 + }, + { + "entropy": 1.8886141359806061, + "epoch": 0.365541682586009, + "grad_norm": 8.9036865234375, + "learning_rate": 4.1843313449723795e-06, + "loss": 0.498, + "mean_token_accuracy": 0.8407893583178521, + "num_tokens": 141780324.0, + "step": 117920 + }, + { + "entropy": 1.8497443065047263, + "epoch": 0.3655726817110587, + "grad_norm": 4.070679187774658, + "learning_rate": 4.184153932289612e-06, + "loss": 0.4569, + "mean_token_accuracy": 0.8470659956336022, + "num_tokens": 141791515.0, + "step": 117930 + }, + { + "entropy": 1.8582046449184417, + "epoch": 0.3656036808361084, + "grad_norm": 8.340042114257812, + "learning_rate": 4.183976542171449e-06, + "loss": 0.445, + "mean_token_accuracy": 0.8539603292942047, + "num_tokens": 141803922.0, + "step": 117940 + }, + { + "entropy": 1.8717338189482688, + "epoch": 0.3656346799611581, + "grad_norm": 3.3975770473480225, + "learning_rate": 4.183799174613104e-06, + "loss": 0.4791, + "mean_token_accuracy": 0.8473453000187874, + "num_tokens": 141815267.0, + "step": 117950 + }, + { + "entropy": 1.821525427699089, + "epoch": 0.3656656790862078, + "grad_norm": 6.9430251121521, + "learning_rate": 4.183621829609798e-06, + "loss": 0.4447, + "mean_token_accuracy": 0.8513611420989037, + "num_tokens": 141827085.0, + "step": 117960 + }, + { + "entropy": 1.8612057372927666, + "epoch": 0.3656966782112575, + "grad_norm": 8.761266708374023, + "learning_rate": 4.1834445071567505e-06, + "loss": 0.4418, + "mean_token_accuracy": 0.8551434949040413, + "num_tokens": 141839369.0, + "step": 117970 + }, + { + "entropy": 1.7933984741568565, + "epoch": 0.3657276773363072, + "grad_norm": 8.286885261535645, + "learning_rate": 4.183267207249182e-06, + "loss": 0.4084, + "mean_token_accuracy": 0.8545448705554008, + "num_tokens": 141852553.0, + "step": 117980 + }, + { + "entropy": 1.702750214934349, + "epoch": 0.3657586764613569, + "grad_norm": 6.650256633758545, + "learning_rate": 4.183089929882318e-06, + "loss": 0.3437, + "mean_token_accuracy": 0.8643985167145729, + "num_tokens": 141867491.0, + "step": 117990 + }, + { + "entropy": 1.8753923997282982, + "epoch": 0.36578967558640657, + "grad_norm": 6.874874114990234, + "learning_rate": 4.182912675051379e-06, + "loss": 0.4207, + "mean_token_accuracy": 0.8641251236200332, + "num_tokens": 141879390.0, + "step": 118000 + }, + { + "entropy": 1.926875615119934, + "epoch": 0.3658206747114563, + "grad_norm": 6.646308422088623, + "learning_rate": 4.182735442751594e-06, + "loss": 0.5192, + "mean_token_accuracy": 0.8443400964140892, + "num_tokens": 141891040.0, + "step": 118010 + }, + { + "entropy": 1.8918773487210274, + "epoch": 0.36585167383650596, + "grad_norm": 6.332447528839111, + "learning_rate": 4.182558232978188e-06, + "loss": 0.492, + "mean_token_accuracy": 0.8434045284986496, + "num_tokens": 141902986.0, + "step": 118020 + }, + { + "entropy": 1.9184204563498497, + "epoch": 0.36588267296155563, + "grad_norm": 7.81022834777832, + "learning_rate": 4.182381045726391e-06, + "loss": 0.461, + "mean_token_accuracy": 0.8520988777279854, + "num_tokens": 141914126.0, + "step": 118030 + }, + { + "entropy": 1.9360093891620636, + "epoch": 0.36591367208660536, + "grad_norm": 8.875545501708984, + "learning_rate": 4.182203880991431e-06, + "loss": 0.5721, + "mean_token_accuracy": 0.8364198818802834, + "num_tokens": 141925360.0, + "step": 118040 + }, + { + "entropy": 1.7633348166942597, + "epoch": 0.365944671211655, + "grad_norm": 3.641873836517334, + "learning_rate": 4.182026738768541e-06, + "loss": 0.4041, + "mean_token_accuracy": 0.8556516453623771, + "num_tokens": 141939081.0, + "step": 118050 + }, + { + "entropy": 1.9223826959729196, + "epoch": 0.36597567033670475, + "grad_norm": 7.793611526489258, + "learning_rate": 4.181849619052955e-06, + "loss": 0.5248, + "mean_token_accuracy": 0.8365512430667877, + "num_tokens": 141950887.0, + "step": 118060 + }, + { + "entropy": 1.910523234307766, + "epoch": 0.3660066694617544, + "grad_norm": 3.47296404838562, + "learning_rate": 4.181672521839904e-06, + "loss": 0.4971, + "mean_token_accuracy": 0.8425001531839371, + "num_tokens": 141962913.0, + "step": 118070 + }, + { + "entropy": 1.8669334262609483, + "epoch": 0.36603766858680414, + "grad_norm": 7.858112812042236, + "learning_rate": 4.1814954471246254e-06, + "loss": 0.4452, + "mean_token_accuracy": 0.8510735929012299, + "num_tokens": 141974965.0, + "step": 118080 + }, + { + "entropy": 1.8188461996614933, + "epoch": 0.3660686677118538, + "grad_norm": 9.689606666564941, + "learning_rate": 4.181318394902356e-06, + "loss": 0.4232, + "mean_token_accuracy": 0.8512760177254677, + "num_tokens": 141987859.0, + "step": 118090 + }, + { + "entropy": 1.8165872499346734, + "epoch": 0.36609966683690354, + "grad_norm": 8.783658981323242, + "learning_rate": 4.181141365168336e-06, + "loss": 0.4307, + "mean_token_accuracy": 0.8553719744086266, + "num_tokens": 142000311.0, + "step": 118100 + }, + { + "entropy": 1.747716872394085, + "epoch": 0.3661306659619532, + "grad_norm": 3.489741802215576, + "learning_rate": 4.1809643579178005e-06, + "loss": 0.3822, + "mean_token_accuracy": 0.8664152339100838, + "num_tokens": 142013950.0, + "step": 118110 + }, + { + "entropy": 1.8506772994995118, + "epoch": 0.36616166508700293, + "grad_norm": 7.3857316970825195, + "learning_rate": 4.180787373145996e-06, + "loss": 0.4218, + "mean_token_accuracy": 0.863612025976181, + "num_tokens": 142025425.0, + "step": 118120 + }, + { + "entropy": 1.8373650014400482, + "epoch": 0.3661926642120526, + "grad_norm": 8.373377799987793, + "learning_rate": 4.180610410848162e-06, + "loss": 0.5219, + "mean_token_accuracy": 0.8401133880019188, + "num_tokens": 142038565.0, + "step": 118130 + }, + { + "entropy": 1.893815578520298, + "epoch": 0.3662236633371023, + "grad_norm": 3.8234965801239014, + "learning_rate": 4.1804334710195425e-06, + "loss": 0.4989, + "mean_token_accuracy": 0.8400477096438408, + "num_tokens": 142050090.0, + "step": 118140 + }, + { + "entropy": 1.825938382744789, + "epoch": 0.366254662462152, + "grad_norm": 4.16547966003418, + "learning_rate": 4.180256553655385e-06, + "loss": 0.43, + "mean_token_accuracy": 0.8617624938488007, + "num_tokens": 142062662.0, + "step": 118150 + }, + { + "entropy": 1.801938709616661, + "epoch": 0.3662856615872017, + "grad_norm": 3.787094831466675, + "learning_rate": 4.180079658750934e-06, + "loss": 0.4252, + "mean_token_accuracy": 0.8575030252337456, + "num_tokens": 142075475.0, + "step": 118160 + }, + { + "entropy": 1.830231237411499, + "epoch": 0.3663166607122514, + "grad_norm": 3.527515172958374, + "learning_rate": 4.179902786301441e-06, + "loss": 0.4372, + "mean_token_accuracy": 0.8546881586313247, + "num_tokens": 142087897.0, + "step": 118170 + }, + { + "entropy": 1.8849106505513191, + "epoch": 0.3663476598373011, + "grad_norm": 7.191827774047852, + "learning_rate": 4.179725936302153e-06, + "loss": 0.4749, + "mean_token_accuracy": 0.8488413885235786, + "num_tokens": 142099236.0, + "step": 118180 + }, + { + "entropy": 1.9353514075279237, + "epoch": 0.3663786589623508, + "grad_norm": 9.70740032196045, + "learning_rate": 4.1795491087483225e-06, + "loss": 0.5316, + "mean_token_accuracy": 0.843434551358223, + "num_tokens": 142110114.0, + "step": 118190 + }, + { + "entropy": 1.7677051618695259, + "epoch": 0.3664096580874005, + "grad_norm": 6.214950084686279, + "learning_rate": 4.1793723036352e-06, + "loss": 0.3575, + "mean_token_accuracy": 0.8696540012955666, + "num_tokens": 142123202.0, + "step": 118200 + }, + { + "entropy": 1.8973610028624535, + "epoch": 0.36644065721245017, + "grad_norm": 7.601381301879883, + "learning_rate": 4.179195520958042e-06, + "loss": 0.4985, + "mean_token_accuracy": 0.8459401935338974, + "num_tokens": 142134423.0, + "step": 118210 + }, + { + "entropy": 1.877578319609165, + "epoch": 0.3664716563374999, + "grad_norm": 6.392949104309082, + "learning_rate": 4.179018760712103e-06, + "loss": 0.4465, + "mean_token_accuracy": 0.8500817194581032, + "num_tokens": 142145725.0, + "step": 118220 + }, + { + "entropy": 1.8080515787005424, + "epoch": 0.36650265546254956, + "grad_norm": 2.9493696689605713, + "learning_rate": 4.178842022892638e-06, + "loss": 0.4435, + "mean_token_accuracy": 0.8623009279370308, + "num_tokens": 142157978.0, + "step": 118230 + }, + { + "entropy": 1.8072045862674713, + "epoch": 0.3665336545875993, + "grad_norm": 8.654043197631836, + "learning_rate": 4.178665307494907e-06, + "loss": 0.4583, + "mean_token_accuracy": 0.8556650042533874, + "num_tokens": 142170679.0, + "step": 118240 + }, + { + "entropy": 1.8863827347755433, + "epoch": 0.36656465371264896, + "grad_norm": 8.228194236755371, + "learning_rate": 4.178488614514169e-06, + "loss": 0.4769, + "mean_token_accuracy": 0.843590895831585, + "num_tokens": 142182700.0, + "step": 118250 + }, + { + "entropy": 1.8511935338377952, + "epoch": 0.3665956528376987, + "grad_norm": 8.606576919555664, + "learning_rate": 4.1783119439456844e-06, + "loss": 0.4523, + "mean_token_accuracy": 0.8457527369260788, + "num_tokens": 142194831.0, + "step": 118260 + }, + { + "entropy": 1.8353765562176705, + "epoch": 0.36662665196274835, + "grad_norm": 9.248912811279297, + "learning_rate": 4.178135295784717e-06, + "loss": 0.452, + "mean_token_accuracy": 0.8504055172204972, + "num_tokens": 142207211.0, + "step": 118270 + }, + { + "entropy": 1.883970457315445, + "epoch": 0.366657651087798, + "grad_norm": 7.958624362945557, + "learning_rate": 4.17795867002653e-06, + "loss": 0.5167, + "mean_token_accuracy": 0.8472073882818222, + "num_tokens": 142218770.0, + "step": 118280 + }, + { + "entropy": 1.866176003217697, + "epoch": 0.36668865021284774, + "grad_norm": 9.430354118347168, + "learning_rate": 4.177782066666388e-06, + "loss": 0.5162, + "mean_token_accuracy": 0.8443237110972405, + "num_tokens": 142230034.0, + "step": 118290 + }, + { + "entropy": 1.8722920432686805, + "epoch": 0.3667196493378974, + "grad_norm": 8.045259475708008, + "learning_rate": 4.177605485699558e-06, + "loss": 0.4178, + "mean_token_accuracy": 0.8541638612747192, + "num_tokens": 142242294.0, + "step": 118300 + }, + { + "entropy": 1.8110121667385102, + "epoch": 0.36675064846294714, + "grad_norm": 3.3392722606658936, + "learning_rate": 4.177428927121307e-06, + "loss": 0.4369, + "mean_token_accuracy": 0.8573125541210175, + "num_tokens": 142255463.0, + "step": 118310 + }, + { + "entropy": 1.826652705669403, + "epoch": 0.3667816475879968, + "grad_norm": 7.730444431304932, + "learning_rate": 4.177252390926905e-06, + "loss": 0.4119, + "mean_token_accuracy": 0.8576511397957802, + "num_tokens": 142267885.0, + "step": 118320 + }, + { + "entropy": 1.8853152304887772, + "epoch": 0.36681264671304653, + "grad_norm": 7.873587131500244, + "learning_rate": 4.1770758771116235e-06, + "loss": 0.4852, + "mean_token_accuracy": 0.8457479134202004, + "num_tokens": 142279026.0, + "step": 118330 + }, + { + "entropy": 1.7946649104356767, + "epoch": 0.3668436458380962, + "grad_norm": 3.6855735778808594, + "learning_rate": 4.176899385670734e-06, + "loss": 0.4067, + "mean_token_accuracy": 0.8608448430895805, + "num_tokens": 142292139.0, + "step": 118340 + }, + { + "entropy": 1.8492568552494049, + "epoch": 0.3668746449631459, + "grad_norm": 9.059473991394043, + "learning_rate": 4.1767229165995095e-06, + "loss": 0.4399, + "mean_token_accuracy": 0.8556591719388962, + "num_tokens": 142304080.0, + "step": 118350 + }, + { + "entropy": 1.8320197984576225, + "epoch": 0.3669056440881956, + "grad_norm": 7.440866470336914, + "learning_rate": 4.176546469893225e-06, + "loss": 0.4569, + "mean_token_accuracy": 0.8550046220421791, + "num_tokens": 142315495.0, + "step": 118360 + }, + { + "entropy": 1.9020672082901, + "epoch": 0.3669366432132453, + "grad_norm": 7.690849781036377, + "learning_rate": 4.176370045547157e-06, + "loss": 0.5199, + "mean_token_accuracy": 0.8459215462207794, + "num_tokens": 142326196.0, + "step": 118370 + }, + { + "entropy": 1.8766709178686143, + "epoch": 0.366967642338295, + "grad_norm": 7.724151134490967, + "learning_rate": 4.176193643556584e-06, + "loss": 0.4898, + "mean_token_accuracy": 0.8508606314659118, + "num_tokens": 142337696.0, + "step": 118380 + }, + { + "entropy": 1.8881130993366242, + "epoch": 0.3669986414633447, + "grad_norm": 8.17661190032959, + "learning_rate": 4.176017263916784e-06, + "loss": 0.4609, + "mean_token_accuracy": 0.8483958140015602, + "num_tokens": 142349838.0, + "step": 118390 + }, + { + "entropy": 1.9459423005580903, + "epoch": 0.3670296405883944, + "grad_norm": 8.967854499816895, + "learning_rate": 4.17584090662304e-06, + "loss": 0.5726, + "mean_token_accuracy": 0.8301825195550918, + "num_tokens": 142360518.0, + "step": 118400 + }, + { + "entropy": 1.889550267159939, + "epoch": 0.3670606397134441, + "grad_norm": 7.768711566925049, + "learning_rate": 4.175664571670631e-06, + "loss": 0.5122, + "mean_token_accuracy": 0.8391052410006523, + "num_tokens": 142372416.0, + "step": 118410 + }, + { + "entropy": 1.8851889297366142, + "epoch": 0.3670916388384938, + "grad_norm": 7.86566686630249, + "learning_rate": 4.175488259054841e-06, + "loss": 0.4989, + "mean_token_accuracy": 0.857041896879673, + "num_tokens": 142384483.0, + "step": 118420 + }, + { + "entropy": 1.8617109119892121, + "epoch": 0.3671226379635435, + "grad_norm": 3.7308735847473145, + "learning_rate": 4.175311968770956e-06, + "loss": 0.4545, + "mean_token_accuracy": 0.8486345887184144, + "num_tokens": 142396492.0, + "step": 118430 + }, + { + "entropy": 1.8434831380844117, + "epoch": 0.36715363708859317, + "grad_norm": 3.5595266819000244, + "learning_rate": 4.175135700814261e-06, + "loss": 0.4503, + "mean_token_accuracy": 0.8539713755249977, + "num_tokens": 142408880.0, + "step": 118440 + }, + { + "entropy": 1.8063078075647354, + "epoch": 0.3671846362136429, + "grad_norm": 7.024228096008301, + "learning_rate": 4.174959455180043e-06, + "loss": 0.4007, + "mean_token_accuracy": 0.8617870509624481, + "num_tokens": 142422170.0, + "step": 118450 + }, + { + "entropy": 1.9420251905918122, + "epoch": 0.36721563533869256, + "grad_norm": 7.5710039138793945, + "learning_rate": 4.174783231863592e-06, + "loss": 0.4772, + "mean_token_accuracy": 0.8481424629688263, + "num_tokens": 142432968.0, + "step": 118460 + }, + { + "entropy": 1.8173451155424118, + "epoch": 0.3672466344637423, + "grad_norm": 7.755475997924805, + "learning_rate": 4.174607030860197e-06, + "loss": 0.4012, + "mean_token_accuracy": 0.8645861312747002, + "num_tokens": 142446003.0, + "step": 118470 + }, + { + "entropy": 1.8867697641253471, + "epoch": 0.36727763358879195, + "grad_norm": 8.526572227478027, + "learning_rate": 4.174430852165151e-06, + "loss": 0.4801, + "mean_token_accuracy": 0.8466065526008606, + "num_tokens": 142457251.0, + "step": 118480 + }, + { + "entropy": 1.9442676544189452, + "epoch": 0.3673086327138417, + "grad_norm": 8.537225723266602, + "learning_rate": 4.1742546957737465e-06, + "loss": 0.4837, + "mean_token_accuracy": 0.8544994726777076, + "num_tokens": 142468748.0, + "step": 118490 + }, + { + "entropy": 1.846812443435192, + "epoch": 0.36733963183889135, + "grad_norm": 7.822627544403076, + "learning_rate": 4.174078561681279e-06, + "loss": 0.4134, + "mean_token_accuracy": 0.8543079942464828, + "num_tokens": 142480975.0, + "step": 118500 + }, + { + "entropy": 1.87229622900486, + "epoch": 0.36737063096394107, + "grad_norm": 4.54270076751709, + "learning_rate": 4.173902449883043e-06, + "loss": 0.4595, + "mean_token_accuracy": 0.8584425121545791, + "num_tokens": 142492336.0, + "step": 118510 + }, + { + "entropy": 1.889968466758728, + "epoch": 0.36740163008899074, + "grad_norm": 3.9654746055603027, + "learning_rate": 4.173726360374335e-06, + "loss": 0.4772, + "mean_token_accuracy": 0.850946743786335, + "num_tokens": 142504212.0, + "step": 118520 + }, + { + "entropy": 1.8861878275871278, + "epoch": 0.3674326292140404, + "grad_norm": 11.20728588104248, + "learning_rate": 4.173550293150456e-06, + "loss": 0.4677, + "mean_token_accuracy": 0.8468590840697289, + "num_tokens": 142516169.0, + "step": 118530 + }, + { + "entropy": 1.842411059141159, + "epoch": 0.36746362833909013, + "grad_norm": 8.36995792388916, + "learning_rate": 4.1733742482067035e-06, + "loss": 0.4267, + "mean_token_accuracy": 0.8587794110178948, + "num_tokens": 142528006.0, + "step": 118540 + }, + { + "entropy": 1.873109185695648, + "epoch": 0.3674946274641398, + "grad_norm": 8.865880012512207, + "learning_rate": 4.173198225538381e-06, + "loss": 0.4709, + "mean_token_accuracy": 0.8478357747197152, + "num_tokens": 142540361.0, + "step": 118550 + }, + { + "entropy": 1.861200602352619, + "epoch": 0.3675256265891895, + "grad_norm": 7.625993728637695, + "learning_rate": 4.173022225140791e-06, + "loss": 0.4406, + "mean_token_accuracy": 0.8592444315552712, + "num_tokens": 142553408.0, + "step": 118560 + }, + { + "entropy": 1.8547646909952165, + "epoch": 0.3675566257142392, + "grad_norm": 8.53388500213623, + "learning_rate": 4.172846247009236e-06, + "loss": 0.4991, + "mean_token_accuracy": 0.8481455728411674, + "num_tokens": 142566455.0, + "step": 118570 + }, + { + "entropy": 1.8456396833062172, + "epoch": 0.3675876248392889, + "grad_norm": 3.9087069034576416, + "learning_rate": 4.1726702911390225e-06, + "loss": 0.4416, + "mean_token_accuracy": 0.8550665810704231, + "num_tokens": 142578809.0, + "step": 118580 + }, + { + "entropy": 1.8159615725278855, + "epoch": 0.3676186239643386, + "grad_norm": 4.247369289398193, + "learning_rate": 4.172494357525458e-06, + "loss": 0.4041, + "mean_token_accuracy": 0.8549291148781777, + "num_tokens": 142591777.0, + "step": 118590 + }, + { + "entropy": 1.84511306732893, + "epoch": 0.3676496230893883, + "grad_norm": 4.138094425201416, + "learning_rate": 4.17231844616385e-06, + "loss": 0.3894, + "mean_token_accuracy": 0.8563702628016472, + "num_tokens": 142605209.0, + "step": 118600 + }, + { + "entropy": 1.9339896500110627, + "epoch": 0.367680622214438, + "grad_norm": 7.239840030670166, + "learning_rate": 4.17214255704951e-06, + "loss": 0.4541, + "mean_token_accuracy": 0.8529000446200371, + "num_tokens": 142616618.0, + "step": 118610 + }, + { + "entropy": 1.9016248881816864, + "epoch": 0.3677116213394877, + "grad_norm": 10.453241348266602, + "learning_rate": 4.171966690177746e-06, + "loss": 0.4942, + "mean_token_accuracy": 0.8428125187754631, + "num_tokens": 142628132.0, + "step": 118620 + }, + { + "entropy": 1.9518603295087815, + "epoch": 0.3677426204645374, + "grad_norm": 8.346458435058594, + "learning_rate": 4.171790845543873e-06, + "loss": 0.4534, + "mean_token_accuracy": 0.8541428744792938, + "num_tokens": 142639425.0, + "step": 118630 + }, + { + "entropy": 1.8354076534509658, + "epoch": 0.3677736195895871, + "grad_norm": 10.148712158203125, + "learning_rate": 4.171615023143204e-06, + "loss": 0.4178, + "mean_token_accuracy": 0.855410099029541, + "num_tokens": 142651425.0, + "step": 118640 + }, + { + "entropy": 1.9070971041917801, + "epoch": 0.36780461871463677, + "grad_norm": 8.946581840515137, + "learning_rate": 4.1714392229710536e-06, + "loss": 0.4988, + "mean_token_accuracy": 0.8442860797047615, + "num_tokens": 142662410.0, + "step": 118650 + }, + { + "entropy": 1.8595642820000648, + "epoch": 0.3678356178396865, + "grad_norm": 9.433381080627441, + "learning_rate": 4.17126344502274e-06, + "loss": 0.4847, + "mean_token_accuracy": 0.8475638180971146, + "num_tokens": 142674406.0, + "step": 118660 + }, + { + "entropy": 1.951467391848564, + "epoch": 0.36786661696473616, + "grad_norm": 7.69748592376709, + "learning_rate": 4.17108768929358e-06, + "loss": 0.5543, + "mean_token_accuracy": 0.8346621260046959, + "num_tokens": 142685520.0, + "step": 118670 + }, + { + "entropy": 1.8777378484606744, + "epoch": 0.3678976160897859, + "grad_norm": 3.798490285873413, + "learning_rate": 4.170911955778893e-06, + "loss": 0.4659, + "mean_token_accuracy": 0.851957768201828, + "num_tokens": 142696814.0, + "step": 118680 + }, + { + "entropy": 1.8550070881843568, + "epoch": 0.36792861521483555, + "grad_norm": 6.965719699859619, + "learning_rate": 4.170736244474e-06, + "loss": 0.4532, + "mean_token_accuracy": 0.8539418116211891, + "num_tokens": 142709691.0, + "step": 118690 + }, + { + "entropy": 1.8812433794140815, + "epoch": 0.3679596143398853, + "grad_norm": 8.439732551574707, + "learning_rate": 4.170560555374224e-06, + "loss": 0.4644, + "mean_token_accuracy": 0.8429150357842445, + "num_tokens": 142721133.0, + "step": 118700 + }, + { + "entropy": 1.8941943876445293, + "epoch": 0.36799061346493495, + "grad_norm": 3.9443578720092773, + "learning_rate": 4.1703848884748875e-06, + "loss": 0.4548, + "mean_token_accuracy": 0.84272540807724, + "num_tokens": 142734302.0, + "step": 118710 + }, + { + "entropy": 1.8599026456475258, + "epoch": 0.36802161258998467, + "grad_norm": 9.261316299438477, + "learning_rate": 4.170209243771315e-06, + "loss": 0.4548, + "mean_token_accuracy": 0.8592417910695076, + "num_tokens": 142745854.0, + "step": 118720 + }, + { + "entropy": 1.8526142731308937, + "epoch": 0.36805261171503434, + "grad_norm": 8.83598804473877, + "learning_rate": 4.1700336212588336e-06, + "loss": 0.4263, + "mean_token_accuracy": 0.850617028772831, + "num_tokens": 142758084.0, + "step": 118730 + }, + { + "entropy": 1.9292904302477836, + "epoch": 0.36808361084008406, + "grad_norm": 8.976339340209961, + "learning_rate": 4.169858020932772e-06, + "loss": 0.4837, + "mean_token_accuracy": 0.8495315477252007, + "num_tokens": 142769212.0, + "step": 118740 + }, + { + "entropy": 1.9031863331794738, + "epoch": 0.36811460996513373, + "grad_norm": 9.556485176086426, + "learning_rate": 4.169682442788456e-06, + "loss": 0.5131, + "mean_token_accuracy": 0.8372549533843994, + "num_tokens": 142780283.0, + "step": 118750 + }, + { + "entropy": 1.903608924150467, + "epoch": 0.36814560909018346, + "grad_norm": 8.73618221282959, + "learning_rate": 4.16950688682122e-06, + "loss": 0.4989, + "mean_token_accuracy": 0.841778826713562, + "num_tokens": 142791778.0, + "step": 118760 + }, + { + "entropy": 1.8728568017482758, + "epoch": 0.3681766082152331, + "grad_norm": 7.941903591156006, + "learning_rate": 4.1693313530263924e-06, + "loss": 0.4746, + "mean_token_accuracy": 0.8445895060896873, + "num_tokens": 142804320.0, + "step": 118770 + }, + { + "entropy": 1.822492091357708, + "epoch": 0.3682076073402828, + "grad_norm": 4.276817798614502, + "learning_rate": 4.1691558413993075e-06, + "loss": 0.3873, + "mean_token_accuracy": 0.8638914600014687, + "num_tokens": 142816773.0, + "step": 118780 + }, + { + "entropy": 1.846034236252308, + "epoch": 0.3682386064653325, + "grad_norm": 9.146262168884277, + "learning_rate": 4.168980351935301e-06, + "loss": 0.4903, + "mean_token_accuracy": 0.8438366368412972, + "num_tokens": 142828530.0, + "step": 118790 + }, + { + "entropy": 1.9032179087400436, + "epoch": 0.3682696055903822, + "grad_norm": 8.37747573852539, + "learning_rate": 4.168804884629708e-06, + "loss": 0.5004, + "mean_token_accuracy": 0.8452135339379311, + "num_tokens": 142839769.0, + "step": 118800 + }, + { + "entropy": 1.9008402451872826, + "epoch": 0.3683006047154319, + "grad_norm": 7.905856609344482, + "learning_rate": 4.1686294394778654e-06, + "loss": 0.4956, + "mean_token_accuracy": 0.8504106491804123, + "num_tokens": 142851380.0, + "step": 118810 + }, + { + "entropy": 1.8856339365243913, + "epoch": 0.3683316038404816, + "grad_norm": 6.864739894866943, + "learning_rate": 4.168454016475113e-06, + "loss": 0.4834, + "mean_token_accuracy": 0.8480422243475914, + "num_tokens": 142863896.0, + "step": 118820 + }, + { + "entropy": 1.872384211421013, + "epoch": 0.3683626029655313, + "grad_norm": 8.543014526367188, + "learning_rate": 4.168278615616788e-06, + "loss": 0.4822, + "mean_token_accuracy": 0.8365708619356156, + "num_tokens": 142876025.0, + "step": 118830 + }, + { + "entropy": 1.8250932022929192, + "epoch": 0.368393602090581, + "grad_norm": 3.684549570083618, + "learning_rate": 4.168103236898236e-06, + "loss": 0.4323, + "mean_token_accuracy": 0.8542123079299927, + "num_tokens": 142888811.0, + "step": 118840 + }, + { + "entropy": 1.9026092737913132, + "epoch": 0.3684246012156307, + "grad_norm": 7.081031322479248, + "learning_rate": 4.167927880314796e-06, + "loss": 0.4878, + "mean_token_accuracy": 0.8558070927858352, + "num_tokens": 142900186.0, + "step": 118850 + }, + { + "entropy": 1.9256568044424056, + "epoch": 0.36845560034068037, + "grad_norm": 7.484577178955078, + "learning_rate": 4.167752545861815e-06, + "loss": 0.4956, + "mean_token_accuracy": 0.8413769558072091, + "num_tokens": 142912507.0, + "step": 118860 + }, + { + "entropy": 1.7556425005197525, + "epoch": 0.3684865994657301, + "grad_norm": 8.272164344787598, + "learning_rate": 4.167577233534637e-06, + "loss": 0.3423, + "mean_token_accuracy": 0.8660473376512527, + "num_tokens": 142927368.0, + "step": 118870 + }, + { + "entropy": 1.9561314284801483, + "epoch": 0.36851759859077976, + "grad_norm": 8.683992385864258, + "learning_rate": 4.167401943328609e-06, + "loss": 0.5259, + "mean_token_accuracy": 0.8412436872720719, + "num_tokens": 142938818.0, + "step": 118880 + }, + { + "entropy": 1.854485747218132, + "epoch": 0.3685485977158295, + "grad_norm": 4.319868087768555, + "learning_rate": 4.167226675239079e-06, + "loss": 0.4168, + "mean_token_accuracy": 0.8507488906383515, + "num_tokens": 142951007.0, + "step": 118890 + }, + { + "entropy": 1.7686067208647729, + "epoch": 0.36857959684087915, + "grad_norm": 7.7503581047058105, + "learning_rate": 4.167051429261398e-06, + "loss": 0.4149, + "mean_token_accuracy": 0.8598582178354264, + "num_tokens": 142964704.0, + "step": 118900 + }, + { + "entropy": 1.898981074988842, + "epoch": 0.3686105959659289, + "grad_norm": 6.987488269805908, + "learning_rate": 4.166876205390915e-06, + "loss": 0.504, + "mean_token_accuracy": 0.8474484965205192, + "num_tokens": 142976441.0, + "step": 118910 + }, + { + "entropy": 1.8347061678767205, + "epoch": 0.36864159509097855, + "grad_norm": 7.668900489807129, + "learning_rate": 4.166701003622984e-06, + "loss": 0.4401, + "mean_token_accuracy": 0.8599162891507148, + "num_tokens": 142989393.0, + "step": 118920 + }, + { + "entropy": 1.8358816392719746, + "epoch": 0.36867259421602827, + "grad_norm": 8.171165466308594, + "learning_rate": 4.166525823952959e-06, + "loss": 0.4274, + "mean_token_accuracy": 0.8551807060837746, + "num_tokens": 143001626.0, + "step": 118930 + }, + { + "entropy": 1.7875201195478438, + "epoch": 0.36870359334107794, + "grad_norm": 4.123532295227051, + "learning_rate": 4.166350666376194e-06, + "loss": 0.4216, + "mean_token_accuracy": 0.8510026276111603, + "num_tokens": 143014393.0, + "step": 118940 + }, + { + "entropy": 1.9123052790760995, + "epoch": 0.36873459246612766, + "grad_norm": 9.459202766418457, + "learning_rate": 4.1661755308880446e-06, + "loss": 0.5015, + "mean_token_accuracy": 0.8408966720104217, + "num_tokens": 143025392.0, + "step": 118950 + }, + { + "entropy": 1.8176440641283989, + "epoch": 0.36876559159117733, + "grad_norm": 7.813144683837891, + "learning_rate": 4.166000417483871e-06, + "loss": 0.4342, + "mean_token_accuracy": 0.8539251640439034, + "num_tokens": 143038069.0, + "step": 118960 + }, + { + "entropy": 1.8900042787194251, + "epoch": 0.36879659071622706, + "grad_norm": 3.9542362689971924, + "learning_rate": 4.16582532615903e-06, + "loss": 0.4541, + "mean_token_accuracy": 0.8539608538150787, + "num_tokens": 143049603.0, + "step": 118970 + }, + { + "entropy": 1.83581335991621, + "epoch": 0.3688275898412767, + "grad_norm": 3.050680160522461, + "learning_rate": 4.165650256908884e-06, + "loss": 0.4276, + "mean_token_accuracy": 0.8514253705739975, + "num_tokens": 143062623.0, + "step": 118980 + }, + { + "entropy": 1.7662447020411491, + "epoch": 0.36885858896632645, + "grad_norm": 5.801089763641357, + "learning_rate": 4.165475209728794e-06, + "loss": 0.3947, + "mean_token_accuracy": 0.8633951663970947, + "num_tokens": 143075670.0, + "step": 118990 + }, + { + "entropy": 1.8178039237856864, + "epoch": 0.3688895880913761, + "grad_norm": 3.8043034076690674, + "learning_rate": 4.165300184614124e-06, + "loss": 0.4101, + "mean_token_accuracy": 0.8604209214448929, + "num_tokens": 143088556.0, + "step": 119000 + }, + { + "entropy": 1.841179284453392, + "epoch": 0.36892058721642584, + "grad_norm": 7.6272292137146, + "learning_rate": 4.1651251815602385e-06, + "loss": 0.4022, + "mean_token_accuracy": 0.8553322166204452, + "num_tokens": 143100756.0, + "step": 119010 + }, + { + "entropy": 1.8461539566516876, + "epoch": 0.3689515863414755, + "grad_norm": 9.249650955200195, + "learning_rate": 4.1649502005625024e-06, + "loss": 0.4493, + "mean_token_accuracy": 0.85612553358078, + "num_tokens": 143112605.0, + "step": 119020 + }, + { + "entropy": 1.902879323065281, + "epoch": 0.3689825854665252, + "grad_norm": 7.911184310913086, + "learning_rate": 4.164775241616285e-06, + "loss": 0.5353, + "mean_token_accuracy": 0.835160045325756, + "num_tokens": 143124403.0, + "step": 119030 + }, + { + "entropy": 1.9350175976753234, + "epoch": 0.3690135845915749, + "grad_norm": 9.767508506774902, + "learning_rate": 4.164600304716953e-06, + "loss": 0.5188, + "mean_token_accuracy": 0.8444019317626953, + "num_tokens": 143135579.0, + "step": 119040 + }, + { + "entropy": 1.797468328475952, + "epoch": 0.3690445837166246, + "grad_norm": 8.605935096740723, + "learning_rate": 4.164425389859878e-06, + "loss": 0.4181, + "mean_token_accuracy": 0.8529829382896423, + "num_tokens": 143148800.0, + "step": 119050 + }, + { + "entropy": 1.8765917882323264, + "epoch": 0.3690755828416743, + "grad_norm": 6.596861839294434, + "learning_rate": 4.164250497040431e-06, + "loss": 0.4719, + "mean_token_accuracy": 0.8517785176634789, + "num_tokens": 143160367.0, + "step": 119060 + }, + { + "entropy": 1.8928829789161683, + "epoch": 0.36910658196672397, + "grad_norm": 7.310998916625977, + "learning_rate": 4.164075626253985e-06, + "loss": 0.4928, + "mean_token_accuracy": 0.8375116720795631, + "num_tokens": 143172139.0, + "step": 119070 + }, + { + "entropy": 1.8701190561056138, + "epoch": 0.3691375810917737, + "grad_norm": 9.065203666687012, + "learning_rate": 4.163900777495915e-06, + "loss": 0.4576, + "mean_token_accuracy": 0.8435375481843949, + "num_tokens": 143184281.0, + "step": 119080 + }, + { + "entropy": 1.8811526045203208, + "epoch": 0.36916858021682336, + "grad_norm": 3.9514009952545166, + "learning_rate": 4.1637259507615935e-06, + "loss": 0.4425, + "mean_token_accuracy": 0.8543221414089203, + "num_tokens": 143196848.0, + "step": 119090 + }, + { + "entropy": 1.8915190115571021, + "epoch": 0.3691995793418731, + "grad_norm": 9.159911155700684, + "learning_rate": 4.163551146046401e-06, + "loss": 0.4463, + "mean_token_accuracy": 0.8477994874119759, + "num_tokens": 143207993.0, + "step": 119100 + }, + { + "entropy": 1.9243160128593444, + "epoch": 0.36923057846692275, + "grad_norm": 8.874519348144531, + "learning_rate": 4.163376363345714e-06, + "loss": 0.4569, + "mean_token_accuracy": 0.8604633152484894, + "num_tokens": 143218596.0, + "step": 119110 + }, + { + "entropy": 1.9454612672328948, + "epoch": 0.3692615775919725, + "grad_norm": 9.145526885986328, + "learning_rate": 4.163201602654912e-06, + "loss": 0.5398, + "mean_token_accuracy": 0.8401903316378594, + "num_tokens": 143230338.0, + "step": 119120 + }, + { + "entropy": 1.896520721912384, + "epoch": 0.36929257671702215, + "grad_norm": 8.464309692382812, + "learning_rate": 4.163026863969376e-06, + "loss": 0.4793, + "mean_token_accuracy": 0.8491705074906349, + "num_tokens": 143242088.0, + "step": 119130 + }, + { + "entropy": 1.943563875555992, + "epoch": 0.36932357584207187, + "grad_norm": 7.9828033447265625, + "learning_rate": 4.162852147284489e-06, + "loss": 0.5092, + "mean_token_accuracy": 0.8447094395756721, + "num_tokens": 143253099.0, + "step": 119140 + }, + { + "entropy": 1.9148243635892868, + "epoch": 0.36935457496712154, + "grad_norm": 8.171924591064453, + "learning_rate": 4.162677452595636e-06, + "loss": 0.5437, + "mean_token_accuracy": 0.846064169704914, + "num_tokens": 143264931.0, + "step": 119150 + }, + { + "entropy": 1.9138666808605194, + "epoch": 0.36938557409217126, + "grad_norm": 9.261260986328125, + "learning_rate": 4.162502779898198e-06, + "loss": 0.512, + "mean_token_accuracy": 0.8433482199907303, + "num_tokens": 143275598.0, + "step": 119160 + }, + { + "entropy": 1.8780044361948967, + "epoch": 0.36941657321722093, + "grad_norm": 8.321491241455078, + "learning_rate": 4.162328129187566e-06, + "loss": 0.4593, + "mean_token_accuracy": 0.8502790480852127, + "num_tokens": 143288132.0, + "step": 119170 + }, + { + "entropy": 1.9145966663956642, + "epoch": 0.36944757234227066, + "grad_norm": 7.993964672088623, + "learning_rate": 4.162153500459123e-06, + "loss": 0.4806, + "mean_token_accuracy": 0.8421851694583893, + "num_tokens": 143300040.0, + "step": 119180 + }, + { + "entropy": 1.8805766895413398, + "epoch": 0.3694785714673203, + "grad_norm": 8.223487854003906, + "learning_rate": 4.161978893708263e-06, + "loss": 0.4664, + "mean_token_accuracy": 0.8541955664753914, + "num_tokens": 143311847.0, + "step": 119190 + }, + { + "entropy": 1.93892260491848, + "epoch": 0.36950957059237005, + "grad_norm": 9.87500286102295, + "learning_rate": 4.161804308930374e-06, + "loss": 0.548, + "mean_token_accuracy": 0.8372417896986007, + "num_tokens": 143324118.0, + "step": 119200 + }, + { + "entropy": 1.892756275832653, + "epoch": 0.3695405697174197, + "grad_norm": 7.675792694091797, + "learning_rate": 4.161629746120847e-06, + "loss": 0.4608, + "mean_token_accuracy": 0.849764832854271, + "num_tokens": 143335130.0, + "step": 119210 + }, + { + "entropy": 1.9276916295289994, + "epoch": 0.36957156884246944, + "grad_norm": 8.37989330291748, + "learning_rate": 4.161455205275077e-06, + "loss": 0.5287, + "mean_token_accuracy": 0.8419989302754403, + "num_tokens": 143346444.0, + "step": 119220 + }, + { + "entropy": 1.846215435862541, + "epoch": 0.3696025679675191, + "grad_norm": 8.054888725280762, + "learning_rate": 4.161280686388458e-06, + "loss": 0.4651, + "mean_token_accuracy": 0.8463585451245308, + "num_tokens": 143358709.0, + "step": 119230 + }, + { + "entropy": 1.8919729992747307, + "epoch": 0.36963356709256884, + "grad_norm": 8.983673095703125, + "learning_rate": 4.161106189456385e-06, + "loss": 0.4864, + "mean_token_accuracy": 0.8458945691585541, + "num_tokens": 143371208.0, + "step": 119240 + }, + { + "entropy": 1.9590601921081543, + "epoch": 0.3696645662176185, + "grad_norm": 9.514650344848633, + "learning_rate": 4.160931714474256e-06, + "loss": 0.5253, + "mean_token_accuracy": 0.8396427750587463, + "num_tokens": 143381837.0, + "step": 119250 + }, + { + "entropy": 1.8351401954889297, + "epoch": 0.36969556534266823, + "grad_norm": 7.223183631896973, + "learning_rate": 4.1607572614374696e-06, + "loss": 0.4197, + "mean_token_accuracy": 0.863115793466568, + "num_tokens": 143393900.0, + "step": 119260 + }, + { + "entropy": 1.7584014266729355, + "epoch": 0.3697265644677179, + "grad_norm": 3.2340762615203857, + "learning_rate": 4.160582830341426e-06, + "loss": 0.3708, + "mean_token_accuracy": 0.8620016917586326, + "num_tokens": 143408131.0, + "step": 119270 + }, + { + "entropy": 1.9106255337595939, + "epoch": 0.36975756359276757, + "grad_norm": 8.975741386413574, + "learning_rate": 4.160408421181527e-06, + "loss": 0.493, + "mean_token_accuracy": 0.8459363922476768, + "num_tokens": 143419497.0, + "step": 119280 + }, + { + "entropy": 1.9070470571517943, + "epoch": 0.3697885627178173, + "grad_norm": 9.103129386901855, + "learning_rate": 4.160234033953172e-06, + "loss": 0.4853, + "mean_token_accuracy": 0.848831920325756, + "num_tokens": 143430671.0, + "step": 119290 + }, + { + "entropy": 1.8115619271993637, + "epoch": 0.36981956184286696, + "grad_norm": 9.283622741699219, + "learning_rate": 4.160059668651768e-06, + "loss": 0.3954, + "mean_token_accuracy": 0.8591952756047249, + "num_tokens": 143442619.0, + "step": 119300 + }, + { + "entropy": 1.8160853952169418, + "epoch": 0.3698505609679167, + "grad_norm": 3.4729106426239014, + "learning_rate": 4.15988532527272e-06, + "loss": 0.4294, + "mean_token_accuracy": 0.8528380259871483, + "num_tokens": 143454996.0, + "step": 119310 + }, + { + "entropy": 1.9566965460777284, + "epoch": 0.36988156009296635, + "grad_norm": 8.186453819274902, + "learning_rate": 4.159711003811434e-06, + "loss": 0.5251, + "mean_token_accuracy": 0.8404311820864677, + "num_tokens": 143466235.0, + "step": 119320 + }, + { + "entropy": 1.770469543337822, + "epoch": 0.3699125592180161, + "grad_norm": 4.339020729064941, + "learning_rate": 4.1595367042633196e-06, + "loss": 0.3832, + "mean_token_accuracy": 0.8623701587319375, + "num_tokens": 143479903.0, + "step": 119330 + }, + { + "entropy": 1.8619060233235358, + "epoch": 0.36994355834306575, + "grad_norm": 7.124758243560791, + "learning_rate": 4.159362426623783e-06, + "loss": 0.4594, + "mean_token_accuracy": 0.8501885548233986, + "num_tokens": 143492204.0, + "step": 119340 + }, + { + "entropy": 1.781793449819088, + "epoch": 0.3699745574681155, + "grad_norm": 6.788565635681152, + "learning_rate": 4.159188170888238e-06, + "loss": 0.4397, + "mean_token_accuracy": 0.8653283238410949, + "num_tokens": 143504832.0, + "step": 119350 + }, + { + "entropy": 1.8910044834017754, + "epoch": 0.37000555659316514, + "grad_norm": 9.051020622253418, + "learning_rate": 4.159013937052096e-06, + "loss": 0.5119, + "mean_token_accuracy": 0.8442646443843842, + "num_tokens": 143516104.0, + "step": 119360 + }, + { + "entropy": 1.9117089003324508, + "epoch": 0.37003655571821487, + "grad_norm": 6.4968390464782715, + "learning_rate": 4.158839725110769e-06, + "loss": 0.4627, + "mean_token_accuracy": 0.852028714120388, + "num_tokens": 143527838.0, + "step": 119370 + }, + { + "entropy": 1.80558120906353, + "epoch": 0.37006755484326453, + "grad_norm": 8.500289916992188, + "learning_rate": 4.158665535059673e-06, + "loss": 0.4231, + "mean_token_accuracy": 0.8530550315976143, + "num_tokens": 143541211.0, + "step": 119380 + }, + { + "entropy": 1.8069082364439963, + "epoch": 0.37009855396831426, + "grad_norm": 6.210768699645996, + "learning_rate": 4.158491366894224e-06, + "loss": 0.4508, + "mean_token_accuracy": 0.8599958032369613, + "num_tokens": 143553620.0, + "step": 119390 + }, + { + "entropy": 1.7626231014728546, + "epoch": 0.3701295530933639, + "grad_norm": 8.250438690185547, + "learning_rate": 4.158317220609839e-06, + "loss": 0.405, + "mean_token_accuracy": 0.860263791680336, + "num_tokens": 143567049.0, + "step": 119400 + }, + { + "entropy": 1.888748326897621, + "epoch": 0.37016055221841365, + "grad_norm": 9.457777976989746, + "learning_rate": 4.158143096201936e-06, + "loss": 0.486, + "mean_token_accuracy": 0.8503834918141365, + "num_tokens": 143578643.0, + "step": 119410 + }, + { + "entropy": 1.910116845369339, + "epoch": 0.3701915513434633, + "grad_norm": 6.411842346191406, + "learning_rate": 4.157968993665937e-06, + "loss": 0.4937, + "mean_token_accuracy": 0.8387831076979637, + "num_tokens": 143590634.0, + "step": 119420 + }, + { + "entropy": 1.8767773866653443, + "epoch": 0.37022255046851305, + "grad_norm": 9.648838996887207, + "learning_rate": 4.157794912997263e-06, + "loss": 0.4942, + "mean_token_accuracy": 0.843813206255436, + "num_tokens": 143602357.0, + "step": 119430 + }, + { + "entropy": 1.9029111877083777, + "epoch": 0.3702535495935627, + "grad_norm": 8.24268913269043, + "learning_rate": 4.157620854191336e-06, + "loss": 0.5262, + "mean_token_accuracy": 0.8355750843882561, + "num_tokens": 143613750.0, + "step": 119440 + }, + { + "entropy": 1.960300487279892, + "epoch": 0.37028454871861244, + "grad_norm": 7.390927791595459, + "learning_rate": 4.1574468172435805e-06, + "loss": 0.5408, + "mean_token_accuracy": 0.8368476778268814, + "num_tokens": 143624528.0, + "step": 119450 + }, + { + "entropy": 1.9035738706588745, + "epoch": 0.3703155478436621, + "grad_norm": 7.557567596435547, + "learning_rate": 4.157272802149423e-06, + "loss": 0.509, + "mean_token_accuracy": 0.8415401116013527, + "num_tokens": 143636080.0, + "step": 119460 + }, + { + "entropy": 1.9506605803966521, + "epoch": 0.37034654696871183, + "grad_norm": 7.982449054718018, + "learning_rate": 4.157098808904288e-06, + "loss": 0.5286, + "mean_token_accuracy": 0.8373087286949158, + "num_tokens": 143646846.0, + "step": 119470 + }, + { + "entropy": 1.8253500118851662, + "epoch": 0.3703775460937615, + "grad_norm": 4.673305034637451, + "learning_rate": 4.156924837503605e-06, + "loss": 0.4425, + "mean_token_accuracy": 0.8508133858442306, + "num_tokens": 143659349.0, + "step": 119480 + }, + { + "entropy": 1.8085382550954818, + "epoch": 0.3704085452188112, + "grad_norm": 3.9146220684051514, + "learning_rate": 4.156750887942804e-06, + "loss": 0.4161, + "mean_token_accuracy": 0.8524488687515259, + "num_tokens": 143672699.0, + "step": 119490 + }, + { + "entropy": 1.8396509125828744, + "epoch": 0.3704395443438609, + "grad_norm": 9.417864799499512, + "learning_rate": 4.156576960217317e-06, + "loss": 0.4473, + "mean_token_accuracy": 0.8548582971096039, + "num_tokens": 143684766.0, + "step": 119500 + }, + { + "entropy": 1.8102483958005906, + "epoch": 0.37047054346891056, + "grad_norm": 8.302236557006836, + "learning_rate": 4.156403054322573e-06, + "loss": 0.3942, + "mean_token_accuracy": 0.8558089345693588, + "num_tokens": 143696483.0, + "step": 119510 + }, + { + "entropy": 1.8374242320656777, + "epoch": 0.3705015425939603, + "grad_norm": 8.9642972946167, + "learning_rate": 4.156229170254007e-06, + "loss": 0.434, + "mean_token_accuracy": 0.8551341906189919, + "num_tokens": 143709585.0, + "step": 119520 + }, + { + "entropy": 1.7632934302091599, + "epoch": 0.37053254171900996, + "grad_norm": 8.926246643066406, + "learning_rate": 4.156055308007056e-06, + "loss": 0.4146, + "mean_token_accuracy": 0.858980093896389, + "num_tokens": 143722796.0, + "step": 119530 + }, + { + "entropy": 1.7985145077109337, + "epoch": 0.3705635408440597, + "grad_norm": 3.6693923473358154, + "learning_rate": 4.155881467577151e-06, + "loss": 0.4514, + "mean_token_accuracy": 0.8529761865735054, + "num_tokens": 143735885.0, + "step": 119540 + }, + { + "entropy": 1.8460396811366082, + "epoch": 0.37059453996910935, + "grad_norm": 8.484942436218262, + "learning_rate": 4.1557076489597354e-06, + "loss": 0.4554, + "mean_token_accuracy": 0.8540690630674362, + "num_tokens": 143747844.0, + "step": 119550 + }, + { + "entropy": 1.8268393889069556, + "epoch": 0.3706255390941591, + "grad_norm": 10.119608879089355, + "learning_rate": 4.155533852150245e-06, + "loss": 0.4451, + "mean_token_accuracy": 0.8529582008719444, + "num_tokens": 143760082.0, + "step": 119560 + }, + { + "entropy": 1.8313589990139008, + "epoch": 0.37065653821920874, + "grad_norm": 10.065152168273926, + "learning_rate": 4.155360077144119e-06, + "loss": 0.4413, + "mean_token_accuracy": 0.8478084012866021, + "num_tokens": 143772556.0, + "step": 119570 + }, + { + "entropy": 1.7658352941274642, + "epoch": 0.37068753734425847, + "grad_norm": 4.34824800491333, + "learning_rate": 4.155186323936802e-06, + "loss": 0.3614, + "mean_token_accuracy": 0.8594838708639145, + "num_tokens": 143786868.0, + "step": 119580 + }, + { + "entropy": 1.9014542356133461, + "epoch": 0.37071853646930814, + "grad_norm": 8.814896583557129, + "learning_rate": 4.155012592523735e-06, + "loss": 0.4831, + "mean_token_accuracy": 0.844634436070919, + "num_tokens": 143799003.0, + "step": 119590 + }, + { + "entropy": 1.9206546247005463, + "epoch": 0.37074953559435786, + "grad_norm": 7.594090461730957, + "learning_rate": 4.154838882900362e-06, + "loss": 0.498, + "mean_token_accuracy": 0.8516721650958061, + "num_tokens": 143809724.0, + "step": 119600 + }, + { + "entropy": 1.920139655470848, + "epoch": 0.37078053471940753, + "grad_norm": 6.954849720001221, + "learning_rate": 4.154665195062129e-06, + "loss": 0.474, + "mean_token_accuracy": 0.8434002518653869, + "num_tokens": 143821147.0, + "step": 119610 + }, + { + "entropy": 1.8785637602210046, + "epoch": 0.37081153384445725, + "grad_norm": 7.791470527648926, + "learning_rate": 4.154491529004484e-06, + "loss": 0.4904, + "mean_token_accuracy": 0.8430327326059341, + "num_tokens": 143832843.0, + "step": 119620 + }, + { + "entropy": 1.8350661814212799, + "epoch": 0.3708425329695069, + "grad_norm": 9.10515308380127, + "learning_rate": 4.1543178847228734e-06, + "loss": 0.4286, + "mean_token_accuracy": 0.8528729796409606, + "num_tokens": 143844895.0, + "step": 119630 + }, + { + "entropy": 1.8383238837122917, + "epoch": 0.37087353209455665, + "grad_norm": 7.8374834060668945, + "learning_rate": 4.154144262212747e-06, + "loss": 0.4538, + "mean_token_accuracy": 0.8570844009518623, + "num_tokens": 143856936.0, + "step": 119640 + }, + { + "entropy": 1.8358725041151047, + "epoch": 0.3709045312196063, + "grad_norm": 10.492681503295898, + "learning_rate": 4.153970661469557e-06, + "loss": 0.4451, + "mean_token_accuracy": 0.8490477934479713, + "num_tokens": 143869530.0, + "step": 119650 + }, + { + "entropy": 1.889818374812603, + "epoch": 0.37093553034465604, + "grad_norm": 8.570408821105957, + "learning_rate": 4.153797082488753e-06, + "loss": 0.4735, + "mean_token_accuracy": 0.8489762708544731, + "num_tokens": 143880824.0, + "step": 119660 + }, + { + "entropy": 1.827413833141327, + "epoch": 0.3709665294697057, + "grad_norm": 8.959956169128418, + "learning_rate": 4.153623525265792e-06, + "loss": 0.4455, + "mean_token_accuracy": 0.8504405677318573, + "num_tokens": 143893214.0, + "step": 119670 + }, + { + "entropy": 1.8852188110351562, + "epoch": 0.37099752859475543, + "grad_norm": 10.366182327270508, + "learning_rate": 4.153449989796128e-06, + "loss": 0.5028, + "mean_token_accuracy": 0.8433555126190185, + "num_tokens": 143904446.0, + "step": 119680 + }, + { + "entropy": 1.9464125022292138, + "epoch": 0.3710285277198051, + "grad_norm": 10.189809799194336, + "learning_rate": 4.153276476075214e-06, + "loss": 0.5475, + "mean_token_accuracy": 0.8357294231653214, + "num_tokens": 143915556.0, + "step": 119690 + }, + { + "entropy": 1.825826181471348, + "epoch": 0.3710595268448548, + "grad_norm": 8.288381576538086, + "learning_rate": 4.1531029840985115e-06, + "loss": 0.4332, + "mean_token_accuracy": 0.8584688425064086, + "num_tokens": 143928024.0, + "step": 119700 + }, + { + "entropy": 1.8943134620785713, + "epoch": 0.3710905259699045, + "grad_norm": 7.004955768585205, + "learning_rate": 4.152929513861477e-06, + "loss": 0.4718, + "mean_token_accuracy": 0.8523566782474518, + "num_tokens": 143940068.0, + "step": 119710 + }, + { + "entropy": 1.8529562443494796, + "epoch": 0.3711215250949542, + "grad_norm": 3.9424831867218018, + "learning_rate": 4.152756065359572e-06, + "loss": 0.4032, + "mean_token_accuracy": 0.859981244802475, + "num_tokens": 143952445.0, + "step": 119720 + }, + { + "entropy": 1.8910843506455421, + "epoch": 0.3711525242200039, + "grad_norm": 9.168560028076172, + "learning_rate": 4.1525826385882565e-06, + "loss": 0.4802, + "mean_token_accuracy": 0.8453802585601806, + "num_tokens": 143964559.0, + "step": 119730 + }, + { + "entropy": 1.8304832607507706, + "epoch": 0.3711835233450536, + "grad_norm": 7.770691871643066, + "learning_rate": 4.152409233542995e-06, + "loss": 0.481, + "mean_token_accuracy": 0.8464579433202744, + "num_tokens": 143976473.0, + "step": 119740 + }, + { + "entropy": 1.8643721297383309, + "epoch": 0.3712145224701033, + "grad_norm": 9.255126953125, + "learning_rate": 4.15223585021925e-06, + "loss": 0.4753, + "mean_token_accuracy": 0.8462480574846267, + "num_tokens": 143987917.0, + "step": 119750 + }, + { + "entropy": 1.9292099431157113, + "epoch": 0.37124552159515295, + "grad_norm": 10.51768684387207, + "learning_rate": 4.1520624886124886e-06, + "loss": 0.5321, + "mean_token_accuracy": 0.8432277202606201, + "num_tokens": 144000155.0, + "step": 119760 + }, + { + "entropy": 1.8904988139867782, + "epoch": 0.3712765207202027, + "grad_norm": 6.968792915344238, + "learning_rate": 4.151889148718177e-06, + "loss": 0.4317, + "mean_token_accuracy": 0.8615191966295243, + "num_tokens": 144012142.0, + "step": 119770 + }, + { + "entropy": 1.9146782085299492, + "epoch": 0.37130751984525234, + "grad_norm": 8.367558479309082, + "learning_rate": 4.151715830531783e-06, + "loss": 0.4812, + "mean_token_accuracy": 0.8470437869429588, + "num_tokens": 144023416.0, + "step": 119780 + }, + { + "entropy": 1.8538981348276138, + "epoch": 0.37133851897030207, + "grad_norm": 8.344622611999512, + "learning_rate": 4.151542534048775e-06, + "loss": 0.4413, + "mean_token_accuracy": 0.8540352538228035, + "num_tokens": 144035624.0, + "step": 119790 + }, + { + "entropy": 1.9207333639264106, + "epoch": 0.37136951809535174, + "grad_norm": 7.7748942375183105, + "learning_rate": 4.151369259264627e-06, + "loss": 0.505, + "mean_token_accuracy": 0.8414986193180084, + "num_tokens": 144047105.0, + "step": 119800 + }, + { + "entropy": 1.9819932192564012, + "epoch": 0.37140051722040146, + "grad_norm": 9.306598663330078, + "learning_rate": 4.151196006174808e-06, + "loss": 0.5185, + "mean_token_accuracy": 0.845866845548153, + "num_tokens": 144058111.0, + "step": 119810 + }, + { + "entropy": 1.94317606985569, + "epoch": 0.37143151634545113, + "grad_norm": 8.76233196258545, + "learning_rate": 4.151022774774793e-06, + "loss": 0.4921, + "mean_token_accuracy": 0.8466348692774772, + "num_tokens": 144068820.0, + "step": 119820 + }, + { + "entropy": 1.8876767694950103, + "epoch": 0.37146251547050085, + "grad_norm": 7.308814525604248, + "learning_rate": 4.150849565060057e-06, + "loss": 0.4707, + "mean_token_accuracy": 0.8494908258318901, + "num_tokens": 144080689.0, + "step": 119830 + }, + { + "entropy": 1.8951317608356475, + "epoch": 0.3714935145955505, + "grad_norm": 9.297974586486816, + "learning_rate": 4.1506763770260735e-06, + "loss": 0.4687, + "mean_token_accuracy": 0.8519510939717293, + "num_tokens": 144092194.0, + "step": 119840 + }, + { + "entropy": 1.9246593773365022, + "epoch": 0.37152451372060025, + "grad_norm": 7.682300090789795, + "learning_rate": 4.150503210668323e-06, + "loss": 0.4773, + "mean_token_accuracy": 0.84807148873806, + "num_tokens": 144103270.0, + "step": 119850 + }, + { + "entropy": 1.911957061290741, + "epoch": 0.3715555128456499, + "grad_norm": 6.910633087158203, + "learning_rate": 4.150330065982283e-06, + "loss": 0.5411, + "mean_token_accuracy": 0.8236610382795334, + "num_tokens": 144115512.0, + "step": 119860 + }, + { + "entropy": 1.7777829378843308, + "epoch": 0.37158651197069964, + "grad_norm": 8.125778198242188, + "learning_rate": 4.1501569429634335e-06, + "loss": 0.3631, + "mean_token_accuracy": 0.8673719853162766, + "num_tokens": 144128091.0, + "step": 119870 + }, + { + "entropy": 1.8605340436100959, + "epoch": 0.3716175110957493, + "grad_norm": 9.269027709960938, + "learning_rate": 4.149983841607256e-06, + "loss": 0.4442, + "mean_token_accuracy": 0.8562753826379776, + "num_tokens": 144139595.0, + "step": 119880 + }, + { + "entropy": 1.8629012137651444, + "epoch": 0.37164851022079903, + "grad_norm": 8.08617877960205, + "learning_rate": 4.149810761909232e-06, + "loss": 0.4161, + "mean_token_accuracy": 0.8640567243099213, + "num_tokens": 144151997.0, + "step": 119890 + }, + { + "entropy": 1.9436404347419738, + "epoch": 0.3716795093458487, + "grad_norm": 9.821636199951172, + "learning_rate": 4.149637703864848e-06, + "loss": 0.4963, + "mean_token_accuracy": 0.848535805940628, + "num_tokens": 144162535.0, + "step": 119900 + }, + { + "entropy": 1.890118558704853, + "epoch": 0.3717105084708984, + "grad_norm": 8.111401557922363, + "learning_rate": 4.149464667469588e-06, + "loss": 0.4725, + "mean_token_accuracy": 0.846766683459282, + "num_tokens": 144174680.0, + "step": 119910 + }, + { + "entropy": 1.8793727621436118, + "epoch": 0.3717415075959481, + "grad_norm": 9.598828315734863, + "learning_rate": 4.149291652718937e-06, + "loss": 0.5008, + "mean_token_accuracy": 0.8412741586565972, + "num_tokens": 144186534.0, + "step": 119920 + }, + { + "entropy": 1.8049995869398117, + "epoch": 0.3717725067209978, + "grad_norm": 4.190101623535156, + "learning_rate": 4.149118659608385e-06, + "loss": 0.413, + "mean_token_accuracy": 0.8464117422699928, + "num_tokens": 144200420.0, + "step": 119930 + }, + { + "entropy": 1.9019189208745957, + "epoch": 0.3718035058460475, + "grad_norm": 9.553948402404785, + "learning_rate": 4.14894568813342e-06, + "loss": 0.4836, + "mean_token_accuracy": 0.8418327271938324, + "num_tokens": 144212107.0, + "step": 119940 + }, + { + "entropy": 1.9129536800086497, + "epoch": 0.3718345049710972, + "grad_norm": 8.531197547912598, + "learning_rate": 4.1487727382895345e-06, + "loss": 0.4828, + "mean_token_accuracy": 0.8501315474510193, + "num_tokens": 144223594.0, + "step": 119950 + }, + { + "entropy": 1.896179696917534, + "epoch": 0.3718655040961469, + "grad_norm": 8.905537605285645, + "learning_rate": 4.148599810072218e-06, + "loss": 0.5049, + "mean_token_accuracy": 0.8426758080720902, + "num_tokens": 144235324.0, + "step": 119960 + }, + { + "entropy": 1.9160192415118218, + "epoch": 0.3718965032211966, + "grad_norm": 9.644220352172852, + "learning_rate": 4.148426903476966e-06, + "loss": 0.4903, + "mean_token_accuracy": 0.8502436727285385, + "num_tokens": 144246974.0, + "step": 119970 + }, + { + "entropy": 1.8606981292366982, + "epoch": 0.3719275023462463, + "grad_norm": 3.5319061279296875, + "learning_rate": 4.148254018499271e-06, + "loss": 0.4265, + "mean_token_accuracy": 0.8643532186746598, + "num_tokens": 144259328.0, + "step": 119980 + }, + { + "entropy": 1.8820769846439362, + "epoch": 0.371958501471296, + "grad_norm": 10.861024856567383, + "learning_rate": 4.1480811551346305e-06, + "loss": 0.4884, + "mean_token_accuracy": 0.8372126758098603, + "num_tokens": 144270870.0, + "step": 119990 + }, + { + "entropy": 1.8569459572434426, + "epoch": 0.37198950059634567, + "grad_norm": 8.557576179504395, + "learning_rate": 4.14790831337854e-06, + "loss": 0.4211, + "mean_token_accuracy": 0.8561231017112731, + "num_tokens": 144282985.0, + "step": 120000 + }, + { + "entropy": 1.8007396288216113, + "epoch": 0.37202049972139534, + "grad_norm": 8.916121482849121, + "learning_rate": 4.147735493226499e-06, + "loss": 0.3946, + "mean_token_accuracy": 0.8646645426750184, + "num_tokens": 144296275.0, + "step": 120010 + }, + { + "entropy": 1.8762442633509635, + "epoch": 0.37205149884644506, + "grad_norm": 10.017187118530273, + "learning_rate": 4.147562694674007e-06, + "loss": 0.4473, + "mean_token_accuracy": 0.8562163576483727, + "num_tokens": 144307269.0, + "step": 120020 + }, + { + "entropy": 1.945700818300247, + "epoch": 0.37208249797149473, + "grad_norm": 9.23812198638916, + "learning_rate": 4.147389917716566e-06, + "loss": 0.5345, + "mean_token_accuracy": 0.840525084733963, + "num_tokens": 144318155.0, + "step": 120030 + }, + { + "entropy": 1.9071541115641595, + "epoch": 0.37211349709654445, + "grad_norm": 6.633033752441406, + "learning_rate": 4.1472171623496765e-06, + "loss": 0.4695, + "mean_token_accuracy": 0.8516213580965996, + "num_tokens": 144330070.0, + "step": 120040 + }, + { + "entropy": 1.8605321899056435, + "epoch": 0.3721444962215941, + "grad_norm": 7.755608081817627, + "learning_rate": 4.147044428568844e-06, + "loss": 0.463, + "mean_token_accuracy": 0.8484739094972611, + "num_tokens": 144342082.0, + "step": 120050 + }, + { + "entropy": 1.8393002033233643, + "epoch": 0.37217549534664385, + "grad_norm": 3.898113489151001, + "learning_rate": 4.146871716369573e-06, + "loss": 0.4465, + "mean_token_accuracy": 0.8561500370502472, + "num_tokens": 144353537.0, + "step": 120060 + }, + { + "entropy": 1.9425443708896637, + "epoch": 0.3722064944716935, + "grad_norm": 7.680639743804932, + "learning_rate": 4.146699025747368e-06, + "loss": 0.4847, + "mean_token_accuracy": 0.8522611662745476, + "num_tokens": 144364505.0, + "step": 120070 + }, + { + "entropy": 1.8757713958621025, + "epoch": 0.37223749359674324, + "grad_norm": 8.204666137695312, + "learning_rate": 4.14652635669774e-06, + "loss": 0.463, + "mean_token_accuracy": 0.8506177857518196, + "num_tokens": 144375941.0, + "step": 120080 + }, + { + "entropy": 1.9103585094213487, + "epoch": 0.3722684927217929, + "grad_norm": 9.457573890686035, + "learning_rate": 4.146353709216196e-06, + "loss": 0.5536, + "mean_token_accuracy": 0.8371702343225479, + "num_tokens": 144386996.0, + "step": 120090 + }, + { + "entropy": 1.7752160847187042, + "epoch": 0.37229949184684263, + "grad_norm": 9.947811126708984, + "learning_rate": 4.146181083298246e-06, + "loss": 0.4366, + "mean_token_accuracy": 0.8600396499037742, + "num_tokens": 144400645.0, + "step": 120100 + }, + { + "entropy": 1.8245454356074333, + "epoch": 0.3723304909718923, + "grad_norm": 10.876919746398926, + "learning_rate": 4.146008478939402e-06, + "loss": 0.4198, + "mean_token_accuracy": 0.8561910197138787, + "num_tokens": 144412719.0, + "step": 120110 + }, + { + "entropy": 1.8593166559934615, + "epoch": 0.372361490096942, + "grad_norm": 7.8572587966918945, + "learning_rate": 4.145835896135177e-06, + "loss": 0.4549, + "mean_token_accuracy": 0.8545159190893173, + "num_tokens": 144424962.0, + "step": 120120 + }, + { + "entropy": 1.9504047334194183, + "epoch": 0.3723924892219917, + "grad_norm": 8.473873138427734, + "learning_rate": 4.1456633348810855e-06, + "loss": 0.5375, + "mean_token_accuracy": 0.8311512500047684, + "num_tokens": 144436324.0, + "step": 120130 + }, + { + "entropy": 1.8700911119580268, + "epoch": 0.3724234883470414, + "grad_norm": 4.052376747131348, + "learning_rate": 4.145490795172642e-06, + "loss": 0.4624, + "mean_token_accuracy": 0.8466508388519287, + "num_tokens": 144447723.0, + "step": 120140 + }, + { + "entropy": 1.8484340474009513, + "epoch": 0.3724544874720911, + "grad_norm": 3.679072141647339, + "learning_rate": 4.145318277005364e-06, + "loss": 0.4588, + "mean_token_accuracy": 0.8553288698196411, + "num_tokens": 144459623.0, + "step": 120150 + }, + { + "entropy": 1.8352990061044694, + "epoch": 0.3724854865971408, + "grad_norm": 10.292163848876953, + "learning_rate": 4.1451457803747705e-06, + "loss": 0.4709, + "mean_token_accuracy": 0.8461218729615212, + "num_tokens": 144471544.0, + "step": 120160 + }, + { + "entropy": 1.9243695348501206, + "epoch": 0.3725164857221905, + "grad_norm": 8.172408103942871, + "learning_rate": 4.1449733052763785e-06, + "loss": 0.4811, + "mean_token_accuracy": 0.8539075449109077, + "num_tokens": 144482528.0, + "step": 120170 + }, + { + "entropy": 1.8598498031497002, + "epoch": 0.3725474848472402, + "grad_norm": 7.883946895599365, + "learning_rate": 4.144800851705711e-06, + "loss": 0.4759, + "mean_token_accuracy": 0.8527406945824623, + "num_tokens": 144493717.0, + "step": 120180 + }, + { + "entropy": 1.8283675089478493, + "epoch": 0.3725784839722899, + "grad_norm": 8.100120544433594, + "learning_rate": 4.1446284196582885e-06, + "loss": 0.4359, + "mean_token_accuracy": 0.8576804473996162, + "num_tokens": 144505962.0, + "step": 120190 + }, + { + "entropy": 1.910760723054409, + "epoch": 0.3726094830973396, + "grad_norm": 4.125510215759277, + "learning_rate": 4.144456009129636e-06, + "loss": 0.4787, + "mean_token_accuracy": 0.846273559331894, + "num_tokens": 144517019.0, + "step": 120200 + }, + { + "entropy": 1.83195867985487, + "epoch": 0.37264048222238927, + "grad_norm": 7.632051944732666, + "learning_rate": 4.144283620115277e-06, + "loss": 0.4404, + "mean_token_accuracy": 0.8484606102108956, + "num_tokens": 144529928.0, + "step": 120210 + }, + { + "entropy": 1.913992887735367, + "epoch": 0.372671481347439, + "grad_norm": 8.024348258972168, + "learning_rate": 4.144111252610736e-06, + "loss": 0.4761, + "mean_token_accuracy": 0.856801763176918, + "num_tokens": 144541124.0, + "step": 120220 + }, + { + "entropy": 1.7974503114819527, + "epoch": 0.37270248047248866, + "grad_norm": 4.225975036621094, + "learning_rate": 4.143938906611542e-06, + "loss": 0.4, + "mean_token_accuracy": 0.852984607219696, + "num_tokens": 144554372.0, + "step": 120230 + }, + { + "entropy": 1.8398534119129182, + "epoch": 0.3727334795975384, + "grad_norm": 7.547396183013916, + "learning_rate": 4.143766582113225e-06, + "loss": 0.45, + "mean_token_accuracy": 0.848320834338665, + "num_tokens": 144567059.0, + "step": 120240 + }, + { + "entropy": 1.79450566470623, + "epoch": 0.37276447872258806, + "grad_norm": 7.509959697723389, + "learning_rate": 4.143594279111312e-06, + "loss": 0.4259, + "mean_token_accuracy": 0.8508245050907135, + "num_tokens": 144579668.0, + "step": 120250 + }, + { + "entropy": 1.9366710186004639, + "epoch": 0.3727954778476377, + "grad_norm": 7.8771209716796875, + "learning_rate": 4.143421997601335e-06, + "loss": 0.5205, + "mean_token_accuracy": 0.8401233479380608, + "num_tokens": 144590869.0, + "step": 120260 + }, + { + "entropy": 1.7975213006138802, + "epoch": 0.37282647697268745, + "grad_norm": 10.612153053283691, + "learning_rate": 4.1432497375788275e-06, + "loss": 0.417, + "mean_token_accuracy": 0.8648232862353324, + "num_tokens": 144603350.0, + "step": 120270 + }, + { + "entropy": 1.882333090901375, + "epoch": 0.3728574760977371, + "grad_norm": 9.56601619720459, + "learning_rate": 4.143077499039321e-06, + "loss": 0.488, + "mean_token_accuracy": 0.8536757841706276, + "num_tokens": 144614705.0, + "step": 120280 + }, + { + "entropy": 1.8980722784996034, + "epoch": 0.37288847522278684, + "grad_norm": 8.230175018310547, + "learning_rate": 4.142905281978353e-06, + "loss": 0.4813, + "mean_token_accuracy": 0.8509363621473313, + "num_tokens": 144625834.0, + "step": 120290 + }, + { + "entropy": 1.881382980942726, + "epoch": 0.3729194743478365, + "grad_norm": 8.221864700317383, + "learning_rate": 4.14273308639146e-06, + "loss": 0.4796, + "mean_token_accuracy": 0.8483862280845642, + "num_tokens": 144636679.0, + "step": 120300 + }, + { + "entropy": 1.879370318353176, + "epoch": 0.37295047347288623, + "grad_norm": 7.582620620727539, + "learning_rate": 4.142560912274176e-06, + "loss": 0.4857, + "mean_token_accuracy": 0.8436816439032555, + "num_tokens": 144648082.0, + "step": 120310 + }, + { + "entropy": 1.8741892874240875, + "epoch": 0.3729814725979359, + "grad_norm": 8.145028114318848, + "learning_rate": 4.142388759622044e-06, + "loss": 0.4767, + "mean_token_accuracy": 0.8376132413744927, + "num_tokens": 144659565.0, + "step": 120320 + }, + { + "entropy": 1.8751545161008836, + "epoch": 0.37301247172298563, + "grad_norm": 8.928181648254395, + "learning_rate": 4.1422166284306016e-06, + "loss": 0.5052, + "mean_token_accuracy": 0.8437012255191803, + "num_tokens": 144670975.0, + "step": 120330 + }, + { + "entropy": 1.8685587406158448, + "epoch": 0.3730434708480353, + "grad_norm": 9.061539649963379, + "learning_rate": 4.142044518695391e-06, + "loss": 0.4559, + "mean_token_accuracy": 0.852770148217678, + "num_tokens": 144682788.0, + "step": 120340 + }, + { + "entropy": 1.7655042618513108, + "epoch": 0.373074469973085, + "grad_norm": 6.830787181854248, + "learning_rate": 4.141872430411956e-06, + "loss": 0.4142, + "mean_token_accuracy": 0.8638226523995399, + "num_tokens": 144695970.0, + "step": 120350 + }, + { + "entropy": 1.8461510226130486, + "epoch": 0.3731054690981347, + "grad_norm": 6.8738298416137695, + "learning_rate": 4.14170036357584e-06, + "loss": 0.4545, + "mean_token_accuracy": 0.8553184866905212, + "num_tokens": 144707528.0, + "step": 120360 + }, + { + "entropy": 1.903184102475643, + "epoch": 0.3731364682231844, + "grad_norm": 7.4232025146484375, + "learning_rate": 4.1415283181825895e-06, + "loss": 0.4897, + "mean_token_accuracy": 0.8506990998983384, + "num_tokens": 144718816.0, + "step": 120370 + }, + { + "entropy": 1.9077295437455177, + "epoch": 0.3731674673482341, + "grad_norm": 8.14012622833252, + "learning_rate": 4.1413562942277484e-06, + "loss": 0.5117, + "mean_token_accuracy": 0.8307053163647652, + "num_tokens": 144730281.0, + "step": 120380 + }, + { + "entropy": 1.8349354296922684, + "epoch": 0.3731984664732838, + "grad_norm": 8.339512825012207, + "learning_rate": 4.141184291706868e-06, + "loss": 0.4454, + "mean_token_accuracy": 0.8565431639552117, + "num_tokens": 144742205.0, + "step": 120390 + }, + { + "entropy": 1.8753593668341637, + "epoch": 0.3732294655983335, + "grad_norm": 9.116205215454102, + "learning_rate": 4.141012310615495e-06, + "loss": 0.4764, + "mean_token_accuracy": 0.8513768821954727, + "num_tokens": 144753297.0, + "step": 120400 + }, + { + "entropy": 1.8986528918147088, + "epoch": 0.3732604647233832, + "grad_norm": 7.670609951019287, + "learning_rate": 4.140840350949181e-06, + "loss": 0.4765, + "mean_token_accuracy": 0.8434357807040215, + "num_tokens": 144764661.0, + "step": 120410 + }, + { + "entropy": 1.7499748080968858, + "epoch": 0.37329146384843287, + "grad_norm": 8.888646125793457, + "learning_rate": 4.140668412703478e-06, + "loss": 0.3657, + "mean_token_accuracy": 0.862440787255764, + "num_tokens": 144778571.0, + "step": 120420 + }, + { + "entropy": 1.76868616938591, + "epoch": 0.3733224629734826, + "grad_norm": 9.065146446228027, + "learning_rate": 4.14049649587394e-06, + "loss": 0.3731, + "mean_token_accuracy": 0.8667063027620315, + "num_tokens": 144791265.0, + "step": 120430 + }, + { + "entropy": 1.8670168176293374, + "epoch": 0.37335346209853226, + "grad_norm": 12.608692169189453, + "learning_rate": 4.140324600456119e-06, + "loss": 0.4971, + "mean_token_accuracy": 0.8426472395658493, + "num_tokens": 144802626.0, + "step": 120440 + }, + { + "entropy": 1.7094080224633217, + "epoch": 0.373384461223582, + "grad_norm": 9.83113956451416, + "learning_rate": 4.140152726445574e-06, + "loss": 0.3783, + "mean_token_accuracy": 0.8663820832967758, + "num_tokens": 144816924.0, + "step": 120450 + }, + { + "entropy": 1.88383856266737, + "epoch": 0.37341546034863166, + "grad_norm": 8.397429466247559, + "learning_rate": 4.1399808738378585e-06, + "loss": 0.4993, + "mean_token_accuracy": 0.8410907715559006, + "num_tokens": 144829203.0, + "step": 120460 + }, + { + "entropy": 1.768805581331253, + "epoch": 0.3734464594736814, + "grad_norm": 4.13175106048584, + "learning_rate": 4.139809042628535e-06, + "loss": 0.3802, + "mean_token_accuracy": 0.8638075411319732, + "num_tokens": 144841621.0, + "step": 120470 + }, + { + "entropy": 1.869265778362751, + "epoch": 0.37347745859873105, + "grad_norm": 10.438434600830078, + "learning_rate": 4.139637232813159e-06, + "loss": 0.5482, + "mean_token_accuracy": 0.830617117881775, + "num_tokens": 144853808.0, + "step": 120480 + }, + { + "entropy": 1.8952225580811501, + "epoch": 0.3735084577237808, + "grad_norm": 7.919762134552002, + "learning_rate": 4.139465444387294e-06, + "loss": 0.4617, + "mean_token_accuracy": 0.851693132519722, + "num_tokens": 144865079.0, + "step": 120490 + }, + { + "entropy": 1.7868720442056656, + "epoch": 0.37353945684883044, + "grad_norm": 8.96243953704834, + "learning_rate": 4.139293677346502e-06, + "loss": 0.4258, + "mean_token_accuracy": 0.8544852897524834, + "num_tokens": 144878075.0, + "step": 120500 + }, + { + "entropy": 1.8284976735711098, + "epoch": 0.3735704559738801, + "grad_norm": 8.363344192504883, + "learning_rate": 4.139121931686345e-06, + "loss": 0.4361, + "mean_token_accuracy": 0.85543752014637, + "num_tokens": 144890326.0, + "step": 120510 + }, + { + "entropy": 1.870447462797165, + "epoch": 0.37360145509892984, + "grad_norm": 7.410390853881836, + "learning_rate": 4.13895020740239e-06, + "loss": 0.4693, + "mean_token_accuracy": 0.8517989814281464, + "num_tokens": 144902157.0, + "step": 120520 + }, + { + "entropy": 1.800754214823246, + "epoch": 0.3736324542239795, + "grad_norm": 4.441434383392334, + "learning_rate": 4.138778504490201e-06, + "loss": 0.4912, + "mean_token_accuracy": 0.8548402667045594, + "num_tokens": 144915102.0, + "step": 120530 + }, + { + "entropy": 1.8087909892201424, + "epoch": 0.37366345334902923, + "grad_norm": 7.617093086242676, + "learning_rate": 4.138606822945347e-06, + "loss": 0.4519, + "mean_token_accuracy": 0.8564250499010087, + "num_tokens": 144927518.0, + "step": 120540 + }, + { + "entropy": 1.8769372016191483, + "epoch": 0.3736944524740789, + "grad_norm": 8.41362476348877, + "learning_rate": 4.138435162763395e-06, + "loss": 0.4812, + "mean_token_accuracy": 0.8450376495718956, + "num_tokens": 144939489.0, + "step": 120550 + }, + { + "entropy": 1.8020145073533058, + "epoch": 0.3737254515991286, + "grad_norm": 9.323637008666992, + "learning_rate": 4.138263523939916e-06, + "loss": 0.4424, + "mean_token_accuracy": 0.8448305040597915, + "num_tokens": 144952661.0, + "step": 120560 + }, + { + "entropy": 1.84602689743042, + "epoch": 0.3737564507241783, + "grad_norm": 8.894927024841309, + "learning_rate": 4.138091906470481e-06, + "loss": 0.4314, + "mean_token_accuracy": 0.8503866240382194, + "num_tokens": 144964533.0, + "step": 120570 + }, + { + "entropy": 1.7196414768695831, + "epoch": 0.373787449849228, + "grad_norm": 4.194610118865967, + "learning_rate": 4.137920310350664e-06, + "loss": 0.3677, + "mean_token_accuracy": 0.8610586509108543, + "num_tokens": 144978621.0, + "step": 120580 + }, + { + "entropy": 1.8428490251302718, + "epoch": 0.3738184489742777, + "grad_norm": 4.064561367034912, + "learning_rate": 4.137748735576036e-06, + "loss": 0.4538, + "mean_token_accuracy": 0.8580682083964348, + "num_tokens": 144990822.0, + "step": 120590 + }, + { + "entropy": 1.8954029515385629, + "epoch": 0.3738494480993274, + "grad_norm": 8.305089950561523, + "learning_rate": 4.137577182142173e-06, + "loss": 0.511, + "mean_token_accuracy": 0.8431953400373459, + "num_tokens": 145002259.0, + "step": 120600 + }, + { + "entropy": 1.7457600995898246, + "epoch": 0.3738804472243771, + "grad_norm": 9.313241958618164, + "learning_rate": 4.137405650044653e-06, + "loss": 0.3778, + "mean_token_accuracy": 0.8654039755463601, + "num_tokens": 145016219.0, + "step": 120610 + }, + { + "entropy": 1.8425143510103226, + "epoch": 0.3739114463494268, + "grad_norm": 7.391794204711914, + "learning_rate": 4.137234139279052e-06, + "loss": 0.4997, + "mean_token_accuracy": 0.8488540187478065, + "num_tokens": 145028857.0, + "step": 120620 + }, + { + "entropy": 1.7952088013291359, + "epoch": 0.37394244547447647, + "grad_norm": 9.129136085510254, + "learning_rate": 4.13706264984095e-06, + "loss": 0.3903, + "mean_token_accuracy": 0.8728530630469322, + "num_tokens": 145041106.0, + "step": 120630 + }, + { + "entropy": 1.9029112622141837, + "epoch": 0.3739734445995262, + "grad_norm": 7.712813377380371, + "learning_rate": 4.136891181725925e-06, + "loss": 0.5076, + "mean_token_accuracy": 0.8385331735014916, + "num_tokens": 145052581.0, + "step": 120640 + }, + { + "entropy": 1.8401472687721252, + "epoch": 0.37400444372457586, + "grad_norm": 9.776946067810059, + "learning_rate": 4.136719734929561e-06, + "loss": 0.4884, + "mean_token_accuracy": 0.8453155905008316, + "num_tokens": 145064652.0, + "step": 120650 + }, + { + "entropy": 1.815567010641098, + "epoch": 0.3740354428496256, + "grad_norm": 9.336750984191895, + "learning_rate": 4.136548309447441e-06, + "loss": 0.434, + "mean_token_accuracy": 0.8552684485912323, + "num_tokens": 145077064.0, + "step": 120660 + }, + { + "entropy": 1.8268222376704215, + "epoch": 0.37406644197467526, + "grad_norm": 8.70843505859375, + "learning_rate": 4.136376905275147e-06, + "loss": 0.4439, + "mean_token_accuracy": 0.8503636911511421, + "num_tokens": 145089585.0, + "step": 120670 + }, + { + "entropy": 1.8132446378469467, + "epoch": 0.374097441099725, + "grad_norm": 8.954428672790527, + "learning_rate": 4.1362055224082654e-06, + "loss": 0.4294, + "mean_token_accuracy": 0.8630944639444351, + "num_tokens": 145101852.0, + "step": 120680 + }, + { + "entropy": 1.8896322906017304, + "epoch": 0.37412844022477465, + "grad_norm": 6.831114768981934, + "learning_rate": 4.136034160842382e-06, + "loss": 0.4903, + "mean_token_accuracy": 0.84442989975214, + "num_tokens": 145113578.0, + "step": 120690 + }, + { + "entropy": 1.8678228601813316, + "epoch": 0.3741594393498244, + "grad_norm": 11.830345153808594, + "learning_rate": 4.135862820573086e-06, + "loss": 0.4919, + "mean_token_accuracy": 0.8500665947794914, + "num_tokens": 145125041.0, + "step": 120700 + }, + { + "entropy": 1.7717063777148723, + "epoch": 0.37419043847487404, + "grad_norm": 2.549680233001709, + "learning_rate": 4.135691501595966e-06, + "loss": 0.3644, + "mean_token_accuracy": 0.8615061908960342, + "num_tokens": 145138854.0, + "step": 120710 + }, + { + "entropy": 1.8354929715394974, + "epoch": 0.37422143759992377, + "grad_norm": 3.5742509365081787, + "learning_rate": 4.1355202039066126e-06, + "loss": 0.4324, + "mean_token_accuracy": 0.8558676704764366, + "num_tokens": 145150497.0, + "step": 120720 + }, + { + "entropy": 1.8448586538434029, + "epoch": 0.37425243672497344, + "grad_norm": 8.689122200012207, + "learning_rate": 4.135348927500618e-06, + "loss": 0.4805, + "mean_token_accuracy": 0.8484915763139724, + "num_tokens": 145162443.0, + "step": 120730 + }, + { + "entropy": 1.8981183648109436, + "epoch": 0.37428343585002316, + "grad_norm": 8.72005844116211, + "learning_rate": 4.135177672373573e-06, + "loss": 0.4603, + "mean_token_accuracy": 0.8429804682731629, + "num_tokens": 145174166.0, + "step": 120740 + }, + { + "entropy": 1.9157382816076278, + "epoch": 0.37431443497507283, + "grad_norm": 3.737830400466919, + "learning_rate": 4.135006438521074e-06, + "loss": 0.5013, + "mean_token_accuracy": 0.8430665746331215, + "num_tokens": 145185172.0, + "step": 120750 + }, + { + "entropy": 1.9020906642079354, + "epoch": 0.3743454341001225, + "grad_norm": 7.51852560043335, + "learning_rate": 4.134835225938717e-06, + "loss": 0.4872, + "mean_token_accuracy": 0.8433557212352752, + "num_tokens": 145196378.0, + "step": 120760 + }, + { + "entropy": 1.8677937388420105, + "epoch": 0.3743764332251722, + "grad_norm": 3.965975284576416, + "learning_rate": 4.134664034622098e-06, + "loss": 0.4406, + "mean_token_accuracy": 0.8616374880075455, + "num_tokens": 145207527.0, + "step": 120770 + }, + { + "entropy": 1.8505346715450286, + "epoch": 0.3744074323502219, + "grad_norm": 4.419517993927002, + "learning_rate": 4.134492864566814e-06, + "loss": 0.4014, + "mean_token_accuracy": 0.8657907411456108, + "num_tokens": 145219338.0, + "step": 120780 + }, + { + "entropy": 1.7792392835021018, + "epoch": 0.3744384314752716, + "grad_norm": 4.039781093597412, + "learning_rate": 4.134321715768466e-06, + "loss": 0.4888, + "mean_token_accuracy": 0.8500527799129486, + "num_tokens": 145232885.0, + "step": 120790 + }, + { + "entropy": 1.870184451341629, + "epoch": 0.3744694306003213, + "grad_norm": 3.6062183380126953, + "learning_rate": 4.134150588222653e-06, + "loss": 0.4869, + "mean_token_accuracy": 0.8536598861217499, + "num_tokens": 145244298.0, + "step": 120800 + }, + { + "entropy": 1.7577032819390297, + "epoch": 0.374500429725371, + "grad_norm": 3.414278030395508, + "learning_rate": 4.13397948192498e-06, + "loss": 0.391, + "mean_token_accuracy": 0.8604967936873436, + "num_tokens": 145257392.0, + "step": 120810 + }, + { + "entropy": 1.8007031008601189, + "epoch": 0.3745314288504207, + "grad_norm": 4.170672416687012, + "learning_rate": 4.133808396871046e-06, + "loss": 0.4069, + "mean_token_accuracy": 0.8616224005818367, + "num_tokens": 145270196.0, + "step": 120820 + }, + { + "entropy": 1.9025542974472045, + "epoch": 0.3745624279754704, + "grad_norm": 7.770366191864014, + "learning_rate": 4.133637333056459e-06, + "loss": 0.4694, + "mean_token_accuracy": 0.8512863472104073, + "num_tokens": 145281494.0, + "step": 120830 + }, + { + "entropy": 1.8452581293880939, + "epoch": 0.37459342710052007, + "grad_norm": 8.3046293258667, + "learning_rate": 4.1334662904768234e-06, + "loss": 0.4547, + "mean_token_accuracy": 0.8516708582639694, + "num_tokens": 145294224.0, + "step": 120840 + }, + { + "entropy": 1.7171327024698257, + "epoch": 0.3746244262255698, + "grad_norm": 3.3472342491149902, + "learning_rate": 4.1332952691277465e-06, + "loss": 0.3925, + "mean_token_accuracy": 0.8607262581586838, + "num_tokens": 145308338.0, + "step": 120850 + }, + { + "entropy": 1.9564589619636537, + "epoch": 0.37465542535061946, + "grad_norm": 7.92958402633667, + "learning_rate": 4.133124269004837e-06, + "loss": 0.5199, + "mean_token_accuracy": 0.8395091354846954, + "num_tokens": 145319683.0, + "step": 120860 + }, + { + "entropy": 1.8489731594920158, + "epoch": 0.3746864244756692, + "grad_norm": 8.664722442626953, + "learning_rate": 4.132953290103704e-06, + "loss": 0.4731, + "mean_token_accuracy": 0.8534695088863373, + "num_tokens": 145332281.0, + "step": 120870 + }, + { + "entropy": 1.8806229501962661, + "epoch": 0.37471742360071886, + "grad_norm": 9.805295944213867, + "learning_rate": 4.132782332419957e-06, + "loss": 0.4259, + "mean_token_accuracy": 0.8593287214636802, + "num_tokens": 145343989.0, + "step": 120880 + }, + { + "entropy": 1.8413976445794105, + "epoch": 0.3747484227257686, + "grad_norm": 3.8515052795410156, + "learning_rate": 4.13261139594921e-06, + "loss": 0.4867, + "mean_token_accuracy": 0.8467993855476379, + "num_tokens": 145356520.0, + "step": 120890 + }, + { + "entropy": 1.9287862733006478, + "epoch": 0.37477942185081825, + "grad_norm": 8.918717384338379, + "learning_rate": 4.132440480687076e-06, + "loss": 0.4549, + "mean_token_accuracy": 0.854530693590641, + "num_tokens": 145367592.0, + "step": 120900 + }, + { + "entropy": 1.909818661212921, + "epoch": 0.374810420975868, + "grad_norm": 11.631319999694824, + "learning_rate": 4.13226958662917e-06, + "loss": 0.4791, + "mean_token_accuracy": 0.8506747603416442, + "num_tokens": 145378561.0, + "step": 120910 + }, + { + "entropy": 1.8912340737879276, + "epoch": 0.37484142010091764, + "grad_norm": 9.222434997558594, + "learning_rate": 4.132098713771106e-06, + "loss": 0.4455, + "mean_token_accuracy": 0.8446800738573075, + "num_tokens": 145390493.0, + "step": 120920 + }, + { + "entropy": 1.891737161576748, + "epoch": 0.37487241922596737, + "grad_norm": 9.400554656982422, + "learning_rate": 4.131927862108504e-06, + "loss": 0.4685, + "mean_token_accuracy": 0.8476646468043327, + "num_tokens": 145401638.0, + "step": 120930 + }, + { + "entropy": 1.8229492217302323, + "epoch": 0.37490341835101704, + "grad_norm": 8.848627090454102, + "learning_rate": 4.13175703163698e-06, + "loss": 0.4408, + "mean_token_accuracy": 0.8519455164670944, + "num_tokens": 145414244.0, + "step": 120940 + }, + { + "entropy": 1.8781214132905006, + "epoch": 0.37493441747606676, + "grad_norm": 9.196588516235352, + "learning_rate": 4.131586222352156e-06, + "loss": 0.4809, + "mean_token_accuracy": 0.8479282841086387, + "num_tokens": 145425934.0, + "step": 120950 + }, + { + "entropy": 1.8320988327264787, + "epoch": 0.37496541660111643, + "grad_norm": 8.844073295593262, + "learning_rate": 4.131415434249652e-06, + "loss": 0.436, + "mean_token_accuracy": 0.8547179445624351, + "num_tokens": 145438172.0, + "step": 120960 + }, + { + "entropy": 1.8629610985517502, + "epoch": 0.37499641572616615, + "grad_norm": 4.892116546630859, + "learning_rate": 4.131244667325089e-06, + "loss": 0.441, + "mean_token_accuracy": 0.8539141744375229, + "num_tokens": 145449705.0, + "step": 120970 + }, + { + "entropy": 1.8123175263404847, + "epoch": 0.3750274148512158, + "grad_norm": 3.7633779048919678, + "learning_rate": 4.131073921574093e-06, + "loss": 0.44, + "mean_token_accuracy": 0.8505129277706146, + "num_tokens": 145463148.0, + "step": 120980 + }, + { + "entropy": 1.9299910217523575, + "epoch": 0.3750584139762655, + "grad_norm": 10.863778114318848, + "learning_rate": 4.130903196992287e-06, + "loss": 0.5118, + "mean_token_accuracy": 0.8390725657343865, + "num_tokens": 145474645.0, + "step": 120990 + }, + { + "entropy": 1.8196517676115036, + "epoch": 0.3750894131013152, + "grad_norm": 4.3333210945129395, + "learning_rate": 4.1307324935752964e-06, + "loss": 0.4161, + "mean_token_accuracy": 0.8509810373187066, + "num_tokens": 145487342.0, + "step": 121000 + }, + { + "entropy": 1.8983831241726876, + "epoch": 0.3751204122263649, + "grad_norm": 3.389133930206299, + "learning_rate": 4.130561811318752e-06, + "loss": 0.5603, + "mean_token_accuracy": 0.839329156279564, + "num_tokens": 145499254.0, + "step": 121010 + }, + { + "entropy": 1.7906238347291947, + "epoch": 0.3751514113514146, + "grad_norm": 8.32974624633789, + "learning_rate": 4.1303911502182784e-06, + "loss": 0.4388, + "mean_token_accuracy": 0.8561747640371322, + "num_tokens": 145511933.0, + "step": 121020 + }, + { + "entropy": 1.943038222193718, + "epoch": 0.3751824104764643, + "grad_norm": 8.438972473144531, + "learning_rate": 4.130220510269507e-06, + "loss": 0.4776, + "mean_token_accuracy": 0.8507806181907653, + "num_tokens": 145522873.0, + "step": 121030 + }, + { + "entropy": 1.9753375679254532, + "epoch": 0.375213409601514, + "grad_norm": 6.82116174697876, + "learning_rate": 4.1300498914680705e-06, + "loss": 0.5384, + "mean_token_accuracy": 0.8418274581432342, + "num_tokens": 145533983.0, + "step": 121040 + }, + { + "entropy": 1.8580348297953606, + "epoch": 0.37524440872656367, + "grad_norm": 9.025100708007812, + "learning_rate": 4.129879293809599e-06, + "loss": 0.4782, + "mean_token_accuracy": 0.840544268488884, + "num_tokens": 145546293.0, + "step": 121050 + }, + { + "entropy": 1.8665331095457076, + "epoch": 0.3752754078516134, + "grad_norm": 4.244259834289551, + "learning_rate": 4.129708717289727e-06, + "loss": 0.4255, + "mean_token_accuracy": 0.856557646393776, + "num_tokens": 145557776.0, + "step": 121060 + }, + { + "entropy": 1.8694847792387008, + "epoch": 0.37530640697666307, + "grad_norm": 8.377169609069824, + "learning_rate": 4.12953816190409e-06, + "loss": 0.4456, + "mean_token_accuracy": 0.857828502357006, + "num_tokens": 145569154.0, + "step": 121070 + }, + { + "entropy": 1.7808043763041497, + "epoch": 0.3753374061017128, + "grad_norm": 4.1414570808410645, + "learning_rate": 4.1293676276483235e-06, + "loss": 0.3795, + "mean_token_accuracy": 0.8570607796311378, + "num_tokens": 145583082.0, + "step": 121080 + }, + { + "entropy": 1.9220749616622925, + "epoch": 0.37536840522676246, + "grad_norm": 8.00906753540039, + "learning_rate": 4.1291971145180645e-06, + "loss": 0.4529, + "mean_token_accuracy": 0.8506890282034874, + "num_tokens": 145594524.0, + "step": 121090 + }, + { + "entropy": 1.9267291516065597, + "epoch": 0.3753994043518122, + "grad_norm": 8.382123947143555, + "learning_rate": 4.129026622508953e-06, + "loss": 0.4762, + "mean_token_accuracy": 0.8558269619941712, + "num_tokens": 145605078.0, + "step": 121100 + }, + { + "entropy": 1.7520041272044182, + "epoch": 0.37543040347686185, + "grad_norm": 8.029170989990234, + "learning_rate": 4.128856151616628e-06, + "loss": 0.4173, + "mean_token_accuracy": 0.8598437607288361, + "num_tokens": 145618913.0, + "step": 121110 + }, + { + "entropy": 1.8347510501742363, + "epoch": 0.3754614026019116, + "grad_norm": 9.550764083862305, + "learning_rate": 4.128685701836731e-06, + "loss": 0.4574, + "mean_token_accuracy": 0.8477082312107086, + "num_tokens": 145631433.0, + "step": 121120 + }, + { + "entropy": 1.8798089861869811, + "epoch": 0.37549240172696124, + "grad_norm": 8.3656005859375, + "learning_rate": 4.128515273164905e-06, + "loss": 0.478, + "mean_token_accuracy": 0.8539829984307289, + "num_tokens": 145642408.0, + "step": 121130 + }, + { + "entropy": 1.7175431214272976, + "epoch": 0.37552340085201097, + "grad_norm": 8.398488998413086, + "learning_rate": 4.128344865596793e-06, + "loss": 0.3904, + "mean_token_accuracy": 0.8567556262016296, + "num_tokens": 145656156.0, + "step": 121140 + }, + { + "entropy": 1.7876494243741035, + "epoch": 0.37555439997706064, + "grad_norm": 9.021903038024902, + "learning_rate": 4.12817447912804e-06, + "loss": 0.4456, + "mean_token_accuracy": 0.8566848263144493, + "num_tokens": 145667977.0, + "step": 121150 + }, + { + "entropy": 1.7991645127534865, + "epoch": 0.37558539910211036, + "grad_norm": 8.89357852935791, + "learning_rate": 4.128004113754292e-06, + "loss": 0.4469, + "mean_token_accuracy": 0.8592987477779388, + "num_tokens": 145680773.0, + "step": 121160 + }, + { + "entropy": 1.8806771039962769, + "epoch": 0.37561639822716003, + "grad_norm": 7.232542514801025, + "learning_rate": 4.127833769471198e-06, + "loss": 0.5171, + "mean_token_accuracy": 0.8432414412498475, + "num_tokens": 145692228.0, + "step": 121170 + }, + { + "entropy": 1.8988723203539848, + "epoch": 0.37564739735220976, + "grad_norm": 9.036412239074707, + "learning_rate": 4.127663446274406e-06, + "loss": 0.4509, + "mean_token_accuracy": 0.8560154557228088, + "num_tokens": 145703505.0, + "step": 121180 + }, + { + "entropy": 1.876578164100647, + "epoch": 0.3756783964772594, + "grad_norm": 8.154003143310547, + "learning_rate": 4.1274931441595645e-06, + "loss": 0.4372, + "mean_token_accuracy": 0.8526700213551521, + "num_tokens": 145714806.0, + "step": 121190 + }, + { + "entropy": 1.831437037885189, + "epoch": 0.37570939560230915, + "grad_norm": 8.418209075927734, + "learning_rate": 4.1273228631223275e-06, + "loss": 0.4525, + "mean_token_accuracy": 0.8543977767229081, + "num_tokens": 145726566.0, + "step": 121200 + }, + { + "entropy": 1.7670956924557686, + "epoch": 0.3757403947273588, + "grad_norm": 8.465004920959473, + "learning_rate": 4.1271526031583445e-06, + "loss": 0.4109, + "mean_token_accuracy": 0.8540250107645988, + "num_tokens": 145739210.0, + "step": 121210 + }, + { + "entropy": 1.7944332778453826, + "epoch": 0.37577139385240854, + "grad_norm": 8.313122749328613, + "learning_rate": 4.126982364263272e-06, + "loss": 0.467, + "mean_token_accuracy": 0.8499791830778122, + "num_tokens": 145751513.0, + "step": 121220 + }, + { + "entropy": 1.8116312876343728, + "epoch": 0.3758023929774582, + "grad_norm": 3.7453999519348145, + "learning_rate": 4.126812146432764e-06, + "loss": 0.3914, + "mean_token_accuracy": 0.8603421568870544, + "num_tokens": 145763916.0, + "step": 121230 + }, + { + "entropy": 1.9280547127127647, + "epoch": 0.3758333921025079, + "grad_norm": 8.784612655639648, + "learning_rate": 4.126641949662477e-06, + "loss": 0.4732, + "mean_token_accuracy": 0.8478071898221969, + "num_tokens": 145774772.0, + "step": 121240 + }, + { + "entropy": 1.9599653363227845, + "epoch": 0.3758643912275576, + "grad_norm": 9.786565780639648, + "learning_rate": 4.126471773948068e-06, + "loss": 0.5111, + "mean_token_accuracy": 0.8443946361541748, + "num_tokens": 145785993.0, + "step": 121250 + }, + { + "entropy": 1.8358087345957756, + "epoch": 0.3758953903526073, + "grad_norm": 8.388903617858887, + "learning_rate": 4.126301619285196e-06, + "loss": 0.4631, + "mean_token_accuracy": 0.8493312045931816, + "num_tokens": 145798151.0, + "step": 121260 + }, + { + "entropy": 1.8409698456525803, + "epoch": 0.375926389477657, + "grad_norm": 9.696656227111816, + "learning_rate": 4.126131485669522e-06, + "loss": 0.4737, + "mean_token_accuracy": 0.8447024121880531, + "num_tokens": 145810340.0, + "step": 121270 + }, + { + "entropy": 1.9255991280078888, + "epoch": 0.37595738860270667, + "grad_norm": 9.015790939331055, + "learning_rate": 4.125961373096706e-06, + "loss": 0.5156, + "mean_token_accuracy": 0.8415984019637108, + "num_tokens": 145821505.0, + "step": 121280 + }, + { + "entropy": 1.9274761855602265, + "epoch": 0.3759883877277564, + "grad_norm": 7.261139392852783, + "learning_rate": 4.125791281562412e-06, + "loss": 0.5009, + "mean_token_accuracy": 0.853644534945488, + "num_tokens": 145832550.0, + "step": 121290 + }, + { + "entropy": 1.838864254951477, + "epoch": 0.37601938685280606, + "grad_norm": 4.556879043579102, + "learning_rate": 4.125621211062301e-06, + "loss": 0.4268, + "mean_token_accuracy": 0.8556353718042373, + "num_tokens": 145844415.0, + "step": 121300 + }, + { + "entropy": 1.9384997293353081, + "epoch": 0.3760503859778558, + "grad_norm": 6.320328712463379, + "learning_rate": 4.125451161592041e-06, + "loss": 0.4963, + "mean_token_accuracy": 0.8399920925498009, + "num_tokens": 145855907.0, + "step": 121310 + }, + { + "entropy": 1.8460743427276611, + "epoch": 0.37608138510290545, + "grad_norm": 8.746512413024902, + "learning_rate": 4.125281133147298e-06, + "loss": 0.4219, + "mean_token_accuracy": 0.865053367614746, + "num_tokens": 145867251.0, + "step": 121320 + }, + { + "entropy": 1.8502772450447083, + "epoch": 0.3761123842279552, + "grad_norm": 4.59258508682251, + "learning_rate": 4.125111125723738e-06, + "loss": 0.4174, + "mean_token_accuracy": 0.8603545278310776, + "num_tokens": 145879356.0, + "step": 121330 + }, + { + "entropy": 1.8849507763981819, + "epoch": 0.37614338335300485, + "grad_norm": 7.458291530609131, + "learning_rate": 4.124941139317031e-06, + "loss": 0.4773, + "mean_token_accuracy": 0.8583846285939216, + "num_tokens": 145891689.0, + "step": 121340 + }, + { + "entropy": 1.8615009844303132, + "epoch": 0.37617438247805457, + "grad_norm": 8.805126190185547, + "learning_rate": 4.124771173922848e-06, + "loss": 0.4526, + "mean_token_accuracy": 0.8532016187906265, + "num_tokens": 145903559.0, + "step": 121350 + }, + { + "entropy": 1.81989781036973, + "epoch": 0.37620538160310424, + "grad_norm": 7.289465427398682, + "learning_rate": 4.1246012295368575e-06, + "loss": 0.425, + "mean_token_accuracy": 0.8597825333476067, + "num_tokens": 145916356.0, + "step": 121360 + }, + { + "entropy": 1.8147448897361755, + "epoch": 0.37623638072815396, + "grad_norm": 7.9857497215271, + "learning_rate": 4.124431306154735e-06, + "loss": 0.423, + "mean_token_accuracy": 0.8589125216007233, + "num_tokens": 145928998.0, + "step": 121370 + }, + { + "entropy": 1.9180129885673523, + "epoch": 0.37626737985320363, + "grad_norm": 9.49499797821045, + "learning_rate": 4.124261403772152e-06, + "loss": 0.5094, + "mean_token_accuracy": 0.8353124216198922, + "num_tokens": 145940800.0, + "step": 121380 + }, + { + "entropy": 1.920377929508686, + "epoch": 0.37629837897825336, + "grad_norm": 9.39726448059082, + "learning_rate": 4.124091522384785e-06, + "loss": 0.5386, + "mean_token_accuracy": 0.834184144437313, + "num_tokens": 145952846.0, + "step": 121390 + }, + { + "entropy": 1.9227221325039863, + "epoch": 0.376329378103303, + "grad_norm": 8.036456108093262, + "learning_rate": 4.1239216619883106e-06, + "loss": 0.4837, + "mean_token_accuracy": 0.85260841101408, + "num_tokens": 145964633.0, + "step": 121400 + }, + { + "entropy": 1.880432690680027, + "epoch": 0.37636037722835275, + "grad_norm": 9.900726318359375, + "learning_rate": 4.123751822578405e-06, + "loss": 0.4538, + "mean_token_accuracy": 0.8541592597961426, + "num_tokens": 145976465.0, + "step": 121410 + }, + { + "entropy": 1.845744264125824, + "epoch": 0.3763913763534024, + "grad_norm": 8.883930206298828, + "learning_rate": 4.123582004150748e-06, + "loss": 0.4641, + "mean_token_accuracy": 0.8476935580372811, + "num_tokens": 145988619.0, + "step": 121420 + }, + { + "entropy": 1.8941200897097588, + "epoch": 0.37642237547845214, + "grad_norm": 7.9326910972595215, + "learning_rate": 4.123412206701019e-06, + "loss": 0.4714, + "mean_token_accuracy": 0.8494398832321167, + "num_tokens": 145999813.0, + "step": 121430 + }, + { + "entropy": 1.950559636950493, + "epoch": 0.3764533746035018, + "grad_norm": 7.880441665649414, + "learning_rate": 4.1232424302249e-06, + "loss": 0.5265, + "mean_token_accuracy": 0.8358272299170494, + "num_tokens": 146010044.0, + "step": 121440 + }, + { + "entropy": 1.8937064185738564, + "epoch": 0.37648437372855154, + "grad_norm": 7.3297438621521, + "learning_rate": 4.123072674718073e-06, + "loss": 0.4991, + "mean_token_accuracy": 0.8427557229995728, + "num_tokens": 146021669.0, + "step": 121450 + }, + { + "entropy": 1.7439885810017586, + "epoch": 0.3765153728536012, + "grad_norm": 2.4287400245666504, + "learning_rate": 4.122902940176222e-06, + "loss": 0.3597, + "mean_token_accuracy": 0.8698983415961266, + "num_tokens": 146035437.0, + "step": 121460 + }, + { + "entropy": 1.8418650522828102, + "epoch": 0.37654637197865093, + "grad_norm": 8.513551712036133, + "learning_rate": 4.122733226595032e-06, + "loss": 0.434, + "mean_token_accuracy": 0.856321020424366, + "num_tokens": 146047669.0, + "step": 121470 + }, + { + "entropy": 1.897158458828926, + "epoch": 0.3765773711037006, + "grad_norm": 8.884608268737793, + "learning_rate": 4.122563533970189e-06, + "loss": 0.4874, + "mean_token_accuracy": 0.8471438035368919, + "num_tokens": 146058873.0, + "step": 121480 + }, + { + "entropy": 1.8239916279911994, + "epoch": 0.37660837022875027, + "grad_norm": 7.655562877655029, + "learning_rate": 4.122393862297381e-06, + "loss": 0.4532, + "mean_token_accuracy": 0.8447648495435714, + "num_tokens": 146071178.0, + "step": 121490 + }, + { + "entropy": 1.8941697224974632, + "epoch": 0.3766393693538, + "grad_norm": 3.823326349258423, + "learning_rate": 4.122224211572297e-06, + "loss": 0.4325, + "mean_token_accuracy": 0.8591320559382438, + "num_tokens": 146082489.0, + "step": 121500 + }, + { + "entropy": 1.8416035562753676, + "epoch": 0.37667036847884966, + "grad_norm": 7.697302341461182, + "learning_rate": 4.122054581790626e-06, + "loss": 0.4573, + "mean_token_accuracy": 0.8484307572245597, + "num_tokens": 146094870.0, + "step": 121510 + }, + { + "entropy": 1.7722488626837731, + "epoch": 0.3767013676038994, + "grad_norm": 6.725828170776367, + "learning_rate": 4.121884972948061e-06, + "loss": 0.3802, + "mean_token_accuracy": 0.8639165312051773, + "num_tokens": 146108574.0, + "step": 121520 + }, + { + "entropy": 1.8973381593823433, + "epoch": 0.37673236672894905, + "grad_norm": 8.68867301940918, + "learning_rate": 4.121715385040292e-06, + "loss": 0.5001, + "mean_token_accuracy": 0.8411045700311661, + "num_tokens": 146120800.0, + "step": 121530 + }, + { + "entropy": 1.929002758860588, + "epoch": 0.3767633658539988, + "grad_norm": 7.320843696594238, + "learning_rate": 4.1215458180630136e-06, + "loss": 0.4931, + "mean_token_accuracy": 0.8451986953616142, + "num_tokens": 146132050.0, + "step": 121540 + }, + { + "entropy": 1.850935024023056, + "epoch": 0.37679436497904845, + "grad_norm": 3.584969997406006, + "learning_rate": 4.121376272011922e-06, + "loss": 0.4786, + "mean_token_accuracy": 0.8525848358869552, + "num_tokens": 146144075.0, + "step": 121550 + }, + { + "entropy": 1.8184596046805381, + "epoch": 0.37682536410409817, + "grad_norm": 8.334314346313477, + "learning_rate": 4.121206746882713e-06, + "loss": 0.4425, + "mean_token_accuracy": 0.8496242105960846, + "num_tokens": 146156909.0, + "step": 121560 + }, + { + "entropy": 1.845918568968773, + "epoch": 0.37685636322914784, + "grad_norm": 7.1232805252075195, + "learning_rate": 4.121037242671082e-06, + "loss": 0.5159, + "mean_token_accuracy": 0.8491025045514107, + "num_tokens": 146169187.0, + "step": 121570 + }, + { + "entropy": 1.8747231513261795, + "epoch": 0.37688736235419756, + "grad_norm": 8.403467178344727, + "learning_rate": 4.12086775937273e-06, + "loss": 0.4414, + "mean_token_accuracy": 0.8543328627943992, + "num_tokens": 146181009.0, + "step": 121580 + }, + { + "entropy": 1.9061955615878106, + "epoch": 0.37691836147924723, + "grad_norm": 9.928679466247559, + "learning_rate": 4.120698296983356e-06, + "loss": 0.4784, + "mean_token_accuracy": 0.8431771412491799, + "num_tokens": 146192524.0, + "step": 121590 + }, + { + "entropy": 1.8643248841166495, + "epoch": 0.37694936060429696, + "grad_norm": 6.862342834472656, + "learning_rate": 4.120528855498663e-06, + "loss": 0.3942, + "mean_token_accuracy": 0.8685376390814781, + "num_tokens": 146204768.0, + "step": 121600 + }, + { + "entropy": 1.8592729791998863, + "epoch": 0.3769803597293466, + "grad_norm": 8.456074714660645, + "learning_rate": 4.12035943491435e-06, + "loss": 0.4934, + "mean_token_accuracy": 0.8440333098173142, + "num_tokens": 146217180.0, + "step": 121610 + }, + { + "entropy": 1.801856230199337, + "epoch": 0.37701135885439635, + "grad_norm": 4.020124435424805, + "learning_rate": 4.120190035226123e-06, + "loss": 0.408, + "mean_token_accuracy": 0.859604449570179, + "num_tokens": 146230138.0, + "step": 121620 + }, + { + "entropy": 1.8506782591342925, + "epoch": 0.377042357979446, + "grad_norm": 7.4072041511535645, + "learning_rate": 4.120020656429685e-06, + "loss": 0.4412, + "mean_token_accuracy": 0.8540009498596192, + "num_tokens": 146242786.0, + "step": 121630 + }, + { + "entropy": 1.914864283800125, + "epoch": 0.37707335710449574, + "grad_norm": 7.623665809631348, + "learning_rate": 4.119851298520746e-06, + "loss": 0.4683, + "mean_token_accuracy": 0.8540341034531593, + "num_tokens": 146253407.0, + "step": 121640 + }, + { + "entropy": 1.9302624300122262, + "epoch": 0.3771043562295454, + "grad_norm": 9.16896915435791, + "learning_rate": 4.119681961495008e-06, + "loss": 0.447, + "mean_token_accuracy": 0.8566766142845154, + "num_tokens": 146264715.0, + "step": 121650 + }, + { + "entropy": 1.8666496634483338, + "epoch": 0.37713535535459514, + "grad_norm": 3.110356092453003, + "learning_rate": 4.119512645348184e-06, + "loss": 0.4695, + "mean_token_accuracy": 0.8514965951442719, + "num_tokens": 146276968.0, + "step": 121660 + }, + { + "entropy": 1.8799085780978202, + "epoch": 0.3771663544796448, + "grad_norm": 4.057291030883789, + "learning_rate": 4.119343350075981e-06, + "loss": 0.447, + "mean_token_accuracy": 0.8615951225161552, + "num_tokens": 146288576.0, + "step": 121670 + }, + { + "entropy": 1.818842075765133, + "epoch": 0.37719735360469453, + "grad_norm": 10.600461959838867, + "learning_rate": 4.119174075674112e-06, + "loss": 0.4499, + "mean_token_accuracy": 0.852581399679184, + "num_tokens": 146300167.0, + "step": 121680 + }, + { + "entropy": 1.8106214493513106, + "epoch": 0.3772283527297442, + "grad_norm": 3.652275323867798, + "learning_rate": 4.119004822138288e-06, + "loss": 0.4137, + "mean_token_accuracy": 0.854650741815567, + "num_tokens": 146312797.0, + "step": 121690 + }, + { + "entropy": 1.783032974600792, + "epoch": 0.3772593518547939, + "grad_norm": 2.3988730907440186, + "learning_rate": 4.118835589464222e-06, + "loss": 0.3963, + "mean_token_accuracy": 0.8633334279060364, + "num_tokens": 146325854.0, + "step": 121700 + }, + { + "entropy": 1.8655279122292996, + "epoch": 0.3772903509798436, + "grad_norm": 3.8433516025543213, + "learning_rate": 4.118666377647631e-06, + "loss": 0.459, + "mean_token_accuracy": 0.8409992963075638, + "num_tokens": 146338473.0, + "step": 121710 + }, + { + "entropy": 1.9374163687229156, + "epoch": 0.3773213501048933, + "grad_norm": 7.3235249519348145, + "learning_rate": 4.118497186684229e-06, + "loss": 0.504, + "mean_token_accuracy": 0.8448733389377594, + "num_tokens": 146349721.0, + "step": 121720 + }, + { + "entropy": 1.8571631446480752, + "epoch": 0.377352349229943, + "grad_norm": 4.183323860168457, + "learning_rate": 4.118328016569734e-06, + "loss": 0.47, + "mean_token_accuracy": 0.8527215197682381, + "num_tokens": 146362284.0, + "step": 121730 + }, + { + "entropy": 1.9317335307598114, + "epoch": 0.37738334835499265, + "grad_norm": 7.954257965087891, + "learning_rate": 4.118158867299864e-06, + "loss": 0.537, + "mean_token_accuracy": 0.835789829492569, + "num_tokens": 146373357.0, + "step": 121740 + }, + { + "entropy": 1.8835597395896913, + "epoch": 0.3774143474800424, + "grad_norm": 8.603527069091797, + "learning_rate": 4.117989738870338e-06, + "loss": 0.4575, + "mean_token_accuracy": 0.8524842366576195, + "num_tokens": 146384815.0, + "step": 121750 + }, + { + "entropy": 1.8527083411812781, + "epoch": 0.37744534660509205, + "grad_norm": 15.195161819458008, + "learning_rate": 4.117820631276879e-06, + "loss": 0.4141, + "mean_token_accuracy": 0.8620884269475937, + "num_tokens": 146396638.0, + "step": 121760 + }, + { + "entropy": 1.8090012684464454, + "epoch": 0.37747634573014177, + "grad_norm": 9.310328483581543, + "learning_rate": 4.117651544515207e-06, + "loss": 0.3812, + "mean_token_accuracy": 0.8572055116295815, + "num_tokens": 146409088.0, + "step": 121770 + }, + { + "entropy": 1.8714495450258255, + "epoch": 0.37750734485519144, + "grad_norm": 8.089369773864746, + "learning_rate": 4.117482478581047e-06, + "loss": 0.4199, + "mean_token_accuracy": 0.8630315214395523, + "num_tokens": 146421044.0, + "step": 121780 + }, + { + "entropy": 1.8696634009480477, + "epoch": 0.37753834398024116, + "grad_norm": 4.10288667678833, + "learning_rate": 4.117313433470123e-06, + "loss": 0.4542, + "mean_token_accuracy": 0.8499879941344262, + "num_tokens": 146432640.0, + "step": 121790 + }, + { + "entropy": 1.8695901393890382, + "epoch": 0.37756934310529083, + "grad_norm": 7.528837203979492, + "learning_rate": 4.11714440917816e-06, + "loss": 0.4675, + "mean_token_accuracy": 0.8490506067872048, + "num_tokens": 146444477.0, + "step": 121800 + }, + { + "entropy": 1.8624702721834183, + "epoch": 0.37760034223034056, + "grad_norm": 7.586561679840088, + "learning_rate": 4.116975405700887e-06, + "loss": 0.4175, + "mean_token_accuracy": 0.853815546631813, + "num_tokens": 146457039.0, + "step": 121810 + }, + { + "entropy": 1.917418046295643, + "epoch": 0.3776313413553902, + "grad_norm": 7.109292030334473, + "learning_rate": 4.116806423034029e-06, + "loss": 0.4573, + "mean_token_accuracy": 0.854109600186348, + "num_tokens": 146469106.0, + "step": 121820 + }, + { + "entropy": 1.8877900682389737, + "epoch": 0.37766234048043995, + "grad_norm": 2.658243417739868, + "learning_rate": 4.116637461173319e-06, + "loss": 0.4289, + "mean_token_accuracy": 0.8463023707270623, + "num_tokens": 146482224.0, + "step": 121830 + }, + { + "entropy": 1.9093093752861023, + "epoch": 0.3776933396054896, + "grad_norm": 7.953685283660889, + "learning_rate": 4.116468520114486e-06, + "loss": 0.4749, + "mean_token_accuracy": 0.8459949418902397, + "num_tokens": 146494016.0, + "step": 121840 + }, + { + "entropy": 1.8482676953077317, + "epoch": 0.37772433873053934, + "grad_norm": 5.989774703979492, + "learning_rate": 4.116299599853262e-06, + "loss": 0.4319, + "mean_token_accuracy": 0.8606431394815445, + "num_tokens": 146507063.0, + "step": 121850 + }, + { + "entropy": 1.6848430618643762, + "epoch": 0.377755337855589, + "grad_norm": 9.330160140991211, + "learning_rate": 4.11613070038538e-06, + "loss": 0.3608, + "mean_token_accuracy": 0.874959084391594, + "num_tokens": 146521265.0, + "step": 121860 + }, + { + "entropy": 1.8134713634848594, + "epoch": 0.37778633698063874, + "grad_norm": 9.139564514160156, + "learning_rate": 4.115961821706575e-06, + "loss": 0.4115, + "mean_token_accuracy": 0.8542681246995926, + "num_tokens": 146533625.0, + "step": 121870 + }, + { + "entropy": 1.8329403929412365, + "epoch": 0.3778173361056884, + "grad_norm": 2.2994585037231445, + "learning_rate": 4.115792963812582e-06, + "loss": 0.4243, + "mean_token_accuracy": 0.8562861248850823, + "num_tokens": 146546747.0, + "step": 121880 + }, + { + "entropy": 1.8927996829152107, + "epoch": 0.37784833523073813, + "grad_norm": 8.54615306854248, + "learning_rate": 4.115624126699139e-06, + "loss": 0.4712, + "mean_token_accuracy": 0.8517010971903801, + "num_tokens": 146559042.0, + "step": 121890 + }, + { + "entropy": 1.878790758550167, + "epoch": 0.3778793343557878, + "grad_norm": 10.510698318481445, + "learning_rate": 4.1154553103619835e-06, + "loss": 0.5119, + "mean_token_accuracy": 0.8425199687480927, + "num_tokens": 146570240.0, + "step": 121900 + }, + { + "entropy": 1.8623479381203651, + "epoch": 0.3779103334808375, + "grad_norm": 8.871630668640137, + "learning_rate": 4.115286514796853e-06, + "loss": 0.4428, + "mean_token_accuracy": 0.8559099569916725, + "num_tokens": 146583049.0, + "step": 121910 + }, + { + "entropy": 1.8767142370343208, + "epoch": 0.3779413326058872, + "grad_norm": 3.710120677947998, + "learning_rate": 4.115117739999491e-06, + "loss": 0.5138, + "mean_token_accuracy": 0.847675909101963, + "num_tokens": 146595103.0, + "step": 121920 + }, + { + "entropy": 1.960743510723114, + "epoch": 0.3779723317309369, + "grad_norm": 7.444425582885742, + "learning_rate": 4.114948985965637e-06, + "loss": 0.5077, + "mean_token_accuracy": 0.8415925487875938, + "num_tokens": 146605922.0, + "step": 121930 + }, + { + "entropy": 1.8839008912444115, + "epoch": 0.3780033308559866, + "grad_norm": 9.009261131286621, + "learning_rate": 4.114780252691036e-06, + "loss": 0.4788, + "mean_token_accuracy": 0.8500188961625099, + "num_tokens": 146617318.0, + "step": 121940 + }, + { + "entropy": 1.89123537838459, + "epoch": 0.3780343299810363, + "grad_norm": 7.541473865509033, + "learning_rate": 4.11461154017143e-06, + "loss": 0.4364, + "mean_token_accuracy": 0.8585240036249161, + "num_tokens": 146628643.0, + "step": 121950 + }, + { + "entropy": 1.9025135144591332, + "epoch": 0.378065329106086, + "grad_norm": 6.690996170043945, + "learning_rate": 4.114442848402565e-06, + "loss": 0.5119, + "mean_token_accuracy": 0.8481958448886872, + "num_tokens": 146640271.0, + "step": 121960 + }, + { + "entropy": 1.956543865799904, + "epoch": 0.3780963282311357, + "grad_norm": 8.179557800292969, + "learning_rate": 4.1142741773801894e-06, + "loss": 0.5092, + "mean_token_accuracy": 0.8471057340502739, + "num_tokens": 146650293.0, + "step": 121970 + }, + { + "entropy": 1.8972681686282158, + "epoch": 0.37812732735618537, + "grad_norm": 8.407889366149902, + "learning_rate": 4.1141055271000485e-06, + "loss": 0.4785, + "mean_token_accuracy": 0.845854164659977, + "num_tokens": 146662226.0, + "step": 121980 + }, + { + "entropy": 1.9091035678982735, + "epoch": 0.37815832648123504, + "grad_norm": 7.07802152633667, + "learning_rate": 4.113936897557893e-06, + "loss": 0.4828, + "mean_token_accuracy": 0.8480123952031136, + "num_tokens": 146674239.0, + "step": 121990 + }, + { + "entropy": 1.906779918074608, + "epoch": 0.37818932560628477, + "grad_norm": 10.43696403503418, + "learning_rate": 4.113768288749473e-06, + "loss": 0.4509, + "mean_token_accuracy": 0.8478205010294915, + "num_tokens": 146686512.0, + "step": 122000 + }, + { + "entropy": 1.9320502176880836, + "epoch": 0.37822032473133443, + "grad_norm": 9.740495681762695, + "learning_rate": 4.113599700670539e-06, + "loss": 0.4757, + "mean_token_accuracy": 0.8459899872541428, + "num_tokens": 146697859.0, + "step": 122010 + }, + { + "entropy": 1.9591059625148772, + "epoch": 0.37825132385638416, + "grad_norm": 8.180789947509766, + "learning_rate": 4.113431133316846e-06, + "loss": 0.4797, + "mean_token_accuracy": 0.8437874868512154, + "num_tokens": 146709083.0, + "step": 122020 + }, + { + "entropy": 1.802576979994774, + "epoch": 0.3782823229814338, + "grad_norm": 9.705636024475098, + "learning_rate": 4.113262586684146e-06, + "loss": 0.4406, + "mean_token_accuracy": 0.8546473920345307, + "num_tokens": 146722550.0, + "step": 122030 + }, + { + "entropy": 1.8501103572547435, + "epoch": 0.37831332210648355, + "grad_norm": 7.220241069793701, + "learning_rate": 4.113094060768193e-06, + "loss": 0.4619, + "mean_token_accuracy": 0.8407644376158714, + "num_tokens": 146736130.0, + "step": 122040 + }, + { + "entropy": 1.847523419559002, + "epoch": 0.3783443212315332, + "grad_norm": 7.426035404205322, + "learning_rate": 4.112925555564747e-06, + "loss": 0.4282, + "mean_token_accuracy": 0.8541855216026306, + "num_tokens": 146748100.0, + "step": 122050 + }, + { + "entropy": 1.7743535205721854, + "epoch": 0.37837532035658294, + "grad_norm": 4.075657367706299, + "learning_rate": 4.112757071069562e-06, + "loss": 0.4231, + "mean_token_accuracy": 0.8489448487758636, + "num_tokens": 146760893.0, + "step": 122060 + }, + { + "entropy": 1.853407160937786, + "epoch": 0.3784063194816326, + "grad_norm": 10.630528450012207, + "learning_rate": 4.1125886072784e-06, + "loss": 0.4918, + "mean_token_accuracy": 0.8410938188433648, + "num_tokens": 146773477.0, + "step": 122070 + }, + { + "entropy": 1.831977953016758, + "epoch": 0.37843731860668234, + "grad_norm": 9.587197303771973, + "learning_rate": 4.112420164187019e-06, + "loss": 0.461, + "mean_token_accuracy": 0.856691500544548, + "num_tokens": 146784991.0, + "step": 122080 + }, + { + "entropy": 1.8454594373703004, + "epoch": 0.378468317731732, + "grad_norm": 8.84549331665039, + "learning_rate": 4.11225174179118e-06, + "loss": 0.4513, + "mean_token_accuracy": 0.8553299933671952, + "num_tokens": 146797158.0, + "step": 122090 + }, + { + "entropy": 1.889687070250511, + "epoch": 0.37849931685678173, + "grad_norm": 8.103165626525879, + "learning_rate": 4.1120833400866465e-06, + "loss": 0.4901, + "mean_token_accuracy": 0.8467686668038368, + "num_tokens": 146808540.0, + "step": 122100 + }, + { + "entropy": 1.7636478379368783, + "epoch": 0.3785303159818314, + "grad_norm": 8.541621208190918, + "learning_rate": 4.111914959069182e-06, + "loss": 0.4046, + "mean_token_accuracy": 0.8589817076921463, + "num_tokens": 146822024.0, + "step": 122110 + }, + { + "entropy": 1.8948402941226958, + "epoch": 0.3785613151068811, + "grad_norm": 7.753279685974121, + "learning_rate": 4.111746598734551e-06, + "loss": 0.4575, + "mean_token_accuracy": 0.8480223119258881, + "num_tokens": 146833733.0, + "step": 122120 + }, + { + "entropy": 1.8317367523908614, + "epoch": 0.3785923142319308, + "grad_norm": 9.554505348205566, + "learning_rate": 4.11157825907852e-06, + "loss": 0.4395, + "mean_token_accuracy": 0.853913576900959, + "num_tokens": 146846482.0, + "step": 122130 + }, + { + "entropy": 1.8885809898376464, + "epoch": 0.3786233133569805, + "grad_norm": 8.104804039001465, + "learning_rate": 4.111409940096856e-06, + "loss": 0.4745, + "mean_token_accuracy": 0.8428619146347046, + "num_tokens": 146857645.0, + "step": 122140 + }, + { + "entropy": 1.9016202330589294, + "epoch": 0.3786543124820302, + "grad_norm": 8.457221031188965, + "learning_rate": 4.111241641785328e-06, + "loss": 0.4571, + "mean_token_accuracy": 0.8547043219208718, + "num_tokens": 146868691.0, + "step": 122150 + }, + { + "entropy": 1.8326540157198905, + "epoch": 0.3786853116070799, + "grad_norm": 8.81411361694336, + "learning_rate": 4.111073364139704e-06, + "loss": 0.4304, + "mean_token_accuracy": 0.8510780692100525, + "num_tokens": 146881779.0, + "step": 122160 + }, + { + "entropy": 1.8807669505476952, + "epoch": 0.3787163107321296, + "grad_norm": 6.89314603805542, + "learning_rate": 4.110905107155758e-06, + "loss": 0.4651, + "mean_token_accuracy": 0.8529252395033836, + "num_tokens": 146893089.0, + "step": 122170 + }, + { + "entropy": 1.8773617178201676, + "epoch": 0.3787473098571793, + "grad_norm": 9.011570930480957, + "learning_rate": 4.110736870829259e-06, + "loss": 0.4395, + "mean_token_accuracy": 0.8588549971580506, + "num_tokens": 146904814.0, + "step": 122180 + }, + { + "entropy": 1.8098601162433625, + "epoch": 0.378778308982229, + "grad_norm": 4.172004222869873, + "learning_rate": 4.110568655155984e-06, + "loss": 0.4148, + "mean_token_accuracy": 0.8595098748803138, + "num_tokens": 146917001.0, + "step": 122190 + }, + { + "entropy": 1.9448596596717835, + "epoch": 0.3788093081072787, + "grad_norm": 9.438422203063965, + "learning_rate": 4.110400460131704e-06, + "loss": 0.5126, + "mean_token_accuracy": 0.8462162002921104, + "num_tokens": 146928045.0, + "step": 122200 + }, + { + "entropy": 1.8534026801586152, + "epoch": 0.37884030723232837, + "grad_norm": 7.917478561401367, + "learning_rate": 4.110232285752197e-06, + "loss": 0.457, + "mean_token_accuracy": 0.8487755730748177, + "num_tokens": 146939455.0, + "step": 122210 + }, + { + "entropy": 1.88424232006073, + "epoch": 0.3788713063573781, + "grad_norm": 9.62708568572998, + "learning_rate": 4.110064132013238e-06, + "loss": 0.4474, + "mean_token_accuracy": 0.8592986583709716, + "num_tokens": 146951083.0, + "step": 122220 + }, + { + "entropy": 1.9062071546912194, + "epoch": 0.37890230548242776, + "grad_norm": 7.392994403839111, + "learning_rate": 4.109895998910608e-06, + "loss": 0.4531, + "mean_token_accuracy": 0.8547699064016342, + "num_tokens": 146962391.0, + "step": 122230 + }, + { + "entropy": 1.8592015653848648, + "epoch": 0.37893330460747743, + "grad_norm": 8.539904594421387, + "learning_rate": 4.109727886440085e-06, + "loss": 0.4418, + "mean_token_accuracy": 0.8498929470777512, + "num_tokens": 146974481.0, + "step": 122240 + }, + { + "entropy": 1.8191335678100586, + "epoch": 0.37896430373252715, + "grad_norm": 8.98744010925293, + "learning_rate": 4.109559794597449e-06, + "loss": 0.4187, + "mean_token_accuracy": 0.8560615256428719, + "num_tokens": 146986678.0, + "step": 122250 + }, + { + "entropy": 1.8230956450104714, + "epoch": 0.3789953028575768, + "grad_norm": 5.308132171630859, + "learning_rate": 4.109391723378483e-06, + "loss": 0.4226, + "mean_token_accuracy": 0.8582112580537796, + "num_tokens": 146999024.0, + "step": 122260 + }, + { + "entropy": 1.890698865056038, + "epoch": 0.37902630198262655, + "grad_norm": 8.043314933776855, + "learning_rate": 4.109223672778969e-06, + "loss": 0.4737, + "mean_token_accuracy": 0.851884001493454, + "num_tokens": 147011194.0, + "step": 122270 + }, + { + "entropy": 1.8781690523028374, + "epoch": 0.3790573011076762, + "grad_norm": 3.952792167663574, + "learning_rate": 4.1090556427946905e-06, + "loss": 0.4586, + "mean_token_accuracy": 0.8536352217197418, + "num_tokens": 147022928.0, + "step": 122280 + }, + { + "entropy": 1.8398768171668052, + "epoch": 0.37908830023272594, + "grad_norm": 4.229760646820068, + "learning_rate": 4.108887633421435e-06, + "loss": 0.4094, + "mean_token_accuracy": 0.8582636058330536, + "num_tokens": 147035202.0, + "step": 122290 + }, + { + "entropy": 1.8784176617860795, + "epoch": 0.3791192993577756, + "grad_norm": 4.562318801879883, + "learning_rate": 4.10871964465499e-06, + "loss": 0.3901, + "mean_token_accuracy": 0.8587144047021866, + "num_tokens": 147047405.0, + "step": 122300 + }, + { + "entropy": 1.8330136485397817, + "epoch": 0.37915029848282533, + "grad_norm": 7.896421432495117, + "learning_rate": 4.108551676491141e-06, + "loss": 0.3953, + "mean_token_accuracy": 0.8706734910607338, + "num_tokens": 147060914.0, + "step": 122310 + }, + { + "entropy": 1.9075976356863975, + "epoch": 0.379181297607875, + "grad_norm": 9.339140892028809, + "learning_rate": 4.108383728925676e-06, + "loss": 0.4792, + "mean_token_accuracy": 0.8520648717880249, + "num_tokens": 147072427.0, + "step": 122320 + }, + { + "entropy": 1.8779551640152932, + "epoch": 0.3792122967329247, + "grad_norm": 8.027301788330078, + "learning_rate": 4.108215801954389e-06, + "loss": 0.4604, + "mean_token_accuracy": 0.8487935498356819, + "num_tokens": 147085063.0, + "step": 122330 + }, + { + "entropy": 1.9310110673308372, + "epoch": 0.3792432958579744, + "grad_norm": 8.90977668762207, + "learning_rate": 4.108047895573069e-06, + "loss": 0.4744, + "mean_token_accuracy": 0.8506239086389542, + "num_tokens": 147096264.0, + "step": 122340 + }, + { + "entropy": 1.9274428904056549, + "epoch": 0.3792742949830241, + "grad_norm": 8.3607759475708, + "learning_rate": 4.107880009777509e-06, + "loss": 0.4596, + "mean_token_accuracy": 0.8582290455698967, + "num_tokens": 147108064.0, + "step": 122350 + }, + { + "entropy": 1.931325614452362, + "epoch": 0.3793052941080738, + "grad_norm": 7.63694429397583, + "learning_rate": 4.1077121445635036e-06, + "loss": 0.4714, + "mean_token_accuracy": 0.8499775350093841, + "num_tokens": 147120037.0, + "step": 122360 + }, + { + "entropy": 1.8099196195602416, + "epoch": 0.3793362932331235, + "grad_norm": 3.8295421600341797, + "learning_rate": 4.107544299926848e-06, + "loss": 0.4353, + "mean_token_accuracy": 0.8536288380622864, + "num_tokens": 147132561.0, + "step": 122370 + }, + { + "entropy": 1.8952490270137787, + "epoch": 0.3793672923581732, + "grad_norm": 3.9062981605529785, + "learning_rate": 4.107376475863337e-06, + "loss": 0.4945, + "mean_token_accuracy": 0.845606592297554, + "num_tokens": 147144257.0, + "step": 122380 + }, + { + "entropy": 1.8194826439023017, + "epoch": 0.3793982914832229, + "grad_norm": 2.3818023204803467, + "learning_rate": 4.10720867236877e-06, + "loss": 0.5514, + "mean_token_accuracy": 0.8439658433198929, + "num_tokens": 147158022.0, + "step": 122390 + }, + { + "entropy": 1.8628859788179397, + "epoch": 0.3794292906082726, + "grad_norm": 9.438950538635254, + "learning_rate": 4.107040889438944e-06, + "loss": 0.4349, + "mean_token_accuracy": 0.8520619332790375, + "num_tokens": 147171213.0, + "step": 122400 + }, + { + "entropy": 1.8283814415335655, + "epoch": 0.3794602897333223, + "grad_norm": 8.620655059814453, + "learning_rate": 4.10687312706966e-06, + "loss": 0.4471, + "mean_token_accuracy": 0.8559184551239014, + "num_tokens": 147183776.0, + "step": 122410 + }, + { + "entropy": 1.9575203657150269, + "epoch": 0.37949128885837197, + "grad_norm": 9.457103729248047, + "learning_rate": 4.106705385256718e-06, + "loss": 0.4955, + "mean_token_accuracy": 0.8479184478521347, + "num_tokens": 147194326.0, + "step": 122420 + }, + { + "entropy": 1.832918418943882, + "epoch": 0.3795222879834217, + "grad_norm": 4.032721519470215, + "learning_rate": 4.106537663995922e-06, + "loss": 0.4266, + "mean_token_accuracy": 0.857351616024971, + "num_tokens": 147207032.0, + "step": 122430 + }, + { + "entropy": 1.7993885815143584, + "epoch": 0.37955328710847136, + "grad_norm": 4.224924087524414, + "learning_rate": 4.106369963283075e-06, + "loss": 0.4184, + "mean_token_accuracy": 0.8557244941592217, + "num_tokens": 147220135.0, + "step": 122440 + }, + { + "entropy": 1.8836615636944771, + "epoch": 0.3795842862335211, + "grad_norm": 7.664379596710205, + "learning_rate": 4.106202283113981e-06, + "loss": 0.4577, + "mean_token_accuracy": 0.8581904530525207, + "num_tokens": 147232750.0, + "step": 122450 + }, + { + "entropy": 1.9607893019914626, + "epoch": 0.37961528535857075, + "grad_norm": 8.335966110229492, + "learning_rate": 4.106034623484447e-06, + "loss": 0.5075, + "mean_token_accuracy": 0.8552387475967407, + "num_tokens": 147244068.0, + "step": 122460 + }, + { + "entropy": 1.8356236606836318, + "epoch": 0.3796462844836205, + "grad_norm": 6.946023941040039, + "learning_rate": 4.105866984390278e-06, + "loss": 0.4068, + "mean_token_accuracy": 0.8576385736465454, + "num_tokens": 147256513.0, + "step": 122470 + }, + { + "entropy": 1.9411771401762963, + "epoch": 0.37967728360867015, + "grad_norm": 7.646897792816162, + "learning_rate": 4.105699365827284e-06, + "loss": 0.4842, + "mean_token_accuracy": 0.8406426265835762, + "num_tokens": 147267715.0, + "step": 122480 + }, + { + "entropy": 1.8944382056593896, + "epoch": 0.3797082827337198, + "grad_norm": 2.604159355163574, + "learning_rate": 4.105531767791274e-06, + "loss": 0.4951, + "mean_token_accuracy": 0.8485357999801636, + "num_tokens": 147279090.0, + "step": 122490 + }, + { + "entropy": 1.8582454279065133, + "epoch": 0.37973928185876954, + "grad_norm": 7.28471565246582, + "learning_rate": 4.105364190278059e-06, + "loss": 0.4241, + "mean_token_accuracy": 0.8588834837079048, + "num_tokens": 147291484.0, + "step": 122500 + }, + { + "entropy": 1.9929537415504455, + "epoch": 0.3797702809838192, + "grad_norm": 10.385540962219238, + "learning_rate": 4.105196633283452e-06, + "loss": 0.5533, + "mean_token_accuracy": 0.8405931279063225, + "num_tokens": 147302573.0, + "step": 122510 + }, + { + "entropy": 1.8923503875732421, + "epoch": 0.37980128010886893, + "grad_norm": 8.61189079284668, + "learning_rate": 4.1050290968032635e-06, + "loss": 0.441, + "mean_token_accuracy": 0.8514659553766251, + "num_tokens": 147314906.0, + "step": 122520 + }, + { + "entropy": 1.8648876428604126, + "epoch": 0.3798322792339186, + "grad_norm": 8.03481388092041, + "learning_rate": 4.10486158083331e-06, + "loss": 0.4817, + "mean_token_accuracy": 0.8421670094132423, + "num_tokens": 147327317.0, + "step": 122530 + }, + { + "entropy": 1.780473567545414, + "epoch": 0.3798632783589683, + "grad_norm": 8.313706398010254, + "learning_rate": 4.104694085369405e-06, + "loss": 0.3914, + "mean_token_accuracy": 0.8570443943142891, + "num_tokens": 147340984.0, + "step": 122540 + }, + { + "entropy": 1.9261644035577774, + "epoch": 0.379894277484018, + "grad_norm": 7.854968070983887, + "learning_rate": 4.104526610407367e-06, + "loss": 0.4716, + "mean_token_accuracy": 0.8521699935197831, + "num_tokens": 147352717.0, + "step": 122550 + }, + { + "entropy": 1.8588500022888184, + "epoch": 0.3799252766090677, + "grad_norm": 3.6017353534698486, + "learning_rate": 4.104359155943014e-06, + "loss": 0.4202, + "mean_token_accuracy": 0.8543045312166214, + "num_tokens": 147365365.0, + "step": 122560 + }, + { + "entropy": 1.8084479227662087, + "epoch": 0.3799562757341174, + "grad_norm": 3.5251076221466064, + "learning_rate": 4.104191721972163e-06, + "loss": 0.4114, + "mean_token_accuracy": 0.8611663445830345, + "num_tokens": 147378567.0, + "step": 122570 + }, + { + "entropy": 1.849086406826973, + "epoch": 0.3799872748591671, + "grad_norm": 4.066585063934326, + "learning_rate": 4.104024308490636e-06, + "loss": 0.4579, + "mean_token_accuracy": 0.8478951990604401, + "num_tokens": 147390692.0, + "step": 122580 + }, + { + "entropy": 1.900680673122406, + "epoch": 0.3800182739842168, + "grad_norm": 10.499907493591309, + "learning_rate": 4.103856915494254e-06, + "loss": 0.4607, + "mean_token_accuracy": 0.8512833088636398, + "num_tokens": 147402469.0, + "step": 122590 + }, + { + "entropy": 1.8157877206802369, + "epoch": 0.3800492731092665, + "grad_norm": 8.330013275146484, + "learning_rate": 4.1036895429788395e-06, + "loss": 0.4072, + "mean_token_accuracy": 0.8638851836323738, + "num_tokens": 147415369.0, + "step": 122600 + }, + { + "entropy": 1.9261372715234757, + "epoch": 0.3800802722343162, + "grad_norm": 8.978737831115723, + "learning_rate": 4.103522190940217e-06, + "loss": 0.5264, + "mean_token_accuracy": 0.8390441820025444, + "num_tokens": 147426432.0, + "step": 122610 + }, + { + "entropy": 1.9050227865576743, + "epoch": 0.3801112713593659, + "grad_norm": 8.126078605651855, + "learning_rate": 4.10335485937421e-06, + "loss": 0.5085, + "mean_token_accuracy": 0.844034643471241, + "num_tokens": 147437847.0, + "step": 122620 + }, + { + "entropy": 1.9048843890428544, + "epoch": 0.38014227048441557, + "grad_norm": 7.828693389892578, + "learning_rate": 4.103187548276646e-06, + "loss": 0.4734, + "mean_token_accuracy": 0.8543620184063911, + "num_tokens": 147449041.0, + "step": 122630 + }, + { + "entropy": 1.8402218729257585, + "epoch": 0.3801732696094653, + "grad_norm": 8.167677879333496, + "learning_rate": 4.103020257643353e-06, + "loss": 0.4197, + "mean_token_accuracy": 0.8610726460814476, + "num_tokens": 147461307.0, + "step": 122640 + }, + { + "entropy": 1.8394718445837497, + "epoch": 0.38020426873451496, + "grad_norm": 4.112548828125, + "learning_rate": 4.1028529874701575e-06, + "loss": 0.4175, + "mean_token_accuracy": 0.8556538373231888, + "num_tokens": 147474356.0, + "step": 122650 + }, + { + "entropy": 1.833746100962162, + "epoch": 0.3802352678595647, + "grad_norm": 6.661306858062744, + "learning_rate": 4.102685737752892e-06, + "loss": 0.4167, + "mean_token_accuracy": 0.855821980535984, + "num_tokens": 147487546.0, + "step": 122660 + }, + { + "entropy": 1.8778820380568504, + "epoch": 0.38026626698461435, + "grad_norm": 7.601857662200928, + "learning_rate": 4.102518508487384e-06, + "loss": 0.4151, + "mean_token_accuracy": 0.8578482121229172, + "num_tokens": 147500179.0, + "step": 122670 + }, + { + "entropy": 1.8702596746385098, + "epoch": 0.3802972661096641, + "grad_norm": 8.159707069396973, + "learning_rate": 4.10235129966947e-06, + "loss": 0.4358, + "mean_token_accuracy": 0.8527193248271943, + "num_tokens": 147512342.0, + "step": 122680 + }, + { + "entropy": 1.9300275981426238, + "epoch": 0.38032826523471375, + "grad_norm": 9.76501750946045, + "learning_rate": 4.10218411129498e-06, + "loss": 0.5027, + "mean_token_accuracy": 0.842798325419426, + "num_tokens": 147523781.0, + "step": 122690 + }, + { + "entropy": 1.8886654764413833, + "epoch": 0.38035926435976347, + "grad_norm": 8.383800506591797, + "learning_rate": 4.10201694335975e-06, + "loss": 0.5191, + "mean_token_accuracy": 0.8469223186373711, + "num_tokens": 147536454.0, + "step": 122700 + }, + { + "entropy": 1.9181490674614907, + "epoch": 0.38039026348481314, + "grad_norm": 7.91489315032959, + "learning_rate": 4.1018497958596145e-06, + "loss": 0.4989, + "mean_token_accuracy": 0.8422677874565124, + "num_tokens": 147548041.0, + "step": 122710 + }, + { + "entropy": 1.7913452178239821, + "epoch": 0.3804212626098628, + "grad_norm": 8.690677642822266, + "learning_rate": 4.1016826687904125e-06, + "loss": 0.4008, + "mean_token_accuracy": 0.8631450653076171, + "num_tokens": 147561955.0, + "step": 122720 + }, + { + "entropy": 1.8757226839661598, + "epoch": 0.38045226173491253, + "grad_norm": 8.690422058105469, + "learning_rate": 4.10151556214798e-06, + "loss": 0.4643, + "mean_token_accuracy": 0.8484616339206695, + "num_tokens": 147574345.0, + "step": 122730 + }, + { + "entropy": 1.9869503617286681, + "epoch": 0.3804832608599622, + "grad_norm": 9.068852424621582, + "learning_rate": 4.101348475928157e-06, + "loss": 0.4854, + "mean_token_accuracy": 0.8462048456072807, + "num_tokens": 147584992.0, + "step": 122740 + }, + { + "entropy": 1.8192487761378289, + "epoch": 0.3805142599850119, + "grad_norm": 8.221050262451172, + "learning_rate": 4.101181410126785e-06, + "loss": 0.4065, + "mean_token_accuracy": 0.8553620144724846, + "num_tokens": 147598434.0, + "step": 122750 + }, + { + "entropy": 1.9016779504716397, + "epoch": 0.3805452591100616, + "grad_norm": 8.561354637145996, + "learning_rate": 4.101014364739705e-06, + "loss": 0.4814, + "mean_token_accuracy": 0.8509173199534417, + "num_tokens": 147610587.0, + "step": 122760 + }, + { + "entropy": 1.9240503638982773, + "epoch": 0.3805762582351113, + "grad_norm": 8.848665237426758, + "learning_rate": 4.100847339762759e-06, + "loss": 0.481, + "mean_token_accuracy": 0.8474395364522934, + "num_tokens": 147622419.0, + "step": 122770 + }, + { + "entropy": 1.9462111860513687, + "epoch": 0.380607257360161, + "grad_norm": 8.207417488098145, + "learning_rate": 4.100680335191792e-06, + "loss": 0.4987, + "mean_token_accuracy": 0.8494761645793915, + "num_tokens": 147632828.0, + "step": 122780 + }, + { + "entropy": 1.8405832841992378, + "epoch": 0.3806382564852107, + "grad_norm": 4.011960506439209, + "learning_rate": 4.1005133510226495e-06, + "loss": 0.4005, + "mean_token_accuracy": 0.8583981603384018, + "num_tokens": 147645476.0, + "step": 122790 + }, + { + "entropy": 1.9420069836080074, + "epoch": 0.3806692556102604, + "grad_norm": 6.821649074554443, + "learning_rate": 4.100346387251177e-06, + "loss": 0.5369, + "mean_token_accuracy": 0.8337976217269898, + "num_tokens": 147657157.0, + "step": 122800 + }, + { + "entropy": 1.9372777387499809, + "epoch": 0.3807002547353101, + "grad_norm": 8.612418174743652, + "learning_rate": 4.100179443873223e-06, + "loss": 0.4738, + "mean_token_accuracy": 0.8544177770614624, + "num_tokens": 147668847.0, + "step": 122810 + }, + { + "entropy": 1.840282154083252, + "epoch": 0.3807312538603598, + "grad_norm": 4.8995041847229, + "learning_rate": 4.100012520884635e-06, + "loss": 0.5072, + "mean_token_accuracy": 0.8471588909626007, + "num_tokens": 147682236.0, + "step": 122820 + }, + { + "entropy": 1.9169215068221093, + "epoch": 0.3807622529854095, + "grad_norm": 8.391762733459473, + "learning_rate": 4.0998456182812634e-06, + "loss": 0.4782, + "mean_token_accuracy": 0.8483142048120499, + "num_tokens": 147693794.0, + "step": 122830 + }, + { + "entropy": 1.8828682228922844, + "epoch": 0.38079325211045917, + "grad_norm": 3.6466331481933594, + "learning_rate": 4.099678736058961e-06, + "loss": 0.4414, + "mean_token_accuracy": 0.8551005318760871, + "num_tokens": 147706128.0, + "step": 122840 + }, + { + "entropy": 1.894105489552021, + "epoch": 0.3808242512355089, + "grad_norm": 9.506867408752441, + "learning_rate": 4.0995118742135785e-06, + "loss": 0.4382, + "mean_token_accuracy": 0.8543181657791138, + "num_tokens": 147717697.0, + "step": 122850 + }, + { + "entropy": 1.8872636377811431, + "epoch": 0.38085525036055856, + "grad_norm": 9.389772415161133, + "learning_rate": 4.09934503274097e-06, + "loss": 0.4521, + "mean_token_accuracy": 0.8546540632843971, + "num_tokens": 147730381.0, + "step": 122860 + }, + { + "entropy": 1.8735971391201018, + "epoch": 0.3808862494856083, + "grad_norm": 10.220447540283203, + "learning_rate": 4.09917821163699e-06, + "loss": 0.448, + "mean_token_accuracy": 0.8552726373076439, + "num_tokens": 147742566.0, + "step": 122870 + }, + { + "entropy": 1.8962013125419617, + "epoch": 0.38091724861065795, + "grad_norm": 7.584221363067627, + "learning_rate": 4.099011410897494e-06, + "loss": 0.4982, + "mean_token_accuracy": 0.8471159264445305, + "num_tokens": 147755055.0, + "step": 122880 + }, + { + "entropy": 1.9144986018538475, + "epoch": 0.3809482477357077, + "grad_norm": 8.150506019592285, + "learning_rate": 4.098844630518339e-06, + "loss": 0.439, + "mean_token_accuracy": 0.8621457502245903, + "num_tokens": 147766218.0, + "step": 122890 + }, + { + "entropy": 1.8816324099898338, + "epoch": 0.38097924686075735, + "grad_norm": 8.89142894744873, + "learning_rate": 4.0986778704953845e-06, + "loss": 0.4577, + "mean_token_accuracy": 0.8532285436987876, + "num_tokens": 147778543.0, + "step": 122900 + }, + { + "entropy": 1.9272763848304748, + "epoch": 0.3810102459858071, + "grad_norm": 8.55495548248291, + "learning_rate": 4.098511130824489e-06, + "loss": 0.4988, + "mean_token_accuracy": 0.845673693716526, + "num_tokens": 147789726.0, + "step": 122910 + }, + { + "entropy": 1.7462919235229493, + "epoch": 0.38104124511085674, + "grad_norm": 9.728035926818848, + "learning_rate": 4.0983444115015134e-06, + "loss": 0.3724, + "mean_token_accuracy": 0.8587867051362992, + "num_tokens": 147804271.0, + "step": 122920 + }, + { + "entropy": 1.8114602223038674, + "epoch": 0.38107224423590647, + "grad_norm": 5.929993152618408, + "learning_rate": 4.098177712522317e-06, + "loss": 0.3789, + "mean_token_accuracy": 0.860235296189785, + "num_tokens": 147817072.0, + "step": 122930 + }, + { + "entropy": 1.846523255109787, + "epoch": 0.38110324336095613, + "grad_norm": 8.399463653564453, + "learning_rate": 4.098011033882767e-06, + "loss": 0.4305, + "mean_token_accuracy": 0.8544706001877784, + "num_tokens": 147830099.0, + "step": 122940 + }, + { + "entropy": 1.9178942322731019, + "epoch": 0.38113424248600586, + "grad_norm": 4.426633358001709, + "learning_rate": 4.0978443755787265e-06, + "loss": 0.4682, + "mean_token_accuracy": 0.8501872152090073, + "num_tokens": 147841437.0, + "step": 122950 + }, + { + "entropy": 1.881843727827072, + "epoch": 0.3811652416110555, + "grad_norm": 4.205018043518066, + "learning_rate": 4.097677737606057e-06, + "loss": 0.4347, + "mean_token_accuracy": 0.8623296022415161, + "num_tokens": 147852742.0, + "step": 122960 + }, + { + "entropy": 1.8286479495465755, + "epoch": 0.3811962407361052, + "grad_norm": 10.341126441955566, + "learning_rate": 4.09751111996063e-06, + "loss": 0.3951, + "mean_token_accuracy": 0.8560968965291977, + "num_tokens": 147865179.0, + "step": 122970 + }, + { + "entropy": 1.8264545932412148, + "epoch": 0.3812272398611549, + "grad_norm": 8.994978904724121, + "learning_rate": 4.09734452263831e-06, + "loss": 0.4299, + "mean_token_accuracy": 0.8656197428703308, + "num_tokens": 147877567.0, + "step": 122980 + }, + { + "entropy": 1.9247392192482948, + "epoch": 0.3812582389862046, + "grad_norm": 7.663559913635254, + "learning_rate": 4.097177945634967e-06, + "loss": 0.4954, + "mean_token_accuracy": 0.8439255699515342, + "num_tokens": 147889528.0, + "step": 122990 + }, + { + "entropy": 1.8423993363976479, + "epoch": 0.3812892381112543, + "grad_norm": 3.762521266937256, + "learning_rate": 4.0970113889464705e-06, + "loss": 0.3907, + "mean_token_accuracy": 0.8593509882688523, + "num_tokens": 147902458.0, + "step": 123000 + }, + { + "entropy": 1.9567664802074431, + "epoch": 0.381320237236304, + "grad_norm": 8.625659942626953, + "learning_rate": 4.096844852568692e-06, + "loss": 0.5569, + "mean_token_accuracy": 0.8298911958932876, + "num_tokens": 147913729.0, + "step": 123010 + }, + { + "entropy": 1.8710352510213852, + "epoch": 0.3813512363613537, + "grad_norm": 8.000286102294922, + "learning_rate": 4.096678336497503e-06, + "loss": 0.4474, + "mean_token_accuracy": 0.8588433533906936, + "num_tokens": 147925548.0, + "step": 123020 + }, + { + "entropy": 1.9936707019805908, + "epoch": 0.3813822354864034, + "grad_norm": 9.606999397277832, + "learning_rate": 4.09651184072878e-06, + "loss": 0.5462, + "mean_token_accuracy": 0.8335342779755592, + "num_tokens": 147936901.0, + "step": 123030 + }, + { + "entropy": 1.9054792627692223, + "epoch": 0.3814132346114531, + "grad_norm": 9.011368751525879, + "learning_rate": 4.096345365258394e-06, + "loss": 0.4861, + "mean_token_accuracy": 0.8416555240750313, + "num_tokens": 147948698.0, + "step": 123040 + }, + { + "entropy": 1.9143636167049407, + "epoch": 0.38144423373650277, + "grad_norm": 9.330082893371582, + "learning_rate": 4.096178910082223e-06, + "loss": 0.4695, + "mean_token_accuracy": 0.8574577882885933, + "num_tokens": 147960109.0, + "step": 123050 + }, + { + "entropy": 1.8986399203538895, + "epoch": 0.3814752328615525, + "grad_norm": 8.78416919708252, + "learning_rate": 4.096012475196143e-06, + "loss": 0.4591, + "mean_token_accuracy": 0.8498771920800209, + "num_tokens": 147972193.0, + "step": 123060 + }, + { + "entropy": 1.8185602709650994, + "epoch": 0.38150623198660216, + "grad_norm": 8.202901840209961, + "learning_rate": 4.095846060596033e-06, + "loss": 0.4556, + "mean_token_accuracy": 0.8550837740302086, + "num_tokens": 147985079.0, + "step": 123070 + }, + { + "entropy": 1.9409471869468689, + "epoch": 0.3815372311116519, + "grad_norm": 7.777408123016357, + "learning_rate": 4.095679666277773e-06, + "loss": 0.4654, + "mean_token_accuracy": 0.8446749314665795, + "num_tokens": 147996703.0, + "step": 123080 + }, + { + "entropy": 1.9127967476844787, + "epoch": 0.38156823023670156, + "grad_norm": 8.144055366516113, + "learning_rate": 4.095513292237241e-06, + "loss": 0.5223, + "mean_token_accuracy": 0.8405533611774445, + "num_tokens": 148007728.0, + "step": 123090 + }, + { + "entropy": 1.8132119834423066, + "epoch": 0.3815992293617513, + "grad_norm": 4.817179203033447, + "learning_rate": 4.095346938470322e-06, + "loss": 0.4972, + "mean_token_accuracy": 0.8394502356648446, + "num_tokens": 148021176.0, + "step": 123100 + }, + { + "entropy": 1.8567951753735543, + "epoch": 0.38163022848680095, + "grad_norm": 8.94315242767334, + "learning_rate": 4.095180604972897e-06, + "loss": 0.4276, + "mean_token_accuracy": 0.8610012695193291, + "num_tokens": 148033775.0, + "step": 123110 + }, + { + "entropy": 1.7960670098662377, + "epoch": 0.3816612276118507, + "grad_norm": 8.07936954498291, + "learning_rate": 4.095014291740849e-06, + "loss": 0.3632, + "mean_token_accuracy": 0.8699286490678787, + "num_tokens": 148046444.0, + "step": 123120 + }, + { + "entropy": 1.880557769536972, + "epoch": 0.38169222673690034, + "grad_norm": 8.235962867736816, + "learning_rate": 4.094847998770066e-06, + "loss": 0.4425, + "mean_token_accuracy": 0.859109228849411, + "num_tokens": 148058765.0, + "step": 123130 + }, + { + "entropy": 1.8710905313491821, + "epoch": 0.38172322586195007, + "grad_norm": 8.953729629516602, + "learning_rate": 4.094681726056433e-06, + "loss": 0.4518, + "mean_token_accuracy": 0.8571123152971267, + "num_tokens": 148071128.0, + "step": 123140 + }, + { + "entropy": 1.8630672127008439, + "epoch": 0.38175422498699974, + "grad_norm": 8.592804908752441, + "learning_rate": 4.094515473595838e-06, + "loss": 0.4443, + "mean_token_accuracy": 0.8458248943090438, + "num_tokens": 148082981.0, + "step": 123150 + }, + { + "entropy": 1.900773896276951, + "epoch": 0.38178522411204946, + "grad_norm": 7.337060928344727, + "learning_rate": 4.0943492413841685e-06, + "loss": 0.4902, + "mean_token_accuracy": 0.8392305105924607, + "num_tokens": 148095109.0, + "step": 123160 + }, + { + "entropy": 1.827962300926447, + "epoch": 0.38181622323709913, + "grad_norm": 7.826770305633545, + "learning_rate": 4.094183029417316e-06, + "loss": 0.3842, + "mean_token_accuracy": 0.8655768454074859, + "num_tokens": 148108413.0, + "step": 123170 + }, + { + "entropy": 1.8747796788811684, + "epoch": 0.38184722236214885, + "grad_norm": 8.268677711486816, + "learning_rate": 4.0940168376911705e-06, + "loss": 0.4595, + "mean_token_accuracy": 0.8500054642558098, + "num_tokens": 148120152.0, + "step": 123180 + }, + { + "entropy": 1.8802484720945358, + "epoch": 0.3818782214871985, + "grad_norm": 9.56521987915039, + "learning_rate": 4.093850666201625e-06, + "loss": 0.4321, + "mean_token_accuracy": 0.8550689592957497, + "num_tokens": 148131543.0, + "step": 123190 + }, + { + "entropy": 1.9417632609605788, + "epoch": 0.38190922061224825, + "grad_norm": 8.511123657226562, + "learning_rate": 4.093684514944573e-06, + "loss": 0.5105, + "mean_token_accuracy": 0.8440429598093033, + "num_tokens": 148142246.0, + "step": 123200 + }, + { + "entropy": 1.862280185520649, + "epoch": 0.3819402197372979, + "grad_norm": 9.24006175994873, + "learning_rate": 4.093518383915908e-06, + "loss": 0.4417, + "mean_token_accuracy": 0.8556727141141891, + "num_tokens": 148154604.0, + "step": 123210 + }, + { + "entropy": 1.90358504652977, + "epoch": 0.3819712188623476, + "grad_norm": 7.789550304412842, + "learning_rate": 4.093352273111527e-06, + "loss": 0.4723, + "mean_token_accuracy": 0.8538018524646759, + "num_tokens": 148166038.0, + "step": 123220 + }, + { + "entropy": 1.8469286099076272, + "epoch": 0.3820022179873973, + "grad_norm": 8.740273475646973, + "learning_rate": 4.093186182527327e-06, + "loss": 0.4235, + "mean_token_accuracy": 0.858376681804657, + "num_tokens": 148178188.0, + "step": 123230 + }, + { + "entropy": 1.8784661173820496, + "epoch": 0.382033217112447, + "grad_norm": 8.926239013671875, + "learning_rate": 4.093020112159205e-06, + "loss": 0.616, + "mean_token_accuracy": 0.8364622831344605, + "num_tokens": 148190734.0, + "step": 123240 + }, + { + "entropy": 1.9453864216804504, + "epoch": 0.3820642162374967, + "grad_norm": 7.907076358795166, + "learning_rate": 4.092854062003061e-06, + "loss": 0.47, + "mean_token_accuracy": 0.8546721920371055, + "num_tokens": 148201633.0, + "step": 123250 + }, + { + "entropy": 1.8376804277300836, + "epoch": 0.38209521536254637, + "grad_norm": 9.020318031311035, + "learning_rate": 4.092688032054795e-06, + "loss": 0.4549, + "mean_token_accuracy": 0.8542071163654328, + "num_tokens": 148214535.0, + "step": 123260 + }, + { + "entropy": 1.8767757460474968, + "epoch": 0.3821262144875961, + "grad_norm": 8.229818344116211, + "learning_rate": 4.092522022310309e-06, + "loss": 0.5102, + "mean_token_accuracy": 0.8497373566031456, + "num_tokens": 148226463.0, + "step": 123270 + }, + { + "entropy": 1.8976902469992638, + "epoch": 0.38215721361264576, + "grad_norm": 8.769594192504883, + "learning_rate": 4.092356032765506e-06, + "loss": 0.5086, + "mean_token_accuracy": 0.8375331163406372, + "num_tokens": 148238620.0, + "step": 123280 + }, + { + "entropy": 1.9701337844133378, + "epoch": 0.3821882127376955, + "grad_norm": 9.727083206176758, + "learning_rate": 4.092190063416288e-06, + "loss": 0.5527, + "mean_token_accuracy": 0.8314969301223755, + "num_tokens": 148249082.0, + "step": 123290 + }, + { + "entropy": 1.8411318197846414, + "epoch": 0.38221921186274516, + "grad_norm": 7.282608985900879, + "learning_rate": 4.0920241142585636e-06, + "loss": 0.39, + "mean_token_accuracy": 0.8594763651490211, + "num_tokens": 148261567.0, + "step": 123300 + }, + { + "entropy": 1.9043711677193642, + "epoch": 0.3822502109877949, + "grad_norm": 7.864800453186035, + "learning_rate": 4.091858185288235e-06, + "loss": 0.5022, + "mean_token_accuracy": 0.8414738133549691, + "num_tokens": 148273132.0, + "step": 123310 + }, + { + "entropy": 1.809215198457241, + "epoch": 0.38228121011284455, + "grad_norm": 8.225022315979004, + "learning_rate": 4.091692276501213e-06, + "loss": 0.3809, + "mean_token_accuracy": 0.8564652130007744, + "num_tokens": 148286238.0, + "step": 123320 + }, + { + "entropy": 1.7878570154309272, + "epoch": 0.3823122092378943, + "grad_norm": 4.167350769042969, + "learning_rate": 4.0915263878934044e-06, + "loss": 0.4234, + "mean_token_accuracy": 0.8619178295135498, + "num_tokens": 148298922.0, + "step": 123330 + }, + { + "entropy": 1.850788153707981, + "epoch": 0.38234320836294394, + "grad_norm": 7.613214492797852, + "learning_rate": 4.091360519460719e-06, + "loss": 0.4202, + "mean_token_accuracy": 0.8597221150994301, + "num_tokens": 148311479.0, + "step": 123340 + }, + { + "entropy": 1.8307570219039917, + "epoch": 0.38237420748799367, + "grad_norm": 4.395393371582031, + "learning_rate": 4.0911946711990686e-06, + "loss": 0.4586, + "mean_token_accuracy": 0.8481859222054482, + "num_tokens": 148324806.0, + "step": 123350 + }, + { + "entropy": 1.9396772772073745, + "epoch": 0.38240520661304334, + "grad_norm": 3.719348430633545, + "learning_rate": 4.091028843104365e-06, + "loss": 0.4781, + "mean_token_accuracy": 0.8467726603150367, + "num_tokens": 148336424.0, + "step": 123360 + }, + { + "entropy": 1.8743967324495316, + "epoch": 0.38243620573809306, + "grad_norm": 3.7305939197540283, + "learning_rate": 4.090863035172519e-06, + "loss": 0.443, + "mean_token_accuracy": 0.8555470928549767, + "num_tokens": 148348774.0, + "step": 123370 + }, + { + "entropy": 1.9252464964985847, + "epoch": 0.38246720486314273, + "grad_norm": 8.94243335723877, + "learning_rate": 4.090697247399448e-06, + "loss": 0.476, + "mean_token_accuracy": 0.8391307726502418, + "num_tokens": 148360798.0, + "step": 123380 + }, + { + "entropy": 1.9863820880651475, + "epoch": 0.38249820398819245, + "grad_norm": 9.898303031921387, + "learning_rate": 4.090531479781067e-06, + "loss": 0.5399, + "mean_token_accuracy": 0.8426146388053894, + "num_tokens": 148371596.0, + "step": 123390 + }, + { + "entropy": 1.8762628570199014, + "epoch": 0.3825292031132421, + "grad_norm": 7.574060440063477, + "learning_rate": 4.0903657323132925e-06, + "loss": 0.4402, + "mean_token_accuracy": 0.8638320744037629, + "num_tokens": 148383122.0, + "step": 123400 + }, + { + "entropy": 1.9100775212049483, + "epoch": 0.38256020223829185, + "grad_norm": 4.106917858123779, + "learning_rate": 4.0902000049920414e-06, + "loss": 0.5017, + "mean_token_accuracy": 0.8347994074225425, + "num_tokens": 148394952.0, + "step": 123410 + }, + { + "entropy": 1.9066940754652024, + "epoch": 0.3825912013633415, + "grad_norm": 7.716053009033203, + "learning_rate": 4.090034297813234e-06, + "loss": 0.4522, + "mean_token_accuracy": 0.849234452843666, + "num_tokens": 148407262.0, + "step": 123420 + }, + { + "entropy": 1.7800323203206063, + "epoch": 0.38262220048839124, + "grad_norm": 4.275322914123535, + "learning_rate": 4.089868610772788e-06, + "loss": 0.4183, + "mean_token_accuracy": 0.8655282810330391, + "num_tokens": 148421221.0, + "step": 123430 + }, + { + "entropy": 1.823851725459099, + "epoch": 0.3826531996134409, + "grad_norm": 8.498555183410645, + "learning_rate": 4.089702943866628e-06, + "loss": 0.3806, + "mean_token_accuracy": 0.8620473012328148, + "num_tokens": 148434376.0, + "step": 123440 + }, + { + "entropy": 1.9421150475740432, + "epoch": 0.38268419873849063, + "grad_norm": 8.584688186645508, + "learning_rate": 4.0895372970906745e-06, + "loss": 0.5023, + "mean_token_accuracy": 0.8448976293206215, + "num_tokens": 148445096.0, + "step": 123450 + }, + { + "entropy": 1.8550694674253463, + "epoch": 0.3827151978635403, + "grad_norm": 8.82845401763916, + "learning_rate": 4.089371670440852e-06, + "loss": 0.4385, + "mean_token_accuracy": 0.8511300772428513, + "num_tokens": 148456927.0, + "step": 123460 + }, + { + "entropy": 1.92039655148983, + "epoch": 0.38274619698858997, + "grad_norm": 8.373238563537598, + "learning_rate": 4.089206063913085e-06, + "loss": 0.4981, + "mean_token_accuracy": 0.8491540059447289, + "num_tokens": 148468499.0, + "step": 123470 + }, + { + "entropy": 1.8469054311513902, + "epoch": 0.3827771961136397, + "grad_norm": 8.064623832702637, + "learning_rate": 4.089040477503299e-06, + "loss": 0.4028, + "mean_token_accuracy": 0.8639935791492462, + "num_tokens": 148480307.0, + "step": 123480 + }, + { + "entropy": 1.961165651679039, + "epoch": 0.38280819523868936, + "grad_norm": 7.094374656677246, + "learning_rate": 4.088874911207421e-06, + "loss": 0.5031, + "mean_token_accuracy": 0.8461368843913079, + "num_tokens": 148490982.0, + "step": 123490 + }, + { + "entropy": 1.8972523733973503, + "epoch": 0.3828391943637391, + "grad_norm": 9.41102123260498, + "learning_rate": 4.08870936502138e-06, + "loss": 0.4796, + "mean_token_accuracy": 0.8459867879748344, + "num_tokens": 148503085.0, + "step": 123500 + }, + { + "entropy": 1.8196463674306869, + "epoch": 0.38287019348878876, + "grad_norm": 8.395617485046387, + "learning_rate": 4.088543838941105e-06, + "loss": 0.4227, + "mean_token_accuracy": 0.8633744567632675, + "num_tokens": 148515132.0, + "step": 123510 + }, + { + "entropy": 1.933710703253746, + "epoch": 0.3829011926138385, + "grad_norm": 9.440218925476074, + "learning_rate": 4.088378332962527e-06, + "loss": 0.5686, + "mean_token_accuracy": 0.8320772811770439, + "num_tokens": 148526393.0, + "step": 123520 + }, + { + "entropy": 1.8306830495595932, + "epoch": 0.38293219173888815, + "grad_norm": 3.874952554702759, + "learning_rate": 4.088212847081577e-06, + "loss": 0.4136, + "mean_token_accuracy": 0.8493449553847313, + "num_tokens": 148539616.0, + "step": 123530 + }, + { + "entropy": 1.8871184498071671, + "epoch": 0.3829631908639379, + "grad_norm": 4.710858345031738, + "learning_rate": 4.0880473812941876e-06, + "loss": 0.4812, + "mean_token_accuracy": 0.848838959634304, + "num_tokens": 148551814.0, + "step": 123540 + }, + { + "entropy": 1.8882265403866767, + "epoch": 0.38299418998898754, + "grad_norm": 7.315601825714111, + "learning_rate": 4.087881935596294e-06, + "loss": 0.4169, + "mean_token_accuracy": 0.8602377876639367, + "num_tokens": 148563466.0, + "step": 123550 + }, + { + "entropy": 1.9142208576202393, + "epoch": 0.38302518911403727, + "grad_norm": 8.4950590133667, + "learning_rate": 4.08771650998383e-06, + "loss": 0.4305, + "mean_token_accuracy": 0.8584097474813461, + "num_tokens": 148575212.0, + "step": 123560 + }, + { + "entropy": 1.9166872769594192, + "epoch": 0.38305618823908694, + "grad_norm": 8.157064437866211, + "learning_rate": 4.087551104452733e-06, + "loss": 0.5462, + "mean_token_accuracy": 0.8363913476467133, + "num_tokens": 148587345.0, + "step": 123570 + }, + { + "entropy": 1.852193793654442, + "epoch": 0.38308718736413666, + "grad_norm": 5.7400031089782715, + "learning_rate": 4.08738571899894e-06, + "loss": 0.4275, + "mean_token_accuracy": 0.8614915952086448, + "num_tokens": 148599766.0, + "step": 123580 + }, + { + "entropy": 1.6889735147356988, + "epoch": 0.38311818648918633, + "grad_norm": 9.48442554473877, + "learning_rate": 4.08722035361839e-06, + "loss": 0.3805, + "mean_token_accuracy": 0.8619127184152603, + "num_tokens": 148613775.0, + "step": 123590 + }, + { + "entropy": 1.8661672458052636, + "epoch": 0.38314918561423605, + "grad_norm": 3.9152071475982666, + "learning_rate": 4.087055008307023e-06, + "loss": 0.4521, + "mean_token_accuracy": 0.8527839332818985, + "num_tokens": 148625883.0, + "step": 123600 + }, + { + "entropy": 1.9174211084842683, + "epoch": 0.3831801847392857, + "grad_norm": 4.289289951324463, + "learning_rate": 4.086889683060778e-06, + "loss": 0.466, + "mean_token_accuracy": 0.8489474713802337, + "num_tokens": 148637944.0, + "step": 123610 + }, + { + "entropy": 1.9397675469517708, + "epoch": 0.38321118386433545, + "grad_norm": 8.902761459350586, + "learning_rate": 4.086724377875599e-06, + "loss": 0.5397, + "mean_token_accuracy": 0.8369091704487801, + "num_tokens": 148649564.0, + "step": 123620 + }, + { + "entropy": 1.8931380987167359, + "epoch": 0.3832421829893851, + "grad_norm": 7.973553657531738, + "learning_rate": 4.08655909274743e-06, + "loss": 0.4915, + "mean_token_accuracy": 0.8456487700343132, + "num_tokens": 148661427.0, + "step": 123630 + }, + { + "entropy": 1.779283882677555, + "epoch": 0.38327318211443484, + "grad_norm": 9.01836109161377, + "learning_rate": 4.086393827672212e-06, + "loss": 0.449, + "mean_token_accuracy": 0.8560737937688827, + "num_tokens": 148675109.0, + "step": 123640 + }, + { + "entropy": 1.899433010816574, + "epoch": 0.3833041812394845, + "grad_norm": 9.137697219848633, + "learning_rate": 4.086228582645893e-06, + "loss": 0.445, + "mean_token_accuracy": 0.8533209547400474, + "num_tokens": 148686499.0, + "step": 123650 + }, + { + "entropy": 1.9183125644922256, + "epoch": 0.38333518036453423, + "grad_norm": 8.239258766174316, + "learning_rate": 4.08606335766442e-06, + "loss": 0.4644, + "mean_token_accuracy": 0.8569030150771141, + "num_tokens": 148697749.0, + "step": 123660 + }, + { + "entropy": 1.7693776428699493, + "epoch": 0.3833661794895839, + "grad_norm": 3.299410343170166, + "learning_rate": 4.08589815272374e-06, + "loss": 0.3586, + "mean_token_accuracy": 0.8623643219470978, + "num_tokens": 148712318.0, + "step": 123670 + }, + { + "entropy": 1.9084981709718705, + "epoch": 0.3833971786146336, + "grad_norm": 7.353041172027588, + "learning_rate": 4.085732967819801e-06, + "loss": 0.4917, + "mean_token_accuracy": 0.847068789601326, + "num_tokens": 148724494.0, + "step": 123680 + }, + { + "entropy": 1.9519029945135116, + "epoch": 0.3834281777396833, + "grad_norm": 8.309041976928711, + "learning_rate": 4.085567802948554e-06, + "loss": 0.4379, + "mean_token_accuracy": 0.8656853973865509, + "num_tokens": 148735242.0, + "step": 123690 + }, + { + "entropy": 1.8672381609678268, + "epoch": 0.383459176864733, + "grad_norm": 8.3343505859375, + "learning_rate": 4.085402658105951e-06, + "loss": 0.4244, + "mean_token_accuracy": 0.8520313948392868, + "num_tokens": 148747370.0, + "step": 123700 + }, + { + "entropy": 1.9503202199935914, + "epoch": 0.3834901759897827, + "grad_norm": 7.8455729484558105, + "learning_rate": 4.085237533287944e-06, + "loss": 0.4949, + "mean_token_accuracy": 0.8526298850774765, + "num_tokens": 148757963.0, + "step": 123710 + }, + { + "entropy": 1.861949661374092, + "epoch": 0.38352117511483236, + "grad_norm": 3.735707998275757, + "learning_rate": 4.085072428490485e-06, + "loss": 0.4121, + "mean_token_accuracy": 0.8578638583421707, + "num_tokens": 148770099.0, + "step": 123720 + }, + { + "entropy": 1.8470420002937318, + "epoch": 0.3835521742398821, + "grad_norm": 10.177236557006836, + "learning_rate": 4.0849073437095295e-06, + "loss": 0.4692, + "mean_token_accuracy": 0.8429055437445641, + "num_tokens": 148782367.0, + "step": 123730 + }, + { + "entropy": 1.9221158519387245, + "epoch": 0.38358317336493175, + "grad_norm": 4.895500183105469, + "learning_rate": 4.0847422789410344e-06, + "loss": 0.4672, + "mean_token_accuracy": 0.8465067788958549, + "num_tokens": 148794003.0, + "step": 123740 + }, + { + "entropy": 1.970751866698265, + "epoch": 0.3836141724899815, + "grad_norm": 7.53320837020874, + "learning_rate": 4.084577234180957e-06, + "loss": 0.48, + "mean_token_accuracy": 0.8521478682756424, + "num_tokens": 148804935.0, + "step": 123750 + }, + { + "entropy": 1.7943205565214158, + "epoch": 0.38364517161503114, + "grad_norm": 6.433713436126709, + "learning_rate": 4.084412209425253e-06, + "loss": 0.3881, + "mean_token_accuracy": 0.8569139391183853, + "num_tokens": 148818096.0, + "step": 123760 + }, + { + "entropy": 1.7551213905215264, + "epoch": 0.38367617074008087, + "grad_norm": 7.526129245758057, + "learning_rate": 4.084247204669883e-06, + "loss": 0.3707, + "mean_token_accuracy": 0.8625258266925812, + "num_tokens": 148831912.0, + "step": 123770 + }, + { + "entropy": 1.862623292207718, + "epoch": 0.38370716986513054, + "grad_norm": 8.402950286865234, + "learning_rate": 4.084082219910807e-06, + "loss": 0.3984, + "mean_token_accuracy": 0.8564006343483925, + "num_tokens": 148844690.0, + "step": 123780 + }, + { + "entropy": 1.90057223290205, + "epoch": 0.38373816899018026, + "grad_norm": 3.392810344696045, + "learning_rate": 4.083917255143988e-06, + "loss": 0.4533, + "mean_token_accuracy": 0.8512577295303345, + "num_tokens": 148856761.0, + "step": 123790 + }, + { + "entropy": 1.9783919692039489, + "epoch": 0.38376916811522993, + "grad_norm": 9.549758911132812, + "learning_rate": 4.083752310365388e-06, + "loss": 0.5152, + "mean_token_accuracy": 0.8406032085418701, + "num_tokens": 148867423.0, + "step": 123800 + }, + { + "entropy": 1.8232928544282914, + "epoch": 0.38380016724027965, + "grad_norm": 4.337226867675781, + "learning_rate": 4.083587385570969e-06, + "loss": 0.4175, + "mean_token_accuracy": 0.852597390115261, + "num_tokens": 148879925.0, + "step": 123810 + }, + { + "entropy": 1.8824557334184646, + "epoch": 0.3838311663653293, + "grad_norm": 7.782412528991699, + "learning_rate": 4.083422480756698e-06, + "loss": 0.4413, + "mean_token_accuracy": 0.856006121635437, + "num_tokens": 148891322.0, + "step": 123820 + }, + { + "entropy": 1.903056225925684, + "epoch": 0.38386216549037905, + "grad_norm": 8.749157905578613, + "learning_rate": 4.083257595918541e-06, + "loss": 0.454, + "mean_token_accuracy": 0.845898973941803, + "num_tokens": 148903673.0, + "step": 123830 + }, + { + "entropy": 1.9123461306095124, + "epoch": 0.3838931646154287, + "grad_norm": 8.949573516845703, + "learning_rate": 4.083092731052464e-06, + "loss": 0.4843, + "mean_token_accuracy": 0.8477382779121398, + "num_tokens": 148915982.0, + "step": 123840 + }, + { + "entropy": 1.867120423913002, + "epoch": 0.38392416374047844, + "grad_norm": 8.769627571105957, + "learning_rate": 4.082927886154436e-06, + "loss": 0.4548, + "mean_token_accuracy": 0.8450190261006355, + "num_tokens": 148928151.0, + "step": 123850 + }, + { + "entropy": 1.9295039504766465, + "epoch": 0.3839551628655281, + "grad_norm": 10.065016746520996, + "learning_rate": 4.0827630612204275e-06, + "loss": 0.5505, + "mean_token_accuracy": 0.8388926431536674, + "num_tokens": 148939030.0, + "step": 123860 + }, + { + "entropy": 1.8576544880867005, + "epoch": 0.38398616199057783, + "grad_norm": 11.196900367736816, + "learning_rate": 4.082598256246408e-06, + "loss": 0.479, + "mean_token_accuracy": 0.8566042557358742, + "num_tokens": 148950841.0, + "step": 123870 + }, + { + "entropy": 1.8984556332230569, + "epoch": 0.3840171611156275, + "grad_norm": 9.907042503356934, + "learning_rate": 4.082433471228349e-06, + "loss": 0.5076, + "mean_token_accuracy": 0.8443589746952057, + "num_tokens": 148962371.0, + "step": 123880 + }, + { + "entropy": 1.7866959184408189, + "epoch": 0.3840481602406772, + "grad_norm": 7.650696754455566, + "learning_rate": 4.082268706162224e-06, + "loss": 0.4578, + "mean_token_accuracy": 0.8526209697127343, + "num_tokens": 148975931.0, + "step": 123890 + }, + { + "entropy": 1.8315860643982886, + "epoch": 0.3840791593657269, + "grad_norm": 2.8190009593963623, + "learning_rate": 4.082103961044008e-06, + "loss": 0.4691, + "mean_token_accuracy": 0.8488441839814186, + "num_tokens": 148989418.0, + "step": 123900 + }, + { + "entropy": 1.803328700363636, + "epoch": 0.3841101584907766, + "grad_norm": 8.294083595275879, + "learning_rate": 4.081939235869675e-06, + "loss": 0.4175, + "mean_token_accuracy": 0.8606008633971214, + "num_tokens": 149003217.0, + "step": 123910 + }, + { + "entropy": 1.8937855035066604, + "epoch": 0.3841411576158263, + "grad_norm": 8.08438777923584, + "learning_rate": 4.081774530635201e-06, + "loss": 0.511, + "mean_token_accuracy": 0.8416438445448875, + "num_tokens": 149014790.0, + "step": 123920 + }, + { + "entropy": 1.9089516907930375, + "epoch": 0.384172156740876, + "grad_norm": 8.53679370880127, + "learning_rate": 4.081609845336565e-06, + "loss": 0.4647, + "mean_token_accuracy": 0.848609185218811, + "num_tokens": 149025831.0, + "step": 123930 + }, + { + "entropy": 1.7949181117117405, + "epoch": 0.3842031558659257, + "grad_norm": 4.385751247406006, + "learning_rate": 4.0814451799697436e-06, + "loss": 0.4284, + "mean_token_accuracy": 0.8539316862821579, + "num_tokens": 149038872.0, + "step": 123940 + }, + { + "entropy": 1.8139029011130332, + "epoch": 0.3842341549909754, + "grad_norm": 3.748425006866455, + "learning_rate": 4.081280534530718e-06, + "loss": 0.4179, + "mean_token_accuracy": 0.8566805422306061, + "num_tokens": 149051871.0, + "step": 123950 + }, + { + "entropy": 1.7684552431106568, + "epoch": 0.3842651541160251, + "grad_norm": 10.310443878173828, + "learning_rate": 4.081115909015469e-06, + "loss": 0.4372, + "mean_token_accuracy": 0.8510794118046761, + "num_tokens": 149065114.0, + "step": 123960 + }, + { + "entropy": 1.8360266655683517, + "epoch": 0.38429615324107474, + "grad_norm": 9.245149612426758, + "learning_rate": 4.080951303419979e-06, + "loss": 0.4395, + "mean_token_accuracy": 0.844825841486454, + "num_tokens": 149077418.0, + "step": 123970 + }, + { + "entropy": 1.7973667308688164, + "epoch": 0.38432715236612447, + "grad_norm": 8.100232124328613, + "learning_rate": 4.0807867177402305e-06, + "loss": 0.4194, + "mean_token_accuracy": 0.8598975196480751, + "num_tokens": 149090213.0, + "step": 123980 + }, + { + "entropy": 1.8818108469247818, + "epoch": 0.38435815149117414, + "grad_norm": 8.730313301086426, + "learning_rate": 4.080622151972207e-06, + "loss": 0.4529, + "mean_token_accuracy": 0.8592151805758477, + "num_tokens": 149102306.0, + "step": 123990 + }, + { + "entropy": 1.9396295577287674, + "epoch": 0.38438915061622386, + "grad_norm": 8.592621803283691, + "learning_rate": 4.080457606111895e-06, + "loss": 0.5319, + "mean_token_accuracy": 0.8406242683529854, + "num_tokens": 149114284.0, + "step": 124000 + }, + { + "entropy": 1.8134768947958946, + "epoch": 0.38442014974127353, + "grad_norm": 8.235865592956543, + "learning_rate": 4.080293080155281e-06, + "loss": 0.422, + "mean_token_accuracy": 0.8475548833608627, + "num_tokens": 149127288.0, + "step": 124010 + }, + { + "entropy": 1.7447491750121116, + "epoch": 0.38445114886632326, + "grad_norm": 8.99509334564209, + "learning_rate": 4.080128574098353e-06, + "loss": 0.3478, + "mean_token_accuracy": 0.8710707902908326, + "num_tokens": 149140350.0, + "step": 124020 + }, + { + "entropy": 1.9281997457146645, + "epoch": 0.3844821479913729, + "grad_norm": 9.224446296691895, + "learning_rate": 4.0799640879370986e-06, + "loss": 0.5329, + "mean_token_accuracy": 0.8362716525793076, + "num_tokens": 149151526.0, + "step": 124030 + }, + { + "entropy": 1.907413001358509, + "epoch": 0.38451314711642265, + "grad_norm": 8.881566047668457, + "learning_rate": 4.079799621667508e-06, + "loss": 0.5119, + "mean_token_accuracy": 0.841308769583702, + "num_tokens": 149162744.0, + "step": 124040 + }, + { + "entropy": 1.764004696905613, + "epoch": 0.3845441462414723, + "grad_norm": 8.439988136291504, + "learning_rate": 4.079635175285575e-06, + "loss": 0.4012, + "mean_token_accuracy": 0.8685743331909179, + "num_tokens": 149175813.0, + "step": 124050 + }, + { + "entropy": 1.8820173025131226, + "epoch": 0.38457514536652204, + "grad_norm": 7.626965522766113, + "learning_rate": 4.079470748787288e-06, + "loss": 0.438, + "mean_token_accuracy": 0.8496981009840965, + "num_tokens": 149188028.0, + "step": 124060 + }, + { + "entropy": 1.9048197478055955, + "epoch": 0.3846061444915717, + "grad_norm": 4.152190685272217, + "learning_rate": 4.079306342168641e-06, + "loss": 0.4677, + "mean_token_accuracy": 0.8482609540224075, + "num_tokens": 149199701.0, + "step": 124070 + }, + { + "entropy": 1.9153966382145882, + "epoch": 0.38463714361662144, + "grad_norm": 7.222431659698486, + "learning_rate": 4.079141955425631e-06, + "loss": 0.4673, + "mean_token_accuracy": 0.8490887343883514, + "num_tokens": 149211482.0, + "step": 124080 + }, + { + "entropy": 1.7790709897875785, + "epoch": 0.3846681427416711, + "grad_norm": 3.140486001968384, + "learning_rate": 4.0789775885542525e-06, + "loss": 0.417, + "mean_token_accuracy": 0.8636253029108047, + "num_tokens": 149224774.0, + "step": 124090 + }, + { + "entropy": 1.9251772895455361, + "epoch": 0.38469914186672083, + "grad_norm": 4.184020519256592, + "learning_rate": 4.0788132415505e-06, + "loss": 0.4724, + "mean_token_accuracy": 0.8495764017105103, + "num_tokens": 149236068.0, + "step": 124100 + }, + { + "entropy": 1.8736867174506187, + "epoch": 0.3847301409917705, + "grad_norm": 9.371009826660156, + "learning_rate": 4.078648914410375e-06, + "loss": 0.4437, + "mean_token_accuracy": 0.8543329164385796, + "num_tokens": 149248502.0, + "step": 124110 + }, + { + "entropy": 1.8708575084805488, + "epoch": 0.3847611401168202, + "grad_norm": 3.769465684890747, + "learning_rate": 4.078484607129874e-06, + "loss": 0.4439, + "mean_token_accuracy": 0.8484385311603546, + "num_tokens": 149260240.0, + "step": 124120 + }, + { + "entropy": 1.9300208121538163, + "epoch": 0.3847921392418699, + "grad_norm": 7.285602569580078, + "learning_rate": 4.078320319704997e-06, + "loss": 0.492, + "mean_token_accuracy": 0.8484133720397949, + "num_tokens": 149271397.0, + "step": 124130 + }, + { + "entropy": 1.8357270821928977, + "epoch": 0.3848231383669196, + "grad_norm": 3.4529945850372314, + "learning_rate": 4.078156052131747e-06, + "loss": 0.4362, + "mean_token_accuracy": 0.8534119576215744, + "num_tokens": 149283833.0, + "step": 124140 + }, + { + "entropy": 1.9487372577190398, + "epoch": 0.3848541374919693, + "grad_norm": 7.477880001068115, + "learning_rate": 4.0779918044061244e-06, + "loss": 0.5247, + "mean_token_accuracy": 0.8423938781023026, + "num_tokens": 149294571.0, + "step": 124150 + }, + { + "entropy": 1.8556806325912476, + "epoch": 0.384885136617019, + "grad_norm": 4.033416748046875, + "learning_rate": 4.077827576524136e-06, + "loss": 0.4794, + "mean_token_accuracy": 0.84943907558918, + "num_tokens": 149307020.0, + "step": 124160 + }, + { + "entropy": 1.877730706334114, + "epoch": 0.3849161357420687, + "grad_norm": 8.740376472473145, + "learning_rate": 4.077663368481781e-06, + "loss": 0.442, + "mean_token_accuracy": 0.8621541142463685, + "num_tokens": 149318207.0, + "step": 124170 + }, + { + "entropy": 1.9129869103431703, + "epoch": 0.3849471348671184, + "grad_norm": 8.78154468536377, + "learning_rate": 4.07749918027507e-06, + "loss": 0.4759, + "mean_token_accuracy": 0.8474374935030937, + "num_tokens": 149329638.0, + "step": 124180 + }, + { + "entropy": 1.8814658731222154, + "epoch": 0.38497813399216807, + "grad_norm": 8.555291175842285, + "learning_rate": 4.077335011900008e-06, + "loss": 0.4554, + "mean_token_accuracy": 0.8533140555024147, + "num_tokens": 149341341.0, + "step": 124190 + }, + { + "entropy": 1.8951398521661758, + "epoch": 0.38500913311721774, + "grad_norm": 7.886125564575195, + "learning_rate": 4.077170863352603e-06, + "loss": 0.5017, + "mean_token_accuracy": 0.8445818021893501, + "num_tokens": 149353637.0, + "step": 124200 + }, + { + "entropy": 1.8607033982872963, + "epoch": 0.38504013224226746, + "grad_norm": 6.102626323699951, + "learning_rate": 4.0770067346288645e-06, + "loss": 0.4808, + "mean_token_accuracy": 0.8472836226224899, + "num_tokens": 149365535.0, + "step": 124210 + }, + { + "entropy": 1.9797284454107285, + "epoch": 0.38507113136731713, + "grad_norm": 8.020270347595215, + "learning_rate": 4.076842625724803e-06, + "loss": 0.5767, + "mean_token_accuracy": 0.8357729539275169, + "num_tokens": 149376980.0, + "step": 124220 + }, + { + "entropy": 1.8790086820721625, + "epoch": 0.38510213049236686, + "grad_norm": 8.278203010559082, + "learning_rate": 4.076678536636429e-06, + "loss": 0.4368, + "mean_token_accuracy": 0.8577861353754997, + "num_tokens": 149388865.0, + "step": 124230 + }, + { + "entropy": 1.8905736938118936, + "epoch": 0.3851331296174165, + "grad_norm": 9.991707801818848, + "learning_rate": 4.076514467359756e-06, + "loss": 0.4826, + "mean_token_accuracy": 0.857266665995121, + "num_tokens": 149400675.0, + "step": 124240 + }, + { + "entropy": 1.9448306143283844, + "epoch": 0.38516412874246625, + "grad_norm": 8.666476249694824, + "learning_rate": 4.076350417890796e-06, + "loss": 0.4914, + "mean_token_accuracy": 0.8469108313322067, + "num_tokens": 149411540.0, + "step": 124250 + }, + { + "entropy": 1.8579439014196395, + "epoch": 0.3851951278675159, + "grad_norm": 4.379420280456543, + "learning_rate": 4.076186388225566e-06, + "loss": 0.4502, + "mean_token_accuracy": 0.8480482071638107, + "num_tokens": 149424172.0, + "step": 124260 + }, + { + "entropy": 1.7416525185108185, + "epoch": 0.38522612699256564, + "grad_norm": 4.337442874908447, + "learning_rate": 4.076022378360081e-06, + "loss": 0.3671, + "mean_token_accuracy": 0.87484562844038, + "num_tokens": 149438737.0, + "step": 124270 + }, + { + "entropy": 1.8429612591862679, + "epoch": 0.3852571261176153, + "grad_norm": 3.8294622898101807, + "learning_rate": 4.075858388290356e-06, + "loss": 0.4457, + "mean_token_accuracy": 0.855830529332161, + "num_tokens": 149451375.0, + "step": 124280 + }, + { + "entropy": 1.8703719988465308, + "epoch": 0.38528812524266504, + "grad_norm": 8.686786651611328, + "learning_rate": 4.075694418012412e-06, + "loss": 0.4051, + "mean_token_accuracy": 0.8558905765414238, + "num_tokens": 149463577.0, + "step": 124290 + }, + { + "entropy": 1.9531716659665108, + "epoch": 0.3853191243677147, + "grad_norm": 10.40597915649414, + "learning_rate": 4.075530467522267e-06, + "loss": 0.5363, + "mean_token_accuracy": 0.8417243003845215, + "num_tokens": 149474986.0, + "step": 124300 + }, + { + "entropy": 1.9056412920355796, + "epoch": 0.38535012349276443, + "grad_norm": 9.468035697937012, + "learning_rate": 4.0753665368159415e-06, + "loss": 0.4469, + "mean_token_accuracy": 0.8571117267012596, + "num_tokens": 149486586.0, + "step": 124310 + }, + { + "entropy": 1.8688831850886345, + "epoch": 0.3853811226178141, + "grad_norm": 7.645292282104492, + "learning_rate": 4.0752026258894575e-06, + "loss": 0.4171, + "mean_token_accuracy": 0.8533623024821282, + "num_tokens": 149499014.0, + "step": 124320 + }, + { + "entropy": 1.8921454787254333, + "epoch": 0.3854121217428638, + "grad_norm": 3.7551565170288086, + "learning_rate": 4.0750387347388356e-06, + "loss": 0.4491, + "mean_token_accuracy": 0.8515683576464653, + "num_tokens": 149511579.0, + "step": 124330 + }, + { + "entropy": 1.933222909271717, + "epoch": 0.3854431208679135, + "grad_norm": 9.027485847473145, + "learning_rate": 4.074874863360102e-06, + "loss": 0.4913, + "mean_token_accuracy": 0.8418224409222603, + "num_tokens": 149522694.0, + "step": 124340 + }, + { + "entropy": 1.88301939368248, + "epoch": 0.3854741199929632, + "grad_norm": 7.786672115325928, + "learning_rate": 4.074711011749281e-06, + "loss": 0.4449, + "mean_token_accuracy": 0.8608944162726402, + "num_tokens": 149534481.0, + "step": 124350 + }, + { + "entropy": 1.870416359603405, + "epoch": 0.3855051191180129, + "grad_norm": 7.477562427520752, + "learning_rate": 4.0745471799023966e-06, + "loss": 0.4254, + "mean_token_accuracy": 0.8581738561391831, + "num_tokens": 149546190.0, + "step": 124360 + }, + { + "entropy": 1.8137023776769639, + "epoch": 0.3855361182430626, + "grad_norm": 8.037935256958008, + "learning_rate": 4.074383367815478e-06, + "loss": 0.4451, + "mean_token_accuracy": 0.855655774474144, + "num_tokens": 149559214.0, + "step": 124370 + }, + { + "entropy": 1.890501284599304, + "epoch": 0.3855671173681123, + "grad_norm": 4.326991558074951, + "learning_rate": 4.074219575484553e-06, + "loss": 0.4686, + "mean_token_accuracy": 0.8478201061487198, + "num_tokens": 149570897.0, + "step": 124380 + }, + { + "entropy": 1.8549866631627083, + "epoch": 0.385598116493162, + "grad_norm": 9.21172046661377, + "learning_rate": 4.07405580290565e-06, + "loss": 0.5014, + "mean_token_accuracy": 0.8404435843229294, + "num_tokens": 149583130.0, + "step": 124390 + }, + { + "entropy": 1.7634894296526908, + "epoch": 0.38562911561821167, + "grad_norm": 2.975172996520996, + "learning_rate": 4.073892050074802e-06, + "loss": 0.3846, + "mean_token_accuracy": 0.8556309074163437, + "num_tokens": 149597170.0, + "step": 124400 + }, + { + "entropy": 1.9400777205824853, + "epoch": 0.3856601147432614, + "grad_norm": 4.058889389038086, + "learning_rate": 4.073728316988036e-06, + "loss": 0.4736, + "mean_token_accuracy": 0.8479943916201591, + "num_tokens": 149608439.0, + "step": 124410 + }, + { + "entropy": 1.9313759356737137, + "epoch": 0.38569111386831106, + "grad_norm": 9.538460731506348, + "learning_rate": 4.073564603641389e-06, + "loss": 0.5035, + "mean_token_accuracy": 0.8419632911682129, + "num_tokens": 149619530.0, + "step": 124420 + }, + { + "entropy": 1.8018861994147302, + "epoch": 0.3857221129933608, + "grad_norm": 3.8183743953704834, + "learning_rate": 4.073400910030892e-06, + "loss": 0.3968, + "mean_token_accuracy": 0.8584095433354377, + "num_tokens": 149632347.0, + "step": 124430 + }, + { + "entropy": 1.8204200729727744, + "epoch": 0.38575311211841046, + "grad_norm": 4.118322849273682, + "learning_rate": 4.073237236152582e-06, + "loss": 0.39, + "mean_token_accuracy": 0.8663435325026512, + "num_tokens": 149645135.0, + "step": 124440 + }, + { + "entropy": 1.8887554615736009, + "epoch": 0.3857841112434601, + "grad_norm": 2.7827911376953125, + "learning_rate": 4.073073582002494e-06, + "loss": 0.465, + "mean_token_accuracy": 0.8540782734751702, + "num_tokens": 149657595.0, + "step": 124450 + }, + { + "entropy": 1.8850025668740273, + "epoch": 0.38581511036850985, + "grad_norm": 8.43443489074707, + "learning_rate": 4.0729099475766645e-06, + "loss": 0.4813, + "mean_token_accuracy": 0.8449831783771515, + "num_tokens": 149669584.0, + "step": 124460 + }, + { + "entropy": 1.873554064333439, + "epoch": 0.3858461094935595, + "grad_norm": 7.510732173919678, + "learning_rate": 4.072746332871133e-06, + "loss": 0.4565, + "mean_token_accuracy": 0.8536177009344101, + "num_tokens": 149681728.0, + "step": 124470 + }, + { + "entropy": 1.9078126922249794, + "epoch": 0.38587710861860924, + "grad_norm": 7.718918800354004, + "learning_rate": 4.072582737881938e-06, + "loss": 0.4752, + "mean_token_accuracy": 0.8453854739665985, + "num_tokens": 149693031.0, + "step": 124480 + }, + { + "entropy": 1.8811730653047563, + "epoch": 0.3859081077436589, + "grad_norm": 7.272776126861572, + "learning_rate": 4.0724191626051195e-06, + "loss": 0.4922, + "mean_token_accuracy": 0.8460710808634758, + "num_tokens": 149704960.0, + "step": 124490 + }, + { + "entropy": 1.930656510591507, + "epoch": 0.38593910686870864, + "grad_norm": 7.962609767913818, + "learning_rate": 4.07225560703672e-06, + "loss": 0.5241, + "mean_token_accuracy": 0.8429863288998604, + "num_tokens": 149716392.0, + "step": 124500 + }, + { + "entropy": 1.9329011127352715, + "epoch": 0.3859701059937583, + "grad_norm": 9.818818092346191, + "learning_rate": 4.072092071172782e-06, + "loss": 0.5077, + "mean_token_accuracy": 0.8381221756339073, + "num_tokens": 149727625.0, + "step": 124510 + }, + { + "entropy": 1.841926720738411, + "epoch": 0.38600110511880803, + "grad_norm": 9.0508394241333, + "learning_rate": 4.071928555009349e-06, + "loss": 0.4744, + "mean_token_accuracy": 0.8424027681350708, + "num_tokens": 149740643.0, + "step": 124520 + }, + { + "entropy": 1.8008276581764222, + "epoch": 0.3860321042438577, + "grad_norm": 7.965822219848633, + "learning_rate": 4.071765058542466e-06, + "loss": 0.4137, + "mean_token_accuracy": 0.8529940769076347, + "num_tokens": 149753969.0, + "step": 124530 + }, + { + "entropy": 1.9138699561357497, + "epoch": 0.3860631033689074, + "grad_norm": 4.05026388168335, + "learning_rate": 4.07160158176818e-06, + "loss": 0.4546, + "mean_token_accuracy": 0.8561643600463867, + "num_tokens": 149765733.0, + "step": 124540 + }, + { + "entropy": 1.9274681612849236, + "epoch": 0.3860941024939571, + "grad_norm": 8.39354133605957, + "learning_rate": 4.071438124682538e-06, + "loss": 0.4649, + "mean_token_accuracy": 0.8445206269621849, + "num_tokens": 149777474.0, + "step": 124550 + }, + { + "entropy": 1.8434103056788445, + "epoch": 0.3861251016190068, + "grad_norm": 3.958385944366455, + "learning_rate": 4.071274687281586e-06, + "loss": 0.4663, + "mean_token_accuracy": 0.8522407054901123, + "num_tokens": 149789629.0, + "step": 124560 + }, + { + "entropy": 1.9501070573925972, + "epoch": 0.3861561007440565, + "grad_norm": 3.8399457931518555, + "learning_rate": 4.071111269561375e-06, + "loss": 0.4496, + "mean_token_accuracy": 0.8507745549082756, + "num_tokens": 149800781.0, + "step": 124570 + }, + { + "entropy": 1.8660836547613144, + "epoch": 0.3861870998691062, + "grad_norm": 3.3907036781311035, + "learning_rate": 4.0709478715179555e-06, + "loss": 0.441, + "mean_token_accuracy": 0.852207650244236, + "num_tokens": 149813258.0, + "step": 124580 + }, + { + "entropy": 1.9248910203576088, + "epoch": 0.3862180989941559, + "grad_norm": 3.793597459793091, + "learning_rate": 4.070784493147379e-06, + "loss": 0.4541, + "mean_token_accuracy": 0.8523493394255638, + "num_tokens": 149824551.0, + "step": 124590 + }, + { + "entropy": 1.913597546517849, + "epoch": 0.3862490981192056, + "grad_norm": 8.392363548278809, + "learning_rate": 4.0706211344457e-06, + "loss": 0.506, + "mean_token_accuracy": 0.846868097782135, + "num_tokens": 149836161.0, + "step": 124600 + }, + { + "entropy": 1.8576731622219085, + "epoch": 0.38628009724425527, + "grad_norm": 8.112271308898926, + "learning_rate": 4.070457795408968e-06, + "loss": 0.4333, + "mean_token_accuracy": 0.8608527705073357, + "num_tokens": 149848097.0, + "step": 124610 + }, + { + "entropy": 1.8224573642015458, + "epoch": 0.386311096369305, + "grad_norm": 3.8690743446350098, + "learning_rate": 4.0702944760332415e-06, + "loss": 0.4221, + "mean_token_accuracy": 0.8590767070651054, + "num_tokens": 149860778.0, + "step": 124620 + }, + { + "entropy": 1.8621004924178124, + "epoch": 0.38634209549435466, + "grad_norm": 8.412699699401855, + "learning_rate": 4.070131176314576e-06, + "loss": 0.4381, + "mean_token_accuracy": 0.8528893545269967, + "num_tokens": 149872659.0, + "step": 124630 + }, + { + "entropy": 1.8653160884976387, + "epoch": 0.3863730946194044, + "grad_norm": 4.710209846496582, + "learning_rate": 4.069967896249026e-06, + "loss": 0.4242, + "mean_token_accuracy": 0.8521018907427788, + "num_tokens": 149885265.0, + "step": 124640 + }, + { + "entropy": 1.8452029392123221, + "epoch": 0.38640409374445406, + "grad_norm": 7.326101779937744, + "learning_rate": 4.069804635832654e-06, + "loss": 0.4809, + "mean_token_accuracy": 0.8424705550074577, + "num_tokens": 149897692.0, + "step": 124650 + }, + { + "entropy": 1.8934701785445214, + "epoch": 0.3864350928695038, + "grad_norm": 9.25935173034668, + "learning_rate": 4.069641395061516e-06, + "loss": 0.515, + "mean_token_accuracy": 0.8364736765623093, + "num_tokens": 149910077.0, + "step": 124660 + }, + { + "entropy": 1.900476099550724, + "epoch": 0.38646609199455345, + "grad_norm": 11.954851150512695, + "learning_rate": 4.069478173931674e-06, + "loss": 0.4723, + "mean_token_accuracy": 0.8435856163501739, + "num_tokens": 149921585.0, + "step": 124670 + }, + { + "entropy": 1.9049529254436492, + "epoch": 0.3864970911196032, + "grad_norm": 8.252809524536133, + "learning_rate": 4.069314972439188e-06, + "loss": 0.4684, + "mean_token_accuracy": 0.8509178534150124, + "num_tokens": 149933471.0, + "step": 124680 + }, + { + "entropy": 1.914153940975666, + "epoch": 0.38652809024465284, + "grad_norm": 8.16724681854248, + "learning_rate": 4.0691517905801225e-06, + "loss": 0.4896, + "mean_token_accuracy": 0.8479727208614349, + "num_tokens": 149944962.0, + "step": 124690 + }, + { + "entropy": 1.9179544657468797, + "epoch": 0.3865590893697025, + "grad_norm": 8.549003601074219, + "learning_rate": 4.0689886283505405e-06, + "loss": 0.4904, + "mean_token_accuracy": 0.8387503355741501, + "num_tokens": 149956453.0, + "step": 124700 + }, + { + "entropy": 1.944225938618183, + "epoch": 0.38659008849475224, + "grad_norm": 8.267438888549805, + "learning_rate": 4.068825485746507e-06, + "loss": 0.4917, + "mean_token_accuracy": 0.8414188399910927, + "num_tokens": 149968058.0, + "step": 124710 + }, + { + "entropy": 1.8530795663595199, + "epoch": 0.3866210876198019, + "grad_norm": 3.6572959423065186, + "learning_rate": 4.068662362764087e-06, + "loss": 0.4355, + "mean_token_accuracy": 0.8553397417068481, + "num_tokens": 149980786.0, + "step": 124720 + }, + { + "entropy": 1.8606258913874627, + "epoch": 0.38665208674485163, + "grad_norm": 9.493535995483398, + "learning_rate": 4.06849925939935e-06, + "loss": 0.4338, + "mean_token_accuracy": 0.8513687327504158, + "num_tokens": 149994145.0, + "step": 124730 + }, + { + "entropy": 1.9374653339385985, + "epoch": 0.3866830858699013, + "grad_norm": 8.496316909790039, + "learning_rate": 4.0683361756483615e-06, + "loss": 0.514, + "mean_token_accuracy": 0.8495217755436897, + "num_tokens": 150005113.0, + "step": 124740 + }, + { + "entropy": 1.8425881132483481, + "epoch": 0.386714084994951, + "grad_norm": 8.17953872680664, + "learning_rate": 4.068173111507192e-06, + "loss": 0.4268, + "mean_token_accuracy": 0.8537844985723495, + "num_tokens": 150018495.0, + "step": 124750 + }, + { + "entropy": 1.8482965901494026, + "epoch": 0.3867450841200007, + "grad_norm": 5.024632453918457, + "learning_rate": 4.068010066971912e-06, + "loss": 0.4146, + "mean_token_accuracy": 0.8522414952516556, + "num_tokens": 150031464.0, + "step": 124760 + }, + { + "entropy": 1.934024366736412, + "epoch": 0.3867760832450504, + "grad_norm": 8.86674976348877, + "learning_rate": 4.067847042038591e-06, + "loss": 0.4842, + "mean_token_accuracy": 0.84689302444458, + "num_tokens": 150043092.0, + "step": 124770 + }, + { + "entropy": 1.871715322136879, + "epoch": 0.3868070823701001, + "grad_norm": 3.58791184425354, + "learning_rate": 4.067684036703305e-06, + "loss": 0.4503, + "mean_token_accuracy": 0.8503245860338211, + "num_tokens": 150055771.0, + "step": 124780 + }, + { + "entropy": 1.833419594168663, + "epoch": 0.3868380814951498, + "grad_norm": 7.293220043182373, + "learning_rate": 4.067521050962126e-06, + "loss": 0.4072, + "mean_token_accuracy": 0.8605907499790192, + "num_tokens": 150068131.0, + "step": 124790 + }, + { + "entropy": 1.8409696131944657, + "epoch": 0.3868690806201995, + "grad_norm": 7.163329124450684, + "learning_rate": 4.067358084811128e-06, + "loss": 0.4406, + "mean_token_accuracy": 0.853056974709034, + "num_tokens": 150080793.0, + "step": 124800 + }, + { + "entropy": 1.8847892254590988, + "epoch": 0.3869000797452492, + "grad_norm": 8.353142738342285, + "learning_rate": 4.067195138246388e-06, + "loss": 0.4377, + "mean_token_accuracy": 0.8540623351931572, + "num_tokens": 150093526.0, + "step": 124810 + }, + { + "entropy": 2.0080907315015795, + "epoch": 0.3869310788702989, + "grad_norm": 8.072431564331055, + "learning_rate": 4.067032211263983e-06, + "loss": 0.5497, + "mean_token_accuracy": 0.8375670999288559, + "num_tokens": 150104134.0, + "step": 124820 + }, + { + "entropy": 1.8393555164337159, + "epoch": 0.3869620779953486, + "grad_norm": 2.448547840118408, + "learning_rate": 4.06686930385999e-06, + "loss": 0.431, + "mean_token_accuracy": 0.8578300714492798, + "num_tokens": 150116750.0, + "step": 124830 + }, + { + "entropy": 1.8307070322334766, + "epoch": 0.38699307712039827, + "grad_norm": 3.879143476486206, + "learning_rate": 4.06670641603049e-06, + "loss": 0.4256, + "mean_token_accuracy": 0.8474024429917335, + "num_tokens": 150130258.0, + "step": 124840 + }, + { + "entropy": 1.9666341543197632, + "epoch": 0.387024076245448, + "grad_norm": 7.913427352905273, + "learning_rate": 4.066543547771561e-06, + "loss": 0.5647, + "mean_token_accuracy": 0.8228250458836556, + "num_tokens": 150141397.0, + "step": 124850 + }, + { + "entropy": 1.9169279381632804, + "epoch": 0.38705507537049766, + "grad_norm": 7.764703750610352, + "learning_rate": 4.066380699079287e-06, + "loss": 0.4324, + "mean_token_accuracy": 0.8474949941039085, + "num_tokens": 150153315.0, + "step": 124860 + }, + { + "entropy": 1.9108419820666314, + "epoch": 0.3870860744955474, + "grad_norm": 8.689712524414062, + "learning_rate": 4.066217869949748e-06, + "loss": 0.4448, + "mean_token_accuracy": 0.8523913621902466, + "num_tokens": 150165774.0, + "step": 124870 + }, + { + "entropy": 1.9573454082012176, + "epoch": 0.38711707362059705, + "grad_norm": 7.767167091369629, + "learning_rate": 4.0660550603790286e-06, + "loss": 0.4841, + "mean_token_accuracy": 0.8440973252058029, + "num_tokens": 150177549.0, + "step": 124880 + }, + { + "entropy": 1.8977736786007882, + "epoch": 0.3871480727456468, + "grad_norm": 4.0380425453186035, + "learning_rate": 4.065892270363214e-06, + "loss": 0.4515, + "mean_token_accuracy": 0.8582114532589913, + "num_tokens": 150189097.0, + "step": 124890 + }, + { + "entropy": 1.832365168631077, + "epoch": 0.38717907187069645, + "grad_norm": 4.158474922180176, + "learning_rate": 4.06572949989839e-06, + "loss": 0.4398, + "mean_token_accuracy": 0.8578600034117698, + "num_tokens": 150202332.0, + "step": 124900 + }, + { + "entropy": 1.9247950717806817, + "epoch": 0.38721007099574617, + "grad_norm": 8.511116027832031, + "learning_rate": 4.065566748980642e-06, + "loss": 0.4413, + "mean_token_accuracy": 0.8546933025121689, + "num_tokens": 150213722.0, + "step": 124910 + }, + { + "entropy": 1.8911121636629105, + "epoch": 0.38724107012079584, + "grad_norm": 7.204141616821289, + "learning_rate": 4.065404017606059e-06, + "loss": 0.4546, + "mean_token_accuracy": 0.8537615105509758, + "num_tokens": 150225589.0, + "step": 124920 + }, + { + "entropy": 1.8968706488609315, + "epoch": 0.38727206924584556, + "grad_norm": 7.295899868011475, + "learning_rate": 4.06524130577073e-06, + "loss": 0.4531, + "mean_token_accuracy": 0.8530194133520126, + "num_tokens": 150236807.0, + "step": 124930 + }, + { + "entropy": 1.8373442202806474, + "epoch": 0.38730306837089523, + "grad_norm": 8.396409034729004, + "learning_rate": 4.065078613470747e-06, + "loss": 0.3856, + "mean_token_accuracy": 0.8561060458421708, + "num_tokens": 150249132.0, + "step": 124940 + }, + { + "entropy": 1.834753280878067, + "epoch": 0.3873340674959449, + "grad_norm": 8.531126022338867, + "learning_rate": 4.0649159407021976e-06, + "loss": 0.4259, + "mean_token_accuracy": 0.8525745943188667, + "num_tokens": 150261614.0, + "step": 124950 + }, + { + "entropy": 1.9869435787200929, + "epoch": 0.3873650666209946, + "grad_norm": 8.008123397827148, + "learning_rate": 4.064753287461177e-06, + "loss": 0.4939, + "mean_token_accuracy": 0.8471731215715408, + "num_tokens": 150272837.0, + "step": 124960 + }, + { + "entropy": 1.9212356433272362, + "epoch": 0.3873960657460443, + "grad_norm": 9.139792442321777, + "learning_rate": 4.064590653743777e-06, + "loss": 0.4965, + "mean_token_accuracy": 0.8533764079213142, + "num_tokens": 150284178.0, + "step": 124970 + }, + { + "entropy": 1.9715091735124588, + "epoch": 0.387427064871094, + "grad_norm": 7.504796504974365, + "learning_rate": 4.0644280395460925e-06, + "loss": 0.5083, + "mean_token_accuracy": 0.8431293860077858, + "num_tokens": 150295307.0, + "step": 124980 + }, + { + "entropy": 1.9665077716112136, + "epoch": 0.3874580639961437, + "grad_norm": 7.719574928283691, + "learning_rate": 4.064265444864221e-06, + "loss": 0.5193, + "mean_token_accuracy": 0.8392296627163887, + "num_tokens": 150306726.0, + "step": 124990 + }, + { + "entropy": 1.8853733599185944, + "epoch": 0.3874890631211934, + "grad_norm": 7.892232894897461, + "learning_rate": 4.064102869694256e-06, + "loss": 0.494, + "mean_token_accuracy": 0.84963990598917, + "num_tokens": 150318729.0, + "step": 125000 + }, + { + "entropy": 1.8874075815081597, + "epoch": 0.3875200622462431, + "grad_norm": 8.320926666259766, + "learning_rate": 4.063940314032298e-06, + "loss": 0.4802, + "mean_token_accuracy": 0.8477603197097778, + "num_tokens": 150329931.0, + "step": 125010 + }, + { + "entropy": 1.9008696138858796, + "epoch": 0.3875510613712928, + "grad_norm": 8.944568634033203, + "learning_rate": 4.063777777874444e-06, + "loss": 0.4467, + "mean_token_accuracy": 0.8521299719810486, + "num_tokens": 150341393.0, + "step": 125020 + }, + { + "entropy": 1.9297545284032822, + "epoch": 0.3875820604963425, + "grad_norm": 8.812226295471191, + "learning_rate": 4.063615261216795e-06, + "loss": 0.4696, + "mean_token_accuracy": 0.8559518858790398, + "num_tokens": 150352453.0, + "step": 125030 + }, + { + "entropy": 1.9456378057599069, + "epoch": 0.3876130596213922, + "grad_norm": 7.481411933898926, + "learning_rate": 4.063452764055453e-06, + "loss": 0.4765, + "mean_token_accuracy": 0.8542866796255112, + "num_tokens": 150363663.0, + "step": 125040 + }, + { + "entropy": 1.843149345368147, + "epoch": 0.38764405874644187, + "grad_norm": 7.64381742477417, + "learning_rate": 4.063290286386518e-06, + "loss": 0.4247, + "mean_token_accuracy": 0.8573990285396575, + "num_tokens": 150376498.0, + "step": 125050 + }, + { + "entropy": 1.926984567940235, + "epoch": 0.3876750578714916, + "grad_norm": 7.688129901885986, + "learning_rate": 4.063127828206096e-06, + "loss": 0.4657, + "mean_token_accuracy": 0.8490166947245598, + "num_tokens": 150387561.0, + "step": 125060 + }, + { + "entropy": 1.8717382565140723, + "epoch": 0.38770605699654126, + "grad_norm": 3.5536370277404785, + "learning_rate": 4.062965389510289e-06, + "loss": 0.4053, + "mean_token_accuracy": 0.8601777657866478, + "num_tokens": 150399917.0, + "step": 125070 + }, + { + "entropy": 1.8823929965496062, + "epoch": 0.387737056121591, + "grad_norm": 4.017459869384766, + "learning_rate": 4.062802970295203e-06, + "loss": 0.3831, + "mean_token_accuracy": 0.8708006635308265, + "num_tokens": 150412228.0, + "step": 125080 + }, + { + "entropy": 1.9448561638593673, + "epoch": 0.38776805524664065, + "grad_norm": 9.202611923217773, + "learning_rate": 4.062640570556946e-06, + "loss": 0.4823, + "mean_token_accuracy": 0.842719966173172, + "num_tokens": 150423234.0, + "step": 125090 + }, + { + "entropy": 1.9790111839771272, + "epoch": 0.3877990543716904, + "grad_norm": 8.097685813903809, + "learning_rate": 4.062478190291623e-06, + "loss": 0.5268, + "mean_token_accuracy": 0.8447407379746437, + "num_tokens": 150434200.0, + "step": 125100 + }, + { + "entropy": 1.9440268695354461, + "epoch": 0.38783005349674005, + "grad_norm": 6.974407196044922, + "learning_rate": 4.062315829495345e-06, + "loss": 0.5078, + "mean_token_accuracy": 0.8498797222971917, + "num_tokens": 150446535.0, + "step": 125110 + }, + { + "entropy": 1.7851212821900844, + "epoch": 0.38786105262178977, + "grad_norm": 3.838473320007324, + "learning_rate": 4.062153488164221e-06, + "loss": 0.4113, + "mean_token_accuracy": 0.8568575367331505, + "num_tokens": 150459866.0, + "step": 125120 + }, + { + "entropy": 1.8512066915631293, + "epoch": 0.38789205174683944, + "grad_norm": 8.369192123413086, + "learning_rate": 4.0619911662943615e-06, + "loss": 0.4093, + "mean_token_accuracy": 0.8748730286955834, + "num_tokens": 150471449.0, + "step": 125130 + }, + { + "entropy": 1.9556255728006362, + "epoch": 0.38792305087188916, + "grad_norm": 8.41415023803711, + "learning_rate": 4.06182886388188e-06, + "loss": 0.5092, + "mean_token_accuracy": 0.8445400312542916, + "num_tokens": 150482247.0, + "step": 125140 + }, + { + "entropy": 1.9603990465402603, + "epoch": 0.38795404999693883, + "grad_norm": 11.196761131286621, + "learning_rate": 4.061666580922887e-06, + "loss": 0.5023, + "mean_token_accuracy": 0.842097707092762, + "num_tokens": 150492674.0, + "step": 125150 + }, + { + "entropy": 1.9363124072551727, + "epoch": 0.38798504912198856, + "grad_norm": 7.762144565582275, + "learning_rate": 4.0615043174135e-06, + "loss": 0.4877, + "mean_token_accuracy": 0.8462110877037048, + "num_tokens": 150503435.0, + "step": 125160 + }, + { + "entropy": 1.8757888361811639, + "epoch": 0.3880160482470382, + "grad_norm": 7.726643085479736, + "learning_rate": 4.061342073349831e-06, + "loss": 0.428, + "mean_token_accuracy": 0.8505197063088417, + "num_tokens": 150515945.0, + "step": 125170 + }, + { + "entropy": 1.903496977686882, + "epoch": 0.38804704737208795, + "grad_norm": 8.79697036743164, + "learning_rate": 4.061179848727998e-06, + "loss": 0.4493, + "mean_token_accuracy": 0.8483191832900048, + "num_tokens": 150527375.0, + "step": 125180 + }, + { + "entropy": 1.8405410438776015, + "epoch": 0.3880780464971376, + "grad_norm": 7.9003586769104, + "learning_rate": 4.061017643544118e-06, + "loss": 0.4483, + "mean_token_accuracy": 0.854458573460579, + "num_tokens": 150540114.0, + "step": 125190 + }, + { + "entropy": 1.94074095338583, + "epoch": 0.3881090456221873, + "grad_norm": 7.372513294219971, + "learning_rate": 4.06085545779431e-06, + "loss": 0.4543, + "mean_token_accuracy": 0.8478065714240074, + "num_tokens": 150551950.0, + "step": 125200 + }, + { + "entropy": 1.9185340121388434, + "epoch": 0.388140044747237, + "grad_norm": 8.795543670654297, + "learning_rate": 4.060693291474694e-06, + "loss": 0.4689, + "mean_token_accuracy": 0.8613968342542648, + "num_tokens": 150563662.0, + "step": 125210 + }, + { + "entropy": 1.780056294798851, + "epoch": 0.3881710438722867, + "grad_norm": 6.95949125289917, + "learning_rate": 4.06053114458139e-06, + "loss": 0.3812, + "mean_token_accuracy": 0.8654720067977906, + "num_tokens": 150576950.0, + "step": 125220 + }, + { + "entropy": 1.856193946301937, + "epoch": 0.3882020429973364, + "grad_norm": 6.373475074768066, + "learning_rate": 4.060369017110518e-06, + "loss": 0.4502, + "mean_token_accuracy": 0.854848501086235, + "num_tokens": 150589326.0, + "step": 125230 + }, + { + "entropy": 1.8003525391221047, + "epoch": 0.3882330421223861, + "grad_norm": 10.690299034118652, + "learning_rate": 4.060206909058204e-06, + "loss": 0.4474, + "mean_token_accuracy": 0.8460852265357971, + "num_tokens": 150602928.0, + "step": 125240 + }, + { + "entropy": 1.7654723808169366, + "epoch": 0.3882640412474358, + "grad_norm": 3.999009132385254, + "learning_rate": 4.060044820420571e-06, + "loss": 0.3632, + "mean_token_accuracy": 0.8616338670253754, + "num_tokens": 150616871.0, + "step": 125250 + }, + { + "entropy": 1.8826753467321395, + "epoch": 0.38829504037248547, + "grad_norm": 8.977141380310059, + "learning_rate": 4.059882751193742e-06, + "loss": 0.466, + "mean_token_accuracy": 0.8482603415846824, + "num_tokens": 150628654.0, + "step": 125260 + }, + { + "entropy": 1.904711978137493, + "epoch": 0.3883260394975352, + "grad_norm": 7.154592990875244, + "learning_rate": 4.059720701373846e-06, + "loss": 0.4669, + "mean_token_accuracy": 0.8563105076551437, + "num_tokens": 150639652.0, + "step": 125270 + }, + { + "entropy": 1.9130603343248367, + "epoch": 0.38835703862258486, + "grad_norm": 7.394834518432617, + "learning_rate": 4.059558670957009e-06, + "loss": 0.5137, + "mean_token_accuracy": 0.8381910189986229, + "num_tokens": 150651907.0, + "step": 125280 + }, + { + "entropy": 1.8639254540205001, + "epoch": 0.3883880377476346, + "grad_norm": 7.72260856628418, + "learning_rate": 4.059396659939359e-06, + "loss": 0.4821, + "mean_token_accuracy": 0.8464089751243591, + "num_tokens": 150664885.0, + "step": 125290 + }, + { + "entropy": 1.8498515799641608, + "epoch": 0.38841903687268425, + "grad_norm": 4.0547871589660645, + "learning_rate": 4.059234668317025e-06, + "loss": 0.4409, + "mean_token_accuracy": 0.8645502269268036, + "num_tokens": 150677100.0, + "step": 125300 + }, + { + "entropy": 1.844531959295273, + "epoch": 0.388450035997734, + "grad_norm": 8.105052947998047, + "learning_rate": 4.059072696086137e-06, + "loss": 0.4685, + "mean_token_accuracy": 0.8442863315343857, + "num_tokens": 150689825.0, + "step": 125310 + }, + { + "entropy": 1.894164504110813, + "epoch": 0.38848103512278365, + "grad_norm": 7.457192897796631, + "learning_rate": 4.058910743242828e-06, + "loss": 0.4526, + "mean_token_accuracy": 0.8596911758184433, + "num_tokens": 150701538.0, + "step": 125320 + }, + { + "entropy": 1.7797346115112305, + "epoch": 0.38851203424783337, + "grad_norm": 8.076257705688477, + "learning_rate": 4.05874880978323e-06, + "loss": 0.3842, + "mean_token_accuracy": 0.8683399274945259, + "num_tokens": 150713771.0, + "step": 125330 + }, + { + "entropy": 1.8412994965910912, + "epoch": 0.38854303337288304, + "grad_norm": 8.09342098236084, + "learning_rate": 4.058586895703477e-06, + "loss": 0.4193, + "mean_token_accuracy": 0.8563569858670235, + "num_tokens": 150726360.0, + "step": 125340 + }, + { + "entropy": 1.8202768236398696, + "epoch": 0.38857403249793276, + "grad_norm": 8.33877182006836, + "learning_rate": 4.058425000999703e-06, + "loss": 0.3963, + "mean_token_accuracy": 0.8674609377980232, + "num_tokens": 150738959.0, + "step": 125350 + }, + { + "entropy": 1.8868903383612632, + "epoch": 0.38860503162298243, + "grad_norm": 6.489454746246338, + "learning_rate": 4.058263125668045e-06, + "loss": 0.4395, + "mean_token_accuracy": 0.8557512402534485, + "num_tokens": 150750974.0, + "step": 125360 + }, + { + "entropy": 1.8816938310861588, + "epoch": 0.38863603074803216, + "grad_norm": 3.8871240615844727, + "learning_rate": 4.058101269704639e-06, + "loss": 0.5132, + "mean_token_accuracy": 0.8465664908289909, + "num_tokens": 150763017.0, + "step": 125370 + }, + { + "entropy": 1.967480507493019, + "epoch": 0.3886670298730818, + "grad_norm": 8.47203254699707, + "learning_rate": 4.0579394331056216e-06, + "loss": 0.5202, + "mean_token_accuracy": 0.8347558289766311, + "num_tokens": 150773877.0, + "step": 125380 + }, + { + "entropy": 1.846261352300644, + "epoch": 0.38869802899813155, + "grad_norm": 8.301435470581055, + "learning_rate": 4.057777615867134e-06, + "loss": 0.4412, + "mean_token_accuracy": 0.8551396802067757, + "num_tokens": 150786304.0, + "step": 125390 + }, + { + "entropy": 1.8747814357280732, + "epoch": 0.3887290281231812, + "grad_norm": 4.218428134918213, + "learning_rate": 4.057615817985316e-06, + "loss": 0.4316, + "mean_token_accuracy": 0.8523815110325813, + "num_tokens": 150797931.0, + "step": 125400 + }, + { + "entropy": 1.90915547311306, + "epoch": 0.38876002724823094, + "grad_norm": 4.607741355895996, + "learning_rate": 4.057454039456308e-06, + "loss": 0.4806, + "mean_token_accuracy": 0.8459027215838433, + "num_tokens": 150809552.0, + "step": 125410 + }, + { + "entropy": 1.9121482491493225, + "epoch": 0.3887910263732806, + "grad_norm": 3.7354090213775635, + "learning_rate": 4.057292280276254e-06, + "loss": 0.4652, + "mean_token_accuracy": 0.8472837626934051, + "num_tokens": 150820587.0, + "step": 125420 + }, + { + "entropy": 1.8476616084575652, + "epoch": 0.38882202549833034, + "grad_norm": 8.451462745666504, + "learning_rate": 4.057130540441295e-06, + "loss": 0.4871, + "mean_token_accuracy": 0.8486483857035637, + "num_tokens": 150832835.0, + "step": 125430 + }, + { + "entropy": 1.8385225757956505, + "epoch": 0.38885302462338, + "grad_norm": 10.56385612487793, + "learning_rate": 4.0569688199475765e-06, + "loss": 0.4569, + "mean_token_accuracy": 0.8505441263318062, + "num_tokens": 150845471.0, + "step": 125440 + }, + { + "entropy": 1.8741918548941612, + "epoch": 0.3888840237484297, + "grad_norm": 7.1380085945129395, + "learning_rate": 4.056807118791245e-06, + "loss": 0.4526, + "mean_token_accuracy": 0.8532653734087944, + "num_tokens": 150857775.0, + "step": 125450 + }, + { + "entropy": 1.8643349304795265, + "epoch": 0.3889150228734794, + "grad_norm": 7.431187152862549, + "learning_rate": 4.056645436968446e-06, + "loss": 0.4259, + "mean_token_accuracy": 0.8581862285733223, + "num_tokens": 150869444.0, + "step": 125460 + }, + { + "entropy": 1.892384371161461, + "epoch": 0.38894602199852907, + "grad_norm": 5.618492603302002, + "learning_rate": 4.056483774475327e-06, + "loss": 0.4758, + "mean_token_accuracy": 0.8496564209461213, + "num_tokens": 150881306.0, + "step": 125470 + }, + { + "entropy": 1.9383496299386025, + "epoch": 0.3889770211235788, + "grad_norm": 9.300127029418945, + "learning_rate": 4.0563221313080375e-06, + "loss": 0.482, + "mean_token_accuracy": 0.8460114181041718, + "num_tokens": 150892540.0, + "step": 125480 + }, + { + "entropy": 1.969851815700531, + "epoch": 0.38900802024862846, + "grad_norm": 10.012271881103516, + "learning_rate": 4.056160507462727e-06, + "loss": 0.5231, + "mean_token_accuracy": 0.8429014101624489, + "num_tokens": 150902915.0, + "step": 125490 + }, + { + "entropy": 1.801128013432026, + "epoch": 0.3890390193736782, + "grad_norm": 7.085505962371826, + "learning_rate": 4.055998902935546e-06, + "loss": 0.441, + "mean_token_accuracy": 0.8595685541629792, + "num_tokens": 150915697.0, + "step": 125500 + }, + { + "entropy": 1.899875347316265, + "epoch": 0.38907001849872785, + "grad_norm": 7.814671993255615, + "learning_rate": 4.055837317722647e-06, + "loss": 0.4748, + "mean_token_accuracy": 0.8444872334599495, + "num_tokens": 150927597.0, + "step": 125510 + }, + { + "entropy": 1.9567907482385636, + "epoch": 0.3891010176237776, + "grad_norm": 8.426041603088379, + "learning_rate": 4.055675751820183e-06, + "loss": 0.5164, + "mean_token_accuracy": 0.8452389568090439, + "num_tokens": 150938164.0, + "step": 125520 + }, + { + "entropy": 1.9653800070285796, + "epoch": 0.38913201674882725, + "grad_norm": 8.481525421142578, + "learning_rate": 4.055514205224307e-06, + "loss": 0.544, + "mean_token_accuracy": 0.8337672173976898, + "num_tokens": 150948930.0, + "step": 125530 + }, + { + "entropy": 1.8959671720862388, + "epoch": 0.38916301587387697, + "grad_norm": 3.948054075241089, + "learning_rate": 4.055352677931176e-06, + "loss": 0.4379, + "mean_token_accuracy": 0.8531327813863754, + "num_tokens": 150960961.0, + "step": 125540 + }, + { + "entropy": 1.934750607609749, + "epoch": 0.38919401499892664, + "grad_norm": 10.166702270507812, + "learning_rate": 4.055191169936945e-06, + "loss": 0.5107, + "mean_token_accuracy": 0.8438746586441994, + "num_tokens": 150971612.0, + "step": 125550 + }, + { + "entropy": 1.9563965529203415, + "epoch": 0.38922501412397636, + "grad_norm": 9.370492935180664, + "learning_rate": 4.055029681237772e-06, + "loss": 0.5295, + "mean_token_accuracy": 0.8403390854597091, + "num_tokens": 150982858.0, + "step": 125560 + }, + { + "entropy": 1.8438422948122024, + "epoch": 0.38925601324902603, + "grad_norm": 7.086329460144043, + "learning_rate": 4.054868211829815e-06, + "loss": 0.3796, + "mean_token_accuracy": 0.8686971604824066, + "num_tokens": 150994866.0, + "step": 125570 + }, + { + "entropy": 1.858051958680153, + "epoch": 0.38928701237407576, + "grad_norm": 8.350471496582031, + "learning_rate": 4.054706761709233e-06, + "loss": 0.4018, + "mean_token_accuracy": 0.8600917294621467, + "num_tokens": 151007774.0, + "step": 125580 + }, + { + "entropy": 1.8484682857990264, + "epoch": 0.3893180114991254, + "grad_norm": 8.876520156860352, + "learning_rate": 4.054545330872188e-06, + "loss": 0.4343, + "mean_token_accuracy": 0.853037391602993, + "num_tokens": 151020015.0, + "step": 125590 + }, + { + "entropy": 1.8667544916272163, + "epoch": 0.38934901062417515, + "grad_norm": 8.567416191101074, + "learning_rate": 4.054383919314841e-06, + "loss": 0.4612, + "mean_token_accuracy": 0.8439093992114067, + "num_tokens": 151033160.0, + "step": 125600 + }, + { + "entropy": 1.825586286187172, + "epoch": 0.3893800097492248, + "grad_norm": 6.282723426818848, + "learning_rate": 4.054222527033354e-06, + "loss": 0.4506, + "mean_token_accuracy": 0.8462097451090813, + "num_tokens": 151046530.0, + "step": 125610 + }, + { + "entropy": 1.9695552736520767, + "epoch": 0.38941100887427454, + "grad_norm": 11.352642059326172, + "learning_rate": 4.054061154023891e-06, + "loss": 0.5256, + "mean_token_accuracy": 0.8349577113986015, + "num_tokens": 151057681.0, + "step": 125620 + }, + { + "entropy": 1.8230530142784118, + "epoch": 0.3894420079993242, + "grad_norm": 3.0121681690216064, + "learning_rate": 4.053899800282617e-06, + "loss": 0.3696, + "mean_token_accuracy": 0.8639827251434327, + "num_tokens": 151070490.0, + "step": 125630 + }, + { + "entropy": 1.9242575734853744, + "epoch": 0.38947300712437394, + "grad_norm": 7.926018714904785, + "learning_rate": 4.053738465805698e-06, + "loss": 0.4306, + "mean_token_accuracy": 0.8579060733318329, + "num_tokens": 151083191.0, + "step": 125640 + }, + { + "entropy": 1.8758929014205932, + "epoch": 0.3895040062494236, + "grad_norm": 4.752248287200928, + "learning_rate": 4.0535771505893e-06, + "loss": 0.4575, + "mean_token_accuracy": 0.8485000059008598, + "num_tokens": 151096082.0, + "step": 125650 + }, + { + "entropy": 1.9501382157206535, + "epoch": 0.38953500537447333, + "grad_norm": 4.228458881378174, + "learning_rate": 4.053415854629593e-06, + "loss": 0.4896, + "mean_token_accuracy": 0.8403441533446312, + "num_tokens": 151107333.0, + "step": 125660 + }, + { + "entropy": 1.985108458995819, + "epoch": 0.389566004499523, + "grad_norm": 9.141436576843262, + "learning_rate": 4.053254577922745e-06, + "loss": 0.5549, + "mean_token_accuracy": 0.8309772089123726, + "num_tokens": 151118366.0, + "step": 125670 + }, + { + "entropy": 1.9115714818239211, + "epoch": 0.3895970036245727, + "grad_norm": 3.911010980606079, + "learning_rate": 4.053093320464925e-06, + "loss": 0.4499, + "mean_token_accuracy": 0.8501477718353272, + "num_tokens": 151130118.0, + "step": 125680 + }, + { + "entropy": 1.976603901386261, + "epoch": 0.3896280027496224, + "grad_norm": 8.365656852722168, + "learning_rate": 4.0529320822523064e-06, + "loss": 0.4979, + "mean_token_accuracy": 0.8426365479826927, + "num_tokens": 151140975.0, + "step": 125690 + }, + { + "entropy": 1.900614893436432, + "epoch": 0.38965900187467206, + "grad_norm": 8.18988037109375, + "learning_rate": 4.05277086328106e-06, + "loss": 0.4591, + "mean_token_accuracy": 0.8563957184553146, + "num_tokens": 151152889.0, + "step": 125700 + }, + { + "entropy": 1.9433655887842178, + "epoch": 0.3896900009997218, + "grad_norm": 7.366755962371826, + "learning_rate": 4.05260966354736e-06, + "loss": 0.4645, + "mean_token_accuracy": 0.855003735423088, + "num_tokens": 151164778.0, + "step": 125710 + }, + { + "entropy": 1.882521539926529, + "epoch": 0.38972100012477146, + "grad_norm": 7.148138523101807, + "learning_rate": 4.05244848304738e-06, + "loss": 0.4767, + "mean_token_accuracy": 0.8558038398623466, + "num_tokens": 151176942.0, + "step": 125720 + }, + { + "entropy": 1.8969984114170075, + "epoch": 0.3897519992498212, + "grad_norm": 6.991118907928467, + "learning_rate": 4.052287321777295e-06, + "loss": 0.4465, + "mean_token_accuracy": 0.8517007067799568, + "num_tokens": 151188748.0, + "step": 125730 + }, + { + "entropy": 1.8719248160719872, + "epoch": 0.38978299837487085, + "grad_norm": 8.159533500671387, + "learning_rate": 4.052126179733283e-06, + "loss": 0.4746, + "mean_token_accuracy": 0.8400168597698212, + "num_tokens": 151201409.0, + "step": 125740 + }, + { + "entropy": 1.8552237376570702, + "epoch": 0.3898139974999206, + "grad_norm": 4.326776504516602, + "learning_rate": 4.051965056911522e-06, + "loss": 0.4472, + "mean_token_accuracy": 0.8551932379603386, + "num_tokens": 151213645.0, + "step": 125750 + }, + { + "entropy": 1.88699069917202, + "epoch": 0.38984499662497024, + "grad_norm": 7.64647102355957, + "learning_rate": 4.051803953308188e-06, + "loss": 0.4361, + "mean_token_accuracy": 0.8565610900521279, + "num_tokens": 151225668.0, + "step": 125760 + }, + { + "entropy": 1.8817365534603596, + "epoch": 0.38987599575001997, + "grad_norm": 9.160236358642578, + "learning_rate": 4.051642868919464e-06, + "loss": 0.4781, + "mean_token_accuracy": 0.8513475954532623, + "num_tokens": 151238004.0, + "step": 125770 + }, + { + "entropy": 1.8975436851382255, + "epoch": 0.38990699487506963, + "grad_norm": 6.79841423034668, + "learning_rate": 4.0514818037415285e-06, + "loss": 0.4854, + "mean_token_accuracy": 0.8526737794280053, + "num_tokens": 151250091.0, + "step": 125780 + }, + { + "entropy": 1.9469924926757813, + "epoch": 0.38993799400011936, + "grad_norm": 7.432028770446777, + "learning_rate": 4.051320757770564e-06, + "loss": 0.4792, + "mean_token_accuracy": 0.8524128124117851, + "num_tokens": 151261967.0, + "step": 125790 + }, + { + "entropy": 1.9410534769296646, + "epoch": 0.38996899312516903, + "grad_norm": 3.4470081329345703, + "learning_rate": 4.051159731002754e-06, + "loss": 0.483, + "mean_token_accuracy": 0.844953115284443, + "num_tokens": 151273163.0, + "step": 125800 + }, + { + "entropy": 1.9425782278180121, + "epoch": 0.38999999225021875, + "grad_norm": 7.442099094390869, + "learning_rate": 4.050998723434281e-06, + "loss": 0.461, + "mean_token_accuracy": 0.8562641754746437, + "num_tokens": 151284496.0, + "step": 125810 + }, + { + "entropy": 1.8365588143467904, + "epoch": 0.3900309913752684, + "grad_norm": 8.772604942321777, + "learning_rate": 4.050837735061332e-06, + "loss": 0.454, + "mean_token_accuracy": 0.8577765494585037, + "num_tokens": 151297315.0, + "step": 125820 + }, + { + "entropy": 1.8861859440803528, + "epoch": 0.39006199050031815, + "grad_norm": 7.54157829284668, + "learning_rate": 4.050676765880091e-06, + "loss": 0.4461, + "mean_token_accuracy": 0.8518304273486137, + "num_tokens": 151309596.0, + "step": 125830 + }, + { + "entropy": 1.8863596022129059, + "epoch": 0.3900929896253678, + "grad_norm": 9.19207763671875, + "learning_rate": 4.0505158158867485e-06, + "loss": 0.4917, + "mean_token_accuracy": 0.8433287471532822, + "num_tokens": 151320769.0, + "step": 125840 + }, + { + "entropy": 1.9262786269187928, + "epoch": 0.39012398875041754, + "grad_norm": 5.372649192810059, + "learning_rate": 4.050354885077489e-06, + "loss": 0.5296, + "mean_token_accuracy": 0.8311650440096855, + "num_tokens": 151332882.0, + "step": 125850 + }, + { + "entropy": 1.913481391966343, + "epoch": 0.3901549878754672, + "grad_norm": 8.025369644165039, + "learning_rate": 4.050193973448504e-06, + "loss": 0.4681, + "mean_token_accuracy": 0.8461648374795914, + "num_tokens": 151344485.0, + "step": 125860 + }, + { + "entropy": 1.9113824039697647, + "epoch": 0.39018598700051693, + "grad_norm": 2.482351779937744, + "learning_rate": 4.050033080995983e-06, + "loss": 0.4437, + "mean_token_accuracy": 0.8611611127853394, + "num_tokens": 151356413.0, + "step": 125870 + }, + { + "entropy": 1.9198512472212315, + "epoch": 0.3902169861255666, + "grad_norm": 9.45190143585205, + "learning_rate": 4.049872207716118e-06, + "loss": 0.453, + "mean_token_accuracy": 0.8463770866394043, + "num_tokens": 151369155.0, + "step": 125880 + }, + { + "entropy": 1.9116858139634132, + "epoch": 0.3902479852506163, + "grad_norm": 7.803056716918945, + "learning_rate": 4.0497113536051e-06, + "loss": 0.4642, + "mean_token_accuracy": 0.8526840418577194, + "num_tokens": 151381458.0, + "step": 125890 + }, + { + "entropy": 1.8976224765181542, + "epoch": 0.390278984375666, + "grad_norm": 9.358246803283691, + "learning_rate": 4.049550518659125e-06, + "loss": 0.4582, + "mean_token_accuracy": 0.8442497715353966, + "num_tokens": 151393539.0, + "step": 125900 + }, + { + "entropy": 1.8757586926221848, + "epoch": 0.3903099835007157, + "grad_norm": 7.887486457824707, + "learning_rate": 4.049389702874385e-06, + "loss": 0.5401, + "mean_token_accuracy": 0.8386203497648239, + "num_tokens": 151406150.0, + "step": 125910 + }, + { + "entropy": 1.851169802248478, + "epoch": 0.3903409826257654, + "grad_norm": 9.607558250427246, + "learning_rate": 4.049228906247078e-06, + "loss": 0.4247, + "mean_token_accuracy": 0.8545200616121292, + "num_tokens": 151419052.0, + "step": 125920 + }, + { + "entropy": 1.8620378583669663, + "epoch": 0.39037198175081506, + "grad_norm": 4.9543986320495605, + "learning_rate": 4.049068128773399e-06, + "loss": 0.4283, + "mean_token_accuracy": 0.8555585101246834, + "num_tokens": 151431366.0, + "step": 125930 + }, + { + "entropy": 1.9239520370960235, + "epoch": 0.3904029808758648, + "grad_norm": 7.769740104675293, + "learning_rate": 4.048907370449547e-06, + "loss": 0.4579, + "mean_token_accuracy": 0.8550285518169403, + "num_tokens": 151442721.0, + "step": 125940 + }, + { + "entropy": 1.9248721554875374, + "epoch": 0.39043398000091445, + "grad_norm": 7.393733978271484, + "learning_rate": 4.048746631271719e-06, + "loss": 0.4521, + "mean_token_accuracy": 0.8553394272923469, + "num_tokens": 151454600.0, + "step": 125950 + }, + { + "entropy": 1.8826577708125114, + "epoch": 0.3904649791259642, + "grad_norm": 8.594537734985352, + "learning_rate": 4.048585911236118e-06, + "loss": 0.4206, + "mean_token_accuracy": 0.8587727382779121, + "num_tokens": 151466942.0, + "step": 125960 + }, + { + "entropy": 1.8130075439810753, + "epoch": 0.39049597825101384, + "grad_norm": 9.427589416503906, + "learning_rate": 4.048425210338942e-06, + "loss": 0.4433, + "mean_token_accuracy": 0.8479527696967125, + "num_tokens": 151480312.0, + "step": 125970 + }, + { + "entropy": 1.9348285168409347, + "epoch": 0.39052697737606357, + "grad_norm": 8.240906715393066, + "learning_rate": 4.048264528576393e-06, + "loss": 0.5186, + "mean_token_accuracy": 0.8450803935527802, + "num_tokens": 151491045.0, + "step": 125980 + }, + { + "entropy": 1.8613590627908707, + "epoch": 0.39055797650111324, + "grad_norm": 4.4133734703063965, + "learning_rate": 4.048103865944676e-06, + "loss": 0.4125, + "mean_token_accuracy": 0.8640340849757194, + "num_tokens": 151502174.0, + "step": 125990 + }, + { + "entropy": 1.905706176161766, + "epoch": 0.39058897562616296, + "grad_norm": 7.541008949279785, + "learning_rate": 4.047943222439993e-06, + "loss": 0.4654, + "mean_token_accuracy": 0.8492152616381645, + "num_tokens": 151513749.0, + "step": 126000 + }, + { + "entropy": 1.88614112585783, + "epoch": 0.39061997475121263, + "grad_norm": 8.197542190551758, + "learning_rate": 4.047782598058551e-06, + "loss": 0.4462, + "mean_token_accuracy": 0.8495625197887421, + "num_tokens": 151525450.0, + "step": 126010 + }, + { + "entropy": 1.8722525179386138, + "epoch": 0.39065097387626235, + "grad_norm": 7.363889217376709, + "learning_rate": 4.047621992796555e-06, + "loss": 0.4151, + "mean_token_accuracy": 0.8617817178368569, + "num_tokens": 151538034.0, + "step": 126020 + }, + { + "entropy": 1.9224463373422622, + "epoch": 0.390681973001312, + "grad_norm": 9.672123908996582, + "learning_rate": 4.047461406650214e-06, + "loss": 0.4682, + "mean_token_accuracy": 0.8521727249026299, + "num_tokens": 151549821.0, + "step": 126030 + }, + { + "entropy": 1.9509896606206893, + "epoch": 0.39071297212636175, + "grad_norm": 7.809655666351318, + "learning_rate": 4.047300839615734e-06, + "loss": 0.5067, + "mean_token_accuracy": 0.8400021120905876, + "num_tokens": 151561202.0, + "step": 126040 + }, + { + "entropy": 1.8582612618803978, + "epoch": 0.3907439712514114, + "grad_norm": 4.203096866607666, + "learning_rate": 4.047140291689325e-06, + "loss": 0.4059, + "mean_token_accuracy": 0.864464844763279, + "num_tokens": 151574760.0, + "step": 126050 + }, + { + "entropy": 1.9256421521306037, + "epoch": 0.39077497037646114, + "grad_norm": 7.765346050262451, + "learning_rate": 4.0469797628671996e-06, + "loss": 0.4424, + "mean_token_accuracy": 0.8578554153442383, + "num_tokens": 151586713.0, + "step": 126060 + }, + { + "entropy": 1.9531617119908333, + "epoch": 0.3908059695015108, + "grad_norm": 7.535767078399658, + "learning_rate": 4.046819253145565e-06, + "loss": 0.4852, + "mean_token_accuracy": 0.8484017476439476, + "num_tokens": 151598618.0, + "step": 126070 + }, + { + "entropy": 1.8639555156230927, + "epoch": 0.39083696862656053, + "grad_norm": 8.342696189880371, + "learning_rate": 4.046658762520638e-06, + "loss": 0.4913, + "mean_token_accuracy": 0.8571187525987625, + "num_tokens": 151610588.0, + "step": 126080 + }, + { + "entropy": 1.8678423702716827, + "epoch": 0.3908679677516102, + "grad_norm": 3.639899730682373, + "learning_rate": 4.046498290988629e-06, + "loss": 0.3858, + "mean_token_accuracy": 0.8643327265977859, + "num_tokens": 151622625.0, + "step": 126090 + }, + { + "entropy": 1.8767968013882637, + "epoch": 0.3908989668766599, + "grad_norm": 7.925841808319092, + "learning_rate": 4.046337838545756e-06, + "loss": 0.4044, + "mean_token_accuracy": 0.8563542291522026, + "num_tokens": 151636178.0, + "step": 126100 + }, + { + "entropy": 1.8535062327980996, + "epoch": 0.3909299660017096, + "grad_norm": 6.453927516937256, + "learning_rate": 4.046177405188231e-06, + "loss": 0.4687, + "mean_token_accuracy": 0.8552742913365364, + "num_tokens": 151649219.0, + "step": 126110 + }, + { + "entropy": 1.9538506001234055, + "epoch": 0.3909609651267593, + "grad_norm": 8.822364807128906, + "learning_rate": 4.046016990912272e-06, + "loss": 0.4634, + "mean_token_accuracy": 0.8550143092870712, + "num_tokens": 151660943.0, + "step": 126120 + }, + { + "entropy": 1.8665287435054778, + "epoch": 0.390991964251809, + "grad_norm": 8.066899299621582, + "learning_rate": 4.045856595714099e-06, + "loss": 0.4545, + "mean_token_accuracy": 0.8569189369678497, + "num_tokens": 151673423.0, + "step": 126130 + }, + { + "entropy": 1.9262668624520303, + "epoch": 0.3910229633768587, + "grad_norm": 7.15204381942749, + "learning_rate": 4.045696219589927e-06, + "loss": 0.5058, + "mean_token_accuracy": 0.847204127907753, + "num_tokens": 151684997.0, + "step": 126140 + }, + { + "entropy": 1.9528811901807785, + "epoch": 0.3910539625019084, + "grad_norm": 8.068526268005371, + "learning_rate": 4.045535862535979e-06, + "loss": 0.5037, + "mean_token_accuracy": 0.8466339007019996, + "num_tokens": 151696214.0, + "step": 126150 + }, + { + "entropy": 1.8696447387337685, + "epoch": 0.3910849616269581, + "grad_norm": 8.606704711914062, + "learning_rate": 4.045375524548474e-06, + "loss": 0.4442, + "mean_token_accuracy": 0.852445213496685, + "num_tokens": 151708632.0, + "step": 126160 + }, + { + "entropy": 1.899414698779583, + "epoch": 0.3911159607520078, + "grad_norm": 7.6843342781066895, + "learning_rate": 4.0452152056236355e-06, + "loss": 0.4273, + "mean_token_accuracy": 0.8635346978902817, + "num_tokens": 151720657.0, + "step": 126170 + }, + { + "entropy": 1.8455950021743774, + "epoch": 0.39114695987705744, + "grad_norm": 8.676742553710938, + "learning_rate": 4.045054905757685e-06, + "loss": 0.4219, + "mean_token_accuracy": 0.8539397269487381, + "num_tokens": 151732951.0, + "step": 126180 + }, + { + "entropy": 1.9040052101016045, + "epoch": 0.39117795900210717, + "grad_norm": 3.939152956008911, + "learning_rate": 4.044894624946848e-06, + "loss": 0.5054, + "mean_token_accuracy": 0.8455827251076699, + "num_tokens": 151744428.0, + "step": 126190 + }, + { + "entropy": 1.840182974934578, + "epoch": 0.39120895812715684, + "grad_norm": 7.910426616668701, + "learning_rate": 4.044734363187349e-06, + "loss": 0.4163, + "mean_token_accuracy": 0.8497889280319214, + "num_tokens": 151757200.0, + "step": 126200 + }, + { + "entropy": 1.9612287282943726, + "epoch": 0.39123995725220656, + "grad_norm": 8.55207347869873, + "learning_rate": 4.044574120475414e-06, + "loss": 0.5217, + "mean_token_accuracy": 0.8453647062182427, + "num_tokens": 151768054.0, + "step": 126210 + }, + { + "entropy": 1.925032651424408, + "epoch": 0.39127095637725623, + "grad_norm": 10.712833404541016, + "learning_rate": 4.044413896807269e-06, + "loss": 0.5053, + "mean_token_accuracy": 0.845679797232151, + "num_tokens": 151778718.0, + "step": 126220 + }, + { + "entropy": 1.9098077476024629, + "epoch": 0.39130195550230595, + "grad_norm": 10.31223201751709, + "learning_rate": 4.044253692179145e-06, + "loss": 0.471, + "mean_token_accuracy": 0.8485254973173142, + "num_tokens": 151789936.0, + "step": 126230 + }, + { + "entropy": 1.926163762807846, + "epoch": 0.3913329546273556, + "grad_norm": 7.671618938446045, + "learning_rate": 4.044093506587268e-06, + "loss": 0.4995, + "mean_token_accuracy": 0.8456138476729393, + "num_tokens": 151800690.0, + "step": 126240 + }, + { + "entropy": 1.9659644454717635, + "epoch": 0.39136395375240535, + "grad_norm": 10.632147789001465, + "learning_rate": 4.043933340027872e-06, + "loss": 0.5037, + "mean_token_accuracy": 0.8462962001562119, + "num_tokens": 151811555.0, + "step": 126250 + }, + { + "entropy": 1.9211021691560746, + "epoch": 0.391394952877455, + "grad_norm": 8.475778579711914, + "learning_rate": 4.043773192497186e-06, + "loss": 0.455, + "mean_token_accuracy": 0.8569527640938759, + "num_tokens": 151823099.0, + "step": 126260 + }, + { + "entropy": 1.8699581772089005, + "epoch": 0.39142595200250474, + "grad_norm": 7.7866530418396, + "learning_rate": 4.043613063991444e-06, + "loss": 0.4393, + "mean_token_accuracy": 0.8597669988870621, + "num_tokens": 151834750.0, + "step": 126270 + }, + { + "entropy": 1.9547498047351837, + "epoch": 0.3914569511275544, + "grad_norm": 8.300743103027344, + "learning_rate": 4.043452954506876e-06, + "loss": 0.5292, + "mean_token_accuracy": 0.8422427833080292, + "num_tokens": 151846263.0, + "step": 126280 + }, + { + "entropy": 1.8979168817400933, + "epoch": 0.39148795025260413, + "grad_norm": 9.36401653289795, + "learning_rate": 4.04329286403972e-06, + "loss": 0.4558, + "mean_token_accuracy": 0.8518133416771889, + "num_tokens": 151857975.0, + "step": 126290 + }, + { + "entropy": 1.8685284316539765, + "epoch": 0.3915189493776538, + "grad_norm": 7.125948429107666, + "learning_rate": 4.043132792586211e-06, + "loss": 0.4297, + "mean_token_accuracy": 0.8564907148480415, + "num_tokens": 151869148.0, + "step": 126300 + }, + { + "entropy": 1.932071290910244, + "epoch": 0.3915499485027035, + "grad_norm": 8.5045804977417, + "learning_rate": 4.042972740142585e-06, + "loss": 0.4501, + "mean_token_accuracy": 0.85165533721447, + "num_tokens": 151880669.0, + "step": 126310 + }, + { + "entropy": 1.898838460445404, + "epoch": 0.3915809476277532, + "grad_norm": 8.528064727783203, + "learning_rate": 4.04281270670508e-06, + "loss": 0.4681, + "mean_token_accuracy": 0.8440765485167503, + "num_tokens": 151893003.0, + "step": 126320 + }, + { + "entropy": 1.8209847897291183, + "epoch": 0.3916119467528029, + "grad_norm": 9.340559959411621, + "learning_rate": 4.042652692269934e-06, + "loss": 0.4549, + "mean_token_accuracy": 0.859174670279026, + "num_tokens": 151905874.0, + "step": 126330 + }, + { + "entropy": 1.95557002723217, + "epoch": 0.3916429458778526, + "grad_norm": 8.08029842376709, + "learning_rate": 4.042492696833388e-06, + "loss": 0.5362, + "mean_token_accuracy": 0.8356934756040573, + "num_tokens": 151917270.0, + "step": 126340 + }, + { + "entropy": 1.8023017689585685, + "epoch": 0.3916739450029023, + "grad_norm": 8.126782417297363, + "learning_rate": 4.042332720391681e-06, + "loss": 0.421, + "mean_token_accuracy": 0.8570797294378281, + "num_tokens": 151931119.0, + "step": 126350 + }, + { + "entropy": 1.881139089167118, + "epoch": 0.391704944127952, + "grad_norm": 8.571240425109863, + "learning_rate": 4.042172762941057e-06, + "loss": 0.4845, + "mean_token_accuracy": 0.8508111834526062, + "num_tokens": 151942409.0, + "step": 126360 + }, + { + "entropy": 1.8666364967823028, + "epoch": 0.3917359432530017, + "grad_norm": 8.6209077835083, + "learning_rate": 4.042012824477757e-06, + "loss": 0.4607, + "mean_token_accuracy": 0.8415684968233108, + "num_tokens": 151954323.0, + "step": 126370 + }, + { + "entropy": 1.8725038409233092, + "epoch": 0.3917669423780514, + "grad_norm": 7.019345760345459, + "learning_rate": 4.041852904998025e-06, + "loss": 0.4529, + "mean_token_accuracy": 0.850992114841938, + "num_tokens": 151967314.0, + "step": 126380 + }, + { + "entropy": 1.8260181456804276, + "epoch": 0.3917979415031011, + "grad_norm": 8.401409149169922, + "learning_rate": 4.0416930044981064e-06, + "loss": 0.4613, + "mean_token_accuracy": 0.84242385327816, + "num_tokens": 151980943.0, + "step": 126390 + }, + { + "entropy": 1.9128731414675713, + "epoch": 0.39182894062815077, + "grad_norm": 3.6636550426483154, + "learning_rate": 4.041533122974248e-06, + "loss": 0.4671, + "mean_token_accuracy": 0.8557316899299622, + "num_tokens": 151992446.0, + "step": 126400 + }, + { + "entropy": 1.8801569387316703, + "epoch": 0.3918599397532005, + "grad_norm": 4.1143107414245605, + "learning_rate": 4.041373260422696e-06, + "loss": 0.4701, + "mean_token_accuracy": 0.8517079591751099, + "num_tokens": 152004499.0, + "step": 126410 + }, + { + "entropy": 1.9254588589072228, + "epoch": 0.39189093887825016, + "grad_norm": 7.596971035003662, + "learning_rate": 4.041213416839698e-06, + "loss": 0.4795, + "mean_token_accuracy": 0.8518746435642243, + "num_tokens": 152016295.0, + "step": 126420 + }, + { + "entropy": 1.9142033576965332, + "epoch": 0.39192193800329983, + "grad_norm": 7.351873397827148, + "learning_rate": 4.0410535922215046e-06, + "loss": 0.4874, + "mean_token_accuracy": 0.8495833858847618, + "num_tokens": 152027558.0, + "step": 126430 + }, + { + "entropy": 1.892107328772545, + "epoch": 0.39195293712834955, + "grad_norm": 9.62601089477539, + "learning_rate": 4.040893786564364e-06, + "loss": 0.4711, + "mean_token_accuracy": 0.8505282923579216, + "num_tokens": 152039402.0, + "step": 126440 + }, + { + "entropy": 1.8616481512784957, + "epoch": 0.3919839362533992, + "grad_norm": 7.576359748840332, + "learning_rate": 4.040733999864529e-06, + "loss": 0.4808, + "mean_token_accuracy": 0.8497416257858277, + "num_tokens": 152051794.0, + "step": 126450 + }, + { + "entropy": 1.7416065499186515, + "epoch": 0.39201493537844895, + "grad_norm": 4.21736478805542, + "learning_rate": 4.0405742321182505e-06, + "loss": 0.3715, + "mean_token_accuracy": 0.857705582678318, + "num_tokens": 152065713.0, + "step": 126460 + }, + { + "entropy": 1.8679188892245293, + "epoch": 0.3920459345034986, + "grad_norm": 7.051849365234375, + "learning_rate": 4.040414483321783e-06, + "loss": 0.4499, + "mean_token_accuracy": 0.8559218123555183, + "num_tokens": 152078414.0, + "step": 126470 + }, + { + "entropy": 1.9041266560554504, + "epoch": 0.39207693362854834, + "grad_norm": 8.104182243347168, + "learning_rate": 4.0402547534713795e-06, + "loss": 0.4943, + "mean_token_accuracy": 0.8375504925847054, + "num_tokens": 152090227.0, + "step": 126480 + }, + { + "entropy": 1.9575203120708466, + "epoch": 0.392107932753598, + "grad_norm": 7.981137752532959, + "learning_rate": 4.0400950425632965e-06, + "loss": 0.5094, + "mean_token_accuracy": 0.8419002950191498, + "num_tokens": 152101040.0, + "step": 126490 + }, + { + "entropy": 1.907607588171959, + "epoch": 0.39213893187864773, + "grad_norm": 8.032891273498535, + "learning_rate": 4.03993535059379e-06, + "loss": 0.4519, + "mean_token_accuracy": 0.8457751244306564, + "num_tokens": 152113002.0, + "step": 126500 + }, + { + "entropy": 1.8676989868283271, + "epoch": 0.3921699310036974, + "grad_norm": 8.83267879486084, + "learning_rate": 4.039775677559117e-06, + "loss": 0.4944, + "mean_token_accuracy": 0.839282539486885, + "num_tokens": 152125801.0, + "step": 126510 + }, + { + "entropy": 1.943473118543625, + "epoch": 0.3922009301287471, + "grad_norm": 9.639033317565918, + "learning_rate": 4.0396160234555365e-06, + "loss": 0.5114, + "mean_token_accuracy": 0.8399049505591393, + "num_tokens": 152137319.0, + "step": 126520 + }, + { + "entropy": 1.885578916966915, + "epoch": 0.3922319292537968, + "grad_norm": 4.101627349853516, + "learning_rate": 4.039456388279307e-06, + "loss": 0.4814, + "mean_token_accuracy": 0.8486464694142342, + "num_tokens": 152148855.0, + "step": 126530 + }, + { + "entropy": 1.8974051237106324, + "epoch": 0.3922629283788465, + "grad_norm": 4.128299713134766, + "learning_rate": 4.03929677202669e-06, + "loss": 0.4754, + "mean_token_accuracy": 0.8376821115612983, + "num_tokens": 152161206.0, + "step": 126540 + }, + { + "entropy": 1.8535172313451767, + "epoch": 0.3922939275038962, + "grad_norm": 8.3539400100708, + "learning_rate": 4.0391371746939465e-06, + "loss": 0.3962, + "mean_token_accuracy": 0.8649252921342849, + "num_tokens": 152172699.0, + "step": 126550 + }, + { + "entropy": 1.954978096485138, + "epoch": 0.3923249266289459, + "grad_norm": 3.532893180847168, + "learning_rate": 4.03897759627734e-06, + "loss": 0.4886, + "mean_token_accuracy": 0.8465725928544998, + "num_tokens": 152184753.0, + "step": 126560 + }, + { + "entropy": 1.9255331978201866, + "epoch": 0.3923559257539956, + "grad_norm": 10.25304126739502, + "learning_rate": 4.038818036773132e-06, + "loss": 0.4848, + "mean_token_accuracy": 0.8339841410517692, + "num_tokens": 152196175.0, + "step": 126570 + }, + { + "entropy": 1.893292284011841, + "epoch": 0.3923869248790453, + "grad_norm": 4.092190742492676, + "learning_rate": 4.038658496177589e-06, + "loss": 0.4712, + "mean_token_accuracy": 0.8404751896858216, + "num_tokens": 152208309.0, + "step": 126580 + }, + { + "entropy": 1.892499789595604, + "epoch": 0.392417924004095, + "grad_norm": 7.2047438621521, + "learning_rate": 4.0384989744869764e-06, + "loss": 0.4035, + "mean_token_accuracy": 0.8644706353545188, + "num_tokens": 152220743.0, + "step": 126590 + }, + { + "entropy": 1.8051872000098228, + "epoch": 0.3924489231291447, + "grad_norm": 8.140718460083008, + "learning_rate": 4.03833947169756e-06, + "loss": 0.4284, + "mean_token_accuracy": 0.857630018889904, + "num_tokens": 152234128.0, + "step": 126600 + }, + { + "entropy": 1.9284190341830254, + "epoch": 0.39247992225419437, + "grad_norm": 8.732551574707031, + "learning_rate": 4.038179987805609e-06, + "loss": 0.4658, + "mean_token_accuracy": 0.8481160834431648, + "num_tokens": 152245832.0, + "step": 126610 + }, + { + "entropy": 1.8666399344801903, + "epoch": 0.3925109213792441, + "grad_norm": 5.154988765716553, + "learning_rate": 4.038020522807391e-06, + "loss": 0.4379, + "mean_token_accuracy": 0.8664086773991585, + "num_tokens": 152258281.0, + "step": 126620 + }, + { + "entropy": 1.9185226619243623, + "epoch": 0.39254192050429376, + "grad_norm": 8.254905700683594, + "learning_rate": 4.037861076699175e-06, + "loss": 0.4265, + "mean_token_accuracy": 0.8595322445034981, + "num_tokens": 152270283.0, + "step": 126630 + }, + { + "entropy": 1.881806494295597, + "epoch": 0.3925729196293435, + "grad_norm": 7.021206378936768, + "learning_rate": 4.037701649477234e-06, + "loss": 0.4585, + "mean_token_accuracy": 0.8548443630337715, + "num_tokens": 152282446.0, + "step": 126640 + }, + { + "entropy": 1.9258292108774184, + "epoch": 0.39260391875439316, + "grad_norm": 8.01991081237793, + "learning_rate": 4.037542241137839e-06, + "loss": 0.489, + "mean_token_accuracy": 0.8480772167444229, + "num_tokens": 152294011.0, + "step": 126650 + }, + { + "entropy": 1.976582020521164, + "epoch": 0.3926349178794429, + "grad_norm": 8.293559074401855, + "learning_rate": 4.0373828516772615e-06, + "loss": 0.4921, + "mean_token_accuracy": 0.8434167116880417, + "num_tokens": 152304686.0, + "step": 126660 + }, + { + "entropy": 1.8964557886123656, + "epoch": 0.39266591700449255, + "grad_norm": 8.829238891601562, + "learning_rate": 4.037223481091777e-06, + "loss": 0.4369, + "mean_token_accuracy": 0.8579582333564758, + "num_tokens": 152316928.0, + "step": 126670 + }, + { + "entropy": 1.889539574831724, + "epoch": 0.3926969161295422, + "grad_norm": 9.728771209716797, + "learning_rate": 4.037064129377661e-06, + "loss": 0.4205, + "mean_token_accuracy": 0.8489784002304077, + "num_tokens": 152329516.0, + "step": 126680 + }, + { + "entropy": 1.9084494978189468, + "epoch": 0.39272791525459194, + "grad_norm": 8.081507682800293, + "learning_rate": 4.036904796531187e-06, + "loss": 0.4672, + "mean_token_accuracy": 0.8519722208380699, + "num_tokens": 152340705.0, + "step": 126690 + }, + { + "entropy": 1.9536496102809906, + "epoch": 0.3927589143796416, + "grad_norm": 7.832414150238037, + "learning_rate": 4.036745482548634e-06, + "loss": 0.4672, + "mean_token_accuracy": 0.8512647017836571, + "num_tokens": 152351867.0, + "step": 126700 + }, + { + "entropy": 1.8545057475566864, + "epoch": 0.39278991350469133, + "grad_norm": 3.9451944828033447, + "learning_rate": 4.03658618742628e-06, + "loss": 0.4381, + "mean_token_accuracy": 0.8511400848627091, + "num_tokens": 152364222.0, + "step": 126710 + }, + { + "entropy": 1.8836131229996682, + "epoch": 0.392820912629741, + "grad_norm": 7.37644624710083, + "learning_rate": 4.036426911160403e-06, + "loss": 0.4651, + "mean_token_accuracy": 0.8465021848678589, + "num_tokens": 152376071.0, + "step": 126720 + }, + { + "entropy": 1.908482810854912, + "epoch": 0.39285191175479073, + "grad_norm": 8.756072998046875, + "learning_rate": 4.036267653747283e-06, + "loss": 0.4601, + "mean_token_accuracy": 0.8456471160054206, + "num_tokens": 152387712.0, + "step": 126730 + }, + { + "entropy": 1.8821537524461747, + "epoch": 0.3928829108798404, + "grad_norm": 6.746029853820801, + "learning_rate": 4.036108415183202e-06, + "loss": 0.4408, + "mean_token_accuracy": 0.853573402762413, + "num_tokens": 152399464.0, + "step": 126740 + }, + { + "entropy": 1.7690861240029334, + "epoch": 0.3929139100048901, + "grad_norm": 8.50271224975586, + "learning_rate": 4.035949195464444e-06, + "loss": 0.3848, + "mean_token_accuracy": 0.8643131896853447, + "num_tokens": 152412703.0, + "step": 126750 + }, + { + "entropy": 1.9057681828737258, + "epoch": 0.3929449091299398, + "grad_norm": 7.735318183898926, + "learning_rate": 4.035789994587288e-06, + "loss": 0.4516, + "mean_token_accuracy": 0.8501883447170258, + "num_tokens": 152423734.0, + "step": 126760 + }, + { + "entropy": 1.980392262339592, + "epoch": 0.3929759082549895, + "grad_norm": 9.195259094238281, + "learning_rate": 4.0356308125480205e-06, + "loss": 0.57, + "mean_token_accuracy": 0.830004945397377, + "num_tokens": 152435210.0, + "step": 126770 + }, + { + "entropy": 1.9139831706881523, + "epoch": 0.3930069073800392, + "grad_norm": 3.758761167526245, + "learning_rate": 4.035471649342926e-06, + "loss": 0.4639, + "mean_token_accuracy": 0.8497570350766182, + "num_tokens": 152447602.0, + "step": 126780 + }, + { + "entropy": 1.9336750611662865, + "epoch": 0.3930379065050889, + "grad_norm": 7.929681777954102, + "learning_rate": 4.035312504968292e-06, + "loss": 0.461, + "mean_token_accuracy": 0.852272717654705, + "num_tokens": 152458841.0, + "step": 126790 + }, + { + "entropy": 1.922458928823471, + "epoch": 0.3930689056301386, + "grad_norm": 7.431619644165039, + "learning_rate": 4.035153379420406e-06, + "loss": 0.4604, + "mean_token_accuracy": 0.8492286145687103, + "num_tokens": 152469765.0, + "step": 126800 + }, + { + "entropy": 1.9320431604981423, + "epoch": 0.3930999047551883, + "grad_norm": 7.810932636260986, + "learning_rate": 4.034994272695553e-06, + "loss": 0.5431, + "mean_token_accuracy": 0.8349621132016182, + "num_tokens": 152481239.0, + "step": 126810 + }, + { + "entropy": 1.911438837647438, + "epoch": 0.39313090388023797, + "grad_norm": 9.275655746459961, + "learning_rate": 4.0348351847900256e-06, + "loss": 0.4774, + "mean_token_accuracy": 0.8501843497157097, + "num_tokens": 152492347.0, + "step": 126820 + }, + { + "entropy": 1.8184763744473458, + "epoch": 0.3931619030052877, + "grad_norm": 8.230948448181152, + "learning_rate": 4.034676115700114e-06, + "loss": 0.4501, + "mean_token_accuracy": 0.8377979636192322, + "num_tokens": 152505653.0, + "step": 126830 + }, + { + "entropy": 1.9131136581301689, + "epoch": 0.39319290213033736, + "grad_norm": 7.071003437042236, + "learning_rate": 4.0345170654221075e-06, + "loss": 0.4916, + "mean_token_accuracy": 0.8539568200707436, + "num_tokens": 152517329.0, + "step": 126840 + }, + { + "entropy": 1.9276820585131644, + "epoch": 0.3932239012553871, + "grad_norm": 7.994453430175781, + "learning_rate": 4.0343580339523e-06, + "loss": 0.4784, + "mean_token_accuracy": 0.8405784666538239, + "num_tokens": 152528760.0, + "step": 126850 + }, + { + "entropy": 1.8666841223835946, + "epoch": 0.39325490038043676, + "grad_norm": 9.046262741088867, + "learning_rate": 4.034199021286984e-06, + "loss": 0.4388, + "mean_token_accuracy": 0.8560965120792389, + "num_tokens": 152540919.0, + "step": 126860 + }, + { + "entropy": 1.8318147659301758, + "epoch": 0.3932858995054865, + "grad_norm": 9.050399780273438, + "learning_rate": 4.0340400274224554e-06, + "loss": 0.4038, + "mean_token_accuracy": 0.8550884276628494, + "num_tokens": 152553211.0, + "step": 126870 + }, + { + "entropy": 1.8964627265930176, + "epoch": 0.39331689863053615, + "grad_norm": 7.297853946685791, + "learning_rate": 4.033881052355008e-06, + "loss": 0.467, + "mean_token_accuracy": 0.840998500585556, + "num_tokens": 152565550.0, + "step": 126880 + }, + { + "entropy": 1.8291507720947267, + "epoch": 0.3933478977555859, + "grad_norm": 4.372984886169434, + "learning_rate": 4.03372209608094e-06, + "loss": 0.4242, + "mean_token_accuracy": 0.8626144364476204, + "num_tokens": 152578617.0, + "step": 126890 + }, + { + "entropy": 1.9425703272223473, + "epoch": 0.39337889688063554, + "grad_norm": 9.826508522033691, + "learning_rate": 4.033563158596547e-06, + "loss": 0.5442, + "mean_token_accuracy": 0.8346266880631447, + "num_tokens": 152589941.0, + "step": 126900 + }, + { + "entropy": 1.8560305163264275, + "epoch": 0.39340989600568527, + "grad_norm": 9.724489212036133, + "learning_rate": 4.0334042398981285e-06, + "loss": 0.4451, + "mean_token_accuracy": 0.8553323075175285, + "num_tokens": 152602105.0, + "step": 126910 + }, + { + "entropy": 1.921321277320385, + "epoch": 0.39344089513073494, + "grad_norm": 8.991716384887695, + "learning_rate": 4.033245339981984e-06, + "loss": 0.4953, + "mean_token_accuracy": 0.8386275783181191, + "num_tokens": 152613975.0, + "step": 126920 + }, + { + "entropy": 1.9884714633226395, + "epoch": 0.3934718942557846, + "grad_norm": 7.993074893951416, + "learning_rate": 4.033086458844415e-06, + "loss": 0.495, + "mean_token_accuracy": 0.8479173853993416, + "num_tokens": 152624593.0, + "step": 126930 + }, + { + "entropy": 1.8521944627165794, + "epoch": 0.39350289338083433, + "grad_norm": 8.29841136932373, + "learning_rate": 4.032927596481721e-06, + "loss": 0.4397, + "mean_token_accuracy": 0.8486550331115723, + "num_tokens": 152636893.0, + "step": 126940 + }, + { + "entropy": 1.853082077205181, + "epoch": 0.393533892505884, + "grad_norm": 8.913239479064941, + "learning_rate": 4.032768752890206e-06, + "loss": 0.4259, + "mean_token_accuracy": 0.8620326921343804, + "num_tokens": 152648688.0, + "step": 126950 + }, + { + "entropy": 1.9352153725922108, + "epoch": 0.3935648916309337, + "grad_norm": 7.439796447753906, + "learning_rate": 4.032609928066174e-06, + "loss": 0.4972, + "mean_token_accuracy": 0.8431309580802917, + "num_tokens": 152660197.0, + "step": 126960 + }, + { + "entropy": 1.9533848583698272, + "epoch": 0.3935958907559834, + "grad_norm": 12.858529090881348, + "learning_rate": 4.032451122005929e-06, + "loss": 0.4903, + "mean_token_accuracy": 0.8557668402791023, + "num_tokens": 152671469.0, + "step": 126970 + }, + { + "entropy": 1.872800037264824, + "epoch": 0.3936268898810331, + "grad_norm": 7.507105350494385, + "learning_rate": 4.032292334705776e-06, + "loss": 0.4325, + "mean_token_accuracy": 0.8509954452514649, + "num_tokens": 152684255.0, + "step": 126980 + }, + { + "entropy": 1.8584757506847382, + "epoch": 0.3936578890060828, + "grad_norm": 3.439596176147461, + "learning_rate": 4.032133566162022e-06, + "loss": 0.4717, + "mean_token_accuracy": 0.853032597899437, + "num_tokens": 152696156.0, + "step": 126990 + }, + { + "entropy": 1.841444942355156, + "epoch": 0.3936888881311325, + "grad_norm": 8.641845703125, + "learning_rate": 4.031974816370976e-06, + "loss": 0.4546, + "mean_token_accuracy": 0.8564640492200851, + "num_tokens": 152708857.0, + "step": 127000 + }, + { + "entropy": 1.9723876774311067, + "epoch": 0.3937198872561822, + "grad_norm": 8.145781517028809, + "learning_rate": 4.031816085328946e-06, + "loss": 0.4604, + "mean_token_accuracy": 0.8606492772698402, + "num_tokens": 152719903.0, + "step": 127010 + }, + { + "entropy": 1.8345705628395081, + "epoch": 0.3937508863812319, + "grad_norm": 7.235647201538086, + "learning_rate": 4.03165737303224e-06, + "loss": 0.3913, + "mean_token_accuracy": 0.8646158143877983, + "num_tokens": 152732756.0, + "step": 127020 + }, + { + "entropy": 1.9376291260123253, + "epoch": 0.39378188550628157, + "grad_norm": 3.9151713848114014, + "learning_rate": 4.031498679477171e-06, + "loss": 0.4656, + "mean_token_accuracy": 0.8439146399497985, + "num_tokens": 152744727.0, + "step": 127030 + }, + { + "entropy": 1.903841333091259, + "epoch": 0.3938128846313313, + "grad_norm": 8.165589332580566, + "learning_rate": 4.03134000466005e-06, + "loss": 0.4755, + "mean_token_accuracy": 0.840572564303875, + "num_tokens": 152756881.0, + "step": 127040 + }, + { + "entropy": 1.8646742686629296, + "epoch": 0.39384388375638096, + "grad_norm": 5.022664546966553, + "learning_rate": 4.0311813485771896e-06, + "loss": 0.4021, + "mean_token_accuracy": 0.8504794493317605, + "num_tokens": 152769508.0, + "step": 127050 + }, + { + "entropy": 1.9327761620283126, + "epoch": 0.3938748828814307, + "grad_norm": 7.481938362121582, + "learning_rate": 4.031022711224904e-06, + "loss": 0.4982, + "mean_token_accuracy": 0.8473081424832344, + "num_tokens": 152781394.0, + "step": 127060 + }, + { + "entropy": 1.9920929014682769, + "epoch": 0.39390588200648036, + "grad_norm": 7.522092819213867, + "learning_rate": 4.030864092599508e-06, + "loss": 0.5138, + "mean_token_accuracy": 0.8395040348172188, + "num_tokens": 152792361.0, + "step": 127070 + }, + { + "entropy": 1.8420628361403941, + "epoch": 0.3939368811315301, + "grad_norm": 7.69534158706665, + "learning_rate": 4.030705492697317e-06, + "loss": 0.4029, + "mean_token_accuracy": 0.8510744541883468, + "num_tokens": 152805416.0, + "step": 127080 + }, + { + "entropy": 1.9465318858623504, + "epoch": 0.39396788025657975, + "grad_norm": 7.537118911743164, + "learning_rate": 4.030546911514647e-06, + "loss": 0.4564, + "mean_token_accuracy": 0.851013197004795, + "num_tokens": 152817242.0, + "step": 127090 + }, + { + "entropy": 1.861098751425743, + "epoch": 0.3939988793816295, + "grad_norm": 8.616077423095703, + "learning_rate": 4.030388349047818e-06, + "loss": 0.4309, + "mean_token_accuracy": 0.8491953507065773, + "num_tokens": 152830104.0, + "step": 127100 + }, + { + "entropy": 1.9238109394907952, + "epoch": 0.39402987850667914, + "grad_norm": 8.807307243347168, + "learning_rate": 4.0302298052931475e-06, + "loss": 0.4784, + "mean_token_accuracy": 0.8525387272238731, + "num_tokens": 152841452.0, + "step": 127110 + }, + { + "entropy": 1.9179314807057382, + "epoch": 0.39406087763172887, + "grad_norm": 10.16199779510498, + "learning_rate": 4.030071280246956e-06, + "loss": 0.4914, + "mean_token_accuracy": 0.8477940663695336, + "num_tokens": 152852810.0, + "step": 127120 + }, + { + "entropy": 1.913467188179493, + "epoch": 0.39409187675677854, + "grad_norm": 3.260228395462036, + "learning_rate": 4.029912773905563e-06, + "loss": 0.4572, + "mean_token_accuracy": 0.848815667629242, + "num_tokens": 152864554.0, + "step": 127130 + }, + { + "entropy": 1.8686377540230752, + "epoch": 0.39412287588182826, + "grad_norm": 9.483077049255371, + "learning_rate": 4.029754286265293e-06, + "loss": 0.4847, + "mean_token_accuracy": 0.8370977133512497, + "num_tokens": 152877426.0, + "step": 127140 + }, + { + "entropy": 1.929710279405117, + "epoch": 0.39415387500687793, + "grad_norm": 2.918980360031128, + "learning_rate": 4.029595817322465e-06, + "loss": 0.453, + "mean_token_accuracy": 0.8524357438087463, + "num_tokens": 152889339.0, + "step": 127150 + }, + { + "entropy": 1.879020507633686, + "epoch": 0.39418487413192765, + "grad_norm": 7.318241596221924, + "learning_rate": 4.029437367073407e-06, + "loss": 0.3923, + "mean_token_accuracy": 0.868370558321476, + "num_tokens": 152902178.0, + "step": 127160 + }, + { + "entropy": 1.8702649146318435, + "epoch": 0.3942158732569773, + "grad_norm": 8.086983680725098, + "learning_rate": 4.029278935514442e-06, + "loss": 0.4811, + "mean_token_accuracy": 0.847602641582489, + "num_tokens": 152914178.0, + "step": 127170 + }, + { + "entropy": 1.8651431202888489, + "epoch": 0.394246872382027, + "grad_norm": 8.561652183532715, + "learning_rate": 4.029120522641896e-06, + "loss": 0.4401, + "mean_token_accuracy": 0.8519322633743286, + "num_tokens": 152926886.0, + "step": 127180 + }, + { + "entropy": 1.8370242148637772, + "epoch": 0.3942778715070767, + "grad_norm": 4.489937782287598, + "learning_rate": 4.028962128452095e-06, + "loss": 0.4949, + "mean_token_accuracy": 0.8447151646018028, + "num_tokens": 152940001.0, + "step": 127190 + }, + { + "entropy": 1.806211344152689, + "epoch": 0.3943088706321264, + "grad_norm": 7.989223957061768, + "learning_rate": 4.02880375294137e-06, + "loss": 0.3879, + "mean_token_accuracy": 0.8558181077241898, + "num_tokens": 152953714.0, + "step": 127200 + }, + { + "entropy": 1.8846919029951095, + "epoch": 0.3943398697571761, + "grad_norm": 7.853222846984863, + "learning_rate": 4.0286453961060475e-06, + "loss": 0.4158, + "mean_token_accuracy": 0.8626481354236603, + "num_tokens": 152965285.0, + "step": 127210 + }, + { + "entropy": 1.8866841107606889, + "epoch": 0.3943708688822258, + "grad_norm": 3.7979695796966553, + "learning_rate": 4.0284870579424576e-06, + "loss": 0.4481, + "mean_token_accuracy": 0.8536775171756744, + "num_tokens": 152978547.0, + "step": 127220 + }, + { + "entropy": 1.9279194965958595, + "epoch": 0.3944018680072755, + "grad_norm": 8.925294876098633, + "learning_rate": 4.028328738446932e-06, + "loss": 0.4702, + "mean_token_accuracy": 0.8454255104064942, + "num_tokens": 152990780.0, + "step": 127230 + }, + { + "entropy": 1.8884064748883247, + "epoch": 0.39443286713232517, + "grad_norm": 4.231663227081299, + "learning_rate": 4.028170437615802e-06, + "loss": 0.4264, + "mean_token_accuracy": 0.8594095915555954, + "num_tokens": 153003179.0, + "step": 127240 + }, + { + "entropy": 1.9022858142852783, + "epoch": 0.3944638662573749, + "grad_norm": 4.5120038986206055, + "learning_rate": 4.028012155445402e-06, + "loss": 0.468, + "mean_token_accuracy": 0.8501111850142479, + "num_tokens": 153015801.0, + "step": 127250 + }, + { + "entropy": 1.9620060488581657, + "epoch": 0.39449486538242456, + "grad_norm": 7.684929370880127, + "learning_rate": 4.027853891932065e-06, + "loss": 0.4612, + "mean_token_accuracy": 0.854415363073349, + "num_tokens": 153027019.0, + "step": 127260 + }, + { + "entropy": 1.9046915173530579, + "epoch": 0.3945258645074743, + "grad_norm": 4.640239238739014, + "learning_rate": 4.027695647072125e-06, + "loss": 0.4763, + "mean_token_accuracy": 0.8494089663028717, + "num_tokens": 153039194.0, + "step": 127270 + }, + { + "entropy": 1.9410059854388237, + "epoch": 0.39455686363252396, + "grad_norm": 9.024331092834473, + "learning_rate": 4.027537420861921e-06, + "loss": 0.4837, + "mean_token_accuracy": 0.846622771024704, + "num_tokens": 153050529.0, + "step": 127280 + }, + { + "entropy": 1.84144167304039, + "epoch": 0.3945878627575737, + "grad_norm": 10.240126609802246, + "learning_rate": 4.027379213297787e-06, + "loss": 0.4221, + "mean_token_accuracy": 0.8550687476992607, + "num_tokens": 153063542.0, + "step": 127290 + }, + { + "entropy": 1.8818937510251998, + "epoch": 0.39461886188262335, + "grad_norm": 4.128033638000488, + "learning_rate": 4.027221024376063e-06, + "loss": 0.4108, + "mean_token_accuracy": 0.8581587508320808, + "num_tokens": 153076371.0, + "step": 127300 + }, + { + "entropy": 1.9620690792798996, + "epoch": 0.3946498610076731, + "grad_norm": 8.45438289642334, + "learning_rate": 4.027062854093087e-06, + "loss": 0.5293, + "mean_token_accuracy": 0.8367072865366936, + "num_tokens": 153086758.0, + "step": 127310 + }, + { + "entropy": 1.879226452112198, + "epoch": 0.39468086013272274, + "grad_norm": 8.631587982177734, + "learning_rate": 4.0269047024452e-06, + "loss": 0.4485, + "mean_token_accuracy": 0.8429103165864944, + "num_tokens": 153099124.0, + "step": 127320 + }, + { + "entropy": 1.911029715836048, + "epoch": 0.39471185925777247, + "grad_norm": 6.511907577514648, + "learning_rate": 4.0267465694287424e-06, + "loss": 0.4207, + "mean_token_accuracy": 0.8632428720593452, + "num_tokens": 153110564.0, + "step": 127330 + }, + { + "entropy": 1.8814439699053764, + "epoch": 0.39474285838282214, + "grad_norm": 8.013952255249023, + "learning_rate": 4.026588455040057e-06, + "loss": 0.4122, + "mean_token_accuracy": 0.8586022913455963, + "num_tokens": 153123090.0, + "step": 127340 + }, + { + "entropy": 1.9142462641000748, + "epoch": 0.39477385750787186, + "grad_norm": 6.910181045532227, + "learning_rate": 4.0264303592754846e-06, + "loss": 0.4592, + "mean_token_accuracy": 0.8546132728457451, + "num_tokens": 153134532.0, + "step": 127350 + }, + { + "entropy": 1.8884952366352081, + "epoch": 0.39480485663292153, + "grad_norm": 11.307275772094727, + "learning_rate": 4.026272282131373e-06, + "loss": 0.4446, + "mean_token_accuracy": 0.8564356371760369, + "num_tokens": 153146234.0, + "step": 127360 + }, + { + "entropy": 1.845706556737423, + "epoch": 0.39483585575797125, + "grad_norm": 6.627220630645752, + "learning_rate": 4.026114223604065e-06, + "loss": 0.3955, + "mean_token_accuracy": 0.8640777125954628, + "num_tokens": 153159242.0, + "step": 127370 + }, + { + "entropy": 1.8887208893895149, + "epoch": 0.3948668548830209, + "grad_norm": 10.363401412963867, + "learning_rate": 4.025956183689907e-06, + "loss": 0.4095, + "mean_token_accuracy": 0.8451639533042907, + "num_tokens": 153172510.0, + "step": 127380 + }, + { + "entropy": 1.9080938518047332, + "epoch": 0.39489785400807065, + "grad_norm": 7.9390482902526855, + "learning_rate": 4.025798162385246e-06, + "loss": 0.4962, + "mean_token_accuracy": 0.8457752048969269, + "num_tokens": 153184696.0, + "step": 127390 + }, + { + "entropy": 1.9023298189043998, + "epoch": 0.3949288531331203, + "grad_norm": 8.671669006347656, + "learning_rate": 4.02564015968643e-06, + "loss": 0.4492, + "mean_token_accuracy": 0.851709508895874, + "num_tokens": 153196489.0, + "step": 127400 + }, + { + "entropy": 1.8737532109022141, + "epoch": 0.39495985225817, + "grad_norm": 6.4061198234558105, + "learning_rate": 4.025482175589809e-06, + "loss": 0.4051, + "mean_token_accuracy": 0.8673104450106621, + "num_tokens": 153207793.0, + "step": 127410 + }, + { + "entropy": 1.8804204553365707, + "epoch": 0.3949908513832197, + "grad_norm": 7.620257377624512, + "learning_rate": 4.025324210091733e-06, + "loss": 0.4195, + "mean_token_accuracy": 0.8681916758418083, + "num_tokens": 153219491.0, + "step": 127420 + }, + { + "entropy": 1.803827053308487, + "epoch": 0.3950218505082694, + "grad_norm": 4.123118877410889, + "learning_rate": 4.0251662631885505e-06, + "loss": 0.4314, + "mean_token_accuracy": 0.8462093636393547, + "num_tokens": 153232842.0, + "step": 127430 + }, + { + "entropy": 1.870395976305008, + "epoch": 0.3950528496333191, + "grad_norm": 8.609538078308105, + "learning_rate": 4.025008334876618e-06, + "loss": 0.453, + "mean_token_accuracy": 0.8614588662981987, + "num_tokens": 153244929.0, + "step": 127440 + }, + { + "entropy": 1.83759603202343, + "epoch": 0.39508384875836877, + "grad_norm": 8.134893417358398, + "learning_rate": 4.024850425152286e-06, + "loss": 0.3945, + "mean_token_accuracy": 0.8604315787553787, + "num_tokens": 153258122.0, + "step": 127450 + }, + { + "entropy": 1.919068345427513, + "epoch": 0.3951148478834185, + "grad_norm": 7.001434326171875, + "learning_rate": 4.024692534011908e-06, + "loss": 0.4842, + "mean_token_accuracy": 0.8476987138390542, + "num_tokens": 153269204.0, + "step": 127460 + }, + { + "entropy": 1.839623036980629, + "epoch": 0.39514584700846817, + "grad_norm": 8.048707962036133, + "learning_rate": 4.024534661451841e-06, + "loss": 0.4184, + "mean_token_accuracy": 0.8495053514838219, + "num_tokens": 153282189.0, + "step": 127470 + }, + { + "entropy": 1.9202652007341385, + "epoch": 0.3951768461335179, + "grad_norm": 8.451288223266602, + "learning_rate": 4.02437680746844e-06, + "loss": 0.4258, + "mean_token_accuracy": 0.857101121544838, + "num_tokens": 153293971.0, + "step": 127480 + }, + { + "entropy": 1.8971919193863869, + "epoch": 0.39520784525856756, + "grad_norm": 7.40570592880249, + "learning_rate": 4.024218972058062e-06, + "loss": 0.4246, + "mean_token_accuracy": 0.8695945486426353, + "num_tokens": 153305994.0, + "step": 127490 + }, + { + "entropy": 1.8754101365804672, + "epoch": 0.3952388443836173, + "grad_norm": 7.888429641723633, + "learning_rate": 4.0240611552170665e-06, + "loss": 0.4087, + "mean_token_accuracy": 0.8595315098762513, + "num_tokens": 153319098.0, + "step": 127500 + }, + { + "entropy": 1.86890929043293, + "epoch": 0.39526984350866695, + "grad_norm": 8.941306114196777, + "learning_rate": 4.023903356941811e-06, + "loss": 0.4383, + "mean_token_accuracy": 0.8576035022735595, + "num_tokens": 153331242.0, + "step": 127510 + }, + { + "entropy": 1.8997353851795196, + "epoch": 0.3953008426337167, + "grad_norm": 7.998995780944824, + "learning_rate": 4.023745577228658e-06, + "loss": 0.4046, + "mean_token_accuracy": 0.8570963263511657, + "num_tokens": 153344304.0, + "step": 127520 + }, + { + "entropy": 1.820690706372261, + "epoch": 0.39533184175876634, + "grad_norm": 7.785648822784424, + "learning_rate": 4.023587816073965e-06, + "loss": 0.4075, + "mean_token_accuracy": 0.8614529862999916, + "num_tokens": 153358390.0, + "step": 127530 + }, + { + "entropy": 1.8702727839350701, + "epoch": 0.39536284088381607, + "grad_norm": 4.13304328918457, + "learning_rate": 4.023430073474097e-06, + "loss": 0.4442, + "mean_token_accuracy": 0.8575648859143257, + "num_tokens": 153370126.0, + "step": 127540 + }, + { + "entropy": 1.81645250543952, + "epoch": 0.39539384000886574, + "grad_norm": 2.846233606338501, + "learning_rate": 4.023272349425415e-06, + "loss": 0.3549, + "mean_token_accuracy": 0.8668262645602226, + "num_tokens": 153383650.0, + "step": 127550 + }, + { + "entropy": 1.899166676402092, + "epoch": 0.39542483913391546, + "grad_norm": 8.356348037719727, + "learning_rate": 4.023114643924286e-06, + "loss": 0.4868, + "mean_token_accuracy": 0.8431504383683205, + "num_tokens": 153395416.0, + "step": 127560 + }, + { + "entropy": 1.8529985576868058, + "epoch": 0.39545583825896513, + "grad_norm": 2.3607380390167236, + "learning_rate": 4.022956956967072e-06, + "loss": 0.4343, + "mean_token_accuracy": 0.8648950770497322, + "num_tokens": 153407938.0, + "step": 127570 + }, + { + "entropy": 1.8317057311534881, + "epoch": 0.39548683738401486, + "grad_norm": 8.973584175109863, + "learning_rate": 4.022799288550142e-06, + "loss": 0.4097, + "mean_token_accuracy": 0.8503635957837105, + "num_tokens": 153420425.0, + "step": 127580 + }, + { + "entropy": 1.967195200920105, + "epoch": 0.3955178365090645, + "grad_norm": 8.250533103942871, + "learning_rate": 4.022641638669861e-06, + "loss": 0.4677, + "mean_token_accuracy": 0.8539433136582375, + "num_tokens": 153431948.0, + "step": 127590 + }, + { + "entropy": 1.8443527117371559, + "epoch": 0.39554883563411425, + "grad_norm": 8.231595039367676, + "learning_rate": 4.022484007322598e-06, + "loss": 0.4609, + "mean_token_accuracy": 0.8479686647653579, + "num_tokens": 153445161.0, + "step": 127600 + }, + { + "entropy": 1.9092737153172492, + "epoch": 0.3955798347591639, + "grad_norm": 8.519765853881836, + "learning_rate": 4.022326394504722e-06, + "loss": 0.4604, + "mean_token_accuracy": 0.8497352659702301, + "num_tokens": 153456782.0, + "step": 127610 + }, + { + "entropy": 1.9170891061425208, + "epoch": 0.39561083388421364, + "grad_norm": 7.752858638763428, + "learning_rate": 4.0221688002126016e-06, + "loss": 0.462, + "mean_token_accuracy": 0.8510538384318351, + "num_tokens": 153468381.0, + "step": 127620 + }, + { + "entropy": 1.8833612218499183, + "epoch": 0.3956418330092633, + "grad_norm": 4.76165246963501, + "learning_rate": 4.022011224442611e-06, + "loss": 0.4545, + "mean_token_accuracy": 0.8437580853700638, + "num_tokens": 153481424.0, + "step": 127630 + }, + { + "entropy": 1.9320109218358994, + "epoch": 0.39567283213431304, + "grad_norm": 4.280218601226807, + "learning_rate": 4.02185366719112e-06, + "loss": 0.4584, + "mean_token_accuracy": 0.848100745677948, + "num_tokens": 153493443.0, + "step": 127640 + }, + { + "entropy": 1.960513624548912, + "epoch": 0.3957038312593627, + "grad_norm": 8.801411628723145, + "learning_rate": 4.021696128454502e-06, + "loss": 0.503, + "mean_token_accuracy": 0.8425059333443642, + "num_tokens": 153504656.0, + "step": 127650 + }, + { + "entropy": 1.9096018448472023, + "epoch": 0.3957348303844124, + "grad_norm": 3.8925299644470215, + "learning_rate": 4.0215386082291315e-06, + "loss": 0.4661, + "mean_token_accuracy": 0.8456997722387314, + "num_tokens": 153516555.0, + "step": 127660 + }, + { + "entropy": 1.9139437288045884, + "epoch": 0.3957658295094621, + "grad_norm": 7.8598127365112305, + "learning_rate": 4.021381106511384e-06, + "loss": 0.5097, + "mean_token_accuracy": 0.8359185487031937, + "num_tokens": 153528681.0, + "step": 127670 + }, + { + "entropy": 1.925663386285305, + "epoch": 0.39579682863451177, + "grad_norm": 7.7925944328308105, + "learning_rate": 4.021223623297635e-06, + "loss": 0.4377, + "mean_token_accuracy": 0.8525381490588189, + "num_tokens": 153540256.0, + "step": 127680 + }, + { + "entropy": 1.9570807069540024, + "epoch": 0.3958278277595615, + "grad_norm": 7.370650291442871, + "learning_rate": 4.021066158584261e-06, + "loss": 0.5033, + "mean_token_accuracy": 0.848370935022831, + "num_tokens": 153551273.0, + "step": 127690 + }, + { + "entropy": 1.8661099091172217, + "epoch": 0.39585882688461116, + "grad_norm": 4.928553581237793, + "learning_rate": 4.020908712367641e-06, + "loss": 0.4293, + "mean_token_accuracy": 0.8484266191720963, + "num_tokens": 153563180.0, + "step": 127700 + }, + { + "entropy": 1.9060293585062027, + "epoch": 0.3958898260096609, + "grad_norm": 9.447659492492676, + "learning_rate": 4.020751284644153e-06, + "loss": 0.4944, + "mean_token_accuracy": 0.8432995095849037, + "num_tokens": 153575032.0, + "step": 127710 + }, + { + "entropy": 1.953291055560112, + "epoch": 0.39592082513471055, + "grad_norm": 7.042132377624512, + "learning_rate": 4.020593875410178e-06, + "loss": 0.4451, + "mean_token_accuracy": 0.8566155821084976, + "num_tokens": 153586028.0, + "step": 127720 + }, + { + "entropy": 1.9117076322436333, + "epoch": 0.3959518242597603, + "grad_norm": 8.872218132019043, + "learning_rate": 4.020436484662098e-06, + "loss": 0.4485, + "mean_token_accuracy": 0.8476521626114846, + "num_tokens": 153599040.0, + "step": 127730 + }, + { + "entropy": 1.8983178213238716, + "epoch": 0.39598282338480995, + "grad_norm": 8.394247055053711, + "learning_rate": 4.020279112396293e-06, + "loss": 0.4207, + "mean_token_accuracy": 0.8589667662978172, + "num_tokens": 153610777.0, + "step": 127740 + }, + { + "entropy": 1.9231909066438675, + "epoch": 0.39601382250985967, + "grad_norm": 10.013471603393555, + "learning_rate": 4.0201217586091465e-06, + "loss": 0.4891, + "mean_token_accuracy": 0.8497250765562058, + "num_tokens": 153621814.0, + "step": 127750 + }, + { + "entropy": 1.900232744216919, + "epoch": 0.39604482163490934, + "grad_norm": 9.529150009155273, + "learning_rate": 4.019964423297043e-06, + "loss": 0.4934, + "mean_token_accuracy": 0.842905105650425, + "num_tokens": 153634211.0, + "step": 127760 + }, + { + "entropy": 1.8112396225333214, + "epoch": 0.39607582075995906, + "grad_norm": 9.09139347076416, + "learning_rate": 4.019807106456367e-06, + "loss": 0.3551, + "mean_token_accuracy": 0.8626585349440574, + "num_tokens": 153647738.0, + "step": 127770 + }, + { + "entropy": 1.8331888422369957, + "epoch": 0.39610681988500873, + "grad_norm": 6.125622272491455, + "learning_rate": 4.0196498080835054e-06, + "loss": 0.3867, + "mean_token_accuracy": 0.8652116924524307, + "num_tokens": 153661042.0, + "step": 127780 + }, + { + "entropy": 1.9142784118652343, + "epoch": 0.39613781901005846, + "grad_norm": 4.094144821166992, + "learning_rate": 4.019492528174845e-06, + "loss": 0.4589, + "mean_token_accuracy": 0.8501773819327354, + "num_tokens": 153673112.0, + "step": 127790 + }, + { + "entropy": 1.8980029091238975, + "epoch": 0.3961688181351081, + "grad_norm": 3.3827157020568848, + "learning_rate": 4.0193352667267725e-06, + "loss": 0.4441, + "mean_token_accuracy": 0.8512454509735108, + "num_tokens": 153685077.0, + "step": 127800 + }, + { + "entropy": 1.716671919822693, + "epoch": 0.39619981726015785, + "grad_norm": 3.326303482055664, + "learning_rate": 4.019178023735678e-06, + "loss": 0.2917, + "mean_token_accuracy": 0.8814573958516121, + "num_tokens": 153699159.0, + "step": 127810 + }, + { + "entropy": 1.7483276754617691, + "epoch": 0.3962308163852075, + "grad_norm": 8.083634376525879, + "learning_rate": 4.019020799197951e-06, + "loss": 0.4023, + "mean_token_accuracy": 0.8582610175013542, + "num_tokens": 153713542.0, + "step": 127820 + }, + { + "entropy": 1.915240579843521, + "epoch": 0.39626181551025724, + "grad_norm": 10.262462615966797, + "learning_rate": 4.018863593109982e-06, + "loss": 0.4624, + "mean_token_accuracy": 0.8524910137057304, + "num_tokens": 153725218.0, + "step": 127830 + }, + { + "entropy": 1.8652495056390763, + "epoch": 0.3962928146353069, + "grad_norm": 9.742287635803223, + "learning_rate": 4.018706405468165e-06, + "loss": 0.4266, + "mean_token_accuracy": 0.8553525045514107, + "num_tokens": 153737029.0, + "step": 127840 + }, + { + "entropy": 1.9075027361512185, + "epoch": 0.39632381376035664, + "grad_norm": 9.754807472229004, + "learning_rate": 4.018549236268891e-06, + "loss": 0.4476, + "mean_token_accuracy": 0.8644733369350434, + "num_tokens": 153748555.0, + "step": 127850 + }, + { + "entropy": 1.8555178031325341, + "epoch": 0.3963548128854063, + "grad_norm": 10.997365951538086, + "learning_rate": 4.018392085508554e-06, + "loss": 0.4603, + "mean_token_accuracy": 0.8505488753318786, + "num_tokens": 153761293.0, + "step": 127860 + }, + { + "entropy": 1.8854364529252052, + "epoch": 0.39638581201045603, + "grad_norm": 10.64345932006836, + "learning_rate": 4.018234953183549e-06, + "loss": 0.4653, + "mean_token_accuracy": 0.8452537760138512, + "num_tokens": 153773673.0, + "step": 127870 + }, + { + "entropy": 1.9308582305908204, + "epoch": 0.3964168111355057, + "grad_norm": 8.477107048034668, + "learning_rate": 4.018077839290272e-06, + "loss": 0.4758, + "mean_token_accuracy": 0.8420649126172066, + "num_tokens": 153785000.0, + "step": 127880 + }, + { + "entropy": 1.8462091460824013, + "epoch": 0.3964478102605554, + "grad_norm": 4.125033855438232, + "learning_rate": 4.01792074382512e-06, + "loss": 0.4301, + "mean_token_accuracy": 0.8481978312134743, + "num_tokens": 153797926.0, + "step": 127890 + }, + { + "entropy": 1.9186622604727746, + "epoch": 0.3964788093856051, + "grad_norm": 8.226593971252441, + "learning_rate": 4.0177636667844914e-06, + "loss": 0.4926, + "mean_token_accuracy": 0.8365380227565765, + "num_tokens": 153809721.0, + "step": 127900 + }, + { + "entropy": 1.8989548355340957, + "epoch": 0.39650980851065476, + "grad_norm": 7.229928970336914, + "learning_rate": 4.017606608164784e-06, + "loss": 0.4022, + "mean_token_accuracy": 0.8661728858947754, + "num_tokens": 153821479.0, + "step": 127910 + }, + { + "entropy": 1.875709056854248, + "epoch": 0.3965408076357045, + "grad_norm": 5.093418598175049, + "learning_rate": 4.017449567962398e-06, + "loss": 0.4293, + "mean_token_accuracy": 0.8482154250144959, + "num_tokens": 153833724.0, + "step": 127920 + }, + { + "entropy": 1.9880733832716941, + "epoch": 0.39657180676075415, + "grad_norm": 8.915475845336914, + "learning_rate": 4.0172925461737336e-06, + "loss": 0.4746, + "mean_token_accuracy": 0.8514209687709808, + "num_tokens": 153844594.0, + "step": 127930 + }, + { + "entropy": 1.8570735067129136, + "epoch": 0.3966028058858039, + "grad_norm": 4.229099273681641, + "learning_rate": 4.017135542795195e-06, + "loss": 0.4222, + "mean_token_accuracy": 0.8634174600243568, + "num_tokens": 153856932.0, + "step": 127940 + }, + { + "entropy": 1.9012201085686684, + "epoch": 0.39663380501085355, + "grad_norm": 11.16518783569336, + "learning_rate": 4.016978557823181e-06, + "loss": 0.4162, + "mean_token_accuracy": 0.8594008833169937, + "num_tokens": 153868678.0, + "step": 127950 + }, + { + "entropy": 1.7977351397275925, + "epoch": 0.39666480413590327, + "grad_norm": 7.758661270141602, + "learning_rate": 4.016821591254099e-06, + "loss": 0.3605, + "mean_token_accuracy": 0.8647834539413453, + "num_tokens": 153882380.0, + "step": 127960 + }, + { + "entropy": 1.86228808760643, + "epoch": 0.39669580326095294, + "grad_norm": 4.080726623535156, + "learning_rate": 4.016664643084352e-06, + "loss": 0.4335, + "mean_token_accuracy": 0.8644084423780442, + "num_tokens": 153894569.0, + "step": 127970 + }, + { + "entropy": 1.9319358631968497, + "epoch": 0.39672680238600266, + "grad_norm": 7.336208820343018, + "learning_rate": 4.016507713310346e-06, + "loss": 0.474, + "mean_token_accuracy": 0.8546565786004067, + "num_tokens": 153905410.0, + "step": 127980 + }, + { + "entropy": 1.8738821625709534, + "epoch": 0.39675780151105233, + "grad_norm": 8.000831604003906, + "learning_rate": 4.016350801928489e-06, + "loss": 0.4445, + "mean_token_accuracy": 0.8442562118172645, + "num_tokens": 153918966.0, + "step": 127990 + }, + { + "entropy": 1.8228215545415878, + "epoch": 0.39678880063610206, + "grad_norm": 6.075873374938965, + "learning_rate": 4.0161939089351855e-06, + "loss": 0.4462, + "mean_token_accuracy": 0.852639339864254, + "num_tokens": 153931674.0, + "step": 128000 + }, + { + "entropy": 1.8811346724629403, + "epoch": 0.3968197997611517, + "grad_norm": 8.297804832458496, + "learning_rate": 4.016037034326846e-06, + "loss": 0.4313, + "mean_token_accuracy": 0.8590738877654076, + "num_tokens": 153943910.0, + "step": 128010 + }, + { + "entropy": 1.9253056347370148, + "epoch": 0.39685079888620145, + "grad_norm": 9.22485637664795, + "learning_rate": 4.015880178099881e-06, + "loss": 0.4444, + "mean_token_accuracy": 0.8511346384882927, + "num_tokens": 153955547.0, + "step": 128020 + }, + { + "entropy": 1.814525055885315, + "epoch": 0.3968817980112511, + "grad_norm": 7.030633449554443, + "learning_rate": 4.0157233402507e-06, + "loss": 0.3723, + "mean_token_accuracy": 0.8703080207109452, + "num_tokens": 153968920.0, + "step": 128030 + }, + { + "entropy": 1.9247617200016975, + "epoch": 0.39691279713630084, + "grad_norm": 9.265406608581543, + "learning_rate": 4.015566520775715e-06, + "loss": 0.4815, + "mean_token_accuracy": 0.849250017106533, + "num_tokens": 153980346.0, + "step": 128040 + }, + { + "entropy": 1.9104125529527665, + "epoch": 0.3969437962613505, + "grad_norm": 3.5930426120758057, + "learning_rate": 4.015409719671339e-06, + "loss": 0.4676, + "mean_token_accuracy": 0.8543394804000854, + "num_tokens": 153992553.0, + "step": 128050 + }, + { + "entropy": 1.866149678826332, + "epoch": 0.39697479538640024, + "grad_norm": 6.8628926277160645, + "learning_rate": 4.015252936933985e-06, + "loss": 0.4338, + "mean_token_accuracy": 0.8579761996865273, + "num_tokens": 154004883.0, + "step": 128060 + }, + { + "entropy": 1.9460046872496606, + "epoch": 0.3970057945114499, + "grad_norm": 9.495827674865723, + "learning_rate": 4.015096172560067e-06, + "loss": 0.4824, + "mean_token_accuracy": 0.8454651072621345, + "num_tokens": 154016816.0, + "step": 128070 + }, + { + "entropy": 1.8353903412818908, + "epoch": 0.39703679363649963, + "grad_norm": 8.298847198486328, + "learning_rate": 4.014939426546002e-06, + "loss": 0.4392, + "mean_token_accuracy": 0.8565979763865471, + "num_tokens": 154029332.0, + "step": 128080 + }, + { + "entropy": 1.8864996194839478, + "epoch": 0.3970677927615493, + "grad_norm": 8.607029914855957, + "learning_rate": 4.014782698888205e-06, + "loss": 0.4839, + "mean_token_accuracy": 0.842142577469349, + "num_tokens": 154040881.0, + "step": 128090 + }, + { + "entropy": 1.8832897543907166, + "epoch": 0.397098791886599, + "grad_norm": 3.863781452178955, + "learning_rate": 4.014625989583094e-06, + "loss": 0.4027, + "mean_token_accuracy": 0.8512258976697922, + "num_tokens": 154052885.0, + "step": 128100 + }, + { + "entropy": 1.8848641395568848, + "epoch": 0.3971297910116487, + "grad_norm": 7.123293876647949, + "learning_rate": 4.014469298627088e-06, + "loss": 0.4602, + "mean_token_accuracy": 0.856060591340065, + "num_tokens": 154064134.0, + "step": 128110 + }, + { + "entropy": 1.8236127287149428, + "epoch": 0.3971607901366984, + "grad_norm": 5.063018321990967, + "learning_rate": 4.0143126260166075e-06, + "loss": 0.4309, + "mean_token_accuracy": 0.8463837578892708, + "num_tokens": 154077307.0, + "step": 128120 + }, + { + "entropy": 1.9382410168647766, + "epoch": 0.3971917892617481, + "grad_norm": 8.761957168579102, + "learning_rate": 4.014155971748069e-06, + "loss": 0.5266, + "mean_token_accuracy": 0.8408651709556579, + "num_tokens": 154088700.0, + "step": 128130 + }, + { + "entropy": 1.888756561279297, + "epoch": 0.3972227883867978, + "grad_norm": 9.47224235534668, + "learning_rate": 4.013999335817898e-06, + "loss": 0.4318, + "mean_token_accuracy": 0.8587385430932045, + "num_tokens": 154100614.0, + "step": 128140 + }, + { + "entropy": 1.9622124269604684, + "epoch": 0.3972537875118475, + "grad_norm": 8.621251106262207, + "learning_rate": 4.013842718222516e-06, + "loss": 0.5208, + "mean_token_accuracy": 0.8376434296369553, + "num_tokens": 154112125.0, + "step": 128150 + }, + { + "entropy": 1.9215806141495704, + "epoch": 0.39728478663689715, + "grad_norm": 8.473328590393066, + "learning_rate": 4.013686118958345e-06, + "loss": 0.4631, + "mean_token_accuracy": 0.852269695699215, + "num_tokens": 154123987.0, + "step": 128160 + }, + { + "entropy": 1.8634486734867095, + "epoch": 0.39731578576194687, + "grad_norm": 8.564555168151855, + "learning_rate": 4.013529538021809e-06, + "loss": 0.4144, + "mean_token_accuracy": 0.8536643877625465, + "num_tokens": 154137001.0, + "step": 128170 + }, + { + "entropy": 1.9504847824573517, + "epoch": 0.39734678488699654, + "grad_norm": 8.242449760437012, + "learning_rate": 4.013372975409336e-06, + "loss": 0.4803, + "mean_token_accuracy": 0.8405697852373123, + "num_tokens": 154148533.0, + "step": 128180 + }, + { + "entropy": 1.8996037617325783, + "epoch": 0.39737778401204626, + "grad_norm": 10.100019454956055, + "learning_rate": 4.01321643111735e-06, + "loss": 0.4573, + "mean_token_accuracy": 0.8533552616834641, + "num_tokens": 154160553.0, + "step": 128190 + }, + { + "entropy": 1.8098711207509042, + "epoch": 0.39740878313709593, + "grad_norm": 9.241110801696777, + "learning_rate": 4.013059905142279e-06, + "loss": 0.3669, + "mean_token_accuracy": 0.871603924036026, + "num_tokens": 154174245.0, + "step": 128200 + }, + { + "entropy": 1.9533662751317025, + "epoch": 0.39743978226214566, + "grad_norm": 9.006794929504395, + "learning_rate": 4.012903397480549e-06, + "loss": 0.4952, + "mean_token_accuracy": 0.8515314489603043, + "num_tokens": 154185508.0, + "step": 128210 + }, + { + "entropy": 1.847785583138466, + "epoch": 0.3974707813871953, + "grad_norm": 4.733590126037598, + "learning_rate": 4.012746908128595e-06, + "loss": 0.4237, + "mean_token_accuracy": 0.8513537049293518, + "num_tokens": 154198114.0, + "step": 128220 + }, + { + "entropy": 1.908326794207096, + "epoch": 0.39750178051224505, + "grad_norm": 7.348505973815918, + "learning_rate": 4.012590437082841e-06, + "loss": 0.4096, + "mean_token_accuracy": 0.8644715324044228, + "num_tokens": 154210418.0, + "step": 128230 + }, + { + "entropy": 1.9537509649991989, + "epoch": 0.3975327796372947, + "grad_norm": 7.655646800994873, + "learning_rate": 4.0124339843397216e-06, + "loss": 0.4951, + "mean_token_accuracy": 0.8567704558372498, + "num_tokens": 154221854.0, + "step": 128240 + }, + { + "entropy": 1.8390052139759063, + "epoch": 0.39756377876234444, + "grad_norm": 8.591985702514648, + "learning_rate": 4.012277549895668e-06, + "loss": 0.4074, + "mean_token_accuracy": 0.859342285990715, + "num_tokens": 154234898.0, + "step": 128250 + }, + { + "entropy": 1.8981177046895028, + "epoch": 0.3975947778873941, + "grad_norm": 3.5473618507385254, + "learning_rate": 4.012121133747113e-06, + "loss": 0.438, + "mean_token_accuracy": 0.8572326749563217, + "num_tokens": 154247303.0, + "step": 128260 + }, + { + "entropy": 1.9318851605057716, + "epoch": 0.39762577701244384, + "grad_norm": 6.904049873352051, + "learning_rate": 4.011964735890492e-06, + "loss": 0.4364, + "mean_token_accuracy": 0.8485365882515907, + "num_tokens": 154259127.0, + "step": 128270 + }, + { + "entropy": 1.856573697924614, + "epoch": 0.3976567761374935, + "grad_norm": 7.510590553283691, + "learning_rate": 4.011808356322238e-06, + "loss": 0.4312, + "mean_token_accuracy": 0.8440538689494133, + "num_tokens": 154271893.0, + "step": 128280 + }, + { + "entropy": 1.9165368214249612, + "epoch": 0.39768777526254323, + "grad_norm": 3.662731885910034, + "learning_rate": 4.011651995038789e-06, + "loss": 0.4618, + "mean_token_accuracy": 0.8502262338995934, + "num_tokens": 154283424.0, + "step": 128290 + }, + { + "entropy": 1.9400472521781922, + "epoch": 0.3977187743875929, + "grad_norm": 8.449536323547363, + "learning_rate": 4.011495652036581e-06, + "loss": 0.4438, + "mean_token_accuracy": 0.864606074988842, + "num_tokens": 154294507.0, + "step": 128300 + }, + { + "entropy": 1.9318364679813385, + "epoch": 0.3977497735126426, + "grad_norm": 7.544651031494141, + "learning_rate": 4.011339327312052e-06, + "loss": 0.4951, + "mean_token_accuracy": 0.851364016532898, + "num_tokens": 154306187.0, + "step": 128310 + }, + { + "entropy": 1.859896543622017, + "epoch": 0.3977807726376923, + "grad_norm": 7.740284442901611, + "learning_rate": 4.0111830208616405e-06, + "loss": 0.4208, + "mean_token_accuracy": 0.8656293138861656, + "num_tokens": 154318477.0, + "step": 128320 + }, + { + "entropy": 1.9752064496278763, + "epoch": 0.397811771762742, + "grad_norm": 7.771737575531006, + "learning_rate": 4.011026732681787e-06, + "loss": 0.47, + "mean_token_accuracy": 0.8587437257170677, + "num_tokens": 154329265.0, + "step": 128330 + }, + { + "entropy": 1.826411110162735, + "epoch": 0.3978427708877917, + "grad_norm": 9.697999000549316, + "learning_rate": 4.010870462768933e-06, + "loss": 0.4771, + "mean_token_accuracy": 0.8450366973876953, + "num_tokens": 154343157.0, + "step": 128340 + }, + { + "entropy": 1.8539129607379436, + "epoch": 0.3978737700128414, + "grad_norm": 9.17566967010498, + "learning_rate": 4.010714211119519e-06, + "loss": 0.4242, + "mean_token_accuracy": 0.8553266122937202, + "num_tokens": 154354274.0, + "step": 128350 + }, + { + "entropy": 1.8710831806063652, + "epoch": 0.3979047691378911, + "grad_norm": 6.939380168914795, + "learning_rate": 4.010557977729989e-06, + "loss": 0.402, + "mean_token_accuracy": 0.8506567060947419, + "num_tokens": 154367463.0, + "step": 128360 + }, + { + "entropy": 1.9934969753026963, + "epoch": 0.3979357682629408, + "grad_norm": 9.315759658813477, + "learning_rate": 4.010401762596786e-06, + "loss": 0.5463, + "mean_token_accuracy": 0.8378830254077911, + "num_tokens": 154378868.0, + "step": 128370 + }, + { + "entropy": 1.9435987308621407, + "epoch": 0.39796676738799047, + "grad_norm": 7.836657524108887, + "learning_rate": 4.010245565716356e-06, + "loss": 0.443, + "mean_token_accuracy": 0.857276414334774, + "num_tokens": 154390849.0, + "step": 128380 + }, + { + "entropy": 1.8966143906116486, + "epoch": 0.3979977665130402, + "grad_norm": 8.332956314086914, + "learning_rate": 4.010089387085143e-06, + "loss": 0.4451, + "mean_token_accuracy": 0.8508517384529114, + "num_tokens": 154402741.0, + "step": 128390 + }, + { + "entropy": 1.9793959528207778, + "epoch": 0.39802876563808987, + "grad_norm": 8.801210403442383, + "learning_rate": 4.009933226699596e-06, + "loss": 0.5463, + "mean_token_accuracy": 0.8385202527046204, + "num_tokens": 154413301.0, + "step": 128400 + }, + { + "entropy": 1.9293900340795518, + "epoch": 0.39805976476313953, + "grad_norm": 10.354657173156738, + "learning_rate": 4.00977708455616e-06, + "loss": 0.5402, + "mean_token_accuracy": 0.8320423439145088, + "num_tokens": 154424899.0, + "step": 128410 + }, + { + "entropy": 1.7377336770296097, + "epoch": 0.39809076388818926, + "grad_norm": 7.612285137176514, + "learning_rate": 4.009620960651285e-06, + "loss": 0.3372, + "mean_token_accuracy": 0.8693819224834443, + "num_tokens": 154439450.0, + "step": 128420 + }, + { + "entropy": 1.9407954409718513, + "epoch": 0.3981217630132389, + "grad_norm": 7.916438579559326, + "learning_rate": 4.00946485498142e-06, + "loss": 0.4653, + "mean_token_accuracy": 0.8480086639523506, + "num_tokens": 154451790.0, + "step": 128430 + }, + { + "entropy": 1.9476406499743462, + "epoch": 0.39815276213828865, + "grad_norm": 4.425556659698486, + "learning_rate": 4.009308767543017e-06, + "loss": 0.4649, + "mean_token_accuracy": 0.8459201797842979, + "num_tokens": 154463897.0, + "step": 128440 + }, + { + "entropy": 2.0008150786161423, + "epoch": 0.3981837612633383, + "grad_norm": 9.704833030700684, + "learning_rate": 4.009152698332526e-06, + "loss": 0.4657, + "mean_token_accuracy": 0.8437413021922111, + "num_tokens": 154475527.0, + "step": 128450 + }, + { + "entropy": 1.7799909293651581, + "epoch": 0.39821476038838804, + "grad_norm": 3.9981141090393066, + "learning_rate": 4.0089966473464e-06, + "loss": 0.3358, + "mean_token_accuracy": 0.8721995532512665, + "num_tokens": 154489848.0, + "step": 128460 + }, + { + "entropy": 1.9085913822054863, + "epoch": 0.3982457595134377, + "grad_norm": 5.100532054901123, + "learning_rate": 4.008840614581093e-06, + "loss": 0.436, + "mean_token_accuracy": 0.856728957593441, + "num_tokens": 154502460.0, + "step": 128470 + }, + { + "entropy": 1.9310724124312402, + "epoch": 0.39827675863848744, + "grad_norm": 7.409860610961914, + "learning_rate": 4.008684600033059e-06, + "loss": 0.4814, + "mean_token_accuracy": 0.8475088179111481, + "num_tokens": 154515158.0, + "step": 128480 + }, + { + "entropy": 1.9532846316695214, + "epoch": 0.3983077577635371, + "grad_norm": 6.9244561195373535, + "learning_rate": 4.008528603698753e-06, + "loss": 0.4868, + "mean_token_accuracy": 0.8392842382192611, + "num_tokens": 154526053.0, + "step": 128490 + }, + { + "entropy": 1.864257997274399, + "epoch": 0.39833875688858683, + "grad_norm": 8.621221542358398, + "learning_rate": 4.008372625574633e-06, + "loss": 0.4103, + "mean_token_accuracy": 0.8733592018485069, + "num_tokens": 154537962.0, + "step": 128500 + }, + { + "entropy": 1.7741185277700424, + "epoch": 0.3983697560136365, + "grad_norm": 12.221007347106934, + "learning_rate": 4.008216665657155e-06, + "loss": 0.36, + "mean_token_accuracy": 0.8721621558070183, + "num_tokens": 154551264.0, + "step": 128510 + }, + { + "entropy": 1.8366203412413598, + "epoch": 0.3984007551386862, + "grad_norm": 8.838532447814941, + "learning_rate": 4.008060723942776e-06, + "loss": 0.3882, + "mean_token_accuracy": 0.8588550105690956, + "num_tokens": 154563900.0, + "step": 128520 + }, + { + "entropy": 1.9761778950691222, + "epoch": 0.3984317542637359, + "grad_norm": 8.271575927734375, + "learning_rate": 4.007904800427958e-06, + "loss": 0.5278, + "mean_token_accuracy": 0.8369174391031265, + "num_tokens": 154574439.0, + "step": 128530 + }, + { + "entropy": 1.871373300254345, + "epoch": 0.3984627533887856, + "grad_norm": 3.379486322402954, + "learning_rate": 4.0077488951091595e-06, + "loss": 0.4258, + "mean_token_accuracy": 0.852562639117241, + "num_tokens": 154586791.0, + "step": 128540 + }, + { + "entropy": 1.8863118886947632, + "epoch": 0.3984937525138353, + "grad_norm": 8.740182876586914, + "learning_rate": 4.007593007982842e-06, + "loss": 0.4544, + "mean_token_accuracy": 0.8477497771382332, + "num_tokens": 154599491.0, + "step": 128550 + }, + { + "entropy": 1.8409544050693512, + "epoch": 0.398524751638885, + "grad_norm": 4.045310020446777, + "learning_rate": 4.007437139045469e-06, + "loss": 0.3935, + "mean_token_accuracy": 0.8576883286237716, + "num_tokens": 154612812.0, + "step": 128560 + }, + { + "entropy": 1.8716674730181695, + "epoch": 0.3985557507639347, + "grad_norm": 4.682071208953857, + "learning_rate": 4.007281288293502e-06, + "loss": 0.4365, + "mean_token_accuracy": 0.8520221546292305, + "num_tokens": 154625583.0, + "step": 128570 + }, + { + "entropy": 1.9001308396458625, + "epoch": 0.3985867498889844, + "grad_norm": 7.292641639709473, + "learning_rate": 4.007125455723405e-06, + "loss": 0.4524, + "mean_token_accuracy": 0.8661713659763336, + "num_tokens": 154637640.0, + "step": 128580 + }, + { + "entropy": 1.9417346104979516, + "epoch": 0.3986177490140341, + "grad_norm": 8.141678810119629, + "learning_rate": 4.006969641331644e-06, + "loss": 0.4585, + "mean_token_accuracy": 0.8521144181489945, + "num_tokens": 154649418.0, + "step": 128590 + }, + { + "entropy": 1.9820822283625603, + "epoch": 0.3986487481390838, + "grad_norm": 10.370221138000488, + "learning_rate": 4.0068138451146845e-06, + "loss": 0.5383, + "mean_token_accuracy": 0.8394621968269348, + "num_tokens": 154660964.0, + "step": 128600 + }, + { + "entropy": 1.836016722023487, + "epoch": 0.39867974726413347, + "grad_norm": 3.998152017593384, + "learning_rate": 4.006658067068994e-06, + "loss": 0.4016, + "mean_token_accuracy": 0.8594549700617791, + "num_tokens": 154674134.0, + "step": 128610 + }, + { + "entropy": 1.9683793038129807, + "epoch": 0.3987107463891832, + "grad_norm": 8.68581771850586, + "learning_rate": 4.0065023071910395e-06, + "loss": 0.4343, + "mean_token_accuracy": 0.866059435904026, + "num_tokens": 154685620.0, + "step": 128620 + }, + { + "entropy": 1.9310366541147232, + "epoch": 0.39874174551423286, + "grad_norm": 8.650096893310547, + "learning_rate": 4.006346565477289e-06, + "loss": 0.4354, + "mean_token_accuracy": 0.8481193721294403, + "num_tokens": 154697293.0, + "step": 128630 + }, + { + "entropy": 1.9691759124398232, + "epoch": 0.3987727446392826, + "grad_norm": 7.362710952758789, + "learning_rate": 4.006190841924217e-06, + "loss": 0.4832, + "mean_token_accuracy": 0.8400867089629174, + "num_tokens": 154708876.0, + "step": 128640 + }, + { + "entropy": 1.8157849997282027, + "epoch": 0.39880374376433225, + "grad_norm": 3.911681652069092, + "learning_rate": 4.006035136528288e-06, + "loss": 0.4236, + "mean_token_accuracy": 0.8522502645850182, + "num_tokens": 154721940.0, + "step": 128650 + }, + { + "entropy": 1.9943493276834487, + "epoch": 0.3988347428893819, + "grad_norm": 9.267487525939941, + "learning_rate": 4.005879449285979e-06, + "loss": 0.5359, + "mean_token_accuracy": 0.8361817836761475, + "num_tokens": 154732230.0, + "step": 128660 + }, + { + "entropy": 1.9468561723828315, + "epoch": 0.39886574201443165, + "grad_norm": 8.407017707824707, + "learning_rate": 4.00572378019376e-06, + "loss": 0.4778, + "mean_token_accuracy": 0.8474451348185539, + "num_tokens": 154743873.0, + "step": 128670 + }, + { + "entropy": 1.883972604572773, + "epoch": 0.3988967411394813, + "grad_norm": 10.778926849365234, + "learning_rate": 4.005568129248105e-06, + "loss": 0.4577, + "mean_token_accuracy": 0.8490250527858734, + "num_tokens": 154756322.0, + "step": 128680 + }, + { + "entropy": 1.9082299306988717, + "epoch": 0.39892774026453104, + "grad_norm": 8.559072494506836, + "learning_rate": 4.0054124964454904e-06, + "loss": 0.4325, + "mean_token_accuracy": 0.8479592993855476, + "num_tokens": 154768822.0, + "step": 128690 + }, + { + "entropy": 1.915241825580597, + "epoch": 0.3989587393895807, + "grad_norm": 8.535916328430176, + "learning_rate": 4.005256881782389e-06, + "loss": 0.4834, + "mean_token_accuracy": 0.848818339407444, + "num_tokens": 154780787.0, + "step": 128700 + }, + { + "entropy": 1.85912893563509, + "epoch": 0.39898973851463043, + "grad_norm": 8.055673599243164, + "learning_rate": 4.005101285255279e-06, + "loss": 0.4004, + "mean_token_accuracy": 0.8605345159769058, + "num_tokens": 154793208.0, + "step": 128710 + }, + { + "entropy": 1.9649493724107743, + "epoch": 0.3990207376396801, + "grad_norm": 8.454475402832031, + "learning_rate": 4.004945706860638e-06, + "loss": 0.4744, + "mean_token_accuracy": 0.8544456735253334, + "num_tokens": 154804930.0, + "step": 128720 + }, + { + "entropy": 1.923764231801033, + "epoch": 0.3990517367647298, + "grad_norm": 7.50667142868042, + "learning_rate": 4.004790146594944e-06, + "loss": 0.5058, + "mean_token_accuracy": 0.8489472419023514, + "num_tokens": 154817425.0, + "step": 128730 + }, + { + "entropy": 1.8540160700678825, + "epoch": 0.3990827358897795, + "grad_norm": 4.772271633148193, + "learning_rate": 4.004634604454677e-06, + "loss": 0.4086, + "mean_token_accuracy": 0.8635622560977936, + "num_tokens": 154829558.0, + "step": 128740 + }, + { + "entropy": 1.8815680295228958, + "epoch": 0.3991137350148292, + "grad_norm": 8.889459609985352, + "learning_rate": 4.004479080436317e-06, + "loss": 0.4128, + "mean_token_accuracy": 0.8565109595656395, + "num_tokens": 154841865.0, + "step": 128750 + }, + { + "entropy": 1.8703197434544563, + "epoch": 0.3991447341398789, + "grad_norm": 3.177785634994507, + "learning_rate": 4.004323574536345e-06, + "loss": 0.4666, + "mean_token_accuracy": 0.851426774263382, + "num_tokens": 154854216.0, + "step": 128760 + }, + { + "entropy": 1.8539531543850898, + "epoch": 0.3991757332649286, + "grad_norm": 7.972646236419678, + "learning_rate": 4.004168086751243e-06, + "loss": 0.3709, + "mean_token_accuracy": 0.8601129725575447, + "num_tokens": 154867350.0, + "step": 128770 + }, + { + "entropy": 1.9226906821131706, + "epoch": 0.3992067323899783, + "grad_norm": 3.4731192588806152, + "learning_rate": 4.0040126170774955e-06, + "loss": 0.4739, + "mean_token_accuracy": 0.8491318255662919, + "num_tokens": 154879654.0, + "step": 128780 + }, + { + "entropy": 1.8054320514202118, + "epoch": 0.399237731515028, + "grad_norm": 9.286853790283203, + "learning_rate": 4.003857165511587e-06, + "loss": 0.3863, + "mean_token_accuracy": 0.8595024034380913, + "num_tokens": 154893043.0, + "step": 128790 + }, + { + "entropy": 1.8495088413357734, + "epoch": 0.3992687306400777, + "grad_norm": 8.713004112243652, + "learning_rate": 4.003701732050002e-06, + "loss": 0.423, + "mean_token_accuracy": 0.8574372127652168, + "num_tokens": 154905390.0, + "step": 128800 + }, + { + "entropy": 1.87299737483263, + "epoch": 0.3992997297651274, + "grad_norm": 9.513683319091797, + "learning_rate": 4.003546316689225e-06, + "loss": 0.4342, + "mean_token_accuracy": 0.8459521770477295, + "num_tokens": 154918334.0, + "step": 128810 + }, + { + "entropy": 1.9595311015844346, + "epoch": 0.39933072889017707, + "grad_norm": 9.396753311157227, + "learning_rate": 4.003390919425746e-06, + "loss": 0.4926, + "mean_token_accuracy": 0.8480179443955421, + "num_tokens": 154929275.0, + "step": 128820 + }, + { + "entropy": 1.8720010727643968, + "epoch": 0.3993617280152268, + "grad_norm": 8.757028579711914, + "learning_rate": 4.003235540256052e-06, + "loss": 0.4176, + "mean_token_accuracy": 0.8585854455828666, + "num_tokens": 154941320.0, + "step": 128830 + }, + { + "entropy": 1.9553624257445335, + "epoch": 0.39939272714027646, + "grad_norm": 10.621780395507812, + "learning_rate": 4.003080179176631e-06, + "loss": 0.4905, + "mean_token_accuracy": 0.8457315772771835, + "num_tokens": 154952903.0, + "step": 128840 + }, + { + "entropy": 1.7949346914887427, + "epoch": 0.3994237262653262, + "grad_norm": 2.807647466659546, + "learning_rate": 4.002924836183973e-06, + "loss": 0.3984, + "mean_token_accuracy": 0.8518297106027604, + "num_tokens": 154967282.0, + "step": 128850 + }, + { + "entropy": 1.8987135276198388, + "epoch": 0.39945472539037585, + "grad_norm": 8.776113510131836, + "learning_rate": 4.002769511274571e-06, + "loss": 0.4808, + "mean_token_accuracy": 0.8481128692626954, + "num_tokens": 154979746.0, + "step": 128860 + }, + { + "entropy": 1.9235922917723656, + "epoch": 0.3994857245154256, + "grad_norm": 9.092692375183105, + "learning_rate": 4.002614204444915e-06, + "loss": 0.4885, + "mean_token_accuracy": 0.8417194813489914, + "num_tokens": 154991799.0, + "step": 128870 + }, + { + "entropy": 1.8608113884925843, + "epoch": 0.39951672364047525, + "grad_norm": 7.770845413208008, + "learning_rate": 4.002458915691497e-06, + "loss": 0.4511, + "mean_token_accuracy": 0.8441184401512146, + "num_tokens": 155005153.0, + "step": 128880 + }, + { + "entropy": 1.9966261342167855, + "epoch": 0.39954772276552497, + "grad_norm": 7.349844932556152, + "learning_rate": 4.002303645010813e-06, + "loss": 0.496, + "mean_token_accuracy": 0.8491346299648285, + "num_tokens": 155016749.0, + "step": 128890 + }, + { + "entropy": 1.8176558762788773, + "epoch": 0.39957872189057464, + "grad_norm": 7.31537389755249, + "learning_rate": 4.002148392399357e-06, + "loss": 0.386, + "mean_token_accuracy": 0.8675624996423721, + "num_tokens": 155030413.0, + "step": 128900 + }, + { + "entropy": 1.7823689445853232, + "epoch": 0.3996097210156243, + "grad_norm": 8.566244125366211, + "learning_rate": 4.001993157853624e-06, + "loss": 0.4131, + "mean_token_accuracy": 0.8506886452436447, + "num_tokens": 155044038.0, + "step": 128910 + }, + { + "entropy": 1.8545478105545044, + "epoch": 0.39964072014067403, + "grad_norm": 7.566455364227295, + "learning_rate": 4.001837941370112e-06, + "loss": 0.3984, + "mean_token_accuracy": 0.8633383393287659, + "num_tokens": 155056896.0, + "step": 128920 + }, + { + "entropy": 1.94671870470047, + "epoch": 0.3996717192657237, + "grad_norm": 9.15937328338623, + "learning_rate": 4.0016827429453155e-06, + "loss": 0.4891, + "mean_token_accuracy": 0.8502067849040031, + "num_tokens": 155067894.0, + "step": 128930 + }, + { + "entropy": 1.8951860249042511, + "epoch": 0.3997027183907734, + "grad_norm": 9.120491027832031, + "learning_rate": 4.001527562575736e-06, + "loss": 0.4282, + "mean_token_accuracy": 0.8581095486879349, + "num_tokens": 155079974.0, + "step": 128940 + }, + { + "entropy": 1.973567920923233, + "epoch": 0.3997337175158231, + "grad_norm": 8.294892311096191, + "learning_rate": 4.001372400257873e-06, + "loss": 0.5001, + "mean_token_accuracy": 0.8433159068226814, + "num_tokens": 155091036.0, + "step": 128950 + }, + { + "entropy": 1.85403670668602, + "epoch": 0.3997647166408728, + "grad_norm": 4.179879188537598, + "learning_rate": 4.001217255988225e-06, + "loss": 0.4339, + "mean_token_accuracy": 0.8468100443482399, + "num_tokens": 155103812.0, + "step": 128960 + }, + { + "entropy": 1.8100439786911011, + "epoch": 0.3997957157659225, + "grad_norm": 3.8184187412261963, + "learning_rate": 4.001062129763296e-06, + "loss": 0.4437, + "mean_token_accuracy": 0.870311813056469, + "num_tokens": 155117031.0, + "step": 128970 + }, + { + "entropy": 1.9028596684336663, + "epoch": 0.3998267148909722, + "grad_norm": 7.593201637268066, + "learning_rate": 4.000907021579585e-06, + "loss": 0.4921, + "mean_token_accuracy": 0.8530150771141052, + "num_tokens": 155128674.0, + "step": 128980 + }, + { + "entropy": 1.9456078946590423, + "epoch": 0.3998577140160219, + "grad_norm": 8.108650207519531, + "learning_rate": 4.000751931433597e-06, + "loss": 0.4632, + "mean_token_accuracy": 0.8464592576026917, + "num_tokens": 155139476.0, + "step": 128990 + }, + { + "entropy": 1.9551035940647126, + "epoch": 0.3998887131410716, + "grad_norm": 9.936694145202637, + "learning_rate": 4.000596859321837e-06, + "loss": 0.5173, + "mean_token_accuracy": 0.8364872843027115, + "num_tokens": 155151556.0, + "step": 129000 + }, + { + "entropy": 1.7921033650636673, + "epoch": 0.3999197122661213, + "grad_norm": 6.708187103271484, + "learning_rate": 4.000441805240809e-06, + "loss": 0.4141, + "mean_token_accuracy": 0.8557077631354332, + "num_tokens": 155165232.0, + "step": 129010 + }, + { + "entropy": 1.9106485813856124, + "epoch": 0.399950711391171, + "grad_norm": 3.6080920696258545, + "learning_rate": 4.00028676918702e-06, + "loss": 0.4454, + "mean_token_accuracy": 0.8508982062339783, + "num_tokens": 155176886.0, + "step": 129020 + }, + { + "entropy": 1.9070033580064774, + "epoch": 0.39998171051622067, + "grad_norm": 7.818281650543213, + "learning_rate": 4.000131751156976e-06, + "loss": 0.4894, + "mean_token_accuracy": 0.8354123339056969, + "num_tokens": 155189493.0, + "step": 129030 + }, + { + "entropy": 1.9348053887486458, + "epoch": 0.4000127096412704, + "grad_norm": 9.461575508117676, + "learning_rate": 3.999976751147185e-06, + "loss": 0.5011, + "mean_token_accuracy": 0.8445191279053688, + "num_tokens": 155200851.0, + "step": 129040 + }, + { + "entropy": 1.958994448184967, + "epoch": 0.40004370876632006, + "grad_norm": 10.124030113220215, + "learning_rate": 3.999821769154158e-06, + "loss": 0.4937, + "mean_token_accuracy": 0.8466064438223839, + "num_tokens": 155212316.0, + "step": 129050 + }, + { + "entropy": 1.9629956305027008, + "epoch": 0.4000747078913698, + "grad_norm": 7.477722644805908, + "learning_rate": 3.999666805174402e-06, + "loss": 0.5206, + "mean_token_accuracy": 0.8446162462234497, + "num_tokens": 155223329.0, + "step": 129060 + }, + { + "entropy": 1.8508127138018609, + "epoch": 0.40010570701641945, + "grad_norm": 9.45833969116211, + "learning_rate": 3.999511859204431e-06, + "loss": 0.4495, + "mean_token_accuracy": 0.8511440798640251, + "num_tokens": 155235927.0, + "step": 129070 + }, + { + "entropy": 1.8051378756761551, + "epoch": 0.4001367061414692, + "grad_norm": 8.635483741760254, + "learning_rate": 3.999356931240755e-06, + "loss": 0.4505, + "mean_token_accuracy": 0.846815425157547, + "num_tokens": 155248596.0, + "step": 129080 + }, + { + "entropy": 1.7926154106855392, + "epoch": 0.40016770526651885, + "grad_norm": 8.050457954406738, + "learning_rate": 3.999202021279885e-06, + "loss": 0.3689, + "mean_token_accuracy": 0.8725738286972046, + "num_tokens": 155261547.0, + "step": 129090 + }, + { + "entropy": 1.9420858532190324, + "epoch": 0.40019870439156857, + "grad_norm": 9.84321403503418, + "learning_rate": 3.999047129318338e-06, + "loss": 0.4839, + "mean_token_accuracy": 0.8433678567409515, + "num_tokens": 155272699.0, + "step": 129100 + }, + { + "entropy": 1.7918237030506134, + "epoch": 0.40022970351661824, + "grad_norm": 4.470568656921387, + "learning_rate": 3.998892255352627e-06, + "loss": 0.4126, + "mean_token_accuracy": 0.8606519907712936, + "num_tokens": 155286705.0, + "step": 129110 + }, + { + "entropy": 1.8863151326775551, + "epoch": 0.40026070264166796, + "grad_norm": 7.721844673156738, + "learning_rate": 3.998737399379268e-06, + "loss": 0.4217, + "mean_token_accuracy": 0.8649403393268585, + "num_tokens": 155298688.0, + "step": 129120 + }, + { + "entropy": 1.9350427404046058, + "epoch": 0.40029170176671763, + "grad_norm": 7.882344722747803, + "learning_rate": 3.998582561394776e-06, + "loss": 0.4691, + "mean_token_accuracy": 0.8500554978847503, + "num_tokens": 155310013.0, + "step": 129130 + }, + { + "entropy": 1.8537417277693748, + "epoch": 0.4003227008917673, + "grad_norm": 8.492751121520996, + "learning_rate": 3.998427741395671e-06, + "loss": 0.4096, + "mean_token_accuracy": 0.8571956619620323, + "num_tokens": 155322433.0, + "step": 129140 + }, + { + "entropy": 1.9245151728391647, + "epoch": 0.400353700016817, + "grad_norm": 2.7976245880126953, + "learning_rate": 3.99827293937847e-06, + "loss": 0.4817, + "mean_token_accuracy": 0.8548303097486496, + "num_tokens": 155334298.0, + "step": 129150 + }, + { + "entropy": 1.9198249489068986, + "epoch": 0.4003846991418667, + "grad_norm": 3.582043170928955, + "learning_rate": 3.998118155339692e-06, + "loss": 0.5238, + "mean_token_accuracy": 0.8364661797881127, + "num_tokens": 155345756.0, + "step": 129160 + }, + { + "entropy": 1.9318518668413163, + "epoch": 0.4004156982669164, + "grad_norm": 8.037443161010742, + "learning_rate": 3.997963389275859e-06, + "loss": 0.4618, + "mean_token_accuracy": 0.8507610768079757, + "num_tokens": 155356700.0, + "step": 129170 + }, + { + "entropy": 1.9149801477789878, + "epoch": 0.4004466973919661, + "grad_norm": 8.608445167541504, + "learning_rate": 3.99780864118349e-06, + "loss": 0.4695, + "mean_token_accuracy": 0.8519381731748581, + "num_tokens": 155367938.0, + "step": 129180 + }, + { + "entropy": 1.8103131994605064, + "epoch": 0.4004776965170158, + "grad_norm": 5.219323635101318, + "learning_rate": 3.9976539110591086e-06, + "loss": 0.4518, + "mean_token_accuracy": 0.8530828103423118, + "num_tokens": 155381084.0, + "step": 129190 + }, + { + "entropy": 1.7915343165397644, + "epoch": 0.4005086956420655, + "grad_norm": 8.658716201782227, + "learning_rate": 3.997499198899237e-06, + "loss": 0.3531, + "mean_token_accuracy": 0.8671585947275162, + "num_tokens": 155394355.0, + "step": 129200 + }, + { + "entropy": 1.791802540421486, + "epoch": 0.4005396947671152, + "grad_norm": 7.633795261383057, + "learning_rate": 3.997344504700401e-06, + "loss": 0.3853, + "mean_token_accuracy": 0.8690174669027328, + "num_tokens": 155407374.0, + "step": 129210 + }, + { + "entropy": 1.8313867419958114, + "epoch": 0.4005706938921649, + "grad_norm": 7.055886745452881, + "learning_rate": 3.997189828459124e-06, + "loss": 0.3852, + "mean_token_accuracy": 0.8657983660697937, + "num_tokens": 155420166.0, + "step": 129220 + }, + { + "entropy": 1.8343457579612732, + "epoch": 0.4006016930172146, + "grad_norm": 8.553956031799316, + "learning_rate": 3.997035170171932e-06, + "loss": 0.4769, + "mean_token_accuracy": 0.8479964166879654, + "num_tokens": 155432912.0, + "step": 129230 + }, + { + "entropy": 1.8380364283919335, + "epoch": 0.40063269214226427, + "grad_norm": 4.127384185791016, + "learning_rate": 3.996880529835352e-06, + "loss": 0.4349, + "mean_token_accuracy": 0.8608715251088143, + "num_tokens": 155445327.0, + "step": 129240 + }, + { + "entropy": 1.8258098438382149, + "epoch": 0.400663691267314, + "grad_norm": 3.8394930362701416, + "learning_rate": 3.996725907445914e-06, + "loss": 0.4834, + "mean_token_accuracy": 0.8447212398052215, + "num_tokens": 155458107.0, + "step": 129250 + }, + { + "entropy": 1.9279157117009162, + "epoch": 0.40069469039236366, + "grad_norm": 8.191624641418457, + "learning_rate": 3.996571303000143e-06, + "loss": 0.5147, + "mean_token_accuracy": 0.8399505227804184, + "num_tokens": 155469474.0, + "step": 129260 + }, + { + "entropy": 1.8246611893177032, + "epoch": 0.4007256895174134, + "grad_norm": 4.23314905166626, + "learning_rate": 3.996416716494572e-06, + "loss": 0.418, + "mean_token_accuracy": 0.8508252546191215, + "num_tokens": 155482390.0, + "step": 129270 + }, + { + "entropy": 1.796310657262802, + "epoch": 0.40075668864246305, + "grad_norm": 3.9624927043914795, + "learning_rate": 3.996262147925729e-06, + "loss": 0.3754, + "mean_token_accuracy": 0.857945391535759, + "num_tokens": 155494865.0, + "step": 129280 + }, + { + "entropy": 1.8756929695606233, + "epoch": 0.4007876877675128, + "grad_norm": 10.10248851776123, + "learning_rate": 3.996107597290148e-06, + "loss": 0.4743, + "mean_token_accuracy": 0.8458318576216698, + "num_tokens": 155507263.0, + "step": 129290 + }, + { + "entropy": 1.853571632504463, + "epoch": 0.40081868689256245, + "grad_norm": 8.234441757202148, + "learning_rate": 3.99595306458436e-06, + "loss": 0.4667, + "mean_token_accuracy": 0.8422810658812523, + "num_tokens": 155519150.0, + "step": 129300 + }, + { + "entropy": 1.8298091277480126, + "epoch": 0.4008496860176122, + "grad_norm": 4.043398380279541, + "learning_rate": 3.995798549804898e-06, + "loss": 0.4288, + "mean_token_accuracy": 0.8495631635189056, + "num_tokens": 155531954.0, + "step": 129310 + }, + { + "entropy": 1.8689694941043853, + "epoch": 0.40088068514266184, + "grad_norm": 8.660704612731934, + "learning_rate": 3.995644052948298e-06, + "loss": 0.4544, + "mean_token_accuracy": 0.8553383216261864, + "num_tokens": 155543288.0, + "step": 129320 + }, + { + "entropy": 1.8053039491176606, + "epoch": 0.40091168426771157, + "grad_norm": 8.172130584716797, + "learning_rate": 3.995489574011096e-06, + "loss": 0.4413, + "mean_token_accuracy": 0.8528603136539459, + "num_tokens": 155555748.0, + "step": 129330 + }, + { + "entropy": 1.9306787729263306, + "epoch": 0.40094268339276123, + "grad_norm": 9.331960678100586, + "learning_rate": 3.995335112989825e-06, + "loss": 0.4986, + "mean_token_accuracy": 0.8444873213768005, + "num_tokens": 155566893.0, + "step": 129340 + }, + { + "entropy": 1.8586668640375137, + "epoch": 0.40097368251781096, + "grad_norm": 9.09153938293457, + "learning_rate": 3.995180669881025e-06, + "loss": 0.4446, + "mean_token_accuracy": 0.8564663842320442, + "num_tokens": 155578818.0, + "step": 129350 + }, + { + "entropy": 1.8796383678913116, + "epoch": 0.4010046816428606, + "grad_norm": 8.167010307312012, + "learning_rate": 3.995026244681234e-06, + "loss": 0.4534, + "mean_token_accuracy": 0.842990031838417, + "num_tokens": 155590442.0, + "step": 129360 + }, + { + "entropy": 1.9271098881959916, + "epoch": 0.40103568076791035, + "grad_norm": 9.545135498046875, + "learning_rate": 3.994871837386989e-06, + "loss": 0.515, + "mean_token_accuracy": 0.8358207404613495, + "num_tokens": 155601889.0, + "step": 129370 + }, + { + "entropy": 1.9072769358754158, + "epoch": 0.40106667989296, + "grad_norm": 7.5232038497924805, + "learning_rate": 3.994717447994832e-06, + "loss": 0.4578, + "mean_token_accuracy": 0.8516602620482445, + "num_tokens": 155614028.0, + "step": 129380 + }, + { + "entropy": 1.82378898113966, + "epoch": 0.4010976790180097, + "grad_norm": 5.845879554748535, + "learning_rate": 3.994563076501303e-06, + "loss": 0.4415, + "mean_token_accuracy": 0.8528670743107796, + "num_tokens": 155626507.0, + "step": 129390 + }, + { + "entropy": 1.8010734841227531, + "epoch": 0.4011286781430594, + "grad_norm": 3.6198856830596924, + "learning_rate": 3.994408722902945e-06, + "loss": 0.4038, + "mean_token_accuracy": 0.8558384582400322, + "num_tokens": 155639677.0, + "step": 129400 + }, + { + "entropy": 1.8200062766671181, + "epoch": 0.4011596772681091, + "grad_norm": 3.8507888317108154, + "learning_rate": 3.9942543871963006e-06, + "loss": 0.4381, + "mean_token_accuracy": 0.8555910781025886, + "num_tokens": 155652366.0, + "step": 129410 + }, + { + "entropy": 1.866781549155712, + "epoch": 0.4011906763931588, + "grad_norm": 6.747231483459473, + "learning_rate": 3.994100069377912e-06, + "loss": 0.44, + "mean_token_accuracy": 0.8502749800682068, + "num_tokens": 155664907.0, + "step": 129420 + }, + { + "entropy": 1.8888089403510093, + "epoch": 0.4012216755182085, + "grad_norm": 8.488142967224121, + "learning_rate": 3.993945769444325e-06, + "loss": 0.4885, + "mean_token_accuracy": 0.8422931343317032, + "num_tokens": 155676728.0, + "step": 129430 + }, + { + "entropy": 1.827822931110859, + "epoch": 0.4012526746432582, + "grad_norm": 4.2824883460998535, + "learning_rate": 3.9937914873920855e-06, + "loss": 0.4095, + "mean_token_accuracy": 0.855113011598587, + "num_tokens": 155689256.0, + "step": 129440 + }, + { + "entropy": 1.82640281021595, + "epoch": 0.40128367376830787, + "grad_norm": 9.845888137817383, + "learning_rate": 3.99363722321774e-06, + "loss": 0.3959, + "mean_token_accuracy": 0.8682078808546067, + "num_tokens": 155701978.0, + "step": 129450 + }, + { + "entropy": 1.8602233231067657, + "epoch": 0.4013146728933576, + "grad_norm": 9.442498207092285, + "learning_rate": 3.9934829769178365e-06, + "loss": 0.5007, + "mean_token_accuracy": 0.8478624925017357, + "num_tokens": 155714775.0, + "step": 129460 + }, + { + "entropy": 1.9146529287099838, + "epoch": 0.40134567201840726, + "grad_norm": 7.782778263092041, + "learning_rate": 3.993328748488922e-06, + "loss": 0.4922, + "mean_token_accuracy": 0.8408509835600853, + "num_tokens": 155726524.0, + "step": 129470 + }, + { + "entropy": 1.8323335126042366, + "epoch": 0.401376671143457, + "grad_norm": 4.129303455352783, + "learning_rate": 3.993174537927546e-06, + "loss": 0.3861, + "mean_token_accuracy": 0.8619013637304306, + "num_tokens": 155739316.0, + "step": 129480 + }, + { + "entropy": 1.8000301405787469, + "epoch": 0.40140767026850666, + "grad_norm": 4.743396282196045, + "learning_rate": 3.993020345230262e-06, + "loss": 0.4435, + "mean_token_accuracy": 0.8503546267747879, + "num_tokens": 155752513.0, + "step": 129490 + }, + { + "entropy": 1.846390649676323, + "epoch": 0.4014386693935564, + "grad_norm": 3.5405542850494385, + "learning_rate": 3.9928661703936164e-06, + "loss": 0.5197, + "mean_token_accuracy": 0.8494727715849877, + "num_tokens": 155764952.0, + "step": 129500 + }, + { + "entropy": 1.8727680340409278, + "epoch": 0.40146966851860605, + "grad_norm": 7.60482931137085, + "learning_rate": 3.992712013414165e-06, + "loss": 0.4306, + "mean_token_accuracy": 0.8475085526704789, + "num_tokens": 155777679.0, + "step": 129510 + }, + { + "entropy": 1.9042856127023697, + "epoch": 0.4015006676436558, + "grad_norm": 7.735726833343506, + "learning_rate": 3.992557874288459e-06, + "loss": 0.49, + "mean_token_accuracy": 0.8530709683895111, + "num_tokens": 155789778.0, + "step": 129520 + }, + { + "entropy": 1.8619258537888528, + "epoch": 0.40153166676870544, + "grad_norm": 8.040087699890137, + "learning_rate": 3.992403753013052e-06, + "loss": 0.4286, + "mean_token_accuracy": 0.8604286208748817, + "num_tokens": 155801838.0, + "step": 129530 + }, + { + "entropy": 1.9772648215293884, + "epoch": 0.40156266589375517, + "grad_norm": 10.059983253479004, + "learning_rate": 3.9922496495845015e-06, + "loss": 0.5051, + "mean_token_accuracy": 0.8421792671084404, + "num_tokens": 155812867.0, + "step": 129540 + }, + { + "entropy": 1.9216817393898964, + "epoch": 0.40159366501880484, + "grad_norm": 7.800021648406982, + "learning_rate": 3.992095563999361e-06, + "loss": 0.4909, + "mean_token_accuracy": 0.8464254021644593, + "num_tokens": 155823811.0, + "step": 129550 + }, + { + "entropy": 1.828902542591095, + "epoch": 0.40162466414385456, + "grad_norm": 10.652735710144043, + "learning_rate": 3.991941496254188e-06, + "loss": 0.3814, + "mean_token_accuracy": 0.8681714370846748, + "num_tokens": 155836495.0, + "step": 129560 + }, + { + "entropy": 1.7818224623799324, + "epoch": 0.40165566326890423, + "grad_norm": 8.231464385986328, + "learning_rate": 3.991787446345542e-06, + "loss": 0.431, + "mean_token_accuracy": 0.8553294375538826, + "num_tokens": 155849733.0, + "step": 129570 + }, + { + "entropy": 1.933050660789013, + "epoch": 0.40168666239395395, + "grad_norm": 9.135824203491211, + "learning_rate": 3.991633414269979e-06, + "loss": 0.4789, + "mean_token_accuracy": 0.8501686662435531, + "num_tokens": 155861168.0, + "step": 129580 + }, + { + "entropy": 1.9456676498055459, + "epoch": 0.4017176615190036, + "grad_norm": 7.7797393798828125, + "learning_rate": 3.9914794000240604e-06, + "loss": 0.4655, + "mean_token_accuracy": 0.8487606376409531, + "num_tokens": 155872569.0, + "step": 129590 + }, + { + "entropy": 1.9395177766680718, + "epoch": 0.40174866064405335, + "grad_norm": 4.4937334060668945, + "learning_rate": 3.9913254036043455e-06, + "loss": 0.5013, + "mean_token_accuracy": 0.8409850835800171, + "num_tokens": 155884343.0, + "step": 129600 + }, + { + "entropy": 1.96222285926342, + "epoch": 0.401779659769103, + "grad_norm": 8.145906448364258, + "learning_rate": 3.991171425007396e-06, + "loss": 0.5238, + "mean_token_accuracy": 0.832419815659523, + "num_tokens": 155895310.0, + "step": 129610 + }, + { + "entropy": 1.9075540453195572, + "epoch": 0.40181065889415274, + "grad_norm": 6.813295364379883, + "learning_rate": 3.991017464229776e-06, + "loss": 0.4826, + "mean_token_accuracy": 0.8529313713312149, + "num_tokens": 155907503.0, + "step": 129620 + }, + { + "entropy": 1.8687444925308228, + "epoch": 0.4018416580192024, + "grad_norm": 8.512529373168945, + "learning_rate": 3.990863521268047e-06, + "loss": 0.4664, + "mean_token_accuracy": 0.8458928510546684, + "num_tokens": 155919609.0, + "step": 129630 + }, + { + "entropy": 1.937071642279625, + "epoch": 0.4018726571442521, + "grad_norm": 8.758728981018066, + "learning_rate": 3.990709596118774e-06, + "loss": 0.486, + "mean_token_accuracy": 0.8510117352008819, + "num_tokens": 155931150.0, + "step": 129640 + }, + { + "entropy": 1.9854602545499802, + "epoch": 0.4019036562693018, + "grad_norm": 7.997261047363281, + "learning_rate": 3.990555688778521e-06, + "loss": 0.4844, + "mean_token_accuracy": 0.8390512794256211, + "num_tokens": 155943070.0, + "step": 129650 + }, + { + "entropy": 1.8987844973802566, + "epoch": 0.40193465539435147, + "grad_norm": 3.581861734390259, + "learning_rate": 3.990401799243856e-06, + "loss": 0.4422, + "mean_token_accuracy": 0.8575407922267914, + "num_tokens": 155955435.0, + "step": 129660 + }, + { + "entropy": 1.9078197583556176, + "epoch": 0.4019656545194012, + "grad_norm": 8.173437118530273, + "learning_rate": 3.990247927511345e-06, + "loss": 0.4263, + "mean_token_accuracy": 0.8537064164876937, + "num_tokens": 155967188.0, + "step": 129670 + }, + { + "entropy": 1.9190190985798836, + "epoch": 0.40199665364445086, + "grad_norm": 9.123544692993164, + "learning_rate": 3.990094073577556e-06, + "loss": 0.4679, + "mean_token_accuracy": 0.8527463868260383, + "num_tokens": 155978679.0, + "step": 129680 + }, + { + "entropy": 1.9677730560302735, + "epoch": 0.4020276527695006, + "grad_norm": 8.531991004943848, + "learning_rate": 3.9899402374390585e-06, + "loss": 0.5025, + "mean_token_accuracy": 0.8477371469140053, + "num_tokens": 155989818.0, + "step": 129690 + }, + { + "entropy": 1.8901991337537765, + "epoch": 0.40205865189455026, + "grad_norm": 4.090048789978027, + "learning_rate": 3.989786419092422e-06, + "loss": 0.4215, + "mean_token_accuracy": 0.8558493494987488, + "num_tokens": 156002187.0, + "step": 129700 + }, + { + "entropy": 1.90907621383667, + "epoch": 0.4020896510196, + "grad_norm": 5.129952907562256, + "learning_rate": 3.989632618534216e-06, + "loss": 0.4733, + "mean_token_accuracy": 0.8454459354281425, + "num_tokens": 156014603.0, + "step": 129710 + }, + { + "entropy": 1.821659305691719, + "epoch": 0.40212065014464965, + "grad_norm": 7.665596961975098, + "learning_rate": 3.9894788357610134e-06, + "loss": 0.3828, + "mean_token_accuracy": 0.8629834577441216, + "num_tokens": 156028439.0, + "step": 129720 + }, + { + "entropy": 1.8766173496842384, + "epoch": 0.4021516492696994, + "grad_norm": 3.5944769382476807, + "learning_rate": 3.989325070769388e-06, + "loss": 0.4096, + "mean_token_accuracy": 0.8606648370623589, + "num_tokens": 156040404.0, + "step": 129730 + }, + { + "entropy": 1.8934729993343353, + "epoch": 0.40218264839474904, + "grad_norm": 3.758676052093506, + "learning_rate": 3.98917132355591e-06, + "loss": 0.4176, + "mean_token_accuracy": 0.8599832877516747, + "num_tokens": 156052297.0, + "step": 129740 + }, + { + "entropy": 1.9285873532295228, + "epoch": 0.40221364751979877, + "grad_norm": 7.010026454925537, + "learning_rate": 3.989017594117158e-06, + "loss": 0.4784, + "mean_token_accuracy": 0.8508692249655724, + "num_tokens": 156063027.0, + "step": 129750 + }, + { + "entropy": 1.7936880350112916, + "epoch": 0.40224464664484844, + "grad_norm": 9.10521125793457, + "learning_rate": 3.9888638824497034e-06, + "loss": 0.4211, + "mean_token_accuracy": 0.8516425848007202, + "num_tokens": 156075651.0, + "step": 129760 + }, + { + "entropy": 1.8739636823534966, + "epoch": 0.40227564576989816, + "grad_norm": 8.865586280822754, + "learning_rate": 3.988710188550125e-06, + "loss": 0.4647, + "mean_token_accuracy": 0.8452788576483726, + "num_tokens": 156087306.0, + "step": 129770 + }, + { + "entropy": 1.9241146951913835, + "epoch": 0.40230664489494783, + "grad_norm": 8.022590637207031, + "learning_rate": 3.988556512415e-06, + "loss": 0.5015, + "mean_token_accuracy": 0.8422558963298797, + "num_tokens": 156098765.0, + "step": 129780 + }, + { + "entropy": 1.8548906803131104, + "epoch": 0.40233764401999755, + "grad_norm": 6.8973493576049805, + "learning_rate": 3.988402854040903e-06, + "loss": 0.414, + "mean_token_accuracy": 0.8576776921749115, + "num_tokens": 156110825.0, + "step": 129790 + }, + { + "entropy": 1.7588834911584854, + "epoch": 0.4023686431450472, + "grad_norm": 8.585075378417969, + "learning_rate": 3.988249213424419e-06, + "loss": 0.3764, + "mean_token_accuracy": 0.8594303146004677, + "num_tokens": 156124449.0, + "step": 129800 + }, + { + "entropy": 1.8105765163898468, + "epoch": 0.40239964227009695, + "grad_norm": 8.616347312927246, + "learning_rate": 3.9880955905621235e-06, + "loss": 0.4874, + "mean_token_accuracy": 0.850447241961956, + "num_tokens": 156137506.0, + "step": 129810 + }, + { + "entropy": 1.8526711270213128, + "epoch": 0.4024306413951466, + "grad_norm": 8.065713882446289, + "learning_rate": 3.9879419854506e-06, + "loss": 0.4683, + "mean_token_accuracy": 0.8551761761307717, + "num_tokens": 156149785.0, + "step": 129820 + }, + { + "entropy": 1.8890022948384284, + "epoch": 0.40246164052019634, + "grad_norm": 7.917675495147705, + "learning_rate": 3.987788398086428e-06, + "loss": 0.4387, + "mean_token_accuracy": 0.8588599741458893, + "num_tokens": 156160850.0, + "step": 129830 + }, + { + "entropy": 1.784026588499546, + "epoch": 0.402492639645246, + "grad_norm": 9.626664161682129, + "learning_rate": 3.987634828466191e-06, + "loss": 0.3686, + "mean_token_accuracy": 0.8714671805500984, + "num_tokens": 156173409.0, + "step": 129840 + }, + { + "entropy": 1.8477544769644738, + "epoch": 0.40252363877029573, + "grad_norm": 8.537195205688477, + "learning_rate": 3.987481276586474e-06, + "loss": 0.4918, + "mean_token_accuracy": 0.8426776066422462, + "num_tokens": 156186477.0, + "step": 129850 + }, + { + "entropy": 1.736336489021778, + "epoch": 0.4025546378953454, + "grad_norm": 2.7146239280700684, + "learning_rate": 3.98732774244386e-06, + "loss": 0.4036, + "mean_token_accuracy": 0.862640056014061, + "num_tokens": 156200279.0, + "step": 129860 + }, + { + "entropy": 1.8483411461114883, + "epoch": 0.4025856370203951, + "grad_norm": 3.8119378089904785, + "learning_rate": 3.987174226034936e-06, + "loss": 0.4548, + "mean_token_accuracy": 0.8419306710362434, + "num_tokens": 156212592.0, + "step": 129870 + }, + { + "entropy": 1.9207856342196465, + "epoch": 0.4026166361454448, + "grad_norm": 3.5462663173675537, + "learning_rate": 3.987020727356287e-06, + "loss": 0.4749, + "mean_token_accuracy": 0.8504367902874946, + "num_tokens": 156223684.0, + "step": 129880 + }, + { + "entropy": 1.8283285796642303, + "epoch": 0.40264763527049446, + "grad_norm": 8.442768096923828, + "learning_rate": 3.9868672464045005e-06, + "loss": 0.4547, + "mean_token_accuracy": 0.8510240688920021, + "num_tokens": 156236409.0, + "step": 129890 + }, + { + "entropy": 1.9546339631080627, + "epoch": 0.4026786343955442, + "grad_norm": 7.817316055297852, + "learning_rate": 3.986713783176166e-06, + "loss": 0.47, + "mean_token_accuracy": 0.8490804046392441, + "num_tokens": 156247811.0, + "step": 129900 + }, + { + "entropy": 1.9404812157154083, + "epoch": 0.40270963352059386, + "grad_norm": 8.437518119812012, + "learning_rate": 3.986560337667872e-06, + "loss": 0.5045, + "mean_token_accuracy": 0.8467909589409828, + "num_tokens": 156259779.0, + "step": 129910 + }, + { + "entropy": 1.9066607609391213, + "epoch": 0.4027406326456436, + "grad_norm": 7.6331305503845215, + "learning_rate": 3.986406909876207e-06, + "loss": 0.4798, + "mean_token_accuracy": 0.8462503507733345, + "num_tokens": 156271585.0, + "step": 129920 + }, + { + "entropy": 1.8766513273119927, + "epoch": 0.40277163177069325, + "grad_norm": 3.2706258296966553, + "learning_rate": 3.986253499797765e-06, + "loss": 0.4622, + "mean_token_accuracy": 0.8481630995869637, + "num_tokens": 156284035.0, + "step": 129930 + }, + { + "entropy": 1.8868813574314118, + "epoch": 0.402802630895743, + "grad_norm": 8.877392768859863, + "learning_rate": 3.986100107429135e-06, + "loss": 0.484, + "mean_token_accuracy": 0.8435383632779121, + "num_tokens": 156296009.0, + "step": 129940 + }, + { + "entropy": 1.9167783632874489, + "epoch": 0.40283363002079264, + "grad_norm": 8.852008819580078, + "learning_rate": 3.985946732766913e-06, + "loss": 0.4591, + "mean_token_accuracy": 0.8485684707760811, + "num_tokens": 156307643.0, + "step": 129950 + }, + { + "entropy": 1.9228922203183174, + "epoch": 0.40286462914584237, + "grad_norm": 7.26152229309082, + "learning_rate": 3.98579337580769e-06, + "loss": 0.4331, + "mean_token_accuracy": 0.8554493680596351, + "num_tokens": 156319030.0, + "step": 129960 + }, + { + "entropy": 1.874145193397999, + "epoch": 0.40289562827089204, + "grad_norm": 8.481700897216797, + "learning_rate": 3.985640036548062e-06, + "loss": 0.4785, + "mean_token_accuracy": 0.8435692340135574, + "num_tokens": 156331237.0, + "step": 129970 + }, + { + "entropy": 1.8589432962238788, + "epoch": 0.40292662739594176, + "grad_norm": 7.1247687339782715, + "learning_rate": 3.985486714984625e-06, + "loss": 0.4164, + "mean_token_accuracy": 0.8517856851220131, + "num_tokens": 156344722.0, + "step": 129980 + }, + { + "entropy": 1.9143261551856994, + "epoch": 0.40295762652099143, + "grad_norm": 9.23549747467041, + "learning_rate": 3.985333411113975e-06, + "loss": 0.509, + "mean_token_accuracy": 0.8484362363815308, + "num_tokens": 156356528.0, + "step": 129990 + }, + { + "entropy": 1.9343707114458084, + "epoch": 0.40298862564604115, + "grad_norm": 8.890726089477539, + "learning_rate": 3.985180124932709e-06, + "loss": 0.4702, + "mean_token_accuracy": 0.8550547063350677, + "num_tokens": 156367827.0, + "step": 130000 + }, + { + "entropy": 1.8906396552920341, + "epoch": 0.4030196247710908, + "grad_norm": 3.963322639465332, + "learning_rate": 3.9850268564374256e-06, + "loss": 0.4382, + "mean_token_accuracy": 0.8531676039099694, + "num_tokens": 156379328.0, + "step": 130010 + }, + { + "entropy": 1.76583993434906, + "epoch": 0.40305062389614055, + "grad_norm": 9.098811149597168, + "learning_rate": 3.9848736056247245e-06, + "loss": 0.3939, + "mean_token_accuracy": 0.8658165216445923, + "num_tokens": 156392630.0, + "step": 130020 + }, + { + "entropy": 1.838332974910736, + "epoch": 0.4030816230211902, + "grad_norm": 8.506731033325195, + "learning_rate": 3.984720372491206e-06, + "loss": 0.435, + "mean_token_accuracy": 0.8585373058915138, + "num_tokens": 156405536.0, + "step": 130030 + }, + { + "entropy": 1.806590475142002, + "epoch": 0.40311262214623994, + "grad_norm": 5.69447660446167, + "learning_rate": 3.984567157033471e-06, + "loss": 0.449, + "mean_token_accuracy": 0.8464230895042419, + "num_tokens": 156418437.0, + "step": 130040 + }, + { + "entropy": 1.9229248493909836, + "epoch": 0.4031436212712896, + "grad_norm": 8.785764694213867, + "learning_rate": 3.9844139592481195e-06, + "loss": 0.5001, + "mean_token_accuracy": 0.8359622538089753, + "num_tokens": 156430560.0, + "step": 130050 + }, + { + "entropy": 1.9032806217670442, + "epoch": 0.40317462039633933, + "grad_norm": 8.77900218963623, + "learning_rate": 3.984260779131759e-06, + "loss": 0.4543, + "mean_token_accuracy": 0.8543096274137497, + "num_tokens": 156442730.0, + "step": 130060 + }, + { + "entropy": 1.9101463302969932, + "epoch": 0.403205619521389, + "grad_norm": 7.89210319519043, + "learning_rate": 3.984107616680989e-06, + "loss": 0.4598, + "mean_token_accuracy": 0.8459172546863556, + "num_tokens": 156454070.0, + "step": 130070 + }, + { + "entropy": 1.9074820682406426, + "epoch": 0.4032366186464387, + "grad_norm": 9.048776626586914, + "learning_rate": 3.983954471892417e-06, + "loss": 0.4616, + "mean_token_accuracy": 0.8488796040415764, + "num_tokens": 156465623.0, + "step": 130080 + }, + { + "entropy": 1.8907459661364556, + "epoch": 0.4032676177714884, + "grad_norm": 9.203822135925293, + "learning_rate": 3.983801344762646e-06, + "loss": 0.439, + "mean_token_accuracy": 0.8531972080469131, + "num_tokens": 156477633.0, + "step": 130090 + }, + { + "entropy": 1.9408058658242227, + "epoch": 0.4032986168965381, + "grad_norm": 8.398517608642578, + "learning_rate": 3.983648235288285e-06, + "loss": 0.4529, + "mean_token_accuracy": 0.8516205415129662, + "num_tokens": 156488438.0, + "step": 130100 + }, + { + "entropy": 1.8947420373558999, + "epoch": 0.4033296160215878, + "grad_norm": 7.668551445007324, + "learning_rate": 3.983495143465942e-06, + "loss": 0.4659, + "mean_token_accuracy": 0.8498603880405426, + "num_tokens": 156499812.0, + "step": 130110 + }, + { + "entropy": 1.7990976139903068, + "epoch": 0.4033606151466375, + "grad_norm": 3.560288190841675, + "learning_rate": 3.983342069292223e-06, + "loss": 0.3632, + "mean_token_accuracy": 0.8639690637588501, + "num_tokens": 156512790.0, + "step": 130120 + }, + { + "entropy": 1.8366239294409752, + "epoch": 0.4033916142716872, + "grad_norm": 9.26343059539795, + "learning_rate": 3.983189012763739e-06, + "loss": 0.3931, + "mean_token_accuracy": 0.8648645102977752, + "num_tokens": 156525094.0, + "step": 130130 + }, + { + "entropy": 1.8781366422772408, + "epoch": 0.40342261339673685, + "grad_norm": 7.926459789276123, + "learning_rate": 3.983035973877099e-06, + "loss": 0.4221, + "mean_token_accuracy": 0.8611466780304908, + "num_tokens": 156537483.0, + "step": 130140 + }, + { + "entropy": 1.8221728757023812, + "epoch": 0.4034536125217866, + "grad_norm": 3.698610782623291, + "learning_rate": 3.982882952628916e-06, + "loss": 0.3992, + "mean_token_accuracy": 0.8600765854120255, + "num_tokens": 156549954.0, + "step": 130150 + }, + { + "entropy": 1.9444189876317979, + "epoch": 0.40348461164683624, + "grad_norm": 8.673882484436035, + "learning_rate": 3.9827299490158e-06, + "loss": 0.4982, + "mean_token_accuracy": 0.8447524651885032, + "num_tokens": 156561547.0, + "step": 130160 + }, + { + "entropy": 1.836028276383877, + "epoch": 0.40351561077188597, + "grad_norm": 9.08564567565918, + "learning_rate": 3.982576963034364e-06, + "loss": 0.4743, + "mean_token_accuracy": 0.8561201602220535, + "num_tokens": 156574641.0, + "step": 130170 + }, + { + "entropy": 1.918661078810692, + "epoch": 0.40354660989693564, + "grad_norm": 7.405217170715332, + "learning_rate": 3.982423994681225e-06, + "loss": 0.4877, + "mean_token_accuracy": 0.8417747050523758, + "num_tokens": 156586063.0, + "step": 130180 + }, + { + "entropy": 1.8970381796360016, + "epoch": 0.40357760902198536, + "grad_norm": 8.172847747802734, + "learning_rate": 3.982271043952995e-06, + "loss": 0.4547, + "mean_token_accuracy": 0.8463637545704842, + "num_tokens": 156598100.0, + "step": 130190 + }, + { + "entropy": 1.896133790910244, + "epoch": 0.40360860814703503, + "grad_norm": 2.4702107906341553, + "learning_rate": 3.9821181108462895e-06, + "loss": 0.4897, + "mean_token_accuracy": 0.8457187637686729, + "num_tokens": 156609713.0, + "step": 130200 + }, + { + "entropy": 1.8511537820100785, + "epoch": 0.40363960727208475, + "grad_norm": 3.5449514389038086, + "learning_rate": 3.981965195357727e-06, + "loss": 0.4154, + "mean_token_accuracy": 0.8625165119767189, + "num_tokens": 156622527.0, + "step": 130210 + }, + { + "entropy": 1.8973491430282592, + "epoch": 0.4036706063971344, + "grad_norm": 7.712673664093018, + "learning_rate": 3.981812297483923e-06, + "loss": 0.4554, + "mean_token_accuracy": 0.8516885727643967, + "num_tokens": 156634254.0, + "step": 130220 + }, + { + "entropy": 1.8966005221009254, + "epoch": 0.40370160552218415, + "grad_norm": 8.126500129699707, + "learning_rate": 3.981659417221498e-06, + "loss": 0.4368, + "mean_token_accuracy": 0.8538167625665665, + "num_tokens": 156646618.0, + "step": 130230 + }, + { + "entropy": 1.85729219019413, + "epoch": 0.4037326046472338, + "grad_norm": 9.697176933288574, + "learning_rate": 3.98150655456707e-06, + "loss": 0.4314, + "mean_token_accuracy": 0.8491919696331024, + "num_tokens": 156659338.0, + "step": 130240 + }, + { + "entropy": 1.8698885142803192, + "epoch": 0.40376360377228354, + "grad_norm": 7.321453094482422, + "learning_rate": 3.981353709517259e-06, + "loss": 0.4241, + "mean_token_accuracy": 0.850645835697651, + "num_tokens": 156671170.0, + "step": 130250 + }, + { + "entropy": 1.8850924864411354, + "epoch": 0.4037946028973332, + "grad_norm": 8.737915992736816, + "learning_rate": 3.9812008820686864e-06, + "loss": 0.4605, + "mean_token_accuracy": 0.8499901428818702, + "num_tokens": 156683391.0, + "step": 130260 + }, + { + "entropy": 1.9188417434692382, + "epoch": 0.40382560202238293, + "grad_norm": 8.441498756408691, + "learning_rate": 3.981048072217976e-06, + "loss": 0.4621, + "mean_token_accuracy": 0.8469459965825081, + "num_tokens": 156695474.0, + "step": 130270 + }, + { + "entropy": 1.832290355861187, + "epoch": 0.4038566011474326, + "grad_norm": 5.164080619812012, + "learning_rate": 3.980895279961748e-06, + "loss": 0.4268, + "mean_token_accuracy": 0.8454208686947823, + "num_tokens": 156708620.0, + "step": 130280 + }, + { + "entropy": 1.9142166703939438, + "epoch": 0.40388760027248233, + "grad_norm": 7.258204936981201, + "learning_rate": 3.980742505296629e-06, + "loss": 0.4338, + "mean_token_accuracy": 0.8631983742117881, + "num_tokens": 156720342.0, + "step": 130290 + }, + { + "entropy": 1.9650160878896714, + "epoch": 0.403918599397532, + "grad_norm": 7.455199241638184, + "learning_rate": 3.980589748219241e-06, + "loss": 0.5014, + "mean_token_accuracy": 0.8472874984145164, + "num_tokens": 156732490.0, + "step": 130300 + }, + { + "entropy": 1.816853478550911, + "epoch": 0.4039495985225817, + "grad_norm": 3.841028928756714, + "learning_rate": 3.980437008726212e-06, + "loss": 0.3849, + "mean_token_accuracy": 0.8634612441062928, + "num_tokens": 156745943.0, + "step": 130310 + }, + { + "entropy": 1.9299751341342926, + "epoch": 0.4039805976476314, + "grad_norm": 4.5318474769592285, + "learning_rate": 3.980284286814167e-06, + "loss": 0.4939, + "mean_token_accuracy": 0.8458565920591354, + "num_tokens": 156757351.0, + "step": 130320 + }, + { + "entropy": 1.8615489616990089, + "epoch": 0.4040115967726811, + "grad_norm": 8.26394271850586, + "learning_rate": 3.980131582479735e-06, + "loss": 0.4583, + "mean_token_accuracy": 0.8518011167645454, + "num_tokens": 156768452.0, + "step": 130330 + }, + { + "entropy": 1.9241826206445694, + "epoch": 0.4040425958977308, + "grad_norm": 9.016687393188477, + "learning_rate": 3.979978895719543e-06, + "loss": 0.4847, + "mean_token_accuracy": 0.8521223112940788, + "num_tokens": 156779453.0, + "step": 130340 + }, + { + "entropy": 2.0048281461000443, + "epoch": 0.4040735950227805, + "grad_norm": 8.775766372680664, + "learning_rate": 3.979826226530221e-06, + "loss": 0.5079, + "mean_token_accuracy": 0.8402179971337318, + "num_tokens": 156789976.0, + "step": 130350 + }, + { + "entropy": 1.8103154242038726, + "epoch": 0.4041045941478302, + "grad_norm": 3.5552315711975098, + "learning_rate": 3.9796735749084e-06, + "loss": 0.3948, + "mean_token_accuracy": 0.8577532470226288, + "num_tokens": 156803981.0, + "step": 130360 + }, + { + "entropy": 1.9318371683359146, + "epoch": 0.4041355932728799, + "grad_norm": 8.001313209533691, + "learning_rate": 3.97952094085071e-06, + "loss": 0.4546, + "mean_token_accuracy": 0.8499713644385338, + "num_tokens": 156816659.0, + "step": 130370 + }, + { + "entropy": 1.9495342776179314, + "epoch": 0.40416659239792957, + "grad_norm": 4.196635723114014, + "learning_rate": 3.979368324353783e-06, + "loss": 0.4924, + "mean_token_accuracy": 0.8435593828558922, + "num_tokens": 156828530.0, + "step": 130380 + }, + { + "entropy": 1.8581201523542403, + "epoch": 0.40419759152297924, + "grad_norm": 7.378039836883545, + "learning_rate": 3.979215725414253e-06, + "loss": 0.4111, + "mean_token_accuracy": 0.8562660500407219, + "num_tokens": 156841203.0, + "step": 130390 + }, + { + "entropy": 1.8645180001854897, + "epoch": 0.40422859064802896, + "grad_norm": 7.564363956451416, + "learning_rate": 3.9790631440287516e-06, + "loss": 0.3761, + "mean_token_accuracy": 0.8670828878879547, + "num_tokens": 156853244.0, + "step": 130400 + }, + { + "entropy": 1.9289032772183419, + "epoch": 0.40425958977307863, + "grad_norm": 4.7226481437683105, + "learning_rate": 3.978910580193916e-06, + "loss": 0.4384, + "mean_token_accuracy": 0.8462576329708099, + "num_tokens": 156864953.0, + "step": 130410 + }, + { + "entropy": 1.9896679311990737, + "epoch": 0.40429058889812836, + "grad_norm": 9.488637924194336, + "learning_rate": 3.978758033906382e-06, + "loss": 0.5219, + "mean_token_accuracy": 0.8406891971826553, + "num_tokens": 156875482.0, + "step": 130420 + }, + { + "entropy": 1.859781025350094, + "epoch": 0.404321588023178, + "grad_norm": 3.904017210006714, + "learning_rate": 3.978605505162784e-06, + "loss": 0.3909, + "mean_token_accuracy": 0.8608370333909988, + "num_tokens": 156887299.0, + "step": 130430 + }, + { + "entropy": 1.8310018733143807, + "epoch": 0.40435258714822775, + "grad_norm": 8.23388671875, + "learning_rate": 3.978452993959761e-06, + "loss": 0.4589, + "mean_token_accuracy": 0.8504728376865387, + "num_tokens": 156900461.0, + "step": 130440 + }, + { + "entropy": 1.877791903913021, + "epoch": 0.4043835862732774, + "grad_norm": 7.384108066558838, + "learning_rate": 3.978300500293951e-06, + "loss": 0.4635, + "mean_token_accuracy": 0.8471254542469978, + "num_tokens": 156913284.0, + "step": 130450 + }, + { + "entropy": 1.6766640424728394, + "epoch": 0.40441458539832714, + "grad_norm": 3.489778995513916, + "learning_rate": 3.978148024161993e-06, + "loss": 0.3129, + "mean_token_accuracy": 0.873348993062973, + "num_tokens": 156928424.0, + "step": 130460 + }, + { + "entropy": 1.8710533007979393, + "epoch": 0.4044455845233768, + "grad_norm": 6.751518726348877, + "learning_rate": 3.977995565560528e-06, + "loss": 0.4647, + "mean_token_accuracy": 0.8581561148166656, + "num_tokens": 156940418.0, + "step": 130470 + }, + { + "entropy": 1.8982125908136367, + "epoch": 0.40447658364842654, + "grad_norm": 7.623547554016113, + "learning_rate": 3.977843124486196e-06, + "loss": 0.4227, + "mean_token_accuracy": 0.859375411272049, + "num_tokens": 156952679.0, + "step": 130480 + }, + { + "entropy": 1.9297777190804482, + "epoch": 0.4045075827734762, + "grad_norm": 8.541840553283691, + "learning_rate": 3.977690700935639e-06, + "loss": 0.4418, + "mean_token_accuracy": 0.8506446555256844, + "num_tokens": 156964447.0, + "step": 130490 + }, + { + "entropy": 1.8197549387812615, + "epoch": 0.40453858189852593, + "grad_norm": 4.050614833831787, + "learning_rate": 3.9775382949055e-06, + "loss": 0.3907, + "mean_token_accuracy": 0.8605873674154282, + "num_tokens": 156977447.0, + "step": 130500 + }, + { + "entropy": 1.7729148700833322, + "epoch": 0.4045695810235756, + "grad_norm": 4.528966903686523, + "learning_rate": 3.977385906392423e-06, + "loss": 0.3785, + "mean_token_accuracy": 0.8583493396639824, + "num_tokens": 156991476.0, + "step": 130510 + }, + { + "entropy": 1.9727448597550392, + "epoch": 0.4046005801486253, + "grad_norm": 9.29497241973877, + "learning_rate": 3.977233535393054e-06, + "loss": 0.5015, + "mean_token_accuracy": 0.843626768887043, + "num_tokens": 157002963.0, + "step": 130520 + }, + { + "entropy": 1.9293553322553634, + "epoch": 0.404631579273675, + "grad_norm": 9.896842956542969, + "learning_rate": 3.977081181904036e-06, + "loss": 0.4716, + "mean_token_accuracy": 0.8478871420025825, + "num_tokens": 157014701.0, + "step": 130530 + }, + { + "entropy": 1.977966983616352, + "epoch": 0.4046625783987247, + "grad_norm": 8.197452545166016, + "learning_rate": 3.976928845922018e-06, + "loss": 0.5224, + "mean_token_accuracy": 0.8346485048532486, + "num_tokens": 157026114.0, + "step": 130540 + }, + { + "entropy": 1.9490982070565224, + "epoch": 0.4046935775237744, + "grad_norm": 3.955064058303833, + "learning_rate": 3.976776527443644e-06, + "loss": 0.4488, + "mean_token_accuracy": 0.8499052435159683, + "num_tokens": 157038105.0, + "step": 130550 + }, + { + "entropy": 1.921256160736084, + "epoch": 0.4047245766488241, + "grad_norm": 7.925784111022949, + "learning_rate": 3.9766242264655655e-06, + "loss": 0.5104, + "mean_token_accuracy": 0.8429896846413613, + "num_tokens": 157050666.0, + "step": 130560 + }, + { + "entropy": 1.9208467230200768, + "epoch": 0.4047555757738738, + "grad_norm": 4.279351711273193, + "learning_rate": 3.976471942984431e-06, + "loss": 0.4727, + "mean_token_accuracy": 0.8471018105745316, + "num_tokens": 157062556.0, + "step": 130570 + }, + { + "entropy": 1.8374285951256752, + "epoch": 0.4047865748989235, + "grad_norm": 8.640376091003418, + "learning_rate": 3.976319676996889e-06, + "loss": 0.4165, + "mean_token_accuracy": 0.8620464310050011, + "num_tokens": 157075344.0, + "step": 130580 + }, + { + "entropy": 1.9202581122517586, + "epoch": 0.40481757402397317, + "grad_norm": 7.99689245223999, + "learning_rate": 3.976167428499592e-06, + "loss": 0.4589, + "mean_token_accuracy": 0.8536014124751091, + "num_tokens": 157086341.0, + "step": 130590 + }, + { + "entropy": 1.9324933484196662, + "epoch": 0.4048485731490229, + "grad_norm": 6.284863471984863, + "learning_rate": 3.976015197489192e-06, + "loss": 0.5003, + "mean_token_accuracy": 0.8359458863735199, + "num_tokens": 157098510.0, + "step": 130600 + }, + { + "entropy": 1.811049999296665, + "epoch": 0.40487957227407256, + "grad_norm": 8.513614654541016, + "learning_rate": 3.97586298396234e-06, + "loss": 0.4118, + "mean_token_accuracy": 0.8566889658570289, + "num_tokens": 157111635.0, + "step": 130610 + }, + { + "entropy": 1.895810031890869, + "epoch": 0.40491057139912223, + "grad_norm": 8.150436401367188, + "learning_rate": 3.975710787915691e-06, + "loss": 0.4337, + "mean_token_accuracy": 0.8592909917235374, + "num_tokens": 157123381.0, + "step": 130620 + }, + { + "entropy": 1.8806810095906257, + "epoch": 0.40494157052417196, + "grad_norm": 9.090899467468262, + "learning_rate": 3.9755586093459e-06, + "loss": 0.4232, + "mean_token_accuracy": 0.8572476267814636, + "num_tokens": 157135740.0, + "step": 130630 + }, + { + "entropy": 1.7848129168152809, + "epoch": 0.4049725696492216, + "grad_norm": 4.283092975616455, + "learning_rate": 3.97540644824962e-06, + "loss": 0.3555, + "mean_token_accuracy": 0.8615420237183571, + "num_tokens": 157148676.0, + "step": 130640 + }, + { + "entropy": 1.8715970084071158, + "epoch": 0.40500356877427135, + "grad_norm": 9.602254867553711, + "learning_rate": 3.975254304623512e-06, + "loss": 0.4169, + "mean_token_accuracy": 0.8585036590695381, + "num_tokens": 157161564.0, + "step": 130650 + }, + { + "entropy": 1.8966206587851047, + "epoch": 0.405034567899321, + "grad_norm": 6.511444091796875, + "learning_rate": 3.975102178464229e-06, + "loss": 0.3862, + "mean_token_accuracy": 0.8608822152018547, + "num_tokens": 157174560.0, + "step": 130660 + }, + { + "entropy": 1.9423101127147675, + "epoch": 0.40506556702437074, + "grad_norm": 8.167618751525879, + "learning_rate": 3.974950069768429e-06, + "loss": 0.4911, + "mean_token_accuracy": 0.8432308033108711, + "num_tokens": 157186728.0, + "step": 130670 + }, + { + "entropy": 1.9459773391485213, + "epoch": 0.4050965661494204, + "grad_norm": 8.282353401184082, + "learning_rate": 3.974797978532774e-06, + "loss": 0.478, + "mean_token_accuracy": 0.8470408290624618, + "num_tokens": 157198743.0, + "step": 130680 + }, + { + "entropy": 1.8234005078673363, + "epoch": 0.40512756527447014, + "grad_norm": 3.401383399963379, + "learning_rate": 3.974645904753922e-06, + "loss": 0.3927, + "mean_token_accuracy": 0.8553168892860412, + "num_tokens": 157211958.0, + "step": 130690 + }, + { + "entropy": 1.9370823100209236, + "epoch": 0.4051585643995198, + "grad_norm": 6.692420959472656, + "learning_rate": 3.974493848428535e-06, + "loss": 0.4796, + "mean_token_accuracy": 0.8524964049458503, + "num_tokens": 157223127.0, + "step": 130700 + }, + { + "entropy": 1.9691085010766982, + "epoch": 0.40518956352456953, + "grad_norm": 8.417125701904297, + "learning_rate": 3.974341809553272e-06, + "loss": 0.4557, + "mean_token_accuracy": 0.8546971321105957, + "num_tokens": 157234327.0, + "step": 130710 + }, + { + "entropy": 1.8974324196577073, + "epoch": 0.4052205626496192, + "grad_norm": 9.326712608337402, + "learning_rate": 3.974189788124799e-06, + "loss": 0.4357, + "mean_token_accuracy": 0.8539463758468628, + "num_tokens": 157246787.0, + "step": 130720 + }, + { + "entropy": 1.9472648695111274, + "epoch": 0.4052515617746689, + "grad_norm": 8.63284683227539, + "learning_rate": 3.974037784139778e-06, + "loss": 0.4983, + "mean_token_accuracy": 0.8510000556707382, + "num_tokens": 157258105.0, + "step": 130730 + }, + { + "entropy": 1.9133790254592895, + "epoch": 0.4052825608997186, + "grad_norm": 7.009464263916016, + "learning_rate": 3.973885797594873e-06, + "loss": 0.4128, + "mean_token_accuracy": 0.8659856930375099, + "num_tokens": 157269330.0, + "step": 130740 + }, + { + "entropy": 1.9145527601242065, + "epoch": 0.4053135600247683, + "grad_norm": 6.98771333694458, + "learning_rate": 3.973733828486749e-06, + "loss": 0.4235, + "mean_token_accuracy": 0.8578495368361473, + "num_tokens": 157282000.0, + "step": 130750 + }, + { + "entropy": 1.9058576181530953, + "epoch": 0.405344559149818, + "grad_norm": 9.95231819152832, + "learning_rate": 3.9735818768120745e-06, + "loss": 0.4964, + "mean_token_accuracy": 0.8475971177220345, + "num_tokens": 157293637.0, + "step": 130760 + }, + { + "entropy": 1.789780667424202, + "epoch": 0.4053755582748677, + "grad_norm": 7.046390533447266, + "learning_rate": 3.973429942567513e-06, + "loss": 0.3527, + "mean_token_accuracy": 0.8669392168521881, + "num_tokens": 157306793.0, + "step": 130770 + }, + { + "entropy": 1.988757422566414, + "epoch": 0.4054065573999174, + "grad_norm": 9.767016410827637, + "learning_rate": 3.973278025749736e-06, + "loss": 0.4872, + "mean_token_accuracy": 0.8483888059854507, + "num_tokens": 157318126.0, + "step": 130780 + }, + { + "entropy": 1.9013223245739936, + "epoch": 0.4054375565249671, + "grad_norm": 4.095386028289795, + "learning_rate": 3.97312612635541e-06, + "loss": 0.4834, + "mean_token_accuracy": 0.8486203476786613, + "num_tokens": 157329583.0, + "step": 130790 + }, + { + "entropy": 1.9669806063175201, + "epoch": 0.40546855565001677, + "grad_norm": 7.4656596183776855, + "learning_rate": 3.9729742443812056e-06, + "loss": 0.5092, + "mean_token_accuracy": 0.8445022612810135, + "num_tokens": 157340387.0, + "step": 130800 + }, + { + "entropy": 1.8516760841012, + "epoch": 0.4054995547750665, + "grad_norm": Infinity, + "learning_rate": 3.972822379823793e-06, + "loss": 0.388, + "mean_token_accuracy": 0.8624450042843819, + "num_tokens": 157353043.0, + "step": 130810 + }, + { + "entropy": 1.915142248570919, + "epoch": 0.40553055390011616, + "grad_norm": 7.543905258178711, + "learning_rate": 3.972670532679844e-06, + "loss": 0.4563, + "mean_token_accuracy": 0.8492273300886154, + "num_tokens": 157365310.0, + "step": 130820 + }, + { + "entropy": 1.8361800596117974, + "epoch": 0.4055615530251659, + "grad_norm": 4.767913341522217, + "learning_rate": 3.9725187029460316e-06, + "loss": 0.4211, + "mean_token_accuracy": 0.8620738372206688, + "num_tokens": 157377616.0, + "step": 130830 + }, + { + "entropy": 1.9048438847064972, + "epoch": 0.40559255215021556, + "grad_norm": 8.4035005569458, + "learning_rate": 3.972366890619029e-06, + "loss": 0.4554, + "mean_token_accuracy": 0.8537156358361244, + "num_tokens": 157389121.0, + "step": 130840 + }, + { + "entropy": 1.9738389521837234, + "epoch": 0.4056235512752653, + "grad_norm": 8.513267517089844, + "learning_rate": 3.972215095695508e-06, + "loss": 0.4703, + "mean_token_accuracy": 0.8514445766806602, + "num_tokens": 157400361.0, + "step": 130850 + }, + { + "entropy": 1.9617274329066277, + "epoch": 0.40565455040031495, + "grad_norm": 4.258702278137207, + "learning_rate": 3.972063318172147e-06, + "loss": 0.4877, + "mean_token_accuracy": 0.847972746193409, + "num_tokens": 157411570.0, + "step": 130860 + }, + { + "entropy": 1.9130688726902008, + "epoch": 0.4056855495253646, + "grad_norm": 7.041831970214844, + "learning_rate": 3.97191155804562e-06, + "loss": 0.4609, + "mean_token_accuracy": 0.8498567163944244, + "num_tokens": 157423298.0, + "step": 130870 + }, + { + "entropy": 1.8860970079898833, + "epoch": 0.40571654865041434, + "grad_norm": 7.607229709625244, + "learning_rate": 3.971759815312605e-06, + "loss": 0.4647, + "mean_token_accuracy": 0.8536626756191253, + "num_tokens": 157434605.0, + "step": 130880 + }, + { + "entropy": 1.9061809971928596, + "epoch": 0.405747547775464, + "grad_norm": 7.09719705581665, + "learning_rate": 3.971608089969779e-06, + "loss": 0.4901, + "mean_token_accuracy": 0.848254905641079, + "num_tokens": 157446268.0, + "step": 130890 + }, + { + "entropy": 1.7818841926753521, + "epoch": 0.40577854690051374, + "grad_norm": 4.486595630645752, + "learning_rate": 3.971456382013821e-06, + "loss": 0.3597, + "mean_token_accuracy": 0.8559565782546997, + "num_tokens": 157460438.0, + "step": 130900 + }, + { + "entropy": 1.763669066131115, + "epoch": 0.4058095460255634, + "grad_norm": 3.9831771850585938, + "learning_rate": 3.97130469144141e-06, + "loss": 0.3303, + "mean_token_accuracy": 0.8683875605463982, + "num_tokens": 157475220.0, + "step": 130910 + }, + { + "entropy": 1.8374385461211205, + "epoch": 0.40584054515061313, + "grad_norm": 8.03730583190918, + "learning_rate": 3.9711530182492266e-06, + "loss": 0.4052, + "mean_token_accuracy": 0.858574740588665, + "num_tokens": 157487520.0, + "step": 130920 + }, + { + "entropy": 1.9221992582082748, + "epoch": 0.4058715442756628, + "grad_norm": 9.426704406738281, + "learning_rate": 3.971001362433953e-06, + "loss": 0.484, + "mean_token_accuracy": 0.8404927179217339, + "num_tokens": 157499623.0, + "step": 130930 + }, + { + "entropy": 1.8548153042793274, + "epoch": 0.4059025434007125, + "grad_norm": 4.058870792388916, + "learning_rate": 3.97084972399227e-06, + "loss": 0.4315, + "mean_token_accuracy": 0.8509564101696014, + "num_tokens": 157512269.0, + "step": 130940 + }, + { + "entropy": 1.898500706255436, + "epoch": 0.4059335425257622, + "grad_norm": 7.100831985473633, + "learning_rate": 3.970698102920861e-06, + "loss": 0.4517, + "mean_token_accuracy": 0.8517476499080658, + "num_tokens": 157524534.0, + "step": 130950 + }, + { + "entropy": 1.938453209400177, + "epoch": 0.4059645416508119, + "grad_norm": 7.448749542236328, + "learning_rate": 3.970546499216411e-06, + "loss": 0.4693, + "mean_token_accuracy": 0.8483160063624382, + "num_tokens": 157535978.0, + "step": 130960 + }, + { + "entropy": 1.8814398035407067, + "epoch": 0.4059955407758616, + "grad_norm": 8.682611465454102, + "learning_rate": 3.970394912875604e-06, + "loss": 0.4537, + "mean_token_accuracy": 0.8522828727960586, + "num_tokens": 157548563.0, + "step": 130970 + }, + { + "entropy": 1.9689362928271295, + "epoch": 0.4060265399009113, + "grad_norm": 8.23831558227539, + "learning_rate": 3.970243343895126e-06, + "loss": 0.4459, + "mean_token_accuracy": 0.8545093983411789, + "num_tokens": 157559741.0, + "step": 130980 + }, + { + "entropy": 1.8692365869879723, + "epoch": 0.406057539025961, + "grad_norm": 8.109051704406738, + "learning_rate": 3.970091792271663e-06, + "loss": 0.4326, + "mean_token_accuracy": 0.8523834586143494, + "num_tokens": 157571939.0, + "step": 130990 + }, + { + "entropy": 1.9243786290287972, + "epoch": 0.4060885381510107, + "grad_norm": 10.623024940490723, + "learning_rate": 3.969940258001903e-06, + "loss": 0.4833, + "mean_token_accuracy": 0.8421697750687599, + "num_tokens": 157583545.0, + "step": 131000 + }, + { + "entropy": 1.817786581814289, + "epoch": 0.40611953727606037, + "grad_norm": 9.550358772277832, + "learning_rate": 3.969788741082535e-06, + "loss": 0.3799, + "mean_token_accuracy": 0.8594713494181633, + "num_tokens": 157596216.0, + "step": 131010 + }, + { + "entropy": 1.9418366000056266, + "epoch": 0.4061505364011101, + "grad_norm": 7.840337753295898, + "learning_rate": 3.969637241510247e-06, + "loss": 0.4571, + "mean_token_accuracy": 0.8537692606449128, + "num_tokens": 157607196.0, + "step": 131020 + }, + { + "entropy": 1.8996048077940941, + "epoch": 0.40618153552615976, + "grad_norm": 8.591992378234863, + "learning_rate": 3.9694857592817295e-06, + "loss": 0.4538, + "mean_token_accuracy": 0.8468469053506851, + "num_tokens": 157619202.0, + "step": 131030 + }, + { + "entropy": 1.8347910821437836, + "epoch": 0.4062125346512095, + "grad_norm": 3.285649299621582, + "learning_rate": 3.969334294393675e-06, + "loss": 0.448, + "mean_token_accuracy": 0.8462332352995873, + "num_tokens": 157632132.0, + "step": 131040 + }, + { + "entropy": 1.9242204323410987, + "epoch": 0.40624353377625916, + "grad_norm": 4.810262680053711, + "learning_rate": 3.969182846842773e-06, + "loss": 0.5069, + "mean_token_accuracy": 0.8468155965209008, + "num_tokens": 157643575.0, + "step": 131050 + }, + { + "entropy": 1.9612080991268157, + "epoch": 0.4062745329013089, + "grad_norm": 9.51435375213623, + "learning_rate": 3.9690314166257186e-06, + "loss": 0.5294, + "mean_token_accuracy": 0.8334100112318993, + "num_tokens": 157654699.0, + "step": 131060 + }, + { + "entropy": 1.9367137864232062, + "epoch": 0.40630553202635855, + "grad_norm": 9.918806076049805, + "learning_rate": 3.968880003739205e-06, + "loss": 0.4602, + "mean_token_accuracy": 0.851118703186512, + "num_tokens": 157665786.0, + "step": 131070 + }, + { + "entropy": 1.8767028152942657, + "epoch": 0.4063365311514083, + "grad_norm": 10.18786334991455, + "learning_rate": 3.9687286081799244e-06, + "loss": 0.4366, + "mean_token_accuracy": 0.8559330344200134, + "num_tokens": 157678031.0, + "step": 131080 + }, + { + "entropy": 1.9314553529024123, + "epoch": 0.40636753027645794, + "grad_norm": 6.053582191467285, + "learning_rate": 3.9685772299445754e-06, + "loss": 0.4421, + "mean_token_accuracy": 0.8622308388352394, + "num_tokens": 157689998.0, + "step": 131090 + }, + { + "entropy": 1.6999683201313018, + "epoch": 0.40639852940150767, + "grad_norm": 5.942327499389648, + "learning_rate": 3.9684258690298525e-06, + "loss": 0.3754, + "mean_token_accuracy": 0.8629785016179085, + "num_tokens": 157704433.0, + "step": 131100 + }, + { + "entropy": 1.864445061981678, + "epoch": 0.40642952852655734, + "grad_norm": 7.0887346267700195, + "learning_rate": 3.968274525432454e-06, + "loss": 0.4122, + "mean_token_accuracy": 0.8601906552910805, + "num_tokens": 157717013.0, + "step": 131110 + }, + { + "entropy": 1.8925062775611878, + "epoch": 0.406460527651607, + "grad_norm": 10.952113151550293, + "learning_rate": 3.968123199149077e-06, + "loss": 0.4281, + "mean_token_accuracy": 0.8586850896477699, + "num_tokens": 157729145.0, + "step": 131120 + }, + { + "entropy": 1.9502949953079223, + "epoch": 0.40649152677665673, + "grad_norm": 10.193359375, + "learning_rate": 3.967971890176421e-06, + "loss": 0.4922, + "mean_token_accuracy": 0.8508687302470207, + "num_tokens": 157740267.0, + "step": 131130 + }, + { + "entropy": 1.9118008241057396, + "epoch": 0.4065225259017064, + "grad_norm": 3.9366962909698486, + "learning_rate": 3.967820598511186e-06, + "loss": 0.4496, + "mean_token_accuracy": 0.8517689287662507, + "num_tokens": 157751681.0, + "step": 131140 + }, + { + "entropy": 1.9390445798635483, + "epoch": 0.4065535250267561, + "grad_norm": 9.202500343322754, + "learning_rate": 3.9676693241500725e-06, + "loss": 0.4959, + "mean_token_accuracy": 0.8437588036060333, + "num_tokens": 157762806.0, + "step": 131150 + }, + { + "entropy": 1.873625774681568, + "epoch": 0.4065845241518058, + "grad_norm": 9.123173713684082, + "learning_rate": 3.967518067089782e-06, + "loss": 0.4701, + "mean_token_accuracy": 0.8525558322668075, + "num_tokens": 157775344.0, + "step": 131160 + }, + { + "entropy": 1.8773087307810783, + "epoch": 0.4066155232768555, + "grad_norm": 7.2432684898376465, + "learning_rate": 3.967366827327019e-06, + "loss": 0.421, + "mean_token_accuracy": 0.8548357993364334, + "num_tokens": 157788421.0, + "step": 131170 + }, + { + "entropy": 1.9489660397171975, + "epoch": 0.4066465224019052, + "grad_norm": 8.408989906311035, + "learning_rate": 3.9672156048584825e-06, + "loss": 0.5063, + "mean_token_accuracy": 0.8430541291832924, + "num_tokens": 157799818.0, + "step": 131180 + }, + { + "entropy": 1.8156598702073097, + "epoch": 0.4066775215269549, + "grad_norm": 3.8361010551452637, + "learning_rate": 3.9670643996808805e-06, + "loss": 0.3904, + "mean_token_accuracy": 0.8656172275543212, + "num_tokens": 157812552.0, + "step": 131190 + }, + { + "entropy": 1.876960425078869, + "epoch": 0.4067085206520046, + "grad_norm": 4.139932632446289, + "learning_rate": 3.966913211790917e-06, + "loss": 0.4741, + "mean_token_accuracy": 0.8435041248798371, + "num_tokens": 157824792.0, + "step": 131200 + }, + { + "entropy": 1.8027958139777183, + "epoch": 0.4067395197770543, + "grad_norm": 8.876256942749023, + "learning_rate": 3.966762041185298e-06, + "loss": 0.4064, + "mean_token_accuracy": 0.8562049359083176, + "num_tokens": 157837760.0, + "step": 131210 + }, + { + "entropy": 1.845954880863428, + "epoch": 0.406770518902104, + "grad_norm": 5.402946949005127, + "learning_rate": 3.966610887860731e-06, + "loss": 0.4745, + "mean_token_accuracy": 0.8410774037241936, + "num_tokens": 157850821.0, + "step": 131220 + }, + { + "entropy": 1.9590726420283318, + "epoch": 0.4068015180271537, + "grad_norm": 9.471705436706543, + "learning_rate": 3.966459751813921e-06, + "loss": 0.4975, + "mean_token_accuracy": 0.8433647140860557, + "num_tokens": 157862272.0, + "step": 131230 + }, + { + "entropy": 1.8670239843428136, + "epoch": 0.40683251715220337, + "grad_norm": 8.589696884155273, + "learning_rate": 3.966308633041582e-06, + "loss": 0.432, + "mean_token_accuracy": 0.8549284100532532, + "num_tokens": 157874516.0, + "step": 131240 + }, + { + "entropy": 1.8836659103631974, + "epoch": 0.4068635162772531, + "grad_norm": 7.1528449058532715, + "learning_rate": 3.9661575315404185e-06, + "loss": 0.536, + "mean_token_accuracy": 0.8425373405218124, + "num_tokens": 157887098.0, + "step": 131250 + }, + { + "entropy": 1.9439944818615913, + "epoch": 0.40689451540230276, + "grad_norm": 4.756436824798584, + "learning_rate": 3.966006447307143e-06, + "loss": 0.5211, + "mean_token_accuracy": 0.8354384452104568, + "num_tokens": 157898487.0, + "step": 131260 + }, + { + "entropy": 1.9490654364228248, + "epoch": 0.4069255145273525, + "grad_norm": 8.603204727172852, + "learning_rate": 3.965855380338467e-06, + "loss": 0.4939, + "mean_token_accuracy": 0.841651976108551, + "num_tokens": 157909630.0, + "step": 131270 + }, + { + "entropy": 1.831677147746086, + "epoch": 0.40695651365240215, + "grad_norm": 4.0721611976623535, + "learning_rate": 3.965704330631102e-06, + "loss": 0.4343, + "mean_token_accuracy": 0.8579563215374947, + "num_tokens": 157922831.0, + "step": 131280 + }, + { + "entropy": 1.8643750965595245, + "epoch": 0.4069875127774519, + "grad_norm": 9.7418851852417, + "learning_rate": 3.965553298181761e-06, + "loss": 0.4613, + "mean_token_accuracy": 0.848840269446373, + "num_tokens": 157934557.0, + "step": 131290 + }, + { + "entropy": 1.9244591280817986, + "epoch": 0.40701851190250155, + "grad_norm": 7.745121955871582, + "learning_rate": 3.965402282987159e-06, + "loss": 0.4905, + "mean_token_accuracy": 0.8503209039568901, + "num_tokens": 157945931.0, + "step": 131300 + }, + { + "entropy": 1.866335666179657, + "epoch": 0.40704951102755127, + "grad_norm": 8.668558120727539, + "learning_rate": 3.96525128504401e-06, + "loss": 0.464, + "mean_token_accuracy": 0.849652573466301, + "num_tokens": 157958476.0, + "step": 131310 + }, + { + "entropy": 1.8344866186380386, + "epoch": 0.40708051015260094, + "grad_norm": 7.905651569366455, + "learning_rate": 3.965100304349029e-06, + "loss": 0.4137, + "mean_token_accuracy": 0.859797616302967, + "num_tokens": 157971189.0, + "step": 131320 + }, + { + "entropy": 1.9577922523021698, + "epoch": 0.40711150927765066, + "grad_norm": 7.413038730621338, + "learning_rate": 3.964949340898934e-06, + "loss": 0.4654, + "mean_token_accuracy": 0.8549491494894028, + "num_tokens": 157981983.0, + "step": 131330 + }, + { + "entropy": 1.9203011736273765, + "epoch": 0.40714250840270033, + "grad_norm": 9.896458625793457, + "learning_rate": 3.9647983946904416e-06, + "loss": 0.462, + "mean_token_accuracy": 0.8488733306527138, + "num_tokens": 157993609.0, + "step": 131340 + }, + { + "entropy": 1.9292757645249368, + "epoch": 0.40717350752775006, + "grad_norm": 9.621637344360352, + "learning_rate": 3.964647465720271e-06, + "loss": 0.5658, + "mean_token_accuracy": 0.8377192333340645, + "num_tokens": 158005340.0, + "step": 131350 + }, + { + "entropy": 1.9438342347741127, + "epoch": 0.4072045066527997, + "grad_norm": 7.706085681915283, + "learning_rate": 3.964496553985139e-06, + "loss": 0.4935, + "mean_token_accuracy": 0.8447881907224655, + "num_tokens": 158016548.0, + "step": 131360 + }, + { + "entropy": 1.825490552186966, + "epoch": 0.4072355057778494, + "grad_norm": 4.330419063568115, + "learning_rate": 3.964345659481768e-06, + "loss": 0.4223, + "mean_token_accuracy": 0.8532146289944649, + "num_tokens": 158029148.0, + "step": 131370 + }, + { + "entropy": 1.8673056378960609, + "epoch": 0.4072665049028991, + "grad_norm": 7.6978230476379395, + "learning_rate": 3.964194782206878e-06, + "loss": 0.4265, + "mean_token_accuracy": 0.8550721898674964, + "num_tokens": 158041404.0, + "step": 131380 + }, + { + "entropy": 1.8040344282984733, + "epoch": 0.4072975040279488, + "grad_norm": 5.431396007537842, + "learning_rate": 3.964043922157191e-06, + "loss": 0.3902, + "mean_token_accuracy": 0.8702194571495057, + "num_tokens": 158054518.0, + "step": 131390 + }, + { + "entropy": 1.8987352877855301, + "epoch": 0.4073285031529985, + "grad_norm": 7.753914833068848, + "learning_rate": 3.963893079329429e-06, + "loss": 0.4926, + "mean_token_accuracy": 0.8489699825644493, + "num_tokens": 158065523.0, + "step": 131400 + }, + { + "entropy": 1.918555736541748, + "epoch": 0.4073595022780482, + "grad_norm": 8.709218978881836, + "learning_rate": 3.9637422537203165e-06, + "loss": 0.4729, + "mean_token_accuracy": 0.8495317488908768, + "num_tokens": 158077808.0, + "step": 131410 + }, + { + "entropy": 1.9698147997260094, + "epoch": 0.4073905014030979, + "grad_norm": 8.470252990722656, + "learning_rate": 3.963591445326578e-06, + "loss": 0.5221, + "mean_token_accuracy": 0.8412455469369888, + "num_tokens": 158089356.0, + "step": 131420 + }, + { + "entropy": 1.7146087184548378, + "epoch": 0.4074215005281476, + "grad_norm": 3.0368010997772217, + "learning_rate": 3.963440654144938e-06, + "loss": 0.3414, + "mean_token_accuracy": 0.8623756617307663, + "num_tokens": 158104069.0, + "step": 131430 + }, + { + "entropy": 1.9334975138306618, + "epoch": 0.4074524996531973, + "grad_norm": 7.6624884605407715, + "learning_rate": 3.963289880172123e-06, + "loss": 0.4892, + "mean_token_accuracy": 0.8561296299099922, + "num_tokens": 158115199.0, + "step": 131440 + }, + { + "entropy": 1.9267579466104507, + "epoch": 0.40748349877824697, + "grad_norm": 7.29675817489624, + "learning_rate": 3.96313912340486e-06, + "loss": 0.4165, + "mean_token_accuracy": 0.8549336135387421, + "num_tokens": 158127014.0, + "step": 131450 + }, + { + "entropy": 1.8517323270440103, + "epoch": 0.4075144979032967, + "grad_norm": Infinity, + "learning_rate": 3.962988383839877e-06, + "loss": 0.4007, + "mean_token_accuracy": 0.8570884570479393, + "num_tokens": 158139840.0, + "step": 131460 + }, + { + "entropy": 1.9198008626699448, + "epoch": 0.40754549702834636, + "grad_norm": 9.076776504516602, + "learning_rate": 3.962837661473903e-06, + "loss": 0.4474, + "mean_token_accuracy": 0.8622155413031578, + "num_tokens": 158150997.0, + "step": 131470 + }, + { + "entropy": 1.9445372179150582, + "epoch": 0.4075764961533961, + "grad_norm": 9.01667308807373, + "learning_rate": 3.962686956303667e-06, + "loss": 0.5047, + "mean_token_accuracy": 0.8443640530109405, + "num_tokens": 158162446.0, + "step": 131480 + }, + { + "entropy": 1.8989515826106071, + "epoch": 0.40760749527844575, + "grad_norm": 4.06826639175415, + "learning_rate": 3.9625362683259e-06, + "loss": 0.4691, + "mean_token_accuracy": 0.8507105484604836, + "num_tokens": 158174917.0, + "step": 131490 + }, + { + "entropy": 1.8795487105846405, + "epoch": 0.4076384944034955, + "grad_norm": 4.378306865692139, + "learning_rate": 3.962385597537333e-06, + "loss": 0.4443, + "mean_token_accuracy": 0.8498224586248397, + "num_tokens": 158187150.0, + "step": 131500 + }, + { + "entropy": 1.709404329955578, + "epoch": 0.40766949352854515, + "grad_norm": 7.759102821350098, + "learning_rate": 3.9622349439346985e-06, + "loss": 0.348, + "mean_token_accuracy": 0.8657671257853508, + "num_tokens": 158201223.0, + "step": 131510 + }, + { + "entropy": 1.9634048074483872, + "epoch": 0.40770049265359487, + "grad_norm": 9.048603057861328, + "learning_rate": 3.962084307514729e-06, + "loss": 0.5274, + "mean_token_accuracy": 0.8406730964779854, + "num_tokens": 158212564.0, + "step": 131520 + }, + { + "entropy": 1.9156591862440109, + "epoch": 0.40773149177864454, + "grad_norm": 8.055933952331543, + "learning_rate": 3.9619336882741595e-06, + "loss": 0.4562, + "mean_token_accuracy": 0.8517060205340385, + "num_tokens": 158224398.0, + "step": 131530 + }, + { + "entropy": 1.851639135181904, + "epoch": 0.40776249090369426, + "grad_norm": 4.349390029907227, + "learning_rate": 3.961783086209726e-06, + "loss": 0.4029, + "mean_token_accuracy": 0.8623872861266136, + "num_tokens": 158237555.0, + "step": 131540 + }, + { + "entropy": 1.8899920910596848, + "epoch": 0.40779349002874393, + "grad_norm": 6.555295467376709, + "learning_rate": 3.96163250131816e-06, + "loss": 0.4161, + "mean_token_accuracy": 0.8546891719102859, + "num_tokens": 158249575.0, + "step": 131550 + }, + { + "entropy": 1.8582070901989938, + "epoch": 0.40782448915379366, + "grad_norm": 8.720921516418457, + "learning_rate": 3.961481933596203e-06, + "loss": 0.4457, + "mean_token_accuracy": 0.8513376504182816, + "num_tokens": 158262561.0, + "step": 131560 + }, + { + "entropy": 1.9029300913214684, + "epoch": 0.4078554882788433, + "grad_norm": 7.594869136810303, + "learning_rate": 3.9613313830405895e-06, + "loss": 0.4742, + "mean_token_accuracy": 0.8554488077759743, + "num_tokens": 158274086.0, + "step": 131570 + }, + { + "entropy": 1.8785912677645684, + "epoch": 0.40788648740389305, + "grad_norm": 4.259986400604248, + "learning_rate": 3.961180849648059e-06, + "loss": 0.4547, + "mean_token_accuracy": 0.8466733306646347, + "num_tokens": 158286621.0, + "step": 131580 + }, + { + "entropy": 1.8832870185375215, + "epoch": 0.4079174865289427, + "grad_norm": 3.7143633365631104, + "learning_rate": 3.961030333415349e-06, + "loss": 0.4362, + "mean_token_accuracy": 0.8520120859146119, + "num_tokens": 158298950.0, + "step": 131590 + }, + { + "entropy": 1.8503879860043526, + "epoch": 0.40794848565399244, + "grad_norm": 6.5647478103637695, + "learning_rate": 3.960879834339202e-06, + "loss": 0.3874, + "mean_token_accuracy": 0.8644073367118835, + "num_tokens": 158311634.0, + "step": 131600 + }, + { + "entropy": 1.785201308131218, + "epoch": 0.4079794847790421, + "grad_norm": 8.026589393615723, + "learning_rate": 3.960729352416358e-06, + "loss": 0.4128, + "mean_token_accuracy": 0.849951197206974, + "num_tokens": 158325117.0, + "step": 131610 + }, + { + "entropy": 1.9552603572607041, + "epoch": 0.4080104839040918, + "grad_norm": 8.554306983947754, + "learning_rate": 3.960578887643557e-06, + "loss": 0.4965, + "mean_token_accuracy": 0.846280574798584, + "num_tokens": 158336274.0, + "step": 131620 + }, + { + "entropy": 1.8943400636315346, + "epoch": 0.4080414830291415, + "grad_norm": 9.436079978942871, + "learning_rate": 3.960428440017544e-06, + "loss": 0.4796, + "mean_token_accuracy": 0.8490863978862763, + "num_tokens": 158347481.0, + "step": 131630 + }, + { + "entropy": 1.8362084731459618, + "epoch": 0.4080724821541912, + "grad_norm": 4.264366626739502, + "learning_rate": 3.960278009535063e-06, + "loss": 0.4281, + "mean_token_accuracy": 0.8494585782289505, + "num_tokens": 158360076.0, + "step": 131640 + }, + { + "entropy": 1.9207329094409942, + "epoch": 0.4081034812792409, + "grad_norm": 8.190292358398438, + "learning_rate": 3.960127596192855e-06, + "loss": 0.4646, + "mean_token_accuracy": 0.8595984593033791, + "num_tokens": 158371437.0, + "step": 131650 + }, + { + "entropy": 1.9442640289664268, + "epoch": 0.40813448040429057, + "grad_norm": 8.884259223937988, + "learning_rate": 3.959977199987669e-06, + "loss": 0.4553, + "mean_token_accuracy": 0.8574367508292198, + "num_tokens": 158382706.0, + "step": 131660 + }, + { + "entropy": 1.8724360197782517, + "epoch": 0.4081654795293403, + "grad_norm": 4.065852642059326, + "learning_rate": 3.959826820916251e-06, + "loss": 0.4309, + "mean_token_accuracy": 0.8485496342182159, + "num_tokens": 158394833.0, + "step": 131670 + }, + { + "entropy": 1.9313586950302124, + "epoch": 0.40819647865438996, + "grad_norm": 4.444271087646484, + "learning_rate": 3.9596764589753435e-06, + "loss": 0.4781, + "mean_token_accuracy": 0.8469718486070633, + "num_tokens": 158406591.0, + "step": 131680 + }, + { + "entropy": 1.9100508004426957, + "epoch": 0.4082274777794397, + "grad_norm": 4.082368850708008, + "learning_rate": 3.959526114161699e-06, + "loss": 0.4211, + "mean_token_accuracy": 0.8586499407887459, + "num_tokens": 158418411.0, + "step": 131690 + }, + { + "entropy": 1.8683311700820924, + "epoch": 0.40825847690448935, + "grad_norm": 3.7615411281585693, + "learning_rate": 3.959375786472065e-06, + "loss": 0.3954, + "mean_token_accuracy": 0.8578609451651573, + "num_tokens": 158430806.0, + "step": 131700 + }, + { + "entropy": 1.902983796596527, + "epoch": 0.4082894760295391, + "grad_norm": 6.945703983306885, + "learning_rate": 3.95922547590319e-06, + "loss": 0.4371, + "mean_token_accuracy": 0.8584466874599457, + "num_tokens": 158442686.0, + "step": 131710 + }, + { + "entropy": 1.8431276768445968, + "epoch": 0.40832047515458875, + "grad_norm": 8.36220932006836, + "learning_rate": 3.959075182451826e-06, + "loss": 0.4039, + "mean_token_accuracy": 0.8638790413737297, + "num_tokens": 158455043.0, + "step": 131720 + }, + { + "entropy": 1.8704046532511711, + "epoch": 0.40835147427963847, + "grad_norm": 8.115221977233887, + "learning_rate": 3.958924906114722e-06, + "loss": 0.4362, + "mean_token_accuracy": 0.8541171133518219, + "num_tokens": 158467164.0, + "step": 131730 + }, + { + "entropy": 1.89015693962574, + "epoch": 0.40838247340468814, + "grad_norm": 8.641586303710938, + "learning_rate": 3.958774646888633e-06, + "loss": 0.5154, + "mean_token_accuracy": 0.8427566275000572, + "num_tokens": 158479113.0, + "step": 131740 + }, + { + "entropy": 1.917370368540287, + "epoch": 0.40841347252973786, + "grad_norm": 8.092529296875, + "learning_rate": 3.958624404770311e-06, + "loss": 0.4753, + "mean_token_accuracy": 0.8460395202040673, + "num_tokens": 158490501.0, + "step": 131750 + }, + { + "entropy": 1.8727830082178116, + "epoch": 0.40844447165478753, + "grad_norm": 4.2208404541015625, + "learning_rate": 3.95847417975651e-06, + "loss": 0.4159, + "mean_token_accuracy": 0.854328741133213, + "num_tokens": 158502344.0, + "step": 131760 + }, + { + "entropy": 1.8874020084738732, + "epoch": 0.40847547077983726, + "grad_norm": 8.900503158569336, + "learning_rate": 3.958323971843983e-06, + "loss": 0.4533, + "mean_token_accuracy": 0.8399735167622566, + "num_tokens": 158514713.0, + "step": 131770 + }, + { + "entropy": 1.9528047412633895, + "epoch": 0.4085064699048869, + "grad_norm": 6.548738956451416, + "learning_rate": 3.958173781029487e-06, + "loss": 0.476, + "mean_token_accuracy": 0.8562960177659988, + "num_tokens": 158526164.0, + "step": 131780 + }, + { + "entropy": 1.8597876712679864, + "epoch": 0.40853746902993665, + "grad_norm": 4.265164375305176, + "learning_rate": 3.95802360730978e-06, + "loss": 0.3839, + "mean_token_accuracy": 0.861919678747654, + "num_tokens": 158538663.0, + "step": 131790 + }, + { + "entropy": 1.874061642587185, + "epoch": 0.4085684681549863, + "grad_norm": 3.2929224967956543, + "learning_rate": 3.957873450681617e-06, + "loss": 0.4689, + "mean_token_accuracy": 0.8565281003713607, + "num_tokens": 158550773.0, + "step": 131800 + }, + { + "entropy": 1.943674847483635, + "epoch": 0.40859946728003604, + "grad_norm": 9.038628578186035, + "learning_rate": 3.9577233111417575e-06, + "loss": 0.4782, + "mean_token_accuracy": 0.8586441352963448, + "num_tokens": 158562087.0, + "step": 131810 + }, + { + "entropy": 1.88618975430727, + "epoch": 0.4086304664050857, + "grad_norm": 8.164410591125488, + "learning_rate": 3.95757318868696e-06, + "loss": 0.5036, + "mean_token_accuracy": 0.8438228338956832, + "num_tokens": 158573774.0, + "step": 131820 + }, + { + "entropy": 1.9306119337677956, + "epoch": 0.40866146553013544, + "grad_norm": 9.125570297241211, + "learning_rate": 3.957423083313984e-06, + "loss": 0.4908, + "mean_token_accuracy": 0.839359101653099, + "num_tokens": 158585857.0, + "step": 131830 + }, + { + "entropy": 1.9744738072156907, + "epoch": 0.4086924646551851, + "grad_norm": 8.555429458618164, + "learning_rate": 3.957272995019592e-06, + "loss": 0.5472, + "mean_token_accuracy": 0.8404243916273118, + "num_tokens": 158597937.0, + "step": 131840 + }, + { + "entropy": 1.9038583174347878, + "epoch": 0.40872346378023483, + "grad_norm": 2.9242498874664307, + "learning_rate": 3.9571229238005436e-06, + "loss": 0.4247, + "mean_token_accuracy": 0.8589347749948502, + "num_tokens": 158610210.0, + "step": 131850 + }, + { + "entropy": 1.9494069427251817, + "epoch": 0.4087544629052845, + "grad_norm": 3.915905237197876, + "learning_rate": 3.956972869653602e-06, + "loss": 0.4873, + "mean_token_accuracy": 0.8493471398949624, + "num_tokens": 158621519.0, + "step": 131860 + }, + { + "entropy": 1.8807632595300674, + "epoch": 0.40878546203033417, + "grad_norm": 7.95693302154541, + "learning_rate": 3.956822832575532e-06, + "loss": 0.4319, + "mean_token_accuracy": 0.8504085868597031, + "num_tokens": 158633877.0, + "step": 131870 + }, + { + "entropy": 1.971236227452755, + "epoch": 0.4088164611553839, + "grad_norm": 7.221638202667236, + "learning_rate": 3.956672812563096e-06, + "loss": 0.5112, + "mean_token_accuracy": 0.8400038599967956, + "num_tokens": 158645327.0, + "step": 131880 + }, + { + "entropy": 1.9444902956485748, + "epoch": 0.40884746028043356, + "grad_norm": 8.52728271484375, + "learning_rate": 3.956522809613061e-06, + "loss": 0.4633, + "mean_token_accuracy": 0.8471096143126488, + "num_tokens": 158657499.0, + "step": 131890 + }, + { + "entropy": 1.8855510473251342, + "epoch": 0.4088784594054833, + "grad_norm": 6.880878448486328, + "learning_rate": 3.95637282372219e-06, + "loss": 0.4621, + "mean_token_accuracy": 0.856865806877613, + "num_tokens": 158669407.0, + "step": 131900 + }, + { + "entropy": 1.9185251086950301, + "epoch": 0.40890945853053295, + "grad_norm": 8.515263557434082, + "learning_rate": 3.956222854887252e-06, + "loss": 0.4217, + "mean_token_accuracy": 0.8487884759902954, + "num_tokens": 158681430.0, + "step": 131910 + }, + { + "entropy": 1.8058866739273072, + "epoch": 0.4089404576555827, + "grad_norm": 3.930187463760376, + "learning_rate": 3.956072903105014e-06, + "loss": 0.3995, + "mean_token_accuracy": 0.8495476976037025, + "num_tokens": 158695298.0, + "step": 131920 + }, + { + "entropy": 1.9411882251501082, + "epoch": 0.40897145678063235, + "grad_norm": 6.896012306213379, + "learning_rate": 3.955922968372246e-06, + "loss": 0.4603, + "mean_token_accuracy": 0.8471732020378113, + "num_tokens": 158706166.0, + "step": 131930 + }, + { + "entropy": 1.9487490490078927, + "epoch": 0.40900245590568207, + "grad_norm": 7.6823015213012695, + "learning_rate": 3.955773050685715e-06, + "loss": 0.4411, + "mean_token_accuracy": 0.8591760620474815, + "num_tokens": 158717557.0, + "step": 131940 + }, + { + "entropy": 1.855629739165306, + "epoch": 0.40903345503073174, + "grad_norm": 7.680805683135986, + "learning_rate": 3.955623150042193e-06, + "loss": 0.4827, + "mean_token_accuracy": 0.8519311249256134, + "num_tokens": 158730328.0, + "step": 131950 + }, + { + "entropy": 1.9119280502200127, + "epoch": 0.40906445415578147, + "grad_norm": 7.97050666809082, + "learning_rate": 3.9554732664384495e-06, + "loss": 0.4739, + "mean_token_accuracy": 0.8517009600996971, + "num_tokens": 158742681.0, + "step": 131960 + }, + { + "entropy": 1.8887418761849404, + "epoch": 0.40909545328083113, + "grad_norm": 4.2701802253723145, + "learning_rate": 3.955323399871258e-06, + "loss": 0.4534, + "mean_token_accuracy": 0.8548057928681374, + "num_tokens": 158754792.0, + "step": 131970 + }, + { + "entropy": 1.955525654554367, + "epoch": 0.40912645240588086, + "grad_norm": 8.67664623260498, + "learning_rate": 3.955173550337391e-06, + "loss": 0.5038, + "mean_token_accuracy": 0.8389379262924195, + "num_tokens": 158766595.0, + "step": 131980 + }, + { + "entropy": 1.9655676484107971, + "epoch": 0.4091574515309305, + "grad_norm": 7.483652591705322, + "learning_rate": 3.955023717833621e-06, + "loss": 0.4962, + "mean_token_accuracy": 0.8467064067721367, + "num_tokens": 158777624.0, + "step": 131990 + }, + { + "entropy": 1.8954958975315095, + "epoch": 0.40918845065598025, + "grad_norm": 8.60129165649414, + "learning_rate": 3.954873902356724e-06, + "loss": 0.4291, + "mean_token_accuracy": 0.8588397979736329, + "num_tokens": 158789808.0, + "step": 132000 + }, + { + "entropy": 1.9237147703766824, + "epoch": 0.4092194497810299, + "grad_norm": 7.778072357177734, + "learning_rate": 3.9547241039034745e-06, + "loss": 0.4504, + "mean_token_accuracy": 0.8559079930186272, + "num_tokens": 158801021.0, + "step": 132010 + }, + { + "entropy": 1.9441052988171577, + "epoch": 0.40925044890607964, + "grad_norm": 8.119583129882812, + "learning_rate": 3.954574322470649e-06, + "loss": 0.4965, + "mean_token_accuracy": 0.8437977254390716, + "num_tokens": 158812670.0, + "step": 132020 + }, + { + "entropy": 1.9320714622735977, + "epoch": 0.4092814480311293, + "grad_norm": 7.461652755737305, + "learning_rate": 3.954424558055025e-06, + "loss": 0.568, + "mean_token_accuracy": 0.842699658870697, + "num_tokens": 158823864.0, + "step": 132030 + }, + { + "entropy": 1.9182468011975289, + "epoch": 0.40931244715617904, + "grad_norm": 7.672018051147461, + "learning_rate": 3.954274810653379e-06, + "loss": 0.4464, + "mean_token_accuracy": 0.8570101261138916, + "num_tokens": 158835728.0, + "step": 132040 + }, + { + "entropy": 1.9373611778020858, + "epoch": 0.4093434462812287, + "grad_norm": 8.900008201599121, + "learning_rate": 3.954125080262492e-06, + "loss": 0.4879, + "mean_token_accuracy": 0.8497121125459671, + "num_tokens": 158846813.0, + "step": 132050 + }, + { + "entropy": 1.8692992970347404, + "epoch": 0.40937444540627843, + "grad_norm": 7.517188549041748, + "learning_rate": 3.953975366879141e-06, + "loss": 0.4331, + "mean_token_accuracy": 0.8590711817145348, + "num_tokens": 158858240.0, + "step": 132060 + }, + { + "entropy": 1.8589076712727546, + "epoch": 0.4094054445313281, + "grad_norm": 9.378162384033203, + "learning_rate": 3.953825670500109e-06, + "loss": 0.4061, + "mean_token_accuracy": 0.861855249106884, + "num_tokens": 158870488.0, + "step": 132070 + }, + { + "entropy": 1.9007334470748902, + "epoch": 0.4094364436563778, + "grad_norm": 8.594630241394043, + "learning_rate": 3.953675991122176e-06, + "loss": 0.5, + "mean_token_accuracy": 0.8480967715382576, + "num_tokens": 158882023.0, + "step": 132080 + }, + { + "entropy": 1.9315433949232101, + "epoch": 0.4094674427814275, + "grad_norm": 3.873143196105957, + "learning_rate": 3.953526328742123e-06, + "loss": 0.4676, + "mean_token_accuracy": 0.8507410883903503, + "num_tokens": 158893185.0, + "step": 132090 + }, + { + "entropy": 1.9150833919644357, + "epoch": 0.4094984419064772, + "grad_norm": 3.768460273742676, + "learning_rate": 3.953376683356738e-06, + "loss": 0.4955, + "mean_token_accuracy": 0.8455236718058586, + "num_tokens": 158904520.0, + "step": 132100 + }, + { + "entropy": 1.8527635991573335, + "epoch": 0.4095294410315269, + "grad_norm": 8.320259094238281, + "learning_rate": 3.953227054962798e-06, + "loss": 0.3745, + "mean_token_accuracy": 0.8663365572690964, + "num_tokens": 158917450.0, + "step": 132110 + }, + { + "entropy": 1.8949743598699569, + "epoch": 0.40956044015657656, + "grad_norm": 3.7683377265930176, + "learning_rate": 3.953077443557093e-06, + "loss": 0.5338, + "mean_token_accuracy": 0.8330464109778404, + "num_tokens": 158929795.0, + "step": 132120 + }, + { + "entropy": 1.9238438218832017, + "epoch": 0.4095914392816263, + "grad_norm": 8.981724739074707, + "learning_rate": 3.952927849136406e-06, + "loss": 0.5059, + "mean_token_accuracy": 0.8446515381336213, + "num_tokens": 158941331.0, + "step": 132130 + }, + { + "entropy": 1.9227786138653755, + "epoch": 0.40962243840667595, + "grad_norm": 7.715068817138672, + "learning_rate": 3.952778271697524e-06, + "loss": 0.5024, + "mean_token_accuracy": 0.8444668635725975, + "num_tokens": 158953028.0, + "step": 132140 + }, + { + "entropy": 1.9023156434297561, + "epoch": 0.4096534375317257, + "grad_norm": 8.244248390197754, + "learning_rate": 3.952628711237235e-06, + "loss": 0.4696, + "mean_token_accuracy": 0.850898091495037, + "num_tokens": 158964269.0, + "step": 132150 + }, + { + "entropy": 1.8805926099419594, + "epoch": 0.40968443665677534, + "grad_norm": 7.6930999755859375, + "learning_rate": 3.952479167752328e-06, + "loss": 0.4305, + "mean_token_accuracy": 0.863546060025692, + "num_tokens": 158976310.0, + "step": 132160 + }, + { + "entropy": 1.7959831669926642, + "epoch": 0.40971543578182507, + "grad_norm": 5.07602596282959, + "learning_rate": 3.952329641239589e-06, + "loss": 0.4041, + "mean_token_accuracy": 0.8648563221096992, + "num_tokens": 158989932.0, + "step": 132170 + }, + { + "entropy": 1.9297270089387895, + "epoch": 0.40974643490687473, + "grad_norm": 7.6279096603393555, + "learning_rate": 3.9521801316958105e-06, + "loss": 0.4611, + "mean_token_accuracy": 0.8553159907460213, + "num_tokens": 159001128.0, + "step": 132180 + }, + { + "entropy": 1.8945249140262603, + "epoch": 0.40977743403192446, + "grad_norm": 7.788156032562256, + "learning_rate": 3.952030639117782e-06, + "loss": 0.4566, + "mean_token_accuracy": 0.8605994209647179, + "num_tokens": 159012619.0, + "step": 132190 + }, + { + "entropy": 1.9229301661252975, + "epoch": 0.40980843315697413, + "grad_norm": 9.636686325073242, + "learning_rate": 3.951881163502295e-06, + "loss": 0.4814, + "mean_token_accuracy": 0.8494541838765144, + "num_tokens": 159023540.0, + "step": 132200 + }, + { + "entropy": 1.8150003015995027, + "epoch": 0.40983943228202385, + "grad_norm": 8.98440933227539, + "learning_rate": 3.951731704846143e-06, + "loss": 0.3751, + "mean_token_accuracy": 0.8665079712867737, + "num_tokens": 159036264.0, + "step": 132210 + }, + { + "entropy": 1.9748639404773711, + "epoch": 0.4098704314070735, + "grad_norm": 10.900146484375, + "learning_rate": 3.951582263146119e-06, + "loss": 0.5727, + "mean_token_accuracy": 0.8254036799073219, + "num_tokens": 159046762.0, + "step": 132220 + }, + { + "entropy": 1.8873142629861832, + "epoch": 0.40990143053212325, + "grad_norm": 9.89717960357666, + "learning_rate": 3.951432838399017e-06, + "loss": 0.5366, + "mean_token_accuracy": 0.8460609570145607, + "num_tokens": 159058574.0, + "step": 132230 + }, + { + "entropy": 1.8014112293720246, + "epoch": 0.4099324296571729, + "grad_norm": 8.128996849060059, + "learning_rate": 3.95128343060163e-06, + "loss": 0.3828, + "mean_token_accuracy": 0.8637454330921173, + "num_tokens": 159072392.0, + "step": 132240 + }, + { + "entropy": 1.8816646337509155, + "epoch": 0.40996342878222264, + "grad_norm": 2.441240072250366, + "learning_rate": 3.9511340397507555e-06, + "loss": 0.4778, + "mean_token_accuracy": 0.8551323056221009, + "num_tokens": 159084913.0, + "step": 132250 + }, + { + "entropy": 1.8489706605672835, + "epoch": 0.4099944279072723, + "grad_norm": 4.065392971038818, + "learning_rate": 3.950984665843191e-06, + "loss": 0.4157, + "mean_token_accuracy": 0.8606104895472526, + "num_tokens": 159097664.0, + "step": 132260 + }, + { + "entropy": 1.8708397269248962, + "epoch": 0.41002542703232203, + "grad_norm": 4.442748069763184, + "learning_rate": 3.950835308875733e-06, + "loss": 0.4184, + "mean_token_accuracy": 0.8600472211837769, + "num_tokens": 159109240.0, + "step": 132270 + }, + { + "entropy": 1.8376126080751418, + "epoch": 0.4100564261573717, + "grad_norm": 7.40627908706665, + "learning_rate": 3.9506859688451805e-06, + "loss": 0.4016, + "mean_token_accuracy": 0.8633526057004929, + "num_tokens": 159121773.0, + "step": 132280 + }, + { + "entropy": 1.9250367239117623, + "epoch": 0.4100874252824214, + "grad_norm": 7.536020755767822, + "learning_rate": 3.950536645748332e-06, + "loss": 0.4819, + "mean_token_accuracy": 0.8445648729801178, + "num_tokens": 159133238.0, + "step": 132290 + }, + { + "entropy": 1.8859627723693848, + "epoch": 0.4101184244074711, + "grad_norm": 10.275099754333496, + "learning_rate": 3.950387339581987e-06, + "loss": 0.4357, + "mean_token_accuracy": 0.8571148261427879, + "num_tokens": 159144881.0, + "step": 132300 + }, + { + "entropy": 1.8043443158268928, + "epoch": 0.4101494235325208, + "grad_norm": 6.609226703643799, + "learning_rate": 3.950238050342948e-06, + "loss": 0.3754, + "mean_token_accuracy": 0.8644479662179947, + "num_tokens": 159157919.0, + "step": 132310 + }, + { + "entropy": 1.9193567991256715, + "epoch": 0.4101804226575705, + "grad_norm": 8.553771018981934, + "learning_rate": 3.950088778028016e-06, + "loss": 0.5014, + "mean_token_accuracy": 0.8534865245223046, + "num_tokens": 159169473.0, + "step": 132320 + }, + { + "entropy": 1.9148662850260734, + "epoch": 0.4102114217826202, + "grad_norm": 5.254544258117676, + "learning_rate": 3.949939522633992e-06, + "loss": 0.49, + "mean_token_accuracy": 0.8425288826227189, + "num_tokens": 159181088.0, + "step": 132330 + }, + { + "entropy": 1.8751920118927956, + "epoch": 0.4102424209076699, + "grad_norm": 8.47884750366211, + "learning_rate": 3.9497902841576826e-06, + "loss": 0.4212, + "mean_token_accuracy": 0.8611443415284157, + "num_tokens": 159192624.0, + "step": 132340 + }, + { + "entropy": 1.9114878982305528, + "epoch": 0.41027342003271955, + "grad_norm": 8.459266662597656, + "learning_rate": 3.949641062595889e-06, + "loss": 0.4893, + "mean_token_accuracy": 0.8466789424419403, + "num_tokens": 159203363.0, + "step": 132350 + }, + { + "entropy": 1.9088921546936035, + "epoch": 0.4103044191577693, + "grad_norm": 7.341428756713867, + "learning_rate": 3.949491857945419e-06, + "loss": 0.4507, + "mean_token_accuracy": 0.8548379436135292, + "num_tokens": 159215263.0, + "step": 132360 + }, + { + "entropy": 1.8097696974873543, + "epoch": 0.41033541828281894, + "grad_norm": 8.788670539855957, + "learning_rate": 3.949342670203077e-06, + "loss": 0.4085, + "mean_token_accuracy": 0.8639630541205406, + "num_tokens": 159227890.0, + "step": 132370 + }, + { + "entropy": 1.921763353049755, + "epoch": 0.41036641740786867, + "grad_norm": 8.037771224975586, + "learning_rate": 3.94919349936567e-06, + "loss": 0.5278, + "mean_token_accuracy": 0.8411315530538559, + "num_tokens": 159239371.0, + "step": 132380 + }, + { + "entropy": 1.9325927823781968, + "epoch": 0.41039741653291834, + "grad_norm": 9.67178726196289, + "learning_rate": 3.949044345430004e-06, + "loss": 0.4994, + "mean_token_accuracy": 0.8497194886207581, + "num_tokens": 159249796.0, + "step": 132390 + }, + { + "entropy": 1.8051844477653503, + "epoch": 0.41042841565796806, + "grad_norm": 5.3989691734313965, + "learning_rate": 3.94889520839289e-06, + "loss": 0.4265, + "mean_token_accuracy": 0.8578594639897347, + "num_tokens": 159262693.0, + "step": 132400 + }, + { + "entropy": 1.9161362245678901, + "epoch": 0.41045941478301773, + "grad_norm": 8.300895690917969, + "learning_rate": 3.948746088251138e-06, + "loss": 0.4547, + "mean_token_accuracy": 0.853403514623642, + "num_tokens": 159274712.0, + "step": 132410 + }, + { + "entropy": 1.8416450381278993, + "epoch": 0.41049041390806745, + "grad_norm": 6.615303039550781, + "learning_rate": 3.948596985001556e-06, + "loss": 0.3615, + "mean_token_accuracy": 0.8720286816358567, + "num_tokens": 159287211.0, + "step": 132420 + }, + { + "entropy": 1.935446311533451, + "epoch": 0.4105214130331171, + "grad_norm": 8.131443977355957, + "learning_rate": 3.948447898640955e-06, + "loss": 0.4783, + "mean_token_accuracy": 0.8410095199942589, + "num_tokens": 159298375.0, + "step": 132430 + }, + { + "entropy": 1.8042462676763535, + "epoch": 0.41055241215816685, + "grad_norm": 6.6756911277771, + "learning_rate": 3.948298829166149e-06, + "loss": 0.4102, + "mean_token_accuracy": 0.8577197238802909, + "num_tokens": 159311942.0, + "step": 132440 + }, + { + "entropy": 1.8535395577549933, + "epoch": 0.4105834112832165, + "grad_norm": 8.319269180297852, + "learning_rate": 3.948149776573948e-06, + "loss": 0.5231, + "mean_token_accuracy": 0.841600701212883, + "num_tokens": 159325565.0, + "step": 132450 + }, + { + "entropy": 1.853706520795822, + "epoch": 0.41061441040826624, + "grad_norm": 7.631629467010498, + "learning_rate": 3.948000740861168e-06, + "loss": 0.4186, + "mean_token_accuracy": 0.8647748947143554, + "num_tokens": 159337661.0, + "step": 132460 + }, + { + "entropy": 1.8631751976907254, + "epoch": 0.4106454095333159, + "grad_norm": 3.5813517570495605, + "learning_rate": 3.947851722024622e-06, + "loss": 0.4183, + "mean_token_accuracy": 0.8632732674479484, + "num_tokens": 159349920.0, + "step": 132470 + }, + { + "entropy": 1.9363053843379021, + "epoch": 0.41067640865836563, + "grad_norm": 7.310238361358643, + "learning_rate": 3.947702720061125e-06, + "loss": 0.4557, + "mean_token_accuracy": 0.8465033307671547, + "num_tokens": 159361594.0, + "step": 132480 + }, + { + "entropy": 1.8684279143810272, + "epoch": 0.4107074077834153, + "grad_norm": 9.23641300201416, + "learning_rate": 3.947553734967494e-06, + "loss": 0.4332, + "mean_token_accuracy": 0.8531864553689956, + "num_tokens": 159373909.0, + "step": 132490 + }, + { + "entropy": 1.9086994245648383, + "epoch": 0.410738406908465, + "grad_norm": 9.247127532958984, + "learning_rate": 3.947404766740546e-06, + "loss": 0.4944, + "mean_token_accuracy": 0.8483163312077522, + "num_tokens": 159386422.0, + "step": 132500 + }, + { + "entropy": 1.9499953478574752, + "epoch": 0.4107694060335147, + "grad_norm": 7.84163236618042, + "learning_rate": 3.947255815377098e-06, + "loss": 0.4753, + "mean_token_accuracy": 0.8528471052646637, + "num_tokens": 159397202.0, + "step": 132510 + }, + { + "entropy": 1.8840179577469827, + "epoch": 0.4108004051585644, + "grad_norm": 4.282788276672363, + "learning_rate": 3.947106880873969e-06, + "loss": 0.4659, + "mean_token_accuracy": 0.8478341817855835, + "num_tokens": 159410046.0, + "step": 132520 + }, + { + "entropy": 1.7982640989124774, + "epoch": 0.4108314042836141, + "grad_norm": 4.207269191741943, + "learning_rate": 3.946957963227978e-06, + "loss": 0.3782, + "mean_token_accuracy": 0.8666106954216957, + "num_tokens": 159422809.0, + "step": 132530 + }, + { + "entropy": 1.8728843182325363, + "epoch": 0.4108624034086638, + "grad_norm": 4.20776891708374, + "learning_rate": 3.946809062435946e-06, + "loss": 0.4235, + "mean_token_accuracy": 0.8539015829563141, + "num_tokens": 159435471.0, + "step": 132540 + }, + { + "entropy": 1.8800769940018653, + "epoch": 0.4108934025337135, + "grad_norm": 8.50875473022461, + "learning_rate": 3.9466601784946935e-06, + "loss": 0.4762, + "mean_token_accuracy": 0.8496467992663383, + "num_tokens": 159448258.0, + "step": 132550 + }, + { + "entropy": 1.824411703646183, + "epoch": 0.4109244016587632, + "grad_norm": 7.829589366912842, + "learning_rate": 3.946511311401043e-06, + "loss": 0.3826, + "mean_token_accuracy": 0.8696093156933784, + "num_tokens": 159460857.0, + "step": 132560 + }, + { + "entropy": 1.8826001703739166, + "epoch": 0.4109554007838129, + "grad_norm": 8.163921356201172, + "learning_rate": 3.946362461151816e-06, + "loss": 0.4364, + "mean_token_accuracy": 0.8533704668283463, + "num_tokens": 159473481.0, + "step": 132570 + }, + { + "entropy": 1.841157278418541, + "epoch": 0.4109863999088626, + "grad_norm": 8.525252342224121, + "learning_rate": 3.946213627743839e-06, + "loss": 0.3758, + "mean_token_accuracy": 0.8639968425035477, + "num_tokens": 159485282.0, + "step": 132580 + }, + { + "entropy": 1.903964453935623, + "epoch": 0.41101739903391227, + "grad_norm": 8.637951850891113, + "learning_rate": 3.9460648111739346e-06, + "loss": 0.4897, + "mean_token_accuracy": 0.8393364146351814, + "num_tokens": 159496784.0, + "step": 132590 + }, + { + "entropy": 1.8756361320614814, + "epoch": 0.41104839815896194, + "grad_norm": 9.585840225219727, + "learning_rate": 3.945916011438926e-06, + "loss": 0.463, + "mean_token_accuracy": 0.8499387681484223, + "num_tokens": 159508357.0, + "step": 132600 + }, + { + "entropy": 1.8971466958522796, + "epoch": 0.41107939728401166, + "grad_norm": 7.960050106048584, + "learning_rate": 3.945767228535644e-06, + "loss": 0.4611, + "mean_token_accuracy": 0.8593509659171105, + "num_tokens": 159519968.0, + "step": 132610 + }, + { + "entropy": 1.928882573544979, + "epoch": 0.41111039640906133, + "grad_norm": 7.798891544342041, + "learning_rate": 3.9456184624609115e-06, + "loss": 0.4672, + "mean_token_accuracy": 0.8493491113185883, + "num_tokens": 159531286.0, + "step": 132620 + }, + { + "entropy": 1.9064328223466873, + "epoch": 0.41114139553411105, + "grad_norm": 8.213712692260742, + "learning_rate": 3.945469713211559e-06, + "loss": 0.4889, + "mean_token_accuracy": 0.844736622273922, + "num_tokens": 159543308.0, + "step": 132630 + }, + { + "entropy": 1.898288056254387, + "epoch": 0.4111723946591607, + "grad_norm": 7.8646039962768555, + "learning_rate": 3.945320980784413e-06, + "loss": 0.5054, + "mean_token_accuracy": 0.8415489286184311, + "num_tokens": 159555394.0, + "step": 132640 + }, + { + "entropy": 1.8438436210155487, + "epoch": 0.41120339378421045, + "grad_norm": 8.93295955657959, + "learning_rate": 3.945172265176305e-06, + "loss": 0.4203, + "mean_token_accuracy": 0.8616420149803161, + "num_tokens": 159568585.0, + "step": 132650 + }, + { + "entropy": 1.909870770573616, + "epoch": 0.4112343929092601, + "grad_norm": 3.4213240146636963, + "learning_rate": 3.945023566384064e-06, + "loss": 0.4484, + "mean_token_accuracy": 0.856622938811779, + "num_tokens": 159580433.0, + "step": 132660 + }, + { + "entropy": 1.9142403885722161, + "epoch": 0.41126539203430984, + "grad_norm": 4.372912883758545, + "learning_rate": 3.944874884404522e-06, + "loss": 0.4383, + "mean_token_accuracy": 0.8533108174800873, + "num_tokens": 159592231.0, + "step": 132670 + }, + { + "entropy": 1.9566717997193337, + "epoch": 0.4112963911593595, + "grad_norm": 7.756010055541992, + "learning_rate": 3.944726219234511e-06, + "loss": 0.4849, + "mean_token_accuracy": 0.8435320198535919, + "num_tokens": 159603617.0, + "step": 132680 + }, + { + "entropy": 1.7626475676894189, + "epoch": 0.41132739028440923, + "grad_norm": 9.682427406311035, + "learning_rate": 3.944577570870863e-06, + "loss": 0.3736, + "mean_token_accuracy": 0.8564101874828338, + "num_tokens": 159617560.0, + "step": 132690 + }, + { + "entropy": 1.9389828056097032, + "epoch": 0.4113583894094589, + "grad_norm": 7.8840179443359375, + "learning_rate": 3.944428939310412e-06, + "loss": 0.4442, + "mean_token_accuracy": 0.8546505197882652, + "num_tokens": 159628901.0, + "step": 132700 + }, + { + "entropy": 1.7881031468510629, + "epoch": 0.4113893885345086, + "grad_norm": 3.9923946857452393, + "learning_rate": 3.944280324549993e-06, + "loss": 0.3477, + "mean_token_accuracy": 0.8698825523257255, + "num_tokens": 159642790.0, + "step": 132710 + }, + { + "entropy": 1.9210022300481797, + "epoch": 0.4114203876595583, + "grad_norm": 6.981124401092529, + "learning_rate": 3.944131726586441e-06, + "loss": 0.4662, + "mean_token_accuracy": 0.842510312795639, + "num_tokens": 159654565.0, + "step": 132720 + }, + { + "entropy": 1.7577592477202415, + "epoch": 0.411451386784608, + "grad_norm": 7.697649002075195, + "learning_rate": 3.943983145416592e-06, + "loss": 0.3523, + "mean_token_accuracy": 0.8635148763656616, + "num_tokens": 159668041.0, + "step": 132730 + }, + { + "entropy": 1.9331092268228531, + "epoch": 0.4114823859096577, + "grad_norm": 7.486753463745117, + "learning_rate": 3.943834581037284e-06, + "loss": 0.4667, + "mean_token_accuracy": 0.8566341996192932, + "num_tokens": 159679025.0, + "step": 132740 + }, + { + "entropy": 1.8456541523337364, + "epoch": 0.4115133850347074, + "grad_norm": 8.025050163269043, + "learning_rate": 3.9436860334453535e-06, + "loss": 0.3865, + "mean_token_accuracy": 0.8562669888138771, + "num_tokens": 159691580.0, + "step": 132750 + }, + { + "entropy": 1.8954646542668343, + "epoch": 0.4115443841597571, + "grad_norm": 8.532896995544434, + "learning_rate": 3.94353750263764e-06, + "loss": 0.4245, + "mean_token_accuracy": 0.8576274499297142, + "num_tokens": 159702748.0, + "step": 132760 + }, + { + "entropy": 1.8293071150779725, + "epoch": 0.4115753832848068, + "grad_norm": 3.7488701343536377, + "learning_rate": 3.943388988610982e-06, + "loss": 0.3922, + "mean_token_accuracy": 0.8620394229888916, + "num_tokens": 159715325.0, + "step": 132770 + }, + { + "entropy": 1.8163642302155494, + "epoch": 0.4116063824098565, + "grad_norm": 11.977995872497559, + "learning_rate": 3.943240491362222e-06, + "loss": 0.4271, + "mean_token_accuracy": 0.8522105798125267, + "num_tokens": 159728453.0, + "step": 132780 + }, + { + "entropy": 1.8184501871466636, + "epoch": 0.4116373815349062, + "grad_norm": 7.925955772399902, + "learning_rate": 3.9430920108882e-06, + "loss": 0.464, + "mean_token_accuracy": 0.8594678580760956, + "num_tokens": 159741671.0, + "step": 132790 + }, + { + "entropy": 1.8316495031118394, + "epoch": 0.41166838065995587, + "grad_norm": 8.928302764892578, + "learning_rate": 3.942943547185757e-06, + "loss": 0.4176, + "mean_token_accuracy": 0.86448425501585, + "num_tokens": 159754290.0, + "step": 132800 + }, + { + "entropy": 1.8046483889222145, + "epoch": 0.4116993797850056, + "grad_norm": 3.8450210094451904, + "learning_rate": 3.942795100251737e-06, + "loss": 0.4365, + "mean_token_accuracy": 0.8553688034415246, + "num_tokens": 159766765.0, + "step": 132810 + }, + { + "entropy": 1.888925837725401, + "epoch": 0.41173037891005526, + "grad_norm": 9.118193626403809, + "learning_rate": 3.942646670082983e-06, + "loss": 0.4684, + "mean_token_accuracy": 0.8464773207902908, + "num_tokens": 159779056.0, + "step": 132820 + }, + { + "entropy": 1.900699371099472, + "epoch": 0.411761378035105, + "grad_norm": 7.870625972747803, + "learning_rate": 3.942498256676342e-06, + "loss": 0.4509, + "mean_token_accuracy": 0.8564991891384125, + "num_tokens": 159790237.0, + "step": 132830 + }, + { + "entropy": 1.9068083673715592, + "epoch": 0.41179237716015465, + "grad_norm": 9.596370697021484, + "learning_rate": 3.942349860028655e-06, + "loss": 0.4538, + "mean_token_accuracy": 0.8504379868507386, + "num_tokens": 159802324.0, + "step": 132840 + }, + { + "entropy": 1.95052922219038, + "epoch": 0.4118233762852043, + "grad_norm": 7.9511494636535645, + "learning_rate": 3.942201480136772e-06, + "loss": 0.4751, + "mean_token_accuracy": 0.8542814716696739, + "num_tokens": 159813411.0, + "step": 132850 + }, + { + "entropy": 1.9478548653423786, + "epoch": 0.41185437541025405, + "grad_norm": 7.628299713134766, + "learning_rate": 3.942053116997537e-06, + "loss": 0.4533, + "mean_token_accuracy": 0.8497453570365906, + "num_tokens": 159824950.0, + "step": 132860 + }, + { + "entropy": 1.9837032228708267, + "epoch": 0.4118853745353037, + "grad_norm": 8.014751434326172, + "learning_rate": 3.941904770607801e-06, + "loss": 0.5192, + "mean_token_accuracy": 0.8386216104030609, + "num_tokens": 159836448.0, + "step": 132870 + }, + { + "entropy": 1.881002813577652, + "epoch": 0.41191637366035344, + "grad_norm": 7.482250213623047, + "learning_rate": 3.94175644096441e-06, + "loss": 0.4778, + "mean_token_accuracy": 0.8465662628412247, + "num_tokens": 159847845.0, + "step": 132880 + }, + { + "entropy": 1.8887635886669158, + "epoch": 0.4119473727854031, + "grad_norm": 3.9271512031555176, + "learning_rate": 3.941608128064214e-06, + "loss": 0.4312, + "mean_token_accuracy": 0.8607404991984368, + "num_tokens": 159860022.0, + "step": 132890 + }, + { + "entropy": 1.9076422840356826, + "epoch": 0.41197837191045283, + "grad_norm": 4.137758255004883, + "learning_rate": 3.9414598319040645e-06, + "loss": 0.4207, + "mean_token_accuracy": 0.8553347662091255, + "num_tokens": 159872269.0, + "step": 132900 + }, + { + "entropy": 1.8967055141925813, + "epoch": 0.4120093710355025, + "grad_norm": 6.800586700439453, + "learning_rate": 3.941311552480813e-06, + "loss": 0.4575, + "mean_token_accuracy": 0.8462317153811455, + "num_tokens": 159884036.0, + "step": 132910 + }, + { + "entropy": 1.9517858356237412, + "epoch": 0.4120403701605522, + "grad_norm": 8.87108039855957, + "learning_rate": 3.941163289791309e-06, + "loss": 0.4335, + "mean_token_accuracy": 0.8600587204098702, + "num_tokens": 159896130.0, + "step": 132920 + }, + { + "entropy": 1.9215604767203331, + "epoch": 0.4120713692856019, + "grad_norm": 8.849936485290527, + "learning_rate": 3.941015043832408e-06, + "loss": 0.4597, + "mean_token_accuracy": 0.8451809346675873, + "num_tokens": 159908040.0, + "step": 132930 + }, + { + "entropy": 1.9565720595419407, + "epoch": 0.4121023684106516, + "grad_norm": 7.92255973815918, + "learning_rate": 3.940866814600961e-06, + "loss": 0.4594, + "mean_token_accuracy": 0.8439367547631264, + "num_tokens": 159919326.0, + "step": 132940 + }, + { + "entropy": 1.9593399614095688, + "epoch": 0.4121333675357013, + "grad_norm": 7.588318824768066, + "learning_rate": 3.940718602093826e-06, + "loss": 0.4888, + "mean_token_accuracy": 0.850217179954052, + "num_tokens": 159930913.0, + "step": 132950 + }, + { + "entropy": 1.9544413655996322, + "epoch": 0.412164366660751, + "grad_norm": 8.658806800842285, + "learning_rate": 3.9405704063078546e-06, + "loss": 0.5432, + "mean_token_accuracy": 0.841646321117878, + "num_tokens": 159942302.0, + "step": 132960 + }, + { + "entropy": 1.9482817143201827, + "epoch": 0.4121953657858007, + "grad_norm": 9.174203872680664, + "learning_rate": 3.940422227239905e-06, + "loss": 0.5234, + "mean_token_accuracy": 0.8407966732978821, + "num_tokens": 159953706.0, + "step": 132970 + }, + { + "entropy": 1.8826787516474723, + "epoch": 0.4122263649108504, + "grad_norm": 10.95289421081543, + "learning_rate": 3.9402740648868335e-06, + "loss": 0.4539, + "mean_token_accuracy": 0.84808798879385, + "num_tokens": 159965894.0, + "step": 132980 + }, + { + "entropy": 1.8592276245355606, + "epoch": 0.4122573640359001, + "grad_norm": 9.670906066894531, + "learning_rate": 3.940125919245498e-06, + "loss": 0.4063, + "mean_token_accuracy": 0.8619933322072029, + "num_tokens": 159978522.0, + "step": 132990 + }, + { + "entropy": 1.9317376971244813, + "epoch": 0.4122883631609498, + "grad_norm": 7.894320011138916, + "learning_rate": 3.939977790312759e-06, + "loss": 0.4869, + "mean_token_accuracy": 0.8445814654231072, + "num_tokens": 159989620.0, + "step": 133000 + }, + { + "entropy": 2.002483421564102, + "epoch": 0.41231936228599947, + "grad_norm": 9.11700439453125, + "learning_rate": 3.939829678085473e-06, + "loss": 0.5269, + "mean_token_accuracy": 0.8350201919674873, + "num_tokens": 160001293.0, + "step": 133010 + }, + { + "entropy": 1.914002077281475, + "epoch": 0.4123503614110492, + "grad_norm": 8.952300071716309, + "learning_rate": 3.939681582560501e-06, + "loss": 0.4696, + "mean_token_accuracy": 0.8454202443361283, + "num_tokens": 160013020.0, + "step": 133020 + }, + { + "entropy": 1.9585654482245445, + "epoch": 0.41238136053609886, + "grad_norm": 7.388771057128906, + "learning_rate": 3.939533503734705e-06, + "loss": 0.4946, + "mean_token_accuracy": 0.8444287464022636, + "num_tokens": 160024299.0, + "step": 133030 + }, + { + "entropy": 1.880546286702156, + "epoch": 0.4124123596611486, + "grad_norm": 3.908165454864502, + "learning_rate": 3.939385441604947e-06, + "loss": 0.4181, + "mean_token_accuracy": 0.859189136326313, + "num_tokens": 160036416.0, + "step": 133040 + }, + { + "entropy": 1.8767894744873046, + "epoch": 0.41244335878619826, + "grad_norm": 7.4038472175598145, + "learning_rate": 3.939237396168088e-06, + "loss": 0.3913, + "mean_token_accuracy": 0.8633315816521645, + "num_tokens": 160048717.0, + "step": 133050 + }, + { + "entropy": 1.9026233911514283, + "epoch": 0.412474357911248, + "grad_norm": 8.312832832336426, + "learning_rate": 3.939089367420993e-06, + "loss": 0.4479, + "mean_token_accuracy": 0.852534967660904, + "num_tokens": 160060661.0, + "step": 133060 + }, + { + "entropy": 1.8906838029623032, + "epoch": 0.41250535703629765, + "grad_norm": 7.901620864868164, + "learning_rate": 3.938941355360527e-06, + "loss": 0.4, + "mean_token_accuracy": 0.8649371221661568, + "num_tokens": 160073055.0, + "step": 133070 + }, + { + "entropy": 1.9295494481921196, + "epoch": 0.4125363561613474, + "grad_norm": 7.450974941253662, + "learning_rate": 3.938793359983554e-06, + "loss": 0.4751, + "mean_token_accuracy": 0.8515502735972404, + "num_tokens": 160084739.0, + "step": 133080 + }, + { + "entropy": 1.8649974435567855, + "epoch": 0.41256735528639704, + "grad_norm": 3.9373440742492676, + "learning_rate": 3.93864538128694e-06, + "loss": 0.4441, + "mean_token_accuracy": 0.8602189511060715, + "num_tokens": 160097401.0, + "step": 133090 + }, + { + "entropy": 1.88185980245471, + "epoch": 0.4125983544114467, + "grad_norm": 8.116096496582031, + "learning_rate": 3.938497419267553e-06, + "loss": 0.369, + "mean_token_accuracy": 0.8672322928905487, + "num_tokens": 160109518.0, + "step": 133100 + }, + { + "entropy": 1.9526149734854699, + "epoch": 0.41262935353649643, + "grad_norm": 10.256512641906738, + "learning_rate": 3.938349473922259e-06, + "loss": 0.4795, + "mean_token_accuracy": 0.8465583026409149, + "num_tokens": 160120718.0, + "step": 133110 + }, + { + "entropy": 1.8862829342484475, + "epoch": 0.4126603526615461, + "grad_norm": 7.902219295501709, + "learning_rate": 3.938201545247929e-06, + "loss": 0.4305, + "mean_token_accuracy": 0.8557077690958976, + "num_tokens": 160133234.0, + "step": 133120 + }, + { + "entropy": 1.9045727148652076, + "epoch": 0.41269135178659583, + "grad_norm": 8.710145950317383, + "learning_rate": 3.93805363324143e-06, + "loss": 0.4381, + "mean_token_accuracy": 0.8461684197187423, + "num_tokens": 160145332.0, + "step": 133130 + }, + { + "entropy": 1.8291308224201202, + "epoch": 0.4127223509116455, + "grad_norm": 9.70020866394043, + "learning_rate": 3.937905737899633e-06, + "loss": 0.4424, + "mean_token_accuracy": 0.8510018140077591, + "num_tokens": 160158147.0, + "step": 133140 + }, + { + "entropy": 1.8897850081324576, + "epoch": 0.4127533500366952, + "grad_norm": 8.668664932250977, + "learning_rate": 3.937757859219409e-06, + "loss": 0.4277, + "mean_token_accuracy": 0.8535225465893745, + "num_tokens": 160169771.0, + "step": 133150 + }, + { + "entropy": 1.9326857790350913, + "epoch": 0.4127843491617449, + "grad_norm": 10.146557807922363, + "learning_rate": 3.93760999719763e-06, + "loss": 0.5089, + "mean_token_accuracy": 0.8501642525196076, + "num_tokens": 160180380.0, + "step": 133160 + }, + { + "entropy": 1.8924714922904968, + "epoch": 0.4128153482867946, + "grad_norm": 8.729730606079102, + "learning_rate": 3.937462151831168e-06, + "loss": 0.4288, + "mean_token_accuracy": 0.8599536821246148, + "num_tokens": 160192793.0, + "step": 133170 + }, + { + "entropy": 1.8484364092350005, + "epoch": 0.4128463474118443, + "grad_norm": 7.995754718780518, + "learning_rate": 3.937314323116897e-06, + "loss": 0.3844, + "mean_token_accuracy": 0.8631056323647499, + "num_tokens": 160204883.0, + "step": 133180 + }, + { + "entropy": 1.8777808651328087, + "epoch": 0.412877346536894, + "grad_norm": 9.567119598388672, + "learning_rate": 3.937166511051691e-06, + "loss": 0.4602, + "mean_token_accuracy": 0.8523745030164719, + "num_tokens": 160217041.0, + "step": 133190 + }, + { + "entropy": 1.8994059637188911, + "epoch": 0.4129083456619437, + "grad_norm": 7.305065155029297, + "learning_rate": 3.937018715632426e-06, + "loss": 0.4384, + "mean_token_accuracy": 0.8568759083747863, + "num_tokens": 160229204.0, + "step": 133200 + }, + { + "entropy": 1.8381987407803535, + "epoch": 0.4129393447869934, + "grad_norm": 7.50701379776001, + "learning_rate": 3.936870936855977e-06, + "loss": 0.4089, + "mean_token_accuracy": 0.8586773455142975, + "num_tokens": 160242271.0, + "step": 133210 + }, + { + "entropy": 1.9254462212324142, + "epoch": 0.41297034391204307, + "grad_norm": 9.528829574584961, + "learning_rate": 3.93672317471922e-06, + "loss": 0.4837, + "mean_token_accuracy": 0.8490356847643852, + "num_tokens": 160253561.0, + "step": 133220 + }, + { + "entropy": 1.9055677652359009, + "epoch": 0.4130013430370928, + "grad_norm": 7.700775146484375, + "learning_rate": 3.9365754292190345e-06, + "loss": 0.4233, + "mean_token_accuracy": 0.8636551991105079, + "num_tokens": 160264851.0, + "step": 133230 + }, + { + "entropy": 1.9295484229922295, + "epoch": 0.41303234216214246, + "grad_norm": 8.420736312866211, + "learning_rate": 3.936427700352297e-06, + "loss": 0.438, + "mean_token_accuracy": 0.8591367319226265, + "num_tokens": 160276017.0, + "step": 133240 + }, + { + "entropy": 1.9808582752943038, + "epoch": 0.4130633412871922, + "grad_norm": 6.451159954071045, + "learning_rate": 3.936279988115888e-06, + "loss": 0.4928, + "mean_token_accuracy": 0.8498171672224999, + "num_tokens": 160286922.0, + "step": 133250 + }, + { + "entropy": 1.8775882482528687, + "epoch": 0.41309434041224186, + "grad_norm": 8.08848762512207, + "learning_rate": 3.936132292506687e-06, + "loss": 0.4848, + "mean_token_accuracy": 0.8411854594945908, + "num_tokens": 160299889.0, + "step": 133260 + }, + { + "entropy": 1.9185838371515274, + "epoch": 0.4131253395372916, + "grad_norm": 8.169017791748047, + "learning_rate": 3.935984613521574e-06, + "loss": 0.4792, + "mean_token_accuracy": 0.8455637603998184, + "num_tokens": 160311587.0, + "step": 133270 + }, + { + "entropy": 1.927768488228321, + "epoch": 0.41315633866234125, + "grad_norm": 8.6692476272583, + "learning_rate": 3.9358369511574325e-06, + "loss": 0.4657, + "mean_token_accuracy": 0.8541040673851967, + "num_tokens": 160322859.0, + "step": 133280 + }, + { + "entropy": 1.980207970738411, + "epoch": 0.413187337787391, + "grad_norm": 9.063236236572266, + "learning_rate": 3.935689305411144e-06, + "loss": 0.5183, + "mean_token_accuracy": 0.8438897222280503, + "num_tokens": 160333949.0, + "step": 133290 + }, + { + "entropy": 1.8348941326141357, + "epoch": 0.41321833691244064, + "grad_norm": 11.424354553222656, + "learning_rate": 3.935541676279592e-06, + "loss": 0.4391, + "mean_token_accuracy": 0.857143734395504, + "num_tokens": 160346373.0, + "step": 133300 + }, + { + "entropy": 1.9375634253025056, + "epoch": 0.41324933603749037, + "grad_norm": 7.8653435707092285, + "learning_rate": 3.93539406375966e-06, + "loss": 0.4542, + "mean_token_accuracy": 0.8492831483483314, + "num_tokens": 160357451.0, + "step": 133310 + }, + { + "entropy": 1.8438090533018112, + "epoch": 0.41328033516254004, + "grad_norm": 8.466445922851562, + "learning_rate": 3.9352464678482325e-06, + "loss": 0.4196, + "mean_token_accuracy": 0.8501735106110573, + "num_tokens": 160370838.0, + "step": 133320 + }, + { + "entropy": 1.9386936947703362, + "epoch": 0.41331133428758976, + "grad_norm": 8.103391647338867, + "learning_rate": 3.935098888542198e-06, + "loss": 0.4446, + "mean_token_accuracy": 0.8613961502909661, + "num_tokens": 160382175.0, + "step": 133330 + }, + { + "entropy": 1.8318694144487382, + "epoch": 0.41334233341263943, + "grad_norm": 8.630756378173828, + "learning_rate": 3.934951325838439e-06, + "loss": 0.4651, + "mean_token_accuracy": 0.8484767213463783, + "num_tokens": 160395178.0, + "step": 133340 + }, + { + "entropy": 1.9260124281048774, + "epoch": 0.4133733325376891, + "grad_norm": 7.492244720458984, + "learning_rate": 3.934803779733846e-06, + "loss": 0.4451, + "mean_token_accuracy": 0.8476378932595253, + "num_tokens": 160407014.0, + "step": 133350 + }, + { + "entropy": 1.8849865958094596, + "epoch": 0.4134043316627388, + "grad_norm": 3.519080400466919, + "learning_rate": 3.934656250225307e-06, + "loss": 0.4288, + "mean_token_accuracy": 0.8558978796005249, + "num_tokens": 160419398.0, + "step": 133360 + }, + { + "entropy": 1.8688034132122993, + "epoch": 0.4134353307877885, + "grad_norm": 6.8715033531188965, + "learning_rate": 3.934508737309709e-06, + "loss": 0.4437, + "mean_token_accuracy": 0.8502852901816368, + "num_tokens": 160431982.0, + "step": 133370 + }, + { + "entropy": 1.8360135570168494, + "epoch": 0.4134663299128382, + "grad_norm": 6.848766326904297, + "learning_rate": 3.934361240983944e-06, + "loss": 0.4321, + "mean_token_accuracy": 0.8653775498270988, + "num_tokens": 160444385.0, + "step": 133380 + }, + { + "entropy": 1.9299522519111634, + "epoch": 0.4134973290378879, + "grad_norm": 4.2813496589660645, + "learning_rate": 3.934213761244901e-06, + "loss": 0.5009, + "mean_token_accuracy": 0.8370076134800911, + "num_tokens": 160456540.0, + "step": 133390 + }, + { + "entropy": 1.8286241069436073, + "epoch": 0.4135283281629376, + "grad_norm": 8.64360237121582, + "learning_rate": 3.934066298089472e-06, + "loss": 0.4333, + "mean_token_accuracy": 0.8586912482976914, + "num_tokens": 160469565.0, + "step": 133400 + }, + { + "entropy": 1.9575915843248368, + "epoch": 0.4135593272879873, + "grad_norm": 5.0459370613098145, + "learning_rate": 3.93391885151455e-06, + "loss": 0.4756, + "mean_token_accuracy": 0.8582988694310189, + "num_tokens": 160480566.0, + "step": 133410 + }, + { + "entropy": 1.9278161972761154, + "epoch": 0.413590326413037, + "grad_norm": 7.705987930297852, + "learning_rate": 3.933771421517027e-06, + "loss": 0.4942, + "mean_token_accuracy": 0.8494553253054619, + "num_tokens": 160492405.0, + "step": 133420 + }, + { + "entropy": 1.9024669751524925, + "epoch": 0.41362132553808667, + "grad_norm": 7.893537998199463, + "learning_rate": 3.9336240080937985e-06, + "loss": 0.4601, + "mean_token_accuracy": 0.8514198035001754, + "num_tokens": 160504029.0, + "step": 133430 + }, + { + "entropy": 1.830211453139782, + "epoch": 0.4136523246631364, + "grad_norm": 5.1870222091674805, + "learning_rate": 3.933476611241757e-06, + "loss": 0.4069, + "mean_token_accuracy": 0.85890783816576, + "num_tokens": 160516627.0, + "step": 133440 + }, + { + "entropy": 1.8257996559143066, + "epoch": 0.41368332378818606, + "grad_norm": 3.705263376235962, + "learning_rate": 3.9333292309578e-06, + "loss": 0.4155, + "mean_token_accuracy": 0.8598002254962921, + "num_tokens": 160530211.0, + "step": 133450 + }, + { + "entropy": 1.9126117467880248, + "epoch": 0.4137143229132358, + "grad_norm": 4.153369903564453, + "learning_rate": 3.933181867238822e-06, + "loss": 0.4548, + "mean_token_accuracy": 0.8488003075122833, + "num_tokens": 160541810.0, + "step": 133460 + }, + { + "entropy": 1.9465106219053268, + "epoch": 0.41374532203828546, + "grad_norm": 8.59892463684082, + "learning_rate": 3.933034520081723e-06, + "loss": 0.46, + "mean_token_accuracy": 0.8590655192732811, + "num_tokens": 160552736.0, + "step": 133470 + }, + { + "entropy": 1.9508576810359954, + "epoch": 0.4137763211633352, + "grad_norm": 8.499300003051758, + "learning_rate": 3.9328871894833975e-06, + "loss": 0.4967, + "mean_token_accuracy": 0.8416604056954384, + "num_tokens": 160564088.0, + "step": 133480 + }, + { + "entropy": 1.9252055808901787, + "epoch": 0.41380732028838485, + "grad_norm": 8.029823303222656, + "learning_rate": 3.932739875440747e-06, + "loss": 0.4801, + "mean_token_accuracy": 0.8455364406108856, + "num_tokens": 160575490.0, + "step": 133490 + }, + { + "entropy": 1.8071866802871228, + "epoch": 0.4138383194134346, + "grad_norm": 8.960620880126953, + "learning_rate": 3.93259257795067e-06, + "loss": 0.4126, + "mean_token_accuracy": 0.8656040906906128, + "num_tokens": 160588680.0, + "step": 133500 + }, + { + "entropy": 1.877330508828163, + "epoch": 0.41386931853848424, + "grad_norm": 8.935325622558594, + "learning_rate": 3.932445297010065e-06, + "loss": 0.4528, + "mean_token_accuracy": 0.8558456063270569, + "num_tokens": 160600946.0, + "step": 133510 + }, + { + "entropy": 1.9224560901522636, + "epoch": 0.41390031766353397, + "grad_norm": 7.983498573303223, + "learning_rate": 3.932298032615838e-06, + "loss": 0.4876, + "mean_token_accuracy": 0.8463954851031303, + "num_tokens": 160612788.0, + "step": 133520 + }, + { + "entropy": 1.8346376448869706, + "epoch": 0.41393131678858364, + "grad_norm": 8.071921348571777, + "learning_rate": 3.932150784764887e-06, + "loss": 0.4211, + "mean_token_accuracy": 0.8672112032771111, + "num_tokens": 160625526.0, + "step": 133530 + }, + { + "entropy": 1.931402738392353, + "epoch": 0.41396231591363336, + "grad_norm": 3.571261405944824, + "learning_rate": 3.932003553454117e-06, + "loss": 0.4441, + "mean_token_accuracy": 0.8496660724282264, + "num_tokens": 160637476.0, + "step": 133540 + }, + { + "entropy": 1.9502887561917306, + "epoch": 0.41399331503868303, + "grad_norm": 7.512312412261963, + "learning_rate": 3.93185633868043e-06, + "loss": 0.4495, + "mean_token_accuracy": 0.8570262104272842, + "num_tokens": 160648694.0, + "step": 133550 + }, + { + "entropy": 1.9070443853735923, + "epoch": 0.41402431416373275, + "grad_norm": 8.191246032714844, + "learning_rate": 3.931709140440732e-06, + "loss": 0.4601, + "mean_token_accuracy": 0.8478936731815339, + "num_tokens": 160660627.0, + "step": 133560 + }, + { + "entropy": 1.9022893160581589, + "epoch": 0.4140553132887824, + "grad_norm": 7.333972454071045, + "learning_rate": 3.931561958731927e-06, + "loss": 0.4469, + "mean_token_accuracy": 0.8530973836779594, + "num_tokens": 160672648.0, + "step": 133570 + }, + { + "entropy": 1.8672500133514405, + "epoch": 0.41408631241383215, + "grad_norm": 7.397851467132568, + "learning_rate": 3.931414793550921e-06, + "loss": 0.4088, + "mean_token_accuracy": 0.8604272216558456, + "num_tokens": 160685095.0, + "step": 133580 + }, + { + "entropy": 1.911836712062359, + "epoch": 0.4141173115388818, + "grad_norm": 9.264906883239746, + "learning_rate": 3.9312676448946225e-06, + "loss": 0.4732, + "mean_token_accuracy": 0.8507508277893067, + "num_tokens": 160697135.0, + "step": 133590 + }, + { + "entropy": 1.892936834692955, + "epoch": 0.4141483106639315, + "grad_norm": 8.400348663330078, + "learning_rate": 3.931120512759939e-06, + "loss": 0.4422, + "mean_token_accuracy": 0.8533075451850891, + "num_tokens": 160708171.0, + "step": 133600 + }, + { + "entropy": 1.9460464030504228, + "epoch": 0.4141793097889812, + "grad_norm": 7.992114067077637, + "learning_rate": 3.930973397143777e-06, + "loss": 0.4671, + "mean_token_accuracy": 0.8554316237568855, + "num_tokens": 160719263.0, + "step": 133610 + }, + { + "entropy": 1.8571830540895462, + "epoch": 0.4142103089140309, + "grad_norm": 9.004242897033691, + "learning_rate": 3.930826298043048e-06, + "loss": 0.3898, + "mean_token_accuracy": 0.868439619243145, + "num_tokens": 160731482.0, + "step": 133620 + }, + { + "entropy": 1.9615523904561996, + "epoch": 0.4142413080390806, + "grad_norm": 7.82743501663208, + "learning_rate": 3.930679215454661e-06, + "loss": 0.5313, + "mean_token_accuracy": 0.8472507506608963, + "num_tokens": 160741961.0, + "step": 133630 + }, + { + "entropy": 1.8198735401034356, + "epoch": 0.41427230716413027, + "grad_norm": 5.923557281494141, + "learning_rate": 3.9305321493755265e-06, + "loss": 0.4134, + "mean_token_accuracy": 0.8585488334298134, + "num_tokens": 160755417.0, + "step": 133640 + }, + { + "entropy": 1.8547986596822739, + "epoch": 0.41430330628918, + "grad_norm": 9.227656364440918, + "learning_rate": 3.9303850998025586e-06, + "loss": 0.4319, + "mean_token_accuracy": 0.8587556973099708, + "num_tokens": 160767718.0, + "step": 133650 + }, + { + "entropy": 1.8604234397411346, + "epoch": 0.41433430541422966, + "grad_norm": 7.426607131958008, + "learning_rate": 3.930238066732667e-06, + "loss": 0.473, + "mean_token_accuracy": 0.8567753449082375, + "num_tokens": 160780130.0, + "step": 133660 + }, + { + "entropy": 1.9464622527360915, + "epoch": 0.4143653045392794, + "grad_norm": 8.832720756530762, + "learning_rate": 3.930091050162768e-06, + "loss": 0.4752, + "mean_token_accuracy": 0.8490612745285034, + "num_tokens": 160790549.0, + "step": 133670 + }, + { + "entropy": 1.738626065850258, + "epoch": 0.41439630366432906, + "grad_norm": 8.568602561950684, + "learning_rate": 3.9299440500897725e-06, + "loss": 0.3268, + "mean_token_accuracy": 0.8719424903392792, + "num_tokens": 160804288.0, + "step": 133680 + }, + { + "entropy": 1.8175345674157142, + "epoch": 0.4144273027893788, + "grad_norm": 3.655850410461426, + "learning_rate": 3.929797066510598e-06, + "loss": 0.403, + "mean_token_accuracy": 0.8615160465240479, + "num_tokens": 160816494.0, + "step": 133690 + }, + { + "entropy": 1.9714103803038596, + "epoch": 0.41445830191442845, + "grad_norm": 3.3211865425109863, + "learning_rate": 3.92965009942216e-06, + "loss": 0.4984, + "mean_token_accuracy": 0.8397148564457894, + "num_tokens": 160827569.0, + "step": 133700 + }, + { + "entropy": 1.8605849146842957, + "epoch": 0.4144893010394782, + "grad_norm": 4.2350921630859375, + "learning_rate": 3.929503148821374e-06, + "loss": 0.461, + "mean_token_accuracy": 0.8473623186349869, + "num_tokens": 160840031.0, + "step": 133710 + }, + { + "entropy": 1.8704538196325302, + "epoch": 0.41452030016452784, + "grad_norm": 4.739353656768799, + "learning_rate": 3.929356214705158e-06, + "loss": 0.4535, + "mean_token_accuracy": 0.8485003530979156, + "num_tokens": 160851772.0, + "step": 133720 + }, + { + "entropy": 1.8266652062535287, + "epoch": 0.41455129928957757, + "grad_norm": 8.387866020202637, + "learning_rate": 3.92920929707043e-06, + "loss": 0.4312, + "mean_token_accuracy": 0.8575938984751701, + "num_tokens": 160864401.0, + "step": 133730 + }, + { + "entropy": 1.8190097853541374, + "epoch": 0.41458229841462724, + "grad_norm": 2.620790719985962, + "learning_rate": 3.92906239591411e-06, + "loss": 0.3936, + "mean_token_accuracy": 0.8680200964212418, + "num_tokens": 160877175.0, + "step": 133740 + }, + { + "entropy": 1.9274157121777535, + "epoch": 0.41461329753967696, + "grad_norm": 8.17121696472168, + "learning_rate": 3.928915511233117e-06, + "loss": 0.481, + "mean_token_accuracy": 0.8486859872937202, + "num_tokens": 160889187.0, + "step": 133750 + }, + { + "entropy": 1.921906155347824, + "epoch": 0.41464429666472663, + "grad_norm": 8.626646041870117, + "learning_rate": 3.928768643024372e-06, + "loss": 0.5145, + "mean_token_accuracy": 0.8409358784556389, + "num_tokens": 160899766.0, + "step": 133760 + }, + { + "entropy": 1.8961732387542725, + "epoch": 0.41467529578977635, + "grad_norm": 7.050107955932617, + "learning_rate": 3.928621791284796e-06, + "loss": 0.4861, + "mean_token_accuracy": 0.8544517681002617, + "num_tokens": 160910866.0, + "step": 133770 + }, + { + "entropy": 1.9284267202019691, + "epoch": 0.414706294914826, + "grad_norm": 4.148862361907959, + "learning_rate": 3.928474956011312e-06, + "loss": 0.4777, + "mean_token_accuracy": 0.8424947530031204, + "num_tokens": 160922592.0, + "step": 133780 + }, + { + "entropy": 1.881701409816742, + "epoch": 0.41473729403987575, + "grad_norm": 6.7715229988098145, + "learning_rate": 3.928328137200842e-06, + "loss": 0.475, + "mean_token_accuracy": 0.8563426598906517, + "num_tokens": 160934482.0, + "step": 133790 + }, + { + "entropy": 1.7748121902346612, + "epoch": 0.4147682931649254, + "grad_norm": 9.0849609375, + "learning_rate": 3.92818133485031e-06, + "loss": 0.3697, + "mean_token_accuracy": 0.870860530436039, + "num_tokens": 160947645.0, + "step": 133800 + }, + { + "entropy": 1.9011081397533416, + "epoch": 0.41479929228997514, + "grad_norm": 8.376455307006836, + "learning_rate": 3.928034548956642e-06, + "loss": 0.4385, + "mean_token_accuracy": 0.8554087609052659, + "num_tokens": 160959336.0, + "step": 133810 + }, + { + "entropy": 1.937306745350361, + "epoch": 0.4148302914150248, + "grad_norm": 9.280498504638672, + "learning_rate": 3.927887779516763e-06, + "loss": 0.4628, + "mean_token_accuracy": 0.8521208077669143, + "num_tokens": 160970927.0, + "step": 133820 + }, + { + "entropy": 1.8931998923420905, + "epoch": 0.4148612905400745, + "grad_norm": 4.04230260848999, + "learning_rate": 3.927741026527598e-06, + "loss": 0.4633, + "mean_token_accuracy": 0.8493711724877357, + "num_tokens": 160982582.0, + "step": 133830 + }, + { + "entropy": 1.862531739473343, + "epoch": 0.4148922896651242, + "grad_norm": 8.157683372497559, + "learning_rate": 3.927594289986076e-06, + "loss": 0.4603, + "mean_token_accuracy": 0.849953505396843, + "num_tokens": 160994545.0, + "step": 133840 + }, + { + "entropy": 1.8835390016436577, + "epoch": 0.41492328879017387, + "grad_norm": 8.619124412536621, + "learning_rate": 3.927447569889122e-06, + "loss": 0.4962, + "mean_token_accuracy": 0.8389230579137802, + "num_tokens": 161007106.0, + "step": 133850 + }, + { + "entropy": 1.8973518058657646, + "epoch": 0.4149542879152236, + "grad_norm": 7.793274402618408, + "learning_rate": 3.9273008662336685e-06, + "loss": 0.5054, + "mean_token_accuracy": 0.8410894647240639, + "num_tokens": 161018629.0, + "step": 133860 + }, + { + "entropy": 1.9573092699050902, + "epoch": 0.41498528704027327, + "grad_norm": 7.558224678039551, + "learning_rate": 3.927154179016643e-06, + "loss": 0.5093, + "mean_token_accuracy": 0.8450547724962234, + "num_tokens": 161029814.0, + "step": 133870 + }, + { + "entropy": 1.8571864694356919, + "epoch": 0.415016286165323, + "grad_norm": 8.728129386901855, + "learning_rate": 3.927007508234975e-06, + "loss": 0.4406, + "mean_token_accuracy": 0.8491732507944107, + "num_tokens": 161041934.0, + "step": 133880 + }, + { + "entropy": 1.877596764266491, + "epoch": 0.41504728529037266, + "grad_norm": 4.276613235473633, + "learning_rate": 3.9268608538855965e-06, + "loss": 0.4656, + "mean_token_accuracy": 0.8503337040543556, + "num_tokens": 161053951.0, + "step": 133890 + }, + { + "entropy": 1.8341388761997224, + "epoch": 0.4150782844154224, + "grad_norm": 4.777545928955078, + "learning_rate": 3.92671421596544e-06, + "loss": 0.4701, + "mean_token_accuracy": 0.8406883478164673, + "num_tokens": 161067201.0, + "step": 133900 + }, + { + "entropy": 1.8835119605064392, + "epoch": 0.41510928354047205, + "grad_norm": 7.586372375488281, + "learning_rate": 3.926567594471437e-06, + "loss": 0.4865, + "mean_token_accuracy": 0.8480971708893776, + "num_tokens": 161079821.0, + "step": 133910 + }, + { + "entropy": 1.817841324210167, + "epoch": 0.4151402826655218, + "grad_norm": 8.434521675109863, + "learning_rate": 3.926420989400522e-06, + "loss": 0.4007, + "mean_token_accuracy": 0.8662325829267502, + "num_tokens": 161092583.0, + "step": 133920 + }, + { + "entropy": 1.9291996493935586, + "epoch": 0.41517128179057144, + "grad_norm": 8.39786434173584, + "learning_rate": 3.926274400749629e-06, + "loss": 0.4703, + "mean_token_accuracy": 0.8544291496276856, + "num_tokens": 161103847.0, + "step": 133930 + }, + { + "entropy": 1.8668764278292656, + "epoch": 0.41520228091562117, + "grad_norm": 3.983625650405884, + "learning_rate": 3.926127828515693e-06, + "loss": 0.417, + "mean_token_accuracy": 0.8636351123452186, + "num_tokens": 161115696.0, + "step": 133940 + }, + { + "entropy": 1.8500849679112434, + "epoch": 0.41523328004067084, + "grad_norm": 11.573062896728516, + "learning_rate": 3.9259812726956495e-06, + "loss": 0.4025, + "mean_token_accuracy": 0.8647431343793869, + "num_tokens": 161127966.0, + "step": 133950 + }, + { + "entropy": 1.8055422961711884, + "epoch": 0.41526427916572056, + "grad_norm": 2.1831018924713135, + "learning_rate": 3.925834733286436e-06, + "loss": 0.4989, + "mean_token_accuracy": 0.8400851160287857, + "num_tokens": 161141745.0, + "step": 133960 + }, + { + "entropy": 1.8568698644638062, + "epoch": 0.41529527829077023, + "grad_norm": 7.404766082763672, + "learning_rate": 3.92568821028499e-06, + "loss": 0.4149, + "mean_token_accuracy": 0.8579725489020348, + "num_tokens": 161153901.0, + "step": 133970 + }, + { + "entropy": 1.9167630136013032, + "epoch": 0.41532627741581996, + "grad_norm": 7.15609884262085, + "learning_rate": 3.925541703688249e-06, + "loss": 0.4677, + "mean_token_accuracy": 0.8486376807093621, + "num_tokens": 161165538.0, + "step": 133980 + }, + { + "entropy": 1.8915724635124207, + "epoch": 0.4153572765408696, + "grad_norm": 7.963951587677002, + "learning_rate": 3.925395213493153e-06, + "loss": 0.4811, + "mean_token_accuracy": 0.8474412530660629, + "num_tokens": 161178272.0, + "step": 133990 + }, + { + "entropy": 1.9221931904554368, + "epoch": 0.41538827566591935, + "grad_norm": 7.879276752471924, + "learning_rate": 3.9252487396966406e-06, + "loss": 0.4792, + "mean_token_accuracy": 0.8496794193983078, + "num_tokens": 161189021.0, + "step": 134000 + }, + { + "entropy": 1.8921538457274436, + "epoch": 0.415419274790969, + "grad_norm": 8.086331367492676, + "learning_rate": 3.925102282295654e-06, + "loss": 0.4773, + "mean_token_accuracy": 0.8499061942100525, + "num_tokens": 161201131.0, + "step": 134010 + }, + { + "entropy": 1.825336940586567, + "epoch": 0.41545027391601874, + "grad_norm": 3.680633068084717, + "learning_rate": 3.924955841287134e-06, + "loss": 0.4024, + "mean_token_accuracy": 0.8611365631222725, + "num_tokens": 161213630.0, + "step": 134020 + }, + { + "entropy": 1.8338982105255126, + "epoch": 0.4154812730410684, + "grad_norm": 9.535050392150879, + "learning_rate": 3.924809416668023e-06, + "loss": 0.4559, + "mean_token_accuracy": 0.8613386616110802, + "num_tokens": 161226206.0, + "step": 134030 + }, + { + "entropy": 1.8270452991127968, + "epoch": 0.41551227216611814, + "grad_norm": 4.551232814788818, + "learning_rate": 3.924663008435264e-06, + "loss": 0.424, + "mean_token_accuracy": 0.8666022554039955, + "num_tokens": 161238486.0, + "step": 134040 + }, + { + "entropy": 1.9449387162923812, + "epoch": 0.4155432712911678, + "grad_norm": 8.532154083251953, + "learning_rate": 3.924516616585802e-06, + "loss": 0.4774, + "mean_token_accuracy": 0.8555340111255646, + "num_tokens": 161249333.0, + "step": 134050 + }, + { + "entropy": 1.9312722623348235, + "epoch": 0.41557427041621753, + "grad_norm": 8.082392692565918, + "learning_rate": 3.9243702411165805e-06, + "loss": 0.5061, + "mean_token_accuracy": 0.8388245224952697, + "num_tokens": 161260572.0, + "step": 134060 + }, + { + "entropy": 1.7703733541071416, + "epoch": 0.4156052695412672, + "grad_norm": 8.54536247253418, + "learning_rate": 3.924223882024544e-06, + "loss": 0.3865, + "mean_token_accuracy": 0.8662910461425781, + "num_tokens": 161274273.0, + "step": 134070 + }, + { + "entropy": 1.8904980316758155, + "epoch": 0.41563626866631687, + "grad_norm": 9.833793640136719, + "learning_rate": 3.9240775393066405e-06, + "loss": 0.4482, + "mean_token_accuracy": 0.8553065106272697, + "num_tokens": 161285970.0, + "step": 134080 + }, + { + "entropy": 1.887739597260952, + "epoch": 0.4156672677913666, + "grad_norm": 6.797054767608643, + "learning_rate": 3.923931212959817e-06, + "loss": 0.4456, + "mean_token_accuracy": 0.8516652911901474, + "num_tokens": 161297693.0, + "step": 134090 + }, + { + "entropy": 1.9516427367925644, + "epoch": 0.41569826691641626, + "grad_norm": 7.91145658493042, + "learning_rate": 3.923784902981022e-06, + "loss": 0.4695, + "mean_token_accuracy": 0.855071696639061, + "num_tokens": 161308121.0, + "step": 134100 + }, + { + "entropy": 1.9453662782907486, + "epoch": 0.415729266041466, + "grad_norm": 7.931567668914795, + "learning_rate": 3.923638609367202e-06, + "loss": 0.5049, + "mean_token_accuracy": 0.8384775072336197, + "num_tokens": 161319934.0, + "step": 134110 + }, + { + "entropy": 1.9178491979837418, + "epoch": 0.41576026516651565, + "grad_norm": 8.443523406982422, + "learning_rate": 3.923492332115307e-06, + "loss": 0.5374, + "mean_token_accuracy": 0.8489273399114609, + "num_tokens": 161330638.0, + "step": 134120 + }, + { + "entropy": 1.9393232375383378, + "epoch": 0.4157912642915654, + "grad_norm": 6.289927005767822, + "learning_rate": 3.9233460712222895e-06, + "loss": 0.5225, + "mean_token_accuracy": 0.8360320463776588, + "num_tokens": 161342902.0, + "step": 134130 + }, + { + "entropy": 1.9341855004429818, + "epoch": 0.41582226341661505, + "grad_norm": 8.161822319030762, + "learning_rate": 3.923199826685099e-06, + "loss": 0.4703, + "mean_token_accuracy": 0.850852632522583, + "num_tokens": 161354270.0, + "step": 134140 + }, + { + "entropy": 1.8818897396326064, + "epoch": 0.41585326254166477, + "grad_norm": 9.941109657287598, + "learning_rate": 3.9230535985006865e-06, + "loss": 0.4419, + "mean_token_accuracy": 0.8523253738880158, + "num_tokens": 161366256.0, + "step": 134150 + }, + { + "entropy": 1.878204520046711, + "epoch": 0.41588426166671444, + "grad_norm": 9.152496337890625, + "learning_rate": 3.9229073866660056e-06, + "loss": 0.4654, + "mean_token_accuracy": 0.8530999973416329, + "num_tokens": 161378072.0, + "step": 134160 + }, + { + "entropy": 1.9943836033344269, + "epoch": 0.41591526079176416, + "grad_norm": 8.155722618103027, + "learning_rate": 3.92276119117801e-06, + "loss": 0.5221, + "mean_token_accuracy": 0.8379778906702995, + "num_tokens": 161389288.0, + "step": 134170 + }, + { + "entropy": 1.8447602733969688, + "epoch": 0.41594625991681383, + "grad_norm": 8.624749183654785, + "learning_rate": 3.922615012033654e-06, + "loss": 0.4482, + "mean_token_accuracy": 0.8538846731185913, + "num_tokens": 161401938.0, + "step": 134180 + }, + { + "entropy": 1.8731971263885498, + "epoch": 0.41597725904186356, + "grad_norm": 8.428763389587402, + "learning_rate": 3.922468849229893e-06, + "loss": 0.4268, + "mean_token_accuracy": 0.8539308935403824, + "num_tokens": 161413429.0, + "step": 134190 + }, + { + "entropy": 1.9091638535261155, + "epoch": 0.4160082581669132, + "grad_norm": 8.18406867980957, + "learning_rate": 3.922322702763682e-06, + "loss": 0.4879, + "mean_token_accuracy": 0.8525436207652092, + "num_tokens": 161424862.0, + "step": 134200 + }, + { + "entropy": 1.8178348287940025, + "epoch": 0.41603925729196295, + "grad_norm": 5.807666778564453, + "learning_rate": 3.922176572631976e-06, + "loss": 0.3925, + "mean_token_accuracy": 0.8637724488973617, + "num_tokens": 161437571.0, + "step": 134210 + }, + { + "entropy": 1.8581533119082452, + "epoch": 0.4160702564170126, + "grad_norm": 5.807939052581787, + "learning_rate": 3.922030458831736e-06, + "loss": 0.4243, + "mean_token_accuracy": 0.8546082183718682, + "num_tokens": 161450359.0, + "step": 134220 + }, + { + "entropy": 1.9059478402137757, + "epoch": 0.41610125554206234, + "grad_norm": 8.28518009185791, + "learning_rate": 3.921884361359918e-06, + "loss": 0.4905, + "mean_token_accuracy": 0.8530061066150665, + "num_tokens": 161461493.0, + "step": 134230 + }, + { + "entropy": 1.8699321061372758, + "epoch": 0.416132254667112, + "grad_norm": 7.9905195236206055, + "learning_rate": 3.921738280213482e-06, + "loss": 0.4416, + "mean_token_accuracy": 0.8566524386405945, + "num_tokens": 161473498.0, + "step": 134240 + }, + { + "entropy": 1.88876011967659, + "epoch": 0.41616325379216174, + "grad_norm": 8.742656707763672, + "learning_rate": 3.921592215389386e-06, + "loss": 0.4385, + "mean_token_accuracy": 0.8500191062688828, + "num_tokens": 161485605.0, + "step": 134250 + }, + { + "entropy": 1.9045383304357528, + "epoch": 0.4161942529172114, + "grad_norm": 9.43719482421875, + "learning_rate": 3.921446166884594e-06, + "loss": 0.465, + "mean_token_accuracy": 0.8534532591700554, + "num_tokens": 161496981.0, + "step": 134260 + }, + { + "entropy": 1.9324941888451577, + "epoch": 0.41622525204226113, + "grad_norm": 7.1453680992126465, + "learning_rate": 3.921300134696064e-06, + "loss": 0.488, + "mean_token_accuracy": 0.8457109674811363, + "num_tokens": 161508264.0, + "step": 134270 + }, + { + "entropy": 1.8439267560839654, + "epoch": 0.4162562511673108, + "grad_norm": 3.715588331222534, + "learning_rate": 3.921154118820759e-06, + "loss": 0.43, + "mean_token_accuracy": 0.8561937183141708, + "num_tokens": 161521748.0, + "step": 134280 + }, + { + "entropy": 1.995403453707695, + "epoch": 0.4162872502923605, + "grad_norm": 8.698075294494629, + "learning_rate": 3.921008119255644e-06, + "loss": 0.4773, + "mean_token_accuracy": 0.8444655641913414, + "num_tokens": 161532336.0, + "step": 134290 + }, + { + "entropy": 1.8547970443964004, + "epoch": 0.4163182494174102, + "grad_norm": 7.284455299377441, + "learning_rate": 3.9208621359976816e-06, + "loss": 0.4427, + "mean_token_accuracy": 0.8494389861822128, + "num_tokens": 161545319.0, + "step": 134300 + }, + { + "entropy": 1.9544886738061904, + "epoch": 0.4163492485424599, + "grad_norm": 8.434039115905762, + "learning_rate": 3.920716169043834e-06, + "loss": 0.5019, + "mean_token_accuracy": 0.8425055891275406, + "num_tokens": 161556597.0, + "step": 134310 + }, + { + "entropy": 1.8419823780655862, + "epoch": 0.4163802476675096, + "grad_norm": 8.092065811157227, + "learning_rate": 3.9205702183910696e-06, + "loss": 0.3975, + "mean_token_accuracy": 0.8516080200672149, + "num_tokens": 161570185.0, + "step": 134320 + }, + { + "entropy": 1.8836676687002183, + "epoch": 0.41641124679255925, + "grad_norm": 7.677006721496582, + "learning_rate": 3.920424284036354e-06, + "loss": 0.4231, + "mean_token_accuracy": 0.8599972873926163, + "num_tokens": 161582993.0, + "step": 134330 + }, + { + "entropy": 1.910847471654415, + "epoch": 0.416442245917609, + "grad_norm": 10.631965637207031, + "learning_rate": 3.920278365976653e-06, + "loss": 0.5067, + "mean_token_accuracy": 0.8386124089360237, + "num_tokens": 161594430.0, + "step": 134340 + }, + { + "entropy": 1.867373377084732, + "epoch": 0.41647324504265865, + "grad_norm": 4.6483259201049805, + "learning_rate": 3.920132464208936e-06, + "loss": 0.4387, + "mean_token_accuracy": 0.8458052203059196, + "num_tokens": 161607226.0, + "step": 134350 + }, + { + "entropy": 1.9246284693479538, + "epoch": 0.41650424416770837, + "grad_norm": 7.288817405700684, + "learning_rate": 3.91998657873017e-06, + "loss": 0.4895, + "mean_token_accuracy": 0.8457208350300789, + "num_tokens": 161618272.0, + "step": 134360 + }, + { + "entropy": 1.8862150296568871, + "epoch": 0.41653524329275804, + "grad_norm": 5.131035804748535, + "learning_rate": 3.919840709537325e-06, + "loss": 0.4388, + "mean_token_accuracy": 0.8593840152025223, + "num_tokens": 161630204.0, + "step": 134370 + }, + { + "entropy": 1.876247802376747, + "epoch": 0.41656624241780776, + "grad_norm": 10.8263578414917, + "learning_rate": 3.919694856627371e-06, + "loss": 0.3749, + "mean_token_accuracy": 0.869242000579834, + "num_tokens": 161641771.0, + "step": 134380 + }, + { + "entropy": 1.9201571598649025, + "epoch": 0.41659724154285743, + "grad_norm": 8.346663475036621, + "learning_rate": 3.919549019997278e-06, + "loss": 0.4356, + "mean_token_accuracy": 0.8538030579686164, + "num_tokens": 161653401.0, + "step": 134390 + }, + { + "entropy": 1.8522045940160752, + "epoch": 0.41662824066790716, + "grad_norm": 3.982300281524658, + "learning_rate": 3.919403199644019e-06, + "loss": 0.4508, + "mean_token_accuracy": 0.8470421150326729, + "num_tokens": 161665679.0, + "step": 134400 + }, + { + "entropy": 1.8318135902285575, + "epoch": 0.4166592397929568, + "grad_norm": 4.39650297164917, + "learning_rate": 3.919257395564566e-06, + "loss": 0.4511, + "mean_token_accuracy": 0.8538282006978989, + "num_tokens": 161678834.0, + "step": 134410 + }, + { + "entropy": 1.8755167797207832, + "epoch": 0.41669023891800655, + "grad_norm": 9.182366371154785, + "learning_rate": 3.919111607755892e-06, + "loss": 0.4426, + "mean_token_accuracy": 0.847786869108677, + "num_tokens": 161691040.0, + "step": 134420 + }, + { + "entropy": 1.921954096853733, + "epoch": 0.4167212380430562, + "grad_norm": 8.545275688171387, + "learning_rate": 3.918965836214972e-06, + "loss": 0.4937, + "mean_token_accuracy": 0.8338611707091331, + "num_tokens": 161703955.0, + "step": 134430 + }, + { + "entropy": 1.8536706238985061, + "epoch": 0.41675223716810594, + "grad_norm": 7.954626560211182, + "learning_rate": 3.918820080938779e-06, + "loss": 0.4029, + "mean_token_accuracy": 0.8524566516280174, + "num_tokens": 161716705.0, + "step": 134440 + }, + { + "entropy": 1.907846173644066, + "epoch": 0.4167832362931556, + "grad_norm": 6.290383338928223, + "learning_rate": 3.91867434192429e-06, + "loss": 0.48, + "mean_token_accuracy": 0.8462353229522706, + "num_tokens": 161728755.0, + "step": 134450 + }, + { + "entropy": 1.9126424625515939, + "epoch": 0.41681423541820534, + "grad_norm": 7.351926803588867, + "learning_rate": 3.918528619168481e-06, + "loss": 0.4377, + "mean_token_accuracy": 0.8547870278358459, + "num_tokens": 161740140.0, + "step": 134460 + }, + { + "entropy": 1.7981932133436203, + "epoch": 0.416845234543255, + "grad_norm": 10.335866928100586, + "learning_rate": 3.9183829126683305e-06, + "loss": 0.3812, + "mean_token_accuracy": 0.8719327822327614, + "num_tokens": 161753546.0, + "step": 134470 + }, + { + "entropy": 1.9288191676139832, + "epoch": 0.41687623366830473, + "grad_norm": 8.642077445983887, + "learning_rate": 3.918237222420813e-06, + "loss": 0.4738, + "mean_token_accuracy": 0.8515138506889344, + "num_tokens": 161764200.0, + "step": 134480 + }, + { + "entropy": 1.8795452415943146, + "epoch": 0.4169072327933544, + "grad_norm": 7.725874900817871, + "learning_rate": 3.918091548422912e-06, + "loss": 0.4932, + "mean_token_accuracy": 0.8545924738049507, + "num_tokens": 161776635.0, + "step": 134490 + }, + { + "entropy": 1.9160576537251472, + "epoch": 0.4169382319184041, + "grad_norm": 4.713511943817139, + "learning_rate": 3.9179458906716036e-06, + "loss": 0.4434, + "mean_token_accuracy": 0.854782110452652, + "num_tokens": 161789062.0, + "step": 134500 + }, + { + "entropy": 1.891368383169174, + "epoch": 0.4169692310434538, + "grad_norm": 7.320746898651123, + "learning_rate": 3.917800249163869e-06, + "loss": 0.4709, + "mean_token_accuracy": 0.8455578476190567, + "num_tokens": 161801067.0, + "step": 134510 + }, + { + "entropy": 1.9198919370770455, + "epoch": 0.4170002301685035, + "grad_norm": 4.034561634063721, + "learning_rate": 3.917654623896689e-06, + "loss": 0.4307, + "mean_token_accuracy": 0.8582830354571342, + "num_tokens": 161813219.0, + "step": 134520 + }, + { + "entropy": 1.8274074256420136, + "epoch": 0.4170312292935532, + "grad_norm": 3.6742751598358154, + "learning_rate": 3.917509014867046e-06, + "loss": 0.3795, + "mean_token_accuracy": 0.8572375267744065, + "num_tokens": 161826737.0, + "step": 134530 + }, + { + "entropy": 1.96164468228817, + "epoch": 0.4170622284186029, + "grad_norm": 7.9983367919921875, + "learning_rate": 3.917363422071923e-06, + "loss": 0.4904, + "mean_token_accuracy": 0.8470546841621399, + "num_tokens": 161837950.0, + "step": 134540 + }, + { + "entropy": 1.840873746573925, + "epoch": 0.4170932275436526, + "grad_norm": 8.481100082397461, + "learning_rate": 3.917217845508302e-06, + "loss": 0.4535, + "mean_token_accuracy": 0.8508628994226456, + "num_tokens": 161851286.0, + "step": 134550 + }, + { + "entropy": 1.8318819746375083, + "epoch": 0.4171242266687023, + "grad_norm": 6.554941654205322, + "learning_rate": 3.917072285173169e-06, + "loss": 0.4343, + "mean_token_accuracy": 0.8574759662151337, + "num_tokens": 161864090.0, + "step": 134560 + }, + { + "entropy": 1.8796166345477103, + "epoch": 0.41715522579375197, + "grad_norm": 7.1165266036987305, + "learning_rate": 3.916926741063509e-06, + "loss": 0.4252, + "mean_token_accuracy": 0.8539829269051552, + "num_tokens": 161876656.0, + "step": 134570 + }, + { + "entropy": 1.8293213650584221, + "epoch": 0.41718622491880164, + "grad_norm": 8.498753547668457, + "learning_rate": 3.916781213176306e-06, + "loss": 0.3757, + "mean_token_accuracy": 0.8652872815728188, + "num_tokens": 161889042.0, + "step": 134580 + }, + { + "entropy": 1.9550884038209915, + "epoch": 0.41721722404385136, + "grad_norm": 9.21761417388916, + "learning_rate": 3.916635701508549e-06, + "loss": 0.5468, + "mean_token_accuracy": 0.8403035715222359, + "num_tokens": 161900992.0, + "step": 134590 + }, + { + "entropy": 1.9226959884166717, + "epoch": 0.41724822316890103, + "grad_norm": 7.0631842613220215, + "learning_rate": 3.916490206057224e-06, + "loss": 0.4587, + "mean_token_accuracy": 0.8605365335941315, + "num_tokens": 161912060.0, + "step": 134600 + }, + { + "entropy": 1.7682531923055649, + "epoch": 0.41727922229395076, + "grad_norm": 8.758580207824707, + "learning_rate": 3.91634472681932e-06, + "loss": 0.4329, + "mean_token_accuracy": 0.8549866124987602, + "num_tokens": 161926308.0, + "step": 134610 + }, + { + "entropy": 1.805106556415558, + "epoch": 0.4173102214190004, + "grad_norm": 4.0905070304870605, + "learning_rate": 3.916199263791824e-06, + "loss": 0.3913, + "mean_token_accuracy": 0.8517909824848175, + "num_tokens": 161939441.0, + "step": 134620 + }, + { + "entropy": 1.7959918841719626, + "epoch": 0.41734122054405015, + "grad_norm": 9.999496459960938, + "learning_rate": 3.916053816971728e-06, + "loss": 0.3566, + "mean_token_accuracy": 0.863648708164692, + "num_tokens": 161952525.0, + "step": 134630 + }, + { + "entropy": 1.8762843772768973, + "epoch": 0.4173722196690998, + "grad_norm": 3.626180648803711, + "learning_rate": 3.9159083863560204e-06, + "loss": 0.4183, + "mean_token_accuracy": 0.8647943899035454, + "num_tokens": 161964249.0, + "step": 134640 + }, + { + "entropy": 1.9006209999322892, + "epoch": 0.41740321879414954, + "grad_norm": 8.766674041748047, + "learning_rate": 3.915762971941694e-06, + "loss": 0.4963, + "mean_token_accuracy": 0.8434425577521324, + "num_tokens": 161975450.0, + "step": 134650 + }, + { + "entropy": 1.959898342192173, + "epoch": 0.4174342179191992, + "grad_norm": 7.614290714263916, + "learning_rate": 3.915617573725742e-06, + "loss": 0.5172, + "mean_token_accuracy": 0.847202044725418, + "num_tokens": 161986492.0, + "step": 134660 + }, + { + "entropy": 1.9258347421884536, + "epoch": 0.41746521704424894, + "grad_norm": 7.911265850067139, + "learning_rate": 3.9154721917051546e-06, + "loss": 0.495, + "mean_token_accuracy": 0.8465585187077522, + "num_tokens": 161997495.0, + "step": 134670 + }, + { + "entropy": 1.8868989422917366, + "epoch": 0.4174962161692986, + "grad_norm": 3.8985397815704346, + "learning_rate": 3.915326825876927e-06, + "loss": 0.4437, + "mean_token_accuracy": 0.8470399528741837, + "num_tokens": 162009972.0, + "step": 134680 + }, + { + "entropy": 1.961172890663147, + "epoch": 0.41752721529434833, + "grad_norm": 9.122645378112793, + "learning_rate": 3.915181476238054e-06, + "loss": 0.5025, + "mean_token_accuracy": 0.8451171442866325, + "num_tokens": 162021139.0, + "step": 134690 + }, + { + "entropy": 1.867299085855484, + "epoch": 0.417558214419398, + "grad_norm": 7.1753034591674805, + "learning_rate": 3.91503614278553e-06, + "loss": 0.4129, + "mean_token_accuracy": 0.8628281652927399, + "num_tokens": 162033229.0, + "step": 134700 + }, + { + "entropy": 1.8105697840452195, + "epoch": 0.4175892135444477, + "grad_norm": 3.7438528537750244, + "learning_rate": 3.91489082551635e-06, + "loss": 0.4439, + "mean_token_accuracy": 0.8544130697846413, + "num_tokens": 162046064.0, + "step": 134710 + }, + { + "entropy": 1.916566787660122, + "epoch": 0.4176202126694974, + "grad_norm": 8.405454635620117, + "learning_rate": 3.914745524427513e-06, + "loss": 0.478, + "mean_token_accuracy": 0.8509135708212853, + "num_tokens": 162056998.0, + "step": 134720 + }, + { + "entropy": 1.883594536781311, + "epoch": 0.4176512117945471, + "grad_norm": 9.41535472869873, + "learning_rate": 3.914600239516016e-06, + "loss": 0.4559, + "mean_token_accuracy": 0.8488730236887931, + "num_tokens": 162068897.0, + "step": 134730 + }, + { + "entropy": 1.8739972546696664, + "epoch": 0.4176822109195968, + "grad_norm": 4.211245059967041, + "learning_rate": 3.9144549707788556e-06, + "loss": 0.4419, + "mean_token_accuracy": 0.8516407638788224, + "num_tokens": 162081030.0, + "step": 134740 + }, + { + "entropy": 1.9788037657737731, + "epoch": 0.4177132100446465, + "grad_norm": 8.905488967895508, + "learning_rate": 3.914309718213034e-06, + "loss": 0.5444, + "mean_token_accuracy": 0.8368604257702827, + "num_tokens": 162091716.0, + "step": 134750 + }, + { + "entropy": 1.8498676016926765, + "epoch": 0.4177442091696962, + "grad_norm": 2.3638017177581787, + "learning_rate": 3.914164481815549e-06, + "loss": 0.4331, + "mean_token_accuracy": 0.8562711879611016, + "num_tokens": 162103721.0, + "step": 134760 + }, + { + "entropy": 1.9256371967494488, + "epoch": 0.4177752082947459, + "grad_norm": 8.89646053314209, + "learning_rate": 3.9140192615833995e-06, + "loss": 0.5043, + "mean_token_accuracy": 0.8388048008084297, + "num_tokens": 162115431.0, + "step": 134770 + }, + { + "entropy": 1.8755139395594598, + "epoch": 0.41780620741979557, + "grad_norm": 7.007622718811035, + "learning_rate": 3.91387405751359e-06, + "loss": 0.4111, + "mean_token_accuracy": 0.8632483765482902, + "num_tokens": 162127975.0, + "step": 134780 + }, + { + "entropy": 1.865597426891327, + "epoch": 0.4178372065448453, + "grad_norm": 4.80448579788208, + "learning_rate": 3.913728869603122e-06, + "loss": 0.4438, + "mean_token_accuracy": 0.8586938053369522, + "num_tokens": 162139430.0, + "step": 134790 + }, + { + "entropy": 1.9837687581777572, + "epoch": 0.41786820566989497, + "grad_norm": 7.113491058349609, + "learning_rate": 3.913583697848999e-06, + "loss": 0.5099, + "mean_token_accuracy": 0.8430691197514534, + "num_tokens": 162150117.0, + "step": 134800 + }, + { + "entropy": 1.8740553870797156, + "epoch": 0.4178992047949447, + "grad_norm": 8.047861099243164, + "learning_rate": 3.913438542248223e-06, + "loss": 0.4269, + "mean_token_accuracy": 0.8576863974332809, + "num_tokens": 162161930.0, + "step": 134810 + }, + { + "entropy": 1.955976441502571, + "epoch": 0.41793020391999436, + "grad_norm": 8.8277006149292, + "learning_rate": 3.913293402797799e-06, + "loss": 0.4547, + "mean_token_accuracy": 0.8523634567856788, + "num_tokens": 162173265.0, + "step": 134820 + }, + { + "entropy": 1.9804795682430267, + "epoch": 0.417961203045044, + "grad_norm": 7.893826007843018, + "learning_rate": 3.913148279494734e-06, + "loss": 0.5503, + "mean_token_accuracy": 0.8313609048724174, + "num_tokens": 162184039.0, + "step": 134830 + }, + { + "entropy": 1.8900707513093948, + "epoch": 0.41799220217009375, + "grad_norm": 7.8856329917907715, + "learning_rate": 3.913003172336033e-06, + "loss": 0.4777, + "mean_token_accuracy": 0.8358192846179009, + "num_tokens": 162196357.0, + "step": 134840 + }, + { + "entropy": 1.93702944368124, + "epoch": 0.4180232012951434, + "grad_norm": 7.011838912963867, + "learning_rate": 3.912858081318703e-06, + "loss": 0.4432, + "mean_token_accuracy": 0.854140405356884, + "num_tokens": 162207437.0, + "step": 134850 + }, + { + "entropy": 1.9064744263887405, + "epoch": 0.41805420042019314, + "grad_norm": 3.6196608543395996, + "learning_rate": 3.912713006439751e-06, + "loss": 0.5159, + "mean_token_accuracy": 0.8377542212605477, + "num_tokens": 162218620.0, + "step": 134860 + }, + { + "entropy": 1.8770883545279502, + "epoch": 0.4180851995452428, + "grad_norm": 8.111139297485352, + "learning_rate": 3.912567947696187e-06, + "loss": 0.4365, + "mean_token_accuracy": 0.858944533765316, + "num_tokens": 162230501.0, + "step": 134870 + }, + { + "entropy": 1.9460731491446495, + "epoch": 0.41811619867029254, + "grad_norm": 10.321117401123047, + "learning_rate": 3.912422905085019e-06, + "loss": 0.5021, + "mean_token_accuracy": 0.8436010986566543, + "num_tokens": 162241864.0, + "step": 134880 + }, + { + "entropy": 1.9751074135303497, + "epoch": 0.4181471977953422, + "grad_norm": 7.4099273681640625, + "learning_rate": 3.912277878603257e-06, + "loss": 0.5029, + "mean_token_accuracy": 0.8479305684566498, + "num_tokens": 162252709.0, + "step": 134890 + }, + { + "entropy": 1.8740838050842286, + "epoch": 0.41817819692039193, + "grad_norm": 3.3563060760498047, + "learning_rate": 3.9121328682479126e-06, + "loss": 0.4222, + "mean_token_accuracy": 0.8611055985093117, + "num_tokens": 162264722.0, + "step": 134900 + }, + { + "entropy": 1.8472968205809592, + "epoch": 0.4182091960454416, + "grad_norm": 4.131076812744141, + "learning_rate": 3.911987874015997e-06, + "loss": 0.4296, + "mean_token_accuracy": 0.8558681800961494, + "num_tokens": 162277616.0, + "step": 134910 + }, + { + "entropy": 1.9546231135725975, + "epoch": 0.4182401951704913, + "grad_norm": 3.539958953857422, + "learning_rate": 3.911842895904522e-06, + "loss": 0.5274, + "mean_token_accuracy": 0.8383591964840889, + "num_tokens": 162288986.0, + "step": 134920 + }, + { + "entropy": 1.8494961842894555, + "epoch": 0.418271194295541, + "grad_norm": 7.521032810211182, + "learning_rate": 3.911697933910501e-06, + "loss": 0.4387, + "mean_token_accuracy": 0.8557368025183678, + "num_tokens": 162301754.0, + "step": 134930 + }, + { + "entropy": 1.8642336532473565, + "epoch": 0.4183021934205907, + "grad_norm": 4.32247257232666, + "learning_rate": 3.911552988030949e-06, + "loss": 0.4259, + "mean_token_accuracy": 0.8557508587837219, + "num_tokens": 162313972.0, + "step": 134940 + }, + { + "entropy": 1.94805389046669, + "epoch": 0.4183331925456404, + "grad_norm": 9.484874725341797, + "learning_rate": 3.911408058262879e-06, + "loss": 0.4782, + "mean_token_accuracy": 0.8523627325892449, + "num_tokens": 162324752.0, + "step": 134950 + }, + { + "entropy": 1.8355311632156373, + "epoch": 0.4183641916706901, + "grad_norm": 16.279993057250977, + "learning_rate": 3.911263144603306e-06, + "loss": 0.4276, + "mean_token_accuracy": 0.8622775912284851, + "num_tokens": 162337826.0, + "step": 134960 + }, + { + "entropy": 1.8673854805529118, + "epoch": 0.4183951907957398, + "grad_norm": 7.603078365325928, + "learning_rate": 3.911118247049249e-06, + "loss": 0.408, + "mean_token_accuracy": 0.8558634921908379, + "num_tokens": 162350433.0, + "step": 134970 + }, + { + "entropy": 1.9007538348436355, + "epoch": 0.4184261899207895, + "grad_norm": 7.9676337242126465, + "learning_rate": 3.9109733655977225e-06, + "loss": 0.5029, + "mean_token_accuracy": 0.8374831721186637, + "num_tokens": 162362674.0, + "step": 134980 + }, + { + "entropy": 1.8434744045138358, + "epoch": 0.4184571890458392, + "grad_norm": 7.775720119476318, + "learning_rate": 3.910828500245745e-06, + "loss": 0.4214, + "mean_token_accuracy": 0.8564092546701432, + "num_tokens": 162375250.0, + "step": 134990 + }, + { + "entropy": 1.918935863673687, + "epoch": 0.4184881881708889, + "grad_norm": 7.319066524505615, + "learning_rate": 3.910683650990335e-06, + "loss": 0.4322, + "mean_token_accuracy": 0.861502180993557, + "num_tokens": 162386475.0, + "step": 135000 + }, + { + "entropy": 1.8640062302350997, + "epoch": 0.41851918729593857, + "grad_norm": 8.106133460998535, + "learning_rate": 3.910538817828512e-06, + "loss": 0.4262, + "mean_token_accuracy": 0.854218167066574, + "num_tokens": 162398652.0, + "step": 135010 + }, + { + "entropy": 1.7948240421712398, + "epoch": 0.4185501864209883, + "grad_norm": 3.9371562004089355, + "learning_rate": 3.910394000757297e-06, + "loss": 0.4051, + "mean_token_accuracy": 0.8555122867226601, + "num_tokens": 162411080.0, + "step": 135020 + }, + { + "entropy": 1.8870450854301453, + "epoch": 0.41858118554603796, + "grad_norm": 7.394895076751709, + "learning_rate": 3.910249199773708e-06, + "loss": 0.4217, + "mean_token_accuracy": 0.8639406561851501, + "num_tokens": 162422198.0, + "step": 135030 + }, + { + "entropy": 1.928053417801857, + "epoch": 0.4186121846710877, + "grad_norm": 7.574821472167969, + "learning_rate": 3.910104414874769e-06, + "loss": 0.5109, + "mean_token_accuracy": 0.84600929915905, + "num_tokens": 162432961.0, + "step": 135040 + }, + { + "entropy": 1.9174908056855202, + "epoch": 0.41864318379613735, + "grad_norm": 8.21877384185791, + "learning_rate": 3.909959646057503e-06, + "loss": 0.4699, + "mean_token_accuracy": 0.8488267034292221, + "num_tokens": 162443874.0, + "step": 135050 + }, + { + "entropy": 1.8227450162172318, + "epoch": 0.4186741829211871, + "grad_norm": 9.875720024108887, + "learning_rate": 3.90981489331893e-06, + "loss": 0.4328, + "mean_token_accuracy": 0.8520313039422035, + "num_tokens": 162457203.0, + "step": 135060 + }, + { + "entropy": 1.8200043380260467, + "epoch": 0.41870518204623675, + "grad_norm": 10.07118034362793, + "learning_rate": 3.909670156656076e-06, + "loss": 0.4238, + "mean_token_accuracy": 0.8482141956686974, + "num_tokens": 162469740.0, + "step": 135070 + }, + { + "entropy": 1.9454358339309692, + "epoch": 0.4187361811712864, + "grad_norm": 9.135685920715332, + "learning_rate": 3.909525436065966e-06, + "loss": 0.4849, + "mean_token_accuracy": 0.8446780741214752, + "num_tokens": 162480950.0, + "step": 135080 + }, + { + "entropy": 1.802914521098137, + "epoch": 0.41876718029633614, + "grad_norm": 7.378044605255127, + "learning_rate": 3.909380731545625e-06, + "loss": 0.3828, + "mean_token_accuracy": 0.8730568781495094, + "num_tokens": 162493924.0, + "step": 135090 + }, + { + "entropy": 1.762102383375168, + "epoch": 0.4187981794213858, + "grad_norm": 8.075687408447266, + "learning_rate": 3.90923604309208e-06, + "loss": 0.3709, + "mean_token_accuracy": 0.8703906610608101, + "num_tokens": 162507010.0, + "step": 135100 + }, + { + "entropy": 1.890143983066082, + "epoch": 0.41882917854643553, + "grad_norm": 8.960722923278809, + "learning_rate": 3.9090913707023566e-06, + "loss": 0.4922, + "mean_token_accuracy": 0.8449656277894974, + "num_tokens": 162518570.0, + "step": 135110 + }, + { + "entropy": 1.8910117477178574, + "epoch": 0.4188601776714852, + "grad_norm": 3.260122776031494, + "learning_rate": 3.908946714373483e-06, + "loss": 0.4742, + "mean_token_accuracy": 0.8450547441840172, + "num_tokens": 162530768.0, + "step": 135120 + }, + { + "entropy": 1.8741901010274886, + "epoch": 0.4188911767965349, + "grad_norm": 3.713733196258545, + "learning_rate": 3.908802074102489e-06, + "loss": 0.4205, + "mean_token_accuracy": 0.8599935069680213, + "num_tokens": 162542560.0, + "step": 135130 + }, + { + "entropy": 1.8225472882390021, + "epoch": 0.4189221759215846, + "grad_norm": 2.8528435230255127, + "learning_rate": 3.908657449886402e-06, + "loss": 0.4187, + "mean_token_accuracy": 0.8527352377772331, + "num_tokens": 162555416.0, + "step": 135140 + }, + { + "entropy": 1.9285516187548637, + "epoch": 0.4189531750466343, + "grad_norm": 9.028520584106445, + "learning_rate": 3.908512841722253e-06, + "loss": 0.4887, + "mean_token_accuracy": 0.8442423656582833, + "num_tokens": 162566844.0, + "step": 135150 + }, + { + "entropy": 1.9082002013921737, + "epoch": 0.418984174171684, + "grad_norm": 8.118371963500977, + "learning_rate": 3.908368249607073e-06, + "loss": 0.5018, + "mean_token_accuracy": 0.8459517538547516, + "num_tokens": 162578692.0, + "step": 135160 + }, + { + "entropy": 1.9068205058574677, + "epoch": 0.4190151732967337, + "grad_norm": 8.231268882751465, + "learning_rate": 3.908223673537895e-06, + "loss": 0.451, + "mean_token_accuracy": 0.8569748342037201, + "num_tokens": 162590372.0, + "step": 135170 + }, + { + "entropy": 1.7947299778461456, + "epoch": 0.4190461724217834, + "grad_norm": 7.387755393981934, + "learning_rate": 3.908079113511748e-06, + "loss": 0.4068, + "mean_token_accuracy": 0.8611501380801201, + "num_tokens": 162603085.0, + "step": 135180 + }, + { + "entropy": 1.830442163348198, + "epoch": 0.4190771715468331, + "grad_norm": 3.5146117210388184, + "learning_rate": 3.907934569525668e-06, + "loss": 0.4604, + "mean_token_accuracy": 0.8458912119269371, + "num_tokens": 162616037.0, + "step": 135190 + }, + { + "entropy": 1.8631732419133187, + "epoch": 0.4191081706718828, + "grad_norm": 8.939651489257812, + "learning_rate": 3.907790041576687e-06, + "loss": 0.4309, + "mean_token_accuracy": 0.8587908774614335, + "num_tokens": 162627961.0, + "step": 135200 + }, + { + "entropy": 1.891756534576416, + "epoch": 0.4191391697969325, + "grad_norm": 7.691137313842773, + "learning_rate": 3.907645529661842e-06, + "loss": 0.4621, + "mean_token_accuracy": 0.8453534230589866, + "num_tokens": 162640527.0, + "step": 135210 + }, + { + "entropy": 1.880522905290127, + "epoch": 0.41917016892198217, + "grad_norm": 8.83800983428955, + "learning_rate": 3.907501033778167e-06, + "loss": 0.4631, + "mean_token_accuracy": 0.8536536455154419, + "num_tokens": 162652160.0, + "step": 135220 + }, + { + "entropy": 1.9340814992785453, + "epoch": 0.4192011680470319, + "grad_norm": 9.312424659729004, + "learning_rate": 3.907356553922698e-06, + "loss": 0.4707, + "mean_token_accuracy": 0.8431622445583343, + "num_tokens": 162663724.0, + "step": 135230 + }, + { + "entropy": 1.7783160164952279, + "epoch": 0.41923216717208156, + "grad_norm": 8.399624824523926, + "learning_rate": 3.907212090092472e-06, + "loss": 0.3552, + "mean_token_accuracy": 0.8627290308475495, + "num_tokens": 162677173.0, + "step": 135240 + }, + { + "entropy": 1.9121920481324195, + "epoch": 0.4192631662971313, + "grad_norm": 4.327278137207031, + "learning_rate": 3.907067642284528e-06, + "loss": 0.4775, + "mean_token_accuracy": 0.8431808009743691, + "num_tokens": 162688069.0, + "step": 135250 + }, + { + "entropy": 1.90309626609087, + "epoch": 0.41929416542218095, + "grad_norm": 7.39229679107666, + "learning_rate": 3.906923210495903e-06, + "loss": 0.4782, + "mean_token_accuracy": 0.8486498475074769, + "num_tokens": 162699438.0, + "step": 135260 + }, + { + "entropy": 1.7719358682632447, + "epoch": 0.4193251645472307, + "grad_norm": 2.3753769397735596, + "learning_rate": 3.9067787947236376e-06, + "loss": 0.3807, + "mean_token_accuracy": 0.8691380977630615, + "num_tokens": 162711906.0, + "step": 135270 + }, + { + "entropy": 1.7657958284020423, + "epoch": 0.41935616367228035, + "grad_norm": 8.348288536071777, + "learning_rate": 3.906634394964771e-06, + "loss": 0.399, + "mean_token_accuracy": 0.8593086928129197, + "num_tokens": 162725490.0, + "step": 135280 + }, + { + "entropy": 1.8791421324014663, + "epoch": 0.41938716279733007, + "grad_norm": 5.080158233642578, + "learning_rate": 3.906490011216344e-06, + "loss": 0.4721, + "mean_token_accuracy": 0.8483076050877572, + "num_tokens": 162737225.0, + "step": 135290 + }, + { + "entropy": 1.77224540412426, + "epoch": 0.41941816192237974, + "grad_norm": 9.42541790008545, + "learning_rate": 3.9063456434754005e-06, + "loss": 0.3504, + "mean_token_accuracy": 0.8711794734001159, + "num_tokens": 162750989.0, + "step": 135300 + }, + { + "entropy": 1.909542678296566, + "epoch": 0.41944916104742946, + "grad_norm": 8.59212589263916, + "learning_rate": 3.906201291738979e-06, + "loss": 0.4915, + "mean_token_accuracy": 0.8415026575326919, + "num_tokens": 162762817.0, + "step": 135310 + }, + { + "entropy": 1.8708134442567825, + "epoch": 0.41948016017247913, + "grad_norm": 7.089857578277588, + "learning_rate": 3.906056956004125e-06, + "loss": 0.4525, + "mean_token_accuracy": 0.8642296716570854, + "num_tokens": 162774634.0, + "step": 135320 + }, + { + "entropy": 1.8708177879452705, + "epoch": 0.4195111592975288, + "grad_norm": 7.2748823165893555, + "learning_rate": 3.905912636267882e-06, + "loss": 0.3918, + "mean_token_accuracy": 0.8618823811411858, + "num_tokens": 162786925.0, + "step": 135330 + }, + { + "entropy": 1.9178523600101471, + "epoch": 0.4195421584225785, + "grad_norm": 7.701486110687256, + "learning_rate": 3.905768332527295e-06, + "loss": 0.5139, + "mean_token_accuracy": 0.8496042251586914, + "num_tokens": 162799600.0, + "step": 135340 + }, + { + "entropy": 1.8545253962278365, + "epoch": 0.4195731575476282, + "grad_norm": 8.929347038269043, + "learning_rate": 3.905624044779408e-06, + "loss": 0.4689, + "mean_token_accuracy": 0.8579249680042267, + "num_tokens": 162811881.0, + "step": 135350 + }, + { + "entropy": 1.8590249836444854, + "epoch": 0.4196041566726779, + "grad_norm": 7.88853645324707, + "learning_rate": 3.905479773021269e-06, + "loss": 0.4949, + "mean_token_accuracy": 0.843219818174839, + "num_tokens": 162824189.0, + "step": 135360 + }, + { + "entropy": 1.8993327513337135, + "epoch": 0.4196351557977276, + "grad_norm": 7.894437789916992, + "learning_rate": 3.905335517249924e-06, + "loss": 0.4708, + "mean_token_accuracy": 0.8462705582380294, + "num_tokens": 162836157.0, + "step": 135370 + }, + { + "entropy": 1.924989990890026, + "epoch": 0.4196661549227773, + "grad_norm": 3.602142333984375, + "learning_rate": 3.9051912774624215e-06, + "loss": 0.4607, + "mean_token_accuracy": 0.8598493561148643, + "num_tokens": 162847849.0, + "step": 135380 + }, + { + "entropy": 1.8464587591588497, + "epoch": 0.419697154047827, + "grad_norm": 9.821682929992676, + "learning_rate": 3.905047053655809e-06, + "loss": 0.3962, + "mean_token_accuracy": 0.8622051507234574, + "num_tokens": 162860582.0, + "step": 135390 + }, + { + "entropy": 1.8456949055194856, + "epoch": 0.4197281531728767, + "grad_norm": 3.930516242980957, + "learning_rate": 3.904902845827135e-06, + "loss": 0.401, + "mean_token_accuracy": 0.8577687725424766, + "num_tokens": 162873376.0, + "step": 135400 + }, + { + "entropy": 1.805974441766739, + "epoch": 0.4197591522979264, + "grad_norm": 7.337070941925049, + "learning_rate": 3.904758653973452e-06, + "loss": 0.3964, + "mean_token_accuracy": 0.8626691713929177, + "num_tokens": 162886100.0, + "step": 135410 + }, + { + "entropy": 1.9164967209100723, + "epoch": 0.4197901514229761, + "grad_norm": 9.788459777832031, + "learning_rate": 3.904614478091809e-06, + "loss": 0.4771, + "mean_token_accuracy": 0.8476973131299019, + "num_tokens": 162897812.0, + "step": 135420 + }, + { + "entropy": 1.9670021116733551, + "epoch": 0.41982115054802577, + "grad_norm": 9.283044815063477, + "learning_rate": 3.904470318179257e-06, + "loss": 0.534, + "mean_token_accuracy": 0.8294244706630707, + "num_tokens": 162909541.0, + "step": 135430 + }, + { + "entropy": 1.9219158321619034, + "epoch": 0.4198521496730755, + "grad_norm": 8.37589168548584, + "learning_rate": 3.9043261742328505e-06, + "loss": 0.5045, + "mean_token_accuracy": 0.8363066896796226, + "num_tokens": 162921516.0, + "step": 135440 + }, + { + "entropy": 1.953237357735634, + "epoch": 0.41988314879812516, + "grad_norm": 9.618903160095215, + "learning_rate": 3.904182046249641e-06, + "loss": 0.534, + "mean_token_accuracy": 0.8366630434989929, + "num_tokens": 162932956.0, + "step": 135450 + }, + { + "entropy": 1.9217358708381653, + "epoch": 0.4199141479231749, + "grad_norm": 7.9734649658203125, + "learning_rate": 3.904037934226683e-06, + "loss": 0.4784, + "mean_token_accuracy": 0.8531175389885902, + "num_tokens": 162944286.0, + "step": 135460 + }, + { + "entropy": 1.799736338853836, + "epoch": 0.41994514704822455, + "grad_norm": 8.599648475646973, + "learning_rate": 3.903893838161029e-06, + "loss": 0.3677, + "mean_token_accuracy": 0.8724782764911652, + "num_tokens": 162957587.0, + "step": 135470 + }, + { + "entropy": 1.8858862951397897, + "epoch": 0.4199761461732743, + "grad_norm": 3.428628444671631, + "learning_rate": 3.903749758049738e-06, + "loss": 0.424, + "mean_token_accuracy": 0.8496242195367814, + "num_tokens": 162969224.0, + "step": 135480 + }, + { + "entropy": 1.8270292654633522, + "epoch": 0.42000714529832395, + "grad_norm": 2.5544705390930176, + "learning_rate": 3.903605693889863e-06, + "loss": 0.3985, + "mean_token_accuracy": 0.8701824352145195, + "num_tokens": 162982342.0, + "step": 135490 + }, + { + "entropy": 1.8943888053297997, + "epoch": 0.42003814442337367, + "grad_norm": 9.184906005859375, + "learning_rate": 3.90346164567846e-06, + "loss": 0.4773, + "mean_token_accuracy": 0.8437836706638336, + "num_tokens": 162994089.0, + "step": 135500 + }, + { + "entropy": 1.8288681022822857, + "epoch": 0.42006914354842334, + "grad_norm": 8.831469535827637, + "learning_rate": 3.903317613412592e-06, + "loss": 0.394, + "mean_token_accuracy": 0.867976401746273, + "num_tokens": 163006600.0, + "step": 135510 + }, + { + "entropy": 1.9004840448498725, + "epoch": 0.42010014267347306, + "grad_norm": 8.030474662780762, + "learning_rate": 3.903173597089313e-06, + "loss": 0.4529, + "mean_token_accuracy": 0.8580782458186149, + "num_tokens": 163017668.0, + "step": 135520 + }, + { + "entropy": 1.9328984439373016, + "epoch": 0.42013114179852273, + "grad_norm": 8.515178680419922, + "learning_rate": 3.903029596705682e-06, + "loss": 0.4733, + "mean_token_accuracy": 0.8430827930569649, + "num_tokens": 163029533.0, + "step": 135530 + }, + { + "entropy": 1.903455564379692, + "epoch": 0.42016214092357246, + "grad_norm": 8.266940116882324, + "learning_rate": 3.90288561225876e-06, + "loss": 0.4381, + "mean_token_accuracy": 0.8574344128370285, + "num_tokens": 163041861.0, + "step": 135540 + }, + { + "entropy": 1.7817657575011254, + "epoch": 0.4201931400486221, + "grad_norm": 8.015380859375, + "learning_rate": 3.902741643745609e-06, + "loss": 0.3766, + "mean_token_accuracy": 0.8572424992918968, + "num_tokens": 163054619.0, + "step": 135550 + }, + { + "entropy": 1.8124181941151618, + "epoch": 0.4202241391736718, + "grad_norm": 5.867700576782227, + "learning_rate": 3.902597691163288e-06, + "loss": 0.4328, + "mean_token_accuracy": 0.8469098672270775, + "num_tokens": 163067835.0, + "step": 135560 + }, + { + "entropy": 1.9572464644908905, + "epoch": 0.4202551382987215, + "grad_norm": 8.415164947509766, + "learning_rate": 3.902453754508861e-06, + "loss": 0.5148, + "mean_token_accuracy": 0.8418419510126114, + "num_tokens": 163079318.0, + "step": 135570 + }, + { + "entropy": 1.9142266780138015, + "epoch": 0.4202861374237712, + "grad_norm": 7.7311577796936035, + "learning_rate": 3.902309833779389e-06, + "loss": 0.458, + "mean_token_accuracy": 0.8493567198514939, + "num_tokens": 163090456.0, + "step": 135580 + }, + { + "entropy": 1.9059449434280396, + "epoch": 0.4203171365488209, + "grad_norm": 7.837739944458008, + "learning_rate": 3.902165928971938e-06, + "loss": 0.4521, + "mean_token_accuracy": 0.857314033806324, + "num_tokens": 163101560.0, + "step": 135590 + }, + { + "entropy": 1.9108774542808533, + "epoch": 0.4203481356738706, + "grad_norm": 7.050215244293213, + "learning_rate": 3.90202204008357e-06, + "loss": 0.4735, + "mean_token_accuracy": 0.8525308519601822, + "num_tokens": 163113176.0, + "step": 135600 + }, + { + "entropy": 1.8393698036670685, + "epoch": 0.4203791347989203, + "grad_norm": 3.4507510662078857, + "learning_rate": 3.901878167111353e-06, + "loss": 0.4471, + "mean_token_accuracy": 0.8612019449472428, + "num_tokens": 163126007.0, + "step": 135610 + }, + { + "entropy": 1.9607517927885056, + "epoch": 0.42041013392397, + "grad_norm": 7.289004325866699, + "learning_rate": 3.9017343100523505e-06, + "loss": 0.5067, + "mean_token_accuracy": 0.8425389587879181, + "num_tokens": 163136804.0, + "step": 135620 + }, + { + "entropy": 1.8896097630262374, + "epoch": 0.4204411330490197, + "grad_norm": 8.581795692443848, + "learning_rate": 3.90159046890363e-06, + "loss": 0.4696, + "mean_token_accuracy": 0.8448723956942559, + "num_tokens": 163148832.0, + "step": 135630 + }, + { + "entropy": 1.922749936580658, + "epoch": 0.42047213217406937, + "grad_norm": 8.489492416381836, + "learning_rate": 3.90144664366226e-06, + "loss": 0.4862, + "mean_token_accuracy": 0.8492430374026299, + "num_tokens": 163160997.0, + "step": 135640 + }, + { + "entropy": 1.9693446904420853, + "epoch": 0.4205031312991191, + "grad_norm": 9.86866283416748, + "learning_rate": 3.9013028343253065e-06, + "loss": 0.4833, + "mean_token_accuracy": 0.8453862652182579, + "num_tokens": 163172061.0, + "step": 135650 + }, + { + "entropy": 1.9001538813114167, + "epoch": 0.42053413042416876, + "grad_norm": 7.654898166656494, + "learning_rate": 3.901159040889842e-06, + "loss": 0.4418, + "mean_token_accuracy": 0.8523498460650444, + "num_tokens": 163184244.0, + "step": 135660 + }, + { + "entropy": 1.945447364449501, + "epoch": 0.4205651295492185, + "grad_norm": 8.955438613891602, + "learning_rate": 3.901015263352933e-06, + "loss": 0.5053, + "mean_token_accuracy": 0.8504986420273781, + "num_tokens": 163195436.0, + "step": 135670 + }, + { + "entropy": 1.9072694763541223, + "epoch": 0.42059612867426815, + "grad_norm": 8.15773868560791, + "learning_rate": 3.900871501711651e-06, + "loss": 0.4505, + "mean_token_accuracy": 0.8516003981232643, + "num_tokens": 163206912.0, + "step": 135680 + }, + { + "entropy": 1.8923516809940337, + "epoch": 0.4206271277993179, + "grad_norm": 8.769623756408691, + "learning_rate": 3.900727755963067e-06, + "loss": 0.4825, + "mean_token_accuracy": 0.848148649930954, + "num_tokens": 163218689.0, + "step": 135690 + }, + { + "entropy": 1.8840610541403293, + "epoch": 0.42065812692436755, + "grad_norm": 7.660909652709961, + "learning_rate": 3.9005840261042535e-06, + "loss": 0.4562, + "mean_token_accuracy": 0.8552731052041054, + "num_tokens": 163231024.0, + "step": 135700 + }, + { + "entropy": 1.903901606798172, + "epoch": 0.4206891260494173, + "grad_norm": 3.740813732147217, + "learning_rate": 3.900440312132283e-06, + "loss": 0.4964, + "mean_token_accuracy": 0.8453383758664131, + "num_tokens": 163242740.0, + "step": 135710 + }, + { + "entropy": 1.9122828841209412, + "epoch": 0.42072012517446694, + "grad_norm": 4.723738670349121, + "learning_rate": 3.90029661404423e-06, + "loss": 0.4393, + "mean_token_accuracy": 0.8527891039848328, + "num_tokens": 163254707.0, + "step": 135720 + }, + { + "entropy": 1.9014658220112324, + "epoch": 0.42075112429951667, + "grad_norm": 3.560215711593628, + "learning_rate": 3.900152931837168e-06, + "loss": 0.4368, + "mean_token_accuracy": 0.8482464477419853, + "num_tokens": 163266816.0, + "step": 135730 + }, + { + "entropy": 1.9249530613422394, + "epoch": 0.42078212342456633, + "grad_norm": 8.004035949707031, + "learning_rate": 3.900009265508172e-06, + "loss": 0.4692, + "mean_token_accuracy": 0.8406677231192589, + "num_tokens": 163278761.0, + "step": 135740 + }, + { + "entropy": 1.8970685094594955, + "epoch": 0.42081312254961606, + "grad_norm": 8.880062103271484, + "learning_rate": 3.899865615054318e-06, + "loss": 0.4325, + "mean_token_accuracy": 0.8497609063982964, + "num_tokens": 163291090.0, + "step": 135750 + }, + { + "entropy": 1.828018756210804, + "epoch": 0.4208441216746657, + "grad_norm": 7.22142219543457, + "learning_rate": 3.8997219804726815e-06, + "loss": 0.43, + "mean_token_accuracy": 0.8671692669391632, + "num_tokens": 163302749.0, + "step": 135760 + }, + { + "entropy": 1.8527932271361351, + "epoch": 0.42087512079971545, + "grad_norm": 7.9586286544799805, + "learning_rate": 3.899578361760341e-06, + "loss": 0.4645, + "mean_token_accuracy": 0.8436411425471306, + "num_tokens": 163315230.0, + "step": 135770 + }, + { + "entropy": 1.8658724144101142, + "epoch": 0.4209061199247651, + "grad_norm": 4.109507083892822, + "learning_rate": 3.899434758914374e-06, + "loss": 0.4303, + "mean_token_accuracy": 0.8581762701272965, + "num_tokens": 163327266.0, + "step": 135780 + }, + { + "entropy": 1.842541829496622, + "epoch": 0.42093711904981485, + "grad_norm": 9.778244972229004, + "learning_rate": 3.89929117193186e-06, + "loss": 0.458, + "mean_token_accuracy": 0.8542135626077652, + "num_tokens": 163339658.0, + "step": 135790 + }, + { + "entropy": 1.8465433612465858, + "epoch": 0.4209681181748645, + "grad_norm": 8.6669282913208, + "learning_rate": 3.899147600809877e-06, + "loss": 0.4387, + "mean_token_accuracy": 0.8487094223499299, + "num_tokens": 163352124.0, + "step": 135800 + }, + { + "entropy": 1.90444867759943, + "epoch": 0.4209991172999142, + "grad_norm": 8.977686882019043, + "learning_rate": 3.899004045545507e-06, + "loss": 0.4835, + "mean_token_accuracy": 0.8524438515305519, + "num_tokens": 163363934.0, + "step": 135810 + }, + { + "entropy": 1.8843096554279328, + "epoch": 0.4210301164249639, + "grad_norm": 8.393902778625488, + "learning_rate": 3.898860506135832e-06, + "loss": 0.4474, + "mean_token_accuracy": 0.8609971582889557, + "num_tokens": 163375865.0, + "step": 135820 + }, + { + "entropy": 1.9457574129104613, + "epoch": 0.4210611155500136, + "grad_norm": 7.407272815704346, + "learning_rate": 3.898716982577929e-06, + "loss": 0.5116, + "mean_token_accuracy": 0.8447777807712555, + "num_tokens": 163386634.0, + "step": 135830 + }, + { + "entropy": 1.916204272210598, + "epoch": 0.4210921146750633, + "grad_norm": 7.169795513153076, + "learning_rate": 3.898573474868886e-06, + "loss": 0.4745, + "mean_token_accuracy": 0.8451360926032067, + "num_tokens": 163398136.0, + "step": 135840 + }, + { + "entropy": 1.887849335372448, + "epoch": 0.42112311380011297, + "grad_norm": 9.207273483276367, + "learning_rate": 3.898429983005783e-06, + "loss": 0.4681, + "mean_token_accuracy": 0.849080765247345, + "num_tokens": 163409693.0, + "step": 135850 + }, + { + "entropy": 1.8300122573971749, + "epoch": 0.4211541129251627, + "grad_norm": 8.275792121887207, + "learning_rate": 3.898286506985706e-06, + "loss": 0.3938, + "mean_token_accuracy": 0.8691477358341217, + "num_tokens": 163422481.0, + "step": 135860 + }, + { + "entropy": 1.9414837792515756, + "epoch": 0.42118511205021236, + "grad_norm": 9.043680191040039, + "learning_rate": 3.898143046805739e-06, + "loss": 0.4655, + "mean_token_accuracy": 0.8596002608537674, + "num_tokens": 163433827.0, + "step": 135870 + }, + { + "entropy": 1.8458356723189353, + "epoch": 0.4212161111752621, + "grad_norm": 8.126672744750977, + "learning_rate": 3.897999602462968e-06, + "loss": 0.3803, + "mean_token_accuracy": 0.8593410521745681, + "num_tokens": 163446168.0, + "step": 135880 + }, + { + "entropy": 1.925001895427704, + "epoch": 0.42124711030031176, + "grad_norm": 8.667831420898438, + "learning_rate": 3.897856173954477e-06, + "loss": 0.4655, + "mean_token_accuracy": 0.8532567232847214, + "num_tokens": 163457240.0, + "step": 135890 + }, + { + "entropy": 1.8693645611405372, + "epoch": 0.4212781094253615, + "grad_norm": 6.114764213562012, + "learning_rate": 3.897712761277357e-06, + "loss": 0.4568, + "mean_token_accuracy": 0.8476710587739944, + "num_tokens": 163470439.0, + "step": 135900 + }, + { + "entropy": 1.928558248281479, + "epoch": 0.42130910855041115, + "grad_norm": 9.09399127960205, + "learning_rate": 3.897569364428692e-06, + "loss": 0.4697, + "mean_token_accuracy": 0.8500400453805923, + "num_tokens": 163481986.0, + "step": 135910 + }, + { + "entropy": 1.9676057755947114, + "epoch": 0.4213401076754609, + "grad_norm": 7.342076301574707, + "learning_rate": 3.897425983405573e-06, + "loss": 0.494, + "mean_token_accuracy": 0.8489060357213021, + "num_tokens": 163492836.0, + "step": 135920 + }, + { + "entropy": 1.8425328716635705, + "epoch": 0.42137110680051054, + "grad_norm": 4.709118843078613, + "learning_rate": 3.897282618205089e-06, + "loss": 0.4112, + "mean_token_accuracy": 0.8543885082006455, + "num_tokens": 163505935.0, + "step": 135930 + }, + { + "entropy": 1.875008788704872, + "epoch": 0.42140210592556027, + "grad_norm": 2.8072144985198975, + "learning_rate": 3.89713926882433e-06, + "loss": 0.4848, + "mean_token_accuracy": 0.8433866515755654, + "num_tokens": 163518513.0, + "step": 135940 + }, + { + "entropy": 1.907314045727253, + "epoch": 0.42143310505060994, + "grad_norm": 7.362760543823242, + "learning_rate": 3.896995935260386e-06, + "loss": 0.4514, + "mean_token_accuracy": 0.8597197920084, + "num_tokens": 163531303.0, + "step": 135950 + }, + { + "entropy": 1.9355842828750611, + "epoch": 0.42146410417565966, + "grad_norm": 3.6717031002044678, + "learning_rate": 3.896852617510349e-06, + "loss": 0.4762, + "mean_token_accuracy": 0.8499437302350998, + "num_tokens": 163542827.0, + "step": 135960 + }, + { + "entropy": 1.9380091413855554, + "epoch": 0.42149510330070933, + "grad_norm": 7.6753621101379395, + "learning_rate": 3.896709315571311e-06, + "loss": 0.4788, + "mean_token_accuracy": 0.834736093878746, + "num_tokens": 163555197.0, + "step": 135970 + }, + { + "entropy": 1.8424546226859093, + "epoch": 0.42152610242575905, + "grad_norm": 3.7391586303710938, + "learning_rate": 3.896566029440366e-06, + "loss": 0.4114, + "mean_token_accuracy": 0.865011602640152, + "num_tokens": 163567600.0, + "step": 135980 + }, + { + "entropy": 1.830755239725113, + "epoch": 0.4215571015508087, + "grad_norm": 10.804282188415527, + "learning_rate": 3.8964227591146075e-06, + "loss": 0.4528, + "mean_token_accuracy": 0.8500369563698769, + "num_tokens": 163580961.0, + "step": 135990 + }, + { + "entropy": 1.8739851251244546, + "epoch": 0.42158810067585845, + "grad_norm": 8.843622207641602, + "learning_rate": 3.89627950459113e-06, + "loss": 0.4433, + "mean_token_accuracy": 0.853018419444561, + "num_tokens": 163593529.0, + "step": 136000 + }, + { + "entropy": 1.8963138684630394, + "epoch": 0.4216190998009081, + "grad_norm": 7.0039191246032715, + "learning_rate": 3.8961362658670284e-06, + "loss": 0.4491, + "mean_token_accuracy": 0.849344827234745, + "num_tokens": 163605267.0, + "step": 136010 + }, + { + "entropy": 1.8308332130312919, + "epoch": 0.42165009892595784, + "grad_norm": 9.128500938415527, + "learning_rate": 3.895993042939398e-06, + "loss": 0.4015, + "mean_token_accuracy": 0.8566051840782165, + "num_tokens": 163618128.0, + "step": 136020 + }, + { + "entropy": 1.9336943060159684, + "epoch": 0.4216810980510075, + "grad_norm": 8.359414100646973, + "learning_rate": 3.895849835805338e-06, + "loss": 0.4617, + "mean_token_accuracy": 0.8571832865476608, + "num_tokens": 163629610.0, + "step": 136030 + }, + { + "entropy": 1.81222113519907, + "epoch": 0.42171209717605723, + "grad_norm": 7.943345546722412, + "learning_rate": 3.8957066444619444e-06, + "loss": 0.3755, + "mean_token_accuracy": 0.86582732796669, + "num_tokens": 163642383.0, + "step": 136040 + }, + { + "entropy": 1.8568004190921783, + "epoch": 0.4217430963011069, + "grad_norm": 5.962125778198242, + "learning_rate": 3.895563468906315e-06, + "loss": 0.4627, + "mean_token_accuracy": 0.8517604455351829, + "num_tokens": 163654588.0, + "step": 136050 + }, + { + "entropy": 1.9352711230516433, + "epoch": 0.42177409542615657, + "grad_norm": 9.666674613952637, + "learning_rate": 3.89542030913555e-06, + "loss": 0.4991, + "mean_token_accuracy": 0.8422815144062042, + "num_tokens": 163665434.0, + "step": 136060 + }, + { + "entropy": 1.9193247556686401, + "epoch": 0.4218050945512063, + "grad_norm": 9.703880310058594, + "learning_rate": 3.895277165146748e-06, + "loss": 0.517, + "mean_token_accuracy": 0.8462235674262046, + "num_tokens": 163677012.0, + "step": 136070 + }, + { + "entropy": 1.9230276107788087, + "epoch": 0.42183609367625596, + "grad_norm": 9.869771003723145, + "learning_rate": 3.895134036937011e-06, + "loss": 0.4913, + "mean_token_accuracy": 0.8501724734902382, + "num_tokens": 163687569.0, + "step": 136080 + }, + { + "entropy": 1.9550448417663575, + "epoch": 0.4218670928013057, + "grad_norm": 8.047429084777832, + "learning_rate": 3.89499092450344e-06, + "loss": 0.5219, + "mean_token_accuracy": 0.843185929954052, + "num_tokens": 163698183.0, + "step": 136090 + }, + { + "entropy": 1.8611886352300644, + "epoch": 0.42189809192635536, + "grad_norm": 8.042689323425293, + "learning_rate": 3.894847827843135e-06, + "loss": 0.4431, + "mean_token_accuracy": 0.8485627174377441, + "num_tokens": 163710589.0, + "step": 136100 + }, + { + "entropy": 1.8729781568050385, + "epoch": 0.4219290910514051, + "grad_norm": 8.031086921691895, + "learning_rate": 3.894704746953201e-06, + "loss": 0.4227, + "mean_token_accuracy": 0.8670966506004334, + "num_tokens": 163722848.0, + "step": 136110 + }, + { + "entropy": 1.9010607331991196, + "epoch": 0.42196009017645475, + "grad_norm": 8.790847778320312, + "learning_rate": 3.894561681830741e-06, + "loss": 0.4238, + "mean_token_accuracy": 0.85305155813694, + "num_tokens": 163734838.0, + "step": 136120 + }, + { + "entropy": 1.869620206952095, + "epoch": 0.4219910893015045, + "grad_norm": 7.369499206542969, + "learning_rate": 3.89441863247286e-06, + "loss": 0.3984, + "mean_token_accuracy": 0.8668390452861786, + "num_tokens": 163747691.0, + "step": 136130 + }, + { + "entropy": 1.8736939072608947, + "epoch": 0.42202208842655414, + "grad_norm": 8.191452026367188, + "learning_rate": 3.89427559887666e-06, + "loss": 0.443, + "mean_token_accuracy": 0.8507094740867615, + "num_tokens": 163759715.0, + "step": 136140 + }, + { + "entropy": 1.891453741490841, + "epoch": 0.42205308755160387, + "grad_norm": 4.542218208312988, + "learning_rate": 3.89413258103925e-06, + "loss": 0.4505, + "mean_token_accuracy": 0.8499180749058723, + "num_tokens": 163772049.0, + "step": 136150 + }, + { + "entropy": 1.8985579848289489, + "epoch": 0.42208408667665354, + "grad_norm": 6.974556922912598, + "learning_rate": 3.8939895789577355e-06, + "loss": 0.4075, + "mean_token_accuracy": 0.866258729994297, + "num_tokens": 163783799.0, + "step": 136160 + }, + { + "entropy": 1.8413588017225266, + "epoch": 0.42211508580170326, + "grad_norm": 3.5806539058685303, + "learning_rate": 3.893846592629224e-06, + "loss": 0.4167, + "mean_token_accuracy": 0.856754244863987, + "num_tokens": 163796194.0, + "step": 136170 + }, + { + "entropy": 1.8396148562431336, + "epoch": 0.42214608492675293, + "grad_norm": 7.897609710693359, + "learning_rate": 3.893703622050822e-06, + "loss": 0.4449, + "mean_token_accuracy": 0.8517332747578621, + "num_tokens": 163808614.0, + "step": 136180 + }, + { + "entropy": 1.8847883358597755, + "epoch": 0.42217708405180265, + "grad_norm": 8.880624771118164, + "learning_rate": 3.89356066721964e-06, + "loss": 0.4671, + "mean_token_accuracy": 0.848950457572937, + "num_tokens": 163820626.0, + "step": 136190 + }, + { + "entropy": 1.9313714131712914, + "epoch": 0.4222080831768523, + "grad_norm": 7.545229911804199, + "learning_rate": 3.893417728132786e-06, + "loss": 0.4886, + "mean_token_accuracy": 0.8502359837293625, + "num_tokens": 163832281.0, + "step": 136200 + }, + { + "entropy": 1.942906777560711, + "epoch": 0.42223908230190205, + "grad_norm": 8.471320152282715, + "learning_rate": 3.8932748047873715e-06, + "loss": 0.4698, + "mean_token_accuracy": 0.8459069982171059, + "num_tokens": 163843591.0, + "step": 136210 + }, + { + "entropy": 1.8104972153902055, + "epoch": 0.4222700814269517, + "grad_norm": 7.06924295425415, + "learning_rate": 3.893131897180506e-06, + "loss": 0.3943, + "mean_token_accuracy": 0.8590885251760483, + "num_tokens": 163856366.0, + "step": 136220 + }, + { + "entropy": 1.8111203506588935, + "epoch": 0.42230108055200144, + "grad_norm": 3.74467396736145, + "learning_rate": 3.892989005309303e-06, + "loss": 0.3944, + "mean_token_accuracy": 0.8676251381635666, + "num_tokens": 163870058.0, + "step": 136230 + }, + { + "entropy": 1.8452250599861144, + "epoch": 0.4223320796770511, + "grad_norm": 9.069093704223633, + "learning_rate": 3.892846129170875e-06, + "loss": 0.4211, + "mean_token_accuracy": 0.8483367681503295, + "num_tokens": 163883160.0, + "step": 136240 + }, + { + "entropy": 1.9414851903915404, + "epoch": 0.42236307880210083, + "grad_norm": 9.163724899291992, + "learning_rate": 3.892703268762333e-06, + "loss": 0.5084, + "mean_token_accuracy": 0.8401763945817947, + "num_tokens": 163894584.0, + "step": 136250 + }, + { + "entropy": 1.8744617268443107, + "epoch": 0.4223940779271505, + "grad_norm": 4.958732604980469, + "learning_rate": 3.892560424080792e-06, + "loss": 0.4281, + "mean_token_accuracy": 0.8564569279551506, + "num_tokens": 163906968.0, + "step": 136260 + }, + { + "entropy": 1.9449343591928483, + "epoch": 0.4224250770522002, + "grad_norm": 7.974160671234131, + "learning_rate": 3.892417595123367e-06, + "loss": 0.4889, + "mean_token_accuracy": 0.8489760622382164, + "num_tokens": 163918188.0, + "step": 136270 + }, + { + "entropy": 1.8679928988218308, + "epoch": 0.4224560761772499, + "grad_norm": 9.126168251037598, + "learning_rate": 3.892274781887172e-06, + "loss": 0.4807, + "mean_token_accuracy": 0.8490209579467773, + "num_tokens": 163930662.0, + "step": 136280 + }, + { + "entropy": 1.8945604875683784, + "epoch": 0.4224870753022996, + "grad_norm": 7.6865363121032715, + "learning_rate": 3.892131984369326e-06, + "loss": 0.4245, + "mean_token_accuracy": 0.8592929482460022, + "num_tokens": 163942417.0, + "step": 136290 + }, + { + "entropy": 1.9043464064598083, + "epoch": 0.4225180744273493, + "grad_norm": 9.015311241149902, + "learning_rate": 3.891989202566944e-06, + "loss": 0.5079, + "mean_token_accuracy": 0.8410688266158104, + "num_tokens": 163955136.0, + "step": 136300 + }, + { + "entropy": 1.9317906364798545, + "epoch": 0.42254907355239896, + "grad_norm": 9.661110877990723, + "learning_rate": 3.891846436477144e-06, + "loss": 0.4953, + "mean_token_accuracy": 0.8499211475253106, + "num_tokens": 163966625.0, + "step": 136310 + }, + { + "entropy": 1.9347480967640878, + "epoch": 0.4225800726774487, + "grad_norm": 4.00605583190918, + "learning_rate": 3.891703686097043e-06, + "loss": 0.5011, + "mean_token_accuracy": 0.8386645168066025, + "num_tokens": 163978413.0, + "step": 136320 + }, + { + "entropy": 1.8491806223988534, + "epoch": 0.42261107180249835, + "grad_norm": 6.554616451263428, + "learning_rate": 3.891560951423763e-06, + "loss": 0.4156, + "mean_token_accuracy": 0.8566528141498566, + "num_tokens": 163990762.0, + "step": 136330 + }, + { + "entropy": 1.9310165911912918, + "epoch": 0.4226420709275481, + "grad_norm": 10.993410110473633, + "learning_rate": 3.891418232454421e-06, + "loss": 0.5121, + "mean_token_accuracy": 0.8411554425954819, + "num_tokens": 164003208.0, + "step": 136340 + }, + { + "entropy": 1.8688613146543502, + "epoch": 0.42267307005259774, + "grad_norm": 4.234386920928955, + "learning_rate": 3.891275529186138e-06, + "loss": 0.4322, + "mean_token_accuracy": 0.8530001133680344, + "num_tokens": 164015901.0, + "step": 136350 + }, + { + "entropy": 1.8955225050449371, + "epoch": 0.42270406917764747, + "grad_norm": 7.75908899307251, + "learning_rate": 3.891132841616038e-06, + "loss": 0.3968, + "mean_token_accuracy": 0.8615424484014511, + "num_tokens": 164027911.0, + "step": 136360 + }, + { + "entropy": 1.90107059776783, + "epoch": 0.42273506830269714, + "grad_norm": 4.065981388092041, + "learning_rate": 3.890990169741239e-06, + "loss": 0.4431, + "mean_token_accuracy": 0.8597174286842346, + "num_tokens": 164039942.0, + "step": 136370 + }, + { + "entropy": 1.971032439172268, + "epoch": 0.42276606742774686, + "grad_norm": 8.14873218536377, + "learning_rate": 3.890847513558867e-06, + "loss": 0.5376, + "mean_token_accuracy": 0.8287710756063461, + "num_tokens": 164051132.0, + "step": 136380 + }, + { + "entropy": 1.933982428908348, + "epoch": 0.42279706655279653, + "grad_norm": 9.093382835388184, + "learning_rate": 3.890704873066045e-06, + "loss": 0.4483, + "mean_token_accuracy": 0.8538150131702423, + "num_tokens": 164063375.0, + "step": 136390 + }, + { + "entropy": 1.9692260310053826, + "epoch": 0.42282806567784625, + "grad_norm": 7.917427062988281, + "learning_rate": 3.890562248259897e-06, + "loss": 0.4822, + "mean_token_accuracy": 0.842354828119278, + "num_tokens": 164074799.0, + "step": 136400 + }, + { + "entropy": 1.949564391374588, + "epoch": 0.4228590648028959, + "grad_norm": 8.609591484069824, + "learning_rate": 3.890419639137547e-06, + "loss": 0.5208, + "mean_token_accuracy": 0.8415541231632233, + "num_tokens": 164086636.0, + "step": 136410 + }, + { + "entropy": 1.8990302249789237, + "epoch": 0.42289006392794565, + "grad_norm": 8.789817810058594, + "learning_rate": 3.890277045696122e-06, + "loss": 0.4632, + "mean_token_accuracy": 0.8372734472155571, + "num_tokens": 164099437.0, + "step": 136420 + }, + { + "entropy": 1.9440916180610657, + "epoch": 0.4229210630529953, + "grad_norm": 8.302007675170898, + "learning_rate": 3.890134467932746e-06, + "loss": 0.4679, + "mean_token_accuracy": 0.8474587753415108, + "num_tokens": 164111134.0, + "step": 136430 + }, + { + "entropy": 1.9455933913588523, + "epoch": 0.42295206217804504, + "grad_norm": 9.274413108825684, + "learning_rate": 3.889991905844551e-06, + "loss": 0.4528, + "mean_token_accuracy": 0.84911008477211, + "num_tokens": 164122818.0, + "step": 136440 + }, + { + "entropy": 1.8401254639029503, + "epoch": 0.4229830613030947, + "grad_norm": 7.885955810546875, + "learning_rate": 3.88984935942866e-06, + "loss": 0.474, + "mean_token_accuracy": 0.8547663778066635, + "num_tokens": 164135328.0, + "step": 136450 + }, + { + "entropy": 1.8858144536614418, + "epoch": 0.42301406042814443, + "grad_norm": 7.3441338539123535, + "learning_rate": 3.8897068286822046e-06, + "loss": 0.4749, + "mean_token_accuracy": 0.8430177375674248, + "num_tokens": 164147713.0, + "step": 136460 + }, + { + "entropy": 1.9624179631471634, + "epoch": 0.4230450595531941, + "grad_norm": 7.776828289031982, + "learning_rate": 3.8895643136023146e-06, + "loss": 0.4735, + "mean_token_accuracy": 0.8557393714785576, + "num_tokens": 164159157.0, + "step": 136470 + }, + { + "entropy": 1.9207000091671944, + "epoch": 0.4230760586782438, + "grad_norm": 8.962823867797852, + "learning_rate": 3.889421814186118e-06, + "loss": 0.5688, + "mean_token_accuracy": 0.8365709602832794, + "num_tokens": 164170834.0, + "step": 136480 + }, + { + "entropy": 1.9185936331748963, + "epoch": 0.4231070578032935, + "grad_norm": 6.344710826873779, + "learning_rate": 3.889279330430746e-06, + "loss": 0.4697, + "mean_token_accuracy": 0.8482104346156121, + "num_tokens": 164183232.0, + "step": 136490 + }, + { + "entropy": 1.908946332335472, + "epoch": 0.4231380569283432, + "grad_norm": 7.712813854217529, + "learning_rate": 3.889136862333333e-06, + "loss": 0.4495, + "mean_token_accuracy": 0.8569162204861641, + "num_tokens": 164195812.0, + "step": 136500 + }, + { + "entropy": 1.9538255900144577, + "epoch": 0.4231690560533929, + "grad_norm": 8.661401748657227, + "learning_rate": 3.888994409891007e-06, + "loss": 0.5378, + "mean_token_accuracy": 0.8376554921269417, + "num_tokens": 164207031.0, + "step": 136510 + }, + { + "entropy": 1.9065537318587302, + "epoch": 0.4232000551784426, + "grad_norm": 3.5878145694732666, + "learning_rate": 3.8888519731009065e-06, + "loss": 0.4526, + "mean_token_accuracy": 0.8541804388165474, + "num_tokens": 164219113.0, + "step": 136520 + }, + { + "entropy": 1.9570256814360618, + "epoch": 0.4232310543034923, + "grad_norm": 11.070653915405273, + "learning_rate": 3.8887095519601594e-06, + "loss": 0.4652, + "mean_token_accuracy": 0.8492454007267952, + "num_tokens": 164230241.0, + "step": 136530 + }, + { + "entropy": 1.9088216736912726, + "epoch": 0.423262053428542, + "grad_norm": 5.85700798034668, + "learning_rate": 3.888567146465905e-06, + "loss": 0.4576, + "mean_token_accuracy": 0.8500758215785027, + "num_tokens": 164242548.0, + "step": 136540 + }, + { + "entropy": 1.9074143424630166, + "epoch": 0.4232930525535917, + "grad_norm": 7.5828938484191895, + "learning_rate": 3.888424756615277e-06, + "loss": 0.4828, + "mean_token_accuracy": 0.8519628167152404, + "num_tokens": 164254254.0, + "step": 136550 + }, + { + "entropy": 1.958798161149025, + "epoch": 0.42332405167864134, + "grad_norm": 7.560995101928711, + "learning_rate": 3.888282382405411e-06, + "loss": 0.4955, + "mean_token_accuracy": 0.8531105071306229, + "num_tokens": 164265550.0, + "step": 136560 + }, + { + "entropy": 1.9580877602100373, + "epoch": 0.42335505080369107, + "grad_norm": 7.767574787139893, + "learning_rate": 3.888140023833444e-06, + "loss": 0.4987, + "mean_token_accuracy": 0.8387601390480995, + "num_tokens": 164277156.0, + "step": 136570 + }, + { + "entropy": 1.9372137054800986, + "epoch": 0.42338604992874074, + "grad_norm": 7.815626621246338, + "learning_rate": 3.887997680896513e-06, + "loss": 0.4388, + "mean_token_accuracy": 0.8588192850351334, + "num_tokens": 164288771.0, + "step": 136580 + }, + { + "entropy": 1.8666774332523346, + "epoch": 0.42341704905379046, + "grad_norm": 11.0514554977417, + "learning_rate": 3.887855353591757e-06, + "loss": 0.4583, + "mean_token_accuracy": 0.8574911624193191, + "num_tokens": 164302139.0, + "step": 136590 + }, + { + "entropy": 1.8457847326993941, + "epoch": 0.42344804817884013, + "grad_norm": 3.481797695159912, + "learning_rate": 3.887713041916315e-06, + "loss": 0.3907, + "mean_token_accuracy": 0.8640109539031983, + "num_tokens": 164315100.0, + "step": 136600 + }, + { + "entropy": 1.8275635659694671, + "epoch": 0.42347904730388986, + "grad_norm": 7.928697109222412, + "learning_rate": 3.887570745867327e-06, + "loss": 0.4406, + "mean_token_accuracy": 0.8522553309798241, + "num_tokens": 164328806.0, + "step": 136610 + }, + { + "entropy": 1.945831647515297, + "epoch": 0.4235100464289395, + "grad_norm": 8.89621353149414, + "learning_rate": 3.887428465441934e-06, + "loss": 0.4692, + "mean_token_accuracy": 0.8539063900709152, + "num_tokens": 164340003.0, + "step": 136620 + }, + { + "entropy": 1.9486369535326957, + "epoch": 0.42354104555398925, + "grad_norm": 9.575989723205566, + "learning_rate": 3.8872862006372745e-06, + "loss": 0.4362, + "mean_token_accuracy": 0.8562672764062882, + "num_tokens": 164350643.0, + "step": 136630 + }, + { + "entropy": 1.8524845391511917, + "epoch": 0.4235720446790389, + "grad_norm": 6.633891582489014, + "learning_rate": 3.887143951450493e-06, + "loss": 0.4592, + "mean_token_accuracy": 0.8535214066505432, + "num_tokens": 164363583.0, + "step": 136640 + }, + { + "entropy": 1.833214531838894, + "epoch": 0.42360304380408864, + "grad_norm": 7.453033924102783, + "learning_rate": 3.887001717878731e-06, + "loss": 0.3917, + "mean_token_accuracy": 0.8589364722371101, + "num_tokens": 164377000.0, + "step": 136650 + }, + { + "entropy": 1.7080215141177177, + "epoch": 0.4236340429291383, + "grad_norm": 7.440715312957764, + "learning_rate": 3.886859499919133e-06, + "loss": 0.356, + "mean_token_accuracy": 0.8675283506512642, + "num_tokens": 164390708.0, + "step": 136660 + }, + { + "entropy": 1.8073251724243165, + "epoch": 0.42366504205418803, + "grad_norm": 4.673215389251709, + "learning_rate": 3.886717297568841e-06, + "loss": 0.4071, + "mean_token_accuracy": 0.8596639856696129, + "num_tokens": 164404186.0, + "step": 136670 + }, + { + "entropy": 1.7819759905338288, + "epoch": 0.4236960411792377, + "grad_norm": 4.5533552169799805, + "learning_rate": 3.8865751108250015e-06, + "loss": 0.3629, + "mean_token_accuracy": 0.8706469982862473, + "num_tokens": 164418745.0, + "step": 136680 + }, + { + "entropy": 1.931021724641323, + "epoch": 0.42372704030428743, + "grad_norm": 7.136261463165283, + "learning_rate": 3.88643293968476e-06, + "loss": 0.4592, + "mean_token_accuracy": 0.851832240819931, + "num_tokens": 164431143.0, + "step": 136690 + }, + { + "entropy": 1.9522256717085837, + "epoch": 0.4237580394293371, + "grad_norm": 6.650088787078857, + "learning_rate": 3.886290784145263e-06, + "loss": 0.5078, + "mean_token_accuracy": 0.8387707456946373, + "num_tokens": 164442769.0, + "step": 136700 + }, + { + "entropy": 1.9686541602015495, + "epoch": 0.4237890385543868, + "grad_norm": 4.377057075500488, + "learning_rate": 3.886148644203657e-06, + "loss": 0.4832, + "mean_token_accuracy": 0.8511147424578667, + "num_tokens": 164454299.0, + "step": 136710 + }, + { + "entropy": 1.907273495197296, + "epoch": 0.4238200376794365, + "grad_norm": 9.434427261352539, + "learning_rate": 3.886006519857088e-06, + "loss": 0.4182, + "mean_token_accuracy": 0.8593376144766808, + "num_tokens": 164466567.0, + "step": 136720 + }, + { + "entropy": 1.8965537667274475, + "epoch": 0.4238510368044862, + "grad_norm": 7.356637477874756, + "learning_rate": 3.885864411102708e-06, + "loss": 0.4372, + "mean_token_accuracy": 0.8546531453728676, + "num_tokens": 164478726.0, + "step": 136730 + }, + { + "entropy": 1.8856528677046298, + "epoch": 0.4238820359295359, + "grad_norm": 9.142583847045898, + "learning_rate": 3.885722317937665e-06, + "loss": 0.4395, + "mean_token_accuracy": 0.8572150781750679, + "num_tokens": 164491306.0, + "step": 136740 + }, + { + "entropy": 1.8360870122909545, + "epoch": 0.4239130350545856, + "grad_norm": 9.105968475341797, + "learning_rate": 3.885580240359107e-06, + "loss": 0.4475, + "mean_token_accuracy": 0.851330541074276, + "num_tokens": 164503683.0, + "step": 136750 + }, + { + "entropy": 1.926066693663597, + "epoch": 0.4239440341796353, + "grad_norm": 6.350192546844482, + "learning_rate": 3.885438178364187e-06, + "loss": 0.4194, + "mean_token_accuracy": 0.8608698025345802, + "num_tokens": 164515517.0, + "step": 136760 + }, + { + "entropy": 1.9915336534380912, + "epoch": 0.423975033304685, + "grad_norm": 7.947946548461914, + "learning_rate": 3.885296131950055e-06, + "loss": 0.4984, + "mean_token_accuracy": 0.8465639933943748, + "num_tokens": 164527079.0, + "step": 136770 + }, + { + "entropy": 1.9105818301439286, + "epoch": 0.42400603242973467, + "grad_norm": 9.564684867858887, + "learning_rate": 3.885154101113865e-06, + "loss": 0.4449, + "mean_token_accuracy": 0.8596938267350197, + "num_tokens": 164538814.0, + "step": 136780 + }, + { + "entropy": 1.9386409044265747, + "epoch": 0.4240370315547844, + "grad_norm": 10.250873565673828, + "learning_rate": 3.885012085852768e-06, + "loss": 0.4499, + "mean_token_accuracy": 0.853357446193695, + "num_tokens": 164550954.0, + "step": 136790 + }, + { + "entropy": 1.9958611935377122, + "epoch": 0.42406803067983406, + "grad_norm": 8.585638999938965, + "learning_rate": 3.884870086163918e-06, + "loss": 0.4789, + "mean_token_accuracy": 0.852650736272335, + "num_tokens": 164561994.0, + "step": 136800 + }, + { + "entropy": 1.8948075383901597, + "epoch": 0.42409902980488373, + "grad_norm": 8.89138412475586, + "learning_rate": 3.88472810204447e-06, + "loss": 0.4955, + "mean_token_accuracy": 0.8488554358482361, + "num_tokens": 164573952.0, + "step": 136810 + }, + { + "entropy": 1.9540094196796418, + "epoch": 0.42413002892993346, + "grad_norm": 7.017063140869141, + "learning_rate": 3.884586133491578e-06, + "loss": 0.4833, + "mean_token_accuracy": 0.8478411644697189, + "num_tokens": 164585607.0, + "step": 136820 + }, + { + "entropy": 1.8501681357622146, + "epoch": 0.4241610280549831, + "grad_norm": 8.608563423156738, + "learning_rate": 3.884444180502399e-06, + "loss": 0.4388, + "mean_token_accuracy": 0.8567686438560486, + "num_tokens": 164598434.0, + "step": 136830 + }, + { + "entropy": 1.9179615780711174, + "epoch": 0.42419202718003285, + "grad_norm": 7.781816482543945, + "learning_rate": 3.884302243074088e-06, + "loss": 0.5059, + "mean_token_accuracy": 0.8461335703730584, + "num_tokens": 164609861.0, + "step": 136840 + }, + { + "entropy": 1.9359031677246095, + "epoch": 0.4242230263050825, + "grad_norm": 9.121841430664062, + "learning_rate": 3.884160321203804e-06, + "loss": 0.4301, + "mean_token_accuracy": 0.8523103460669518, + "num_tokens": 164622151.0, + "step": 136850 + }, + { + "entropy": 1.923108348250389, + "epoch": 0.42425402543013224, + "grad_norm": 3.747391700744629, + "learning_rate": 3.884018414888704e-06, + "loss": 0.4437, + "mean_token_accuracy": 0.856217896938324, + "num_tokens": 164633849.0, + "step": 136860 + }, + { + "entropy": 2.0000748217105864, + "epoch": 0.4242850245551819, + "grad_norm": 9.04723834991455, + "learning_rate": 3.883876524125946e-06, + "loss": 0.5137, + "mean_token_accuracy": 0.8461816415190697, + "num_tokens": 164645091.0, + "step": 136870 + }, + { + "entropy": 1.9932536423206328, + "epoch": 0.42431602368023164, + "grad_norm": 9.16733169555664, + "learning_rate": 3.883734648912692e-06, + "loss": 0.5414, + "mean_token_accuracy": 0.8346955180168152, + "num_tokens": 164655606.0, + "step": 136880 + }, + { + "entropy": 1.936814832687378, + "epoch": 0.4243470228052813, + "grad_norm": 7.927580833435059, + "learning_rate": 3.883592789246098e-06, + "loss": 0.4945, + "mean_token_accuracy": 0.848675799369812, + "num_tokens": 164667161.0, + "step": 136890 + }, + { + "entropy": 1.9285539746284486, + "epoch": 0.42437802193033103, + "grad_norm": 3.780477523803711, + "learning_rate": 3.883450945123329e-06, + "loss": 0.4747, + "mean_token_accuracy": 0.8493298560380935, + "num_tokens": 164678581.0, + "step": 136900 + }, + { + "entropy": 1.8320277720689773, + "epoch": 0.4244090210553807, + "grad_norm": 9.713822364807129, + "learning_rate": 3.883309116541545e-06, + "loss": 0.4115, + "mean_token_accuracy": 0.8653510987758637, + "num_tokens": 164690881.0, + "step": 136910 + }, + { + "entropy": 1.9016039952635766, + "epoch": 0.4244400201804304, + "grad_norm": 8.08753776550293, + "learning_rate": 3.883167303497907e-06, + "loss": 0.4351, + "mean_token_accuracy": 0.8574401244521141, + "num_tokens": 164703031.0, + "step": 136920 + }, + { + "entropy": 1.8986100777983665, + "epoch": 0.4244710193054801, + "grad_norm": 3.745185375213623, + "learning_rate": 3.88302550598958e-06, + "loss": 0.4313, + "mean_token_accuracy": 0.8573842152953148, + "num_tokens": 164715311.0, + "step": 136930 + }, + { + "entropy": 1.9534535899758338, + "epoch": 0.4245020184305298, + "grad_norm": 10.045483589172363, + "learning_rate": 3.882883724013727e-06, + "loss": 0.4726, + "mean_token_accuracy": 0.8531460225582123, + "num_tokens": 164727139.0, + "step": 136940 + }, + { + "entropy": 1.931221941113472, + "epoch": 0.4245330175555795, + "grad_norm": 9.615496635437012, + "learning_rate": 3.882741957567513e-06, + "loss": 0.4526, + "mean_token_accuracy": 0.8572171211242676, + "num_tokens": 164738964.0, + "step": 136950 + }, + { + "entropy": 1.9113097980618476, + "epoch": 0.4245640166806292, + "grad_norm": 4.518940448760986, + "learning_rate": 3.882600206648101e-06, + "loss": 0.4727, + "mean_token_accuracy": 0.8442388892173767, + "num_tokens": 164750751.0, + "step": 136960 + }, + { + "entropy": 1.884780551493168, + "epoch": 0.4245950158056789, + "grad_norm": 10.810199737548828, + "learning_rate": 3.88245847125266e-06, + "loss": 0.4323, + "mean_token_accuracy": 0.8605505630373955, + "num_tokens": 164762720.0, + "step": 136970 + }, + { + "entropy": 1.8848048835992812, + "epoch": 0.4246260149307286, + "grad_norm": 7.640275478363037, + "learning_rate": 3.882316751378355e-06, + "loss": 0.4696, + "mean_token_accuracy": 0.8543965950608253, + "num_tokens": 164774673.0, + "step": 136980 + }, + { + "entropy": 1.9594868689775466, + "epoch": 0.42465701405577827, + "grad_norm": 8.8289213180542, + "learning_rate": 3.882175047022354e-06, + "loss": 0.4698, + "mean_token_accuracy": 0.8498617529869079, + "num_tokens": 164785684.0, + "step": 136990 + }, + { + "entropy": 1.9494032636284828, + "epoch": 0.424688013180828, + "grad_norm": 9.075950622558594, + "learning_rate": 3.8820333581818245e-06, + "loss": 0.487, + "mean_token_accuracy": 0.8439081847667694, + "num_tokens": 164797327.0, + "step": 137000 + }, + { + "entropy": 1.8996917128562927, + "epoch": 0.42471901230587766, + "grad_norm": 9.116379737854004, + "learning_rate": 3.881891684853936e-06, + "loss": 0.509, + "mean_token_accuracy": 0.8461435765028, + "num_tokens": 164809654.0, + "step": 137010 + }, + { + "entropy": 1.927696566283703, + "epoch": 0.4247500114309274, + "grad_norm": 8.284031867980957, + "learning_rate": 3.881750027035857e-06, + "loss": 0.4344, + "mean_token_accuracy": 0.8543187946081161, + "num_tokens": 164820690.0, + "step": 137020 + }, + { + "entropy": 1.773264393210411, + "epoch": 0.42478101055597706, + "grad_norm": 7.941432476043701, + "learning_rate": 3.88160838472476e-06, + "loss": 0.389, + "mean_token_accuracy": 0.8707416027784347, + "num_tokens": 164834760.0, + "step": 137030 + }, + { + "entropy": 1.798407319188118, + "epoch": 0.4248120096810267, + "grad_norm": 3.9334545135498047, + "learning_rate": 3.881466757917814e-06, + "loss": 0.3334, + "mean_token_accuracy": 0.8698904514312744, + "num_tokens": 164848105.0, + "step": 137040 + }, + { + "entropy": 1.8838206291198731, + "epoch": 0.42484300880607645, + "grad_norm": 7.942371845245361, + "learning_rate": 3.8813251466121905e-06, + "loss": 0.4114, + "mean_token_accuracy": 0.8569322526454926, + "num_tokens": 164860645.0, + "step": 137050 + }, + { + "entropy": 1.9465039163827895, + "epoch": 0.4248740079311261, + "grad_norm": 9.43044662475586, + "learning_rate": 3.881183550805064e-06, + "loss": 0.4727, + "mean_token_accuracy": 0.8527957573533058, + "num_tokens": 164871767.0, + "step": 137060 + }, + { + "entropy": 1.948851892352104, + "epoch": 0.42490500705617584, + "grad_norm": 7.4881110191345215, + "learning_rate": 3.881041970493606e-06, + "loss": 0.465, + "mean_token_accuracy": 0.8495731383562088, + "num_tokens": 164883389.0, + "step": 137070 + }, + { + "entropy": 1.7817671418190002, + "epoch": 0.4249360061812255, + "grad_norm": 3.924825429916382, + "learning_rate": 3.88090040567499e-06, + "loss": 0.348, + "mean_token_accuracy": 0.8674560770392418, + "num_tokens": 164896540.0, + "step": 137080 + }, + { + "entropy": 1.9194593280553818, + "epoch": 0.42496700530627524, + "grad_norm": 7.824845790863037, + "learning_rate": 3.8807588563463924e-06, + "loss": 0.5412, + "mean_token_accuracy": 0.8526005268096923, + "num_tokens": 164908834.0, + "step": 137090 + }, + { + "entropy": 1.9632477343082428, + "epoch": 0.4249980044313249, + "grad_norm": 7.450333595275879, + "learning_rate": 3.880617322504987e-06, + "loss": 0.5201, + "mean_token_accuracy": 0.8478045627474785, + "num_tokens": 164919727.0, + "step": 137100 + }, + { + "entropy": 1.9021240234375, + "epoch": 0.42502900355637463, + "grad_norm": 8.733455657958984, + "learning_rate": 3.8804758041479515e-06, + "loss": 0.4241, + "mean_token_accuracy": 0.853900471329689, + "num_tokens": 164931228.0, + "step": 137110 + }, + { + "entropy": 1.9155962690711021, + "epoch": 0.4250600026814243, + "grad_norm": 8.310453414916992, + "learning_rate": 3.880334301272461e-06, + "loss": 0.5104, + "mean_token_accuracy": 0.8396357014775276, + "num_tokens": 164943673.0, + "step": 137120 + }, + { + "entropy": 1.8234332576394081, + "epoch": 0.425091001806474, + "grad_norm": 8.392141342163086, + "learning_rate": 3.880192813875693e-06, + "loss": 0.4604, + "mean_token_accuracy": 0.8463705331087112, + "num_tokens": 164957038.0, + "step": 137130 + }, + { + "entropy": 1.8326028779149055, + "epoch": 0.4251220009315237, + "grad_norm": 4.345094203948975, + "learning_rate": 3.880051341954828e-06, + "loss": 0.3922, + "mean_token_accuracy": 0.870023638010025, + "num_tokens": 164970423.0, + "step": 137140 + }, + { + "entropy": 1.881044802069664, + "epoch": 0.4251530000565734, + "grad_norm": 8.87633991241455, + "learning_rate": 3.879909885507042e-06, + "loss": 0.4442, + "mean_token_accuracy": 0.8461710780858993, + "num_tokens": 164982236.0, + "step": 137150 + }, + { + "entropy": 1.8549317836761474, + "epoch": 0.4251839991816231, + "grad_norm": 9.388912200927734, + "learning_rate": 3.879768444529517e-06, + "loss": 0.4236, + "mean_token_accuracy": 0.8555863380432129, + "num_tokens": 164995126.0, + "step": 137160 + }, + { + "entropy": 1.8838744007050992, + "epoch": 0.4252149983066728, + "grad_norm": 7.678390979766846, + "learning_rate": 3.879627019019431e-06, + "loss": 0.4511, + "mean_token_accuracy": 0.8485071495175361, + "num_tokens": 165007465.0, + "step": 137170 + }, + { + "entropy": 1.9211634308099748, + "epoch": 0.4252459974317225, + "grad_norm": 8.58484172821045, + "learning_rate": 3.879485608973968e-06, + "loss": 0.4207, + "mean_token_accuracy": 0.8675820276141166, + "num_tokens": 165018031.0, + "step": 137180 + }, + { + "entropy": 1.7676224395632745, + "epoch": 0.4252769965567722, + "grad_norm": 8.151896476745605, + "learning_rate": 3.879344214390308e-06, + "loss": 0.416, + "mean_token_accuracy": 0.8637456342577934, + "num_tokens": 165032409.0, + "step": 137190 + }, + { + "entropy": 1.883620023727417, + "epoch": 0.42530799568182187, + "grad_norm": 7.578486442565918, + "learning_rate": 3.879202835265633e-06, + "loss": 0.4462, + "mean_token_accuracy": 0.8562824308872223, + "num_tokens": 165044847.0, + "step": 137200 + }, + { + "entropy": 1.8436495438218117, + "epoch": 0.4253389948068716, + "grad_norm": 8.562501907348633, + "learning_rate": 3.879061471597127e-06, + "loss": 0.4225, + "mean_token_accuracy": 0.8575531423091889, + "num_tokens": 165058103.0, + "step": 137210 + }, + { + "entropy": 1.8444740131497384, + "epoch": 0.42536999393192126, + "grad_norm": 7.836605072021484, + "learning_rate": 3.878920123381976e-06, + "loss": 0.4366, + "mean_token_accuracy": 0.8577763974666596, + "num_tokens": 165070717.0, + "step": 137220 + }, + { + "entropy": 1.747816914319992, + "epoch": 0.425400993056971, + "grad_norm": 8.393543243408203, + "learning_rate": 3.878778790617362e-06, + "loss": 0.3329, + "mean_token_accuracy": 0.8659502550959587, + "num_tokens": 165085002.0, + "step": 137230 + }, + { + "entropy": 1.9642114371061326, + "epoch": 0.42543199218202066, + "grad_norm": 8.948190689086914, + "learning_rate": 3.87863747330047e-06, + "loss": 0.5287, + "mean_token_accuracy": 0.8416478574275971, + "num_tokens": 165095906.0, + "step": 137240 + }, + { + "entropy": 1.8498766794800758, + "epoch": 0.4254629913070704, + "grad_norm": 4.120171070098877, + "learning_rate": 3.8784961714284885e-06, + "loss": 0.4044, + "mean_token_accuracy": 0.8605809271335602, + "num_tokens": 165108454.0, + "step": 137250 + }, + { + "entropy": 1.9248084783554078, + "epoch": 0.42549399043212005, + "grad_norm": 3.3840432167053223, + "learning_rate": 3.878354884998603e-06, + "loss": 0.4371, + "mean_token_accuracy": 0.8541036680340767, + "num_tokens": 165120547.0, + "step": 137260 + }, + { + "entropy": 1.8695127993822098, + "epoch": 0.4255249895571698, + "grad_norm": 7.736443519592285, + "learning_rate": 3.878213614008001e-06, + "loss": 0.4748, + "mean_token_accuracy": 0.8537966668605804, + "num_tokens": 165132994.0, + "step": 137270 + }, + { + "entropy": 1.889861761033535, + "epoch": 0.42555598868221944, + "grad_norm": 9.705860137939453, + "learning_rate": 3.878072358453872e-06, + "loss": 0.4257, + "mean_token_accuracy": 0.8553769558668136, + "num_tokens": 165144856.0, + "step": 137280 + }, + { + "entropy": 1.8385322004556657, + "epoch": 0.4255869878072691, + "grad_norm": 4.066993236541748, + "learning_rate": 3.877931118333403e-06, + "loss": 0.3621, + "mean_token_accuracy": 0.8684134423732758, + "num_tokens": 165157887.0, + "step": 137290 + }, + { + "entropy": 1.9168552801012992, + "epoch": 0.42561798693231884, + "grad_norm": 7.822579383850098, + "learning_rate": 3.877789893643785e-06, + "loss": 0.4402, + "mean_token_accuracy": 0.8533562257885933, + "num_tokens": 165170433.0, + "step": 137300 + }, + { + "entropy": 1.8463567778468133, + "epoch": 0.4256489860573685, + "grad_norm": 7.959437847137451, + "learning_rate": 3.877648684382209e-06, + "loss": 0.362, + "mean_token_accuracy": 0.8728025883436203, + "num_tokens": 165183224.0, + "step": 137310 + }, + { + "entropy": 1.896780176460743, + "epoch": 0.42567998518241823, + "grad_norm": 3.448885679244995, + "learning_rate": 3.877507490545866e-06, + "loss": 0.4425, + "mean_token_accuracy": 0.8589633658528328, + "num_tokens": 165194990.0, + "step": 137320 + }, + { + "entropy": 1.9273283243179322, + "epoch": 0.4257109843074679, + "grad_norm": 8.690948486328125, + "learning_rate": 3.877366312131946e-06, + "loss": 0.478, + "mean_token_accuracy": 0.8396705284714698, + "num_tokens": 165207611.0, + "step": 137330 + }, + { + "entropy": 1.9326757207512855, + "epoch": 0.4257419834325176, + "grad_norm": 9.278722763061523, + "learning_rate": 3.877225149137642e-06, + "loss": 0.4887, + "mean_token_accuracy": 0.8490404456853866, + "num_tokens": 165219648.0, + "step": 137340 + }, + { + "entropy": 1.7946316838264464, + "epoch": 0.4257729825575673, + "grad_norm": 8.48944091796875, + "learning_rate": 3.877084001560149e-06, + "loss": 0.3903, + "mean_token_accuracy": 0.8692017003893853, + "num_tokens": 165232876.0, + "step": 137350 + }, + { + "entropy": 1.9322822287678718, + "epoch": 0.425803981682617, + "grad_norm": 7.394003391265869, + "learning_rate": 3.87694286939666e-06, + "loss": 0.4537, + "mean_token_accuracy": 0.85526312738657, + "num_tokens": 165244530.0, + "step": 137360 + }, + { + "entropy": 1.8217709109187126, + "epoch": 0.4258349808076667, + "grad_norm": 8.290609359741211, + "learning_rate": 3.876801752644371e-06, + "loss": 0.394, + "mean_token_accuracy": 0.8580706313252449, + "num_tokens": 165257526.0, + "step": 137370 + }, + { + "entropy": 1.8738437041640281, + "epoch": 0.4258659799327164, + "grad_norm": 8.288384437561035, + "learning_rate": 3.876660651300476e-06, + "loss": 0.475, + "mean_token_accuracy": 0.8446152061223984, + "num_tokens": 165269243.0, + "step": 137380 + }, + { + "entropy": 1.8645242124795913, + "epoch": 0.4258969790577661, + "grad_norm": 7.856457710266113, + "learning_rate": 3.876519565362171e-06, + "loss": 0.4103, + "mean_token_accuracy": 0.8648913115262985, + "num_tokens": 165281328.0, + "step": 137390 + }, + { + "entropy": 1.8050186708569527, + "epoch": 0.4259279781828158, + "grad_norm": 6.527402400970459, + "learning_rate": 3.876378494826653e-06, + "loss": 0.3733, + "mean_token_accuracy": 0.8636491522192955, + "num_tokens": 165294695.0, + "step": 137400 + }, + { + "entropy": 1.88536017537117, + "epoch": 0.42595897730786547, + "grad_norm": 2.9641103744506836, + "learning_rate": 3.8762374396911215e-06, + "loss": 0.4317, + "mean_token_accuracy": 0.8582195937633514, + "num_tokens": 165307479.0, + "step": 137410 + }, + { + "entropy": 1.846780201792717, + "epoch": 0.4259899764329152, + "grad_norm": 5.190714359283447, + "learning_rate": 3.876096399952772e-06, + "loss": 0.3712, + "mean_token_accuracy": 0.8630301207304001, + "num_tokens": 165320970.0, + "step": 137420 + }, + { + "entropy": 1.9380200251936912, + "epoch": 0.42602097555796486, + "grad_norm": 7.597511291503906, + "learning_rate": 3.875955375608804e-06, + "loss": 0.5248, + "mean_token_accuracy": 0.8328088164329529, + "num_tokens": 165333314.0, + "step": 137430 + }, + { + "entropy": 1.9373356685042382, + "epoch": 0.4260519746830146, + "grad_norm": 7.61322021484375, + "learning_rate": 3.875814366656419e-06, + "loss": 0.4599, + "mean_token_accuracy": 0.8467313498258591, + "num_tokens": 165345441.0, + "step": 137440 + }, + { + "entropy": 1.849155631661415, + "epoch": 0.42608297380806426, + "grad_norm": 8.47956371307373, + "learning_rate": 3.875673373092818e-06, + "loss": 0.4481, + "mean_token_accuracy": 0.8476941645145416, + "num_tokens": 165358430.0, + "step": 137450 + }, + { + "entropy": 1.9534918010234832, + "epoch": 0.426113972933114, + "grad_norm": 7.988271236419678, + "learning_rate": 3.875532394915199e-06, + "loss": 0.4779, + "mean_token_accuracy": 0.8520032778382302, + "num_tokens": 165369269.0, + "step": 137460 + }, + { + "entropy": 1.9056456133723259, + "epoch": 0.42614497205816365, + "grad_norm": 9.272027969360352, + "learning_rate": 3.875391432120765e-06, + "loss": 0.4568, + "mean_token_accuracy": 0.8481805965304374, + "num_tokens": 165380985.0, + "step": 137470 + }, + { + "entropy": 1.939954537153244, + "epoch": 0.4261759711832134, + "grad_norm": 9.503613471984863, + "learning_rate": 3.87525048470672e-06, + "loss": 0.5082, + "mean_token_accuracy": 0.8481825277209282, + "num_tokens": 165392220.0, + "step": 137480 + }, + { + "entropy": 1.9774063229560852, + "epoch": 0.42620697030826304, + "grad_norm": 8.539331436157227, + "learning_rate": 3.875109552670266e-06, + "loss": 0.4858, + "mean_token_accuracy": 0.8497935310006142, + "num_tokens": 165403622.0, + "step": 137490 + }, + { + "entropy": 1.9746006906032563, + "epoch": 0.42623796943331277, + "grad_norm": 7.517263412475586, + "learning_rate": 3.874968636008607e-06, + "loss": 0.4568, + "mean_token_accuracy": 0.8602147474884987, + "num_tokens": 165414792.0, + "step": 137500 + }, + { + "entropy": 1.9843586146831513, + "epoch": 0.42626896855836244, + "grad_norm": 9.263333320617676, + "learning_rate": 3.874827734718949e-06, + "loss": 0.5149, + "mean_token_accuracy": 0.8430792987346649, + "num_tokens": 165425288.0, + "step": 137510 + }, + { + "entropy": 1.8610298484563828, + "epoch": 0.42629996768341216, + "grad_norm": 8.659163475036621, + "learning_rate": 3.8746868487984955e-06, + "loss": 0.4769, + "mean_token_accuracy": 0.8577761441469193, + "num_tokens": 165437939.0, + "step": 137520 + }, + { + "entropy": 1.9919404417276383, + "epoch": 0.42633096680846183, + "grad_norm": 7.622340202331543, + "learning_rate": 3.874545978244454e-06, + "loss": 0.4922, + "mean_token_accuracy": 0.8462273955345154, + "num_tokens": 165449331.0, + "step": 137530 + }, + { + "entropy": 1.883239607512951, + "epoch": 0.4263619659335115, + "grad_norm": 8.540725708007812, + "learning_rate": 3.87440512305403e-06, + "loss": 0.4444, + "mean_token_accuracy": 0.8450183764100074, + "num_tokens": 165462052.0, + "step": 137540 + }, + { + "entropy": 1.761160995066166, + "epoch": 0.4263929650585612, + "grad_norm": 3.7234740257263184, + "learning_rate": 3.874264283224433e-06, + "loss": 0.341, + "mean_token_accuracy": 0.877360138297081, + "num_tokens": 165476174.0, + "step": 137550 + }, + { + "entropy": 1.8949896410107612, + "epoch": 0.4264239641836109, + "grad_norm": 4.375694274902344, + "learning_rate": 3.874123458752871e-06, + "loss": 0.4789, + "mean_token_accuracy": 0.8546592324972153, + "num_tokens": 165488036.0, + "step": 137560 + }, + { + "entropy": 1.970250654220581, + "epoch": 0.4264549633086606, + "grad_norm": 7.3062357902526855, + "learning_rate": 3.873982649636551e-06, + "loss": 0.4985, + "mean_token_accuracy": 0.8475343957543373, + "num_tokens": 165498774.0, + "step": 137570 + }, + { + "entropy": 1.931829509139061, + "epoch": 0.4264859624337103, + "grad_norm": 8.701257705688477, + "learning_rate": 3.8738418558726845e-06, + "loss": 0.473, + "mean_token_accuracy": 0.8460123270750046, + "num_tokens": 165510856.0, + "step": 137580 + }, + { + "entropy": 1.9778954088687897, + "epoch": 0.42651696155876, + "grad_norm": 10.640881538391113, + "learning_rate": 3.873701077458481e-06, + "loss": 0.5176, + "mean_token_accuracy": 0.8416032001376152, + "num_tokens": 165521684.0, + "step": 137590 + }, + { + "entropy": 1.871251691877842, + "epoch": 0.4265479606838097, + "grad_norm": 7.633983612060547, + "learning_rate": 3.8735603143911525e-06, + "loss": 0.418, + "mean_token_accuracy": 0.8568643853068352, + "num_tokens": 165533887.0, + "step": 137600 + }, + { + "entropy": 1.9317373171448708, + "epoch": 0.4265789598088594, + "grad_norm": 7.2197465896606445, + "learning_rate": 3.873419566667911e-06, + "loss": 0.4767, + "mean_token_accuracy": 0.845657354593277, + "num_tokens": 165545740.0, + "step": 137610 + }, + { + "entropy": 1.8812014043331147, + "epoch": 0.4266099589339091, + "grad_norm": 8.381542205810547, + "learning_rate": 3.873278834285967e-06, + "loss": 0.4234, + "mean_token_accuracy": 0.8589608117938041, + "num_tokens": 165557441.0, + "step": 137620 + }, + { + "entropy": 1.9113554194569589, + "epoch": 0.4266409580589588, + "grad_norm": 8.038044929504395, + "learning_rate": 3.8731381172425355e-06, + "loss": 0.4489, + "mean_token_accuracy": 0.8493774205446243, + "num_tokens": 165568993.0, + "step": 137630 + }, + { + "entropy": 1.891987682878971, + "epoch": 0.42667195718400847, + "grad_norm": 8.052876472473145, + "learning_rate": 3.87299741553483e-06, + "loss": 0.4313, + "mean_token_accuracy": 0.8627904951572418, + "num_tokens": 165580905.0, + "step": 137640 + }, + { + "entropy": 1.9532914757728577, + "epoch": 0.4267029563090582, + "grad_norm": 8.188140869140625, + "learning_rate": 3.872856729160065e-06, + "loss": 0.4911, + "mean_token_accuracy": 0.8446842655539513, + "num_tokens": 165591744.0, + "step": 137650 + }, + { + "entropy": 1.9147072404623031, + "epoch": 0.42673395543410786, + "grad_norm": 7.08876895904541, + "learning_rate": 3.872716058115456e-06, + "loss": 0.4474, + "mean_token_accuracy": 0.8530980065464974, + "num_tokens": 165603429.0, + "step": 137660 + }, + { + "entropy": 1.950710503757, + "epoch": 0.4267649545591576, + "grad_norm": 9.763349533081055, + "learning_rate": 3.87257540239822e-06, + "loss": 0.4953, + "mean_token_accuracy": 0.8458059638738632, + "num_tokens": 165614886.0, + "step": 137670 + }, + { + "entropy": 1.8922182023525238, + "epoch": 0.42679595368420725, + "grad_norm": 7.75042724609375, + "learning_rate": 3.872434762005572e-06, + "loss": 0.4605, + "mean_token_accuracy": 0.8595366209745408, + "num_tokens": 165626421.0, + "step": 137680 + }, + { + "entropy": 1.9189585834741592, + "epoch": 0.426826952809257, + "grad_norm": 5.677242755889893, + "learning_rate": 3.872294136934731e-06, + "loss": 0.4916, + "mean_token_accuracy": 0.8460530653595925, + "num_tokens": 165638484.0, + "step": 137690 + }, + { + "entropy": 1.8581395775079728, + "epoch": 0.42685795193430665, + "grad_norm": 8.89156723022461, + "learning_rate": 3.872153527182914e-06, + "loss": 0.4251, + "mean_token_accuracy": 0.851922070980072, + "num_tokens": 165652365.0, + "step": 137700 + }, + { + "entropy": 1.9807763159275056, + "epoch": 0.42688895105935637, + "grad_norm": 9.592456817626953, + "learning_rate": 3.87201293274734e-06, + "loss": 0.5234, + "mean_token_accuracy": 0.8444359451532364, + "num_tokens": 165663039.0, + "step": 137710 + }, + { + "entropy": 1.9604480162262916, + "epoch": 0.42691995018440604, + "grad_norm": 6.907299518585205, + "learning_rate": 3.87187235362523e-06, + "loss": 0.5137, + "mean_token_accuracy": 0.8401239201426506, + "num_tokens": 165674987.0, + "step": 137720 + }, + { + "entropy": 1.8310967803001403, + "epoch": 0.42695094930945576, + "grad_norm": 7.537128925323486, + "learning_rate": 3.871731789813803e-06, + "loss": 0.3615, + "mean_token_accuracy": 0.8688787788152694, + "num_tokens": 165688212.0, + "step": 137730 + }, + { + "entropy": 1.8531201511621476, + "epoch": 0.42698194843450543, + "grad_norm": 3.874274253845215, + "learning_rate": 3.87159124131028e-06, + "loss": 0.3895, + "mean_token_accuracy": 0.8694923147559166, + "num_tokens": 165700648.0, + "step": 137740 + }, + { + "entropy": 1.8400910899043084, + "epoch": 0.42701294755955516, + "grad_norm": 8.50154972076416, + "learning_rate": 3.871450708111883e-06, + "loss": 0.4143, + "mean_token_accuracy": 0.8610082730650902, + "num_tokens": 165713968.0, + "step": 137750 + }, + { + "entropy": 1.9427133277058601, + "epoch": 0.4270439466846048, + "grad_norm": 9.070932388305664, + "learning_rate": 3.8713101902158354e-06, + "loss": 0.457, + "mean_token_accuracy": 0.8499548882246017, + "num_tokens": 165725879.0, + "step": 137760 + }, + { + "entropy": 1.913887146115303, + "epoch": 0.42707494580965455, + "grad_norm": 7.201402187347412, + "learning_rate": 3.871169687619359e-06, + "loss": 0.4384, + "mean_token_accuracy": 0.8556391328573227, + "num_tokens": 165737902.0, + "step": 137770 + }, + { + "entropy": 1.8545256823301315, + "epoch": 0.4271059449347042, + "grad_norm": 7.537273406982422, + "learning_rate": 3.871029200319679e-06, + "loss": 0.4282, + "mean_token_accuracy": 0.8587723672389984, + "num_tokens": 165750054.0, + "step": 137780 + }, + { + "entropy": 1.938425388932228, + "epoch": 0.4271369440597539, + "grad_norm": 8.340550422668457, + "learning_rate": 3.870888728314018e-06, + "loss": 0.4813, + "mean_token_accuracy": 0.8487496972084045, + "num_tokens": 165760805.0, + "step": 137790 + }, + { + "entropy": 1.7333957627415657, + "epoch": 0.4271679431848036, + "grad_norm": 8.770150184631348, + "learning_rate": 3.870748271599602e-06, + "loss": 0.3814, + "mean_token_accuracy": 0.8617977604269982, + "num_tokens": 165774883.0, + "step": 137800 + }, + { + "entropy": 1.924593235552311, + "epoch": 0.4271989423098533, + "grad_norm": 4.3831400871276855, + "learning_rate": 3.870607830173659e-06, + "loss": 0.4805, + "mean_token_accuracy": 0.8520621985197068, + "num_tokens": 165786764.0, + "step": 137810 + }, + { + "entropy": 1.856243497133255, + "epoch": 0.427229941434903, + "grad_norm": 8.254294395446777, + "learning_rate": 3.8704674040334124e-06, + "loss": 0.4257, + "mean_token_accuracy": 0.8571957781910896, + "num_tokens": 165799501.0, + "step": 137820 + }, + { + "entropy": 1.9511059790849685, + "epoch": 0.4272609405599527, + "grad_norm": 7.887767314910889, + "learning_rate": 3.8703269931760905e-06, + "loss": 0.4449, + "mean_token_accuracy": 0.8668577119708061, + "num_tokens": 165810071.0, + "step": 137830 + }, + { + "entropy": 1.854556131362915, + "epoch": 0.4272919396850024, + "grad_norm": 7.153961658477783, + "learning_rate": 3.870186597598924e-06, + "loss": 0.4058, + "mean_token_accuracy": 0.8615784257650375, + "num_tokens": 165822533.0, + "step": 137840 + }, + { + "entropy": 1.8858709022402764, + "epoch": 0.42732293881005207, + "grad_norm": 3.6662893295288086, + "learning_rate": 3.870046217299139e-06, + "loss": 0.4176, + "mean_token_accuracy": 0.8626757502555847, + "num_tokens": 165834055.0, + "step": 137850 + }, + { + "entropy": 1.977617959678173, + "epoch": 0.4273539379351018, + "grad_norm": 10.866979598999023, + "learning_rate": 3.869905852273965e-06, + "loss": 0.5671, + "mean_token_accuracy": 0.8238250657916069, + "num_tokens": 165846081.0, + "step": 137860 + }, + { + "entropy": 1.808312650024891, + "epoch": 0.42738493706015146, + "grad_norm": 3.0479211807250977, + "learning_rate": 3.869765502520633e-06, + "loss": 0.4259, + "mean_token_accuracy": 0.8619560331106186, + "num_tokens": 165858762.0, + "step": 137870 + }, + { + "entropy": 1.966909298300743, + "epoch": 0.4274159361852012, + "grad_norm": 8.616495132446289, + "learning_rate": 3.869625168036374e-06, + "loss": 0.5079, + "mean_token_accuracy": 0.8432893455028534, + "num_tokens": 165869711.0, + "step": 137880 + }, + { + "entropy": 1.8776897192001343, + "epoch": 0.42744693531025085, + "grad_norm": 5.014652252197266, + "learning_rate": 3.8694848488184185e-06, + "loss": 0.4583, + "mean_token_accuracy": 0.8473319381475448, + "num_tokens": 165882325.0, + "step": 137890 + }, + { + "entropy": 1.9359253600239754, + "epoch": 0.4274779344353006, + "grad_norm": 9.926865577697754, + "learning_rate": 3.869344544864e-06, + "loss": 0.4655, + "mean_token_accuracy": 0.8564401522278786, + "num_tokens": 165893416.0, + "step": 137900 + }, + { + "entropy": 1.9416421443223952, + "epoch": 0.42750893356035025, + "grad_norm": 4.446148872375488, + "learning_rate": 3.869204256170351e-06, + "loss": 0.5114, + "mean_token_accuracy": 0.8427053913474083, + "num_tokens": 165904568.0, + "step": 137910 + }, + { + "entropy": 1.8732975110411645, + "epoch": 0.42753993268539997, + "grad_norm": 7.7177019119262695, + "learning_rate": 3.869063982734706e-06, + "loss": 0.4447, + "mean_token_accuracy": 0.860724626481533, + "num_tokens": 165916369.0, + "step": 137920 + }, + { + "entropy": 1.8843568712472916, + "epoch": 0.42757093181044964, + "grad_norm": 8.525490760803223, + "learning_rate": 3.868923724554298e-06, + "loss": 0.4442, + "mean_token_accuracy": 0.8500266253948212, + "num_tokens": 165928150.0, + "step": 137930 + }, + { + "entropy": 1.9407880812883378, + "epoch": 0.42760193093549936, + "grad_norm": 8.2413969039917, + "learning_rate": 3.868783481626363e-06, + "loss": 0.4822, + "mean_token_accuracy": 0.850470757484436, + "num_tokens": 165938815.0, + "step": 137940 + }, + { + "entropy": 1.9635769337415696, + "epoch": 0.42763293006054903, + "grad_norm": 8.470434188842773, + "learning_rate": 3.868643253948136e-06, + "loss": 0.4737, + "mean_token_accuracy": 0.853185373544693, + "num_tokens": 165949479.0, + "step": 137950 + }, + { + "entropy": 1.778677401691675, + "epoch": 0.42766392918559876, + "grad_norm": 9.556220054626465, + "learning_rate": 3.868503041516854e-06, + "loss": 0.3995, + "mean_token_accuracy": 0.8522502809762955, + "num_tokens": 165962742.0, + "step": 137960 + }, + { + "entropy": 1.9046275839209557, + "epoch": 0.4276949283106484, + "grad_norm": 8.775141716003418, + "learning_rate": 3.8683628443297554e-06, + "loss": 0.5006, + "mean_token_accuracy": 0.8469727069139481, + "num_tokens": 165974044.0, + "step": 137970 + }, + { + "entropy": 1.9093778878450394, + "epoch": 0.42772592743569815, + "grad_norm": 4.199886798858643, + "learning_rate": 3.868222662384076e-06, + "loss": 0.4646, + "mean_token_accuracy": 0.8507871389389038, + "num_tokens": 165985295.0, + "step": 137980 + }, + { + "entropy": 1.858989103138447, + "epoch": 0.4277569265607478, + "grad_norm": 8.536810874938965, + "learning_rate": 3.868082495677056e-06, + "loss": 0.4509, + "mean_token_accuracy": 0.8496241062879563, + "num_tokens": 165998258.0, + "step": 137990 + }, + { + "entropy": 1.8789437100291253, + "epoch": 0.42778792568579754, + "grad_norm": 8.451990127563477, + "learning_rate": 3.867942344205934e-06, + "loss": 0.4667, + "mean_token_accuracy": 0.8546844542026519, + "num_tokens": 166009620.0, + "step": 138000 + }, + { + "entropy": 1.8833610072731972, + "epoch": 0.4278189248108472, + "grad_norm": 7.35070276260376, + "learning_rate": 3.867802207967949e-06, + "loss": 0.4241, + "mean_token_accuracy": 0.8602321013808251, + "num_tokens": 166021208.0, + "step": 138010 + }, + { + "entropy": 1.8281296089291572, + "epoch": 0.42784992393589694, + "grad_norm": 7.972667694091797, + "learning_rate": 3.867662086960344e-06, + "loss": 0.4273, + "mean_token_accuracy": 0.8570877268910408, + "num_tokens": 166033804.0, + "step": 138020 + }, + { + "entropy": 1.7907851234078407, + "epoch": 0.4278809230609466, + "grad_norm": 8.16515827178955, + "learning_rate": 3.8675219811803586e-06, + "loss": 0.4153, + "mean_token_accuracy": 0.8523140147328376, + "num_tokens": 166047092.0, + "step": 138030 + }, + { + "entropy": 1.7590288013219832, + "epoch": 0.4279119221859963, + "grad_norm": 3.805769443511963, + "learning_rate": 3.8673818906252354e-06, + "loss": 0.349, + "mean_token_accuracy": 0.8651872128248215, + "num_tokens": 166060694.0, + "step": 138040 + }, + { + "entropy": 1.8908022806048392, + "epoch": 0.427942921311046, + "grad_norm": 3.611936092376709, + "learning_rate": 3.867241815292218e-06, + "loss": 0.4588, + "mean_token_accuracy": 0.851640222966671, + "num_tokens": 166072485.0, + "step": 138050 + }, + { + "entropy": 1.9297591403126717, + "epoch": 0.42797392043609567, + "grad_norm": 7.440158843994141, + "learning_rate": 3.867101755178548e-06, + "loss": 0.4881, + "mean_token_accuracy": 0.8443147346377373, + "num_tokens": 166083693.0, + "step": 138060 + }, + { + "entropy": 1.9145255535840988, + "epoch": 0.4280049195611454, + "grad_norm": 7.726010799407959, + "learning_rate": 3.866961710281471e-06, + "loss": 0.4287, + "mean_token_accuracy": 0.8665438875555992, + "num_tokens": 166095128.0, + "step": 138070 + }, + { + "entropy": 1.8834168136119842, + "epoch": 0.42803591868619506, + "grad_norm": 8.636183738708496, + "learning_rate": 3.866821680598232e-06, + "loss": 0.4411, + "mean_token_accuracy": 0.8588782355189324, + "num_tokens": 166107020.0, + "step": 138080 + }, + { + "entropy": 1.890410417318344, + "epoch": 0.4280669178112448, + "grad_norm": 9.407487869262695, + "learning_rate": 3.8666816661260766e-06, + "loss": 0.4226, + "mean_token_accuracy": 0.8558942392468453, + "num_tokens": 166118533.0, + "step": 138090 + }, + { + "entropy": 1.895900882035494, + "epoch": 0.42809791693629445, + "grad_norm": 9.499615669250488, + "learning_rate": 3.86654166686225e-06, + "loss": 0.4494, + "mean_token_accuracy": 0.8502351224422455, + "num_tokens": 166130282.0, + "step": 138100 + }, + { + "entropy": 1.899457646906376, + "epoch": 0.4281289160613442, + "grad_norm": 8.837594032287598, + "learning_rate": 3.866401682804001e-06, + "loss": 0.4296, + "mean_token_accuracy": 0.8482611507177353, + "num_tokens": 166142276.0, + "step": 138110 + }, + { + "entropy": 1.9217953234910965, + "epoch": 0.42815991518639385, + "grad_norm": 8.12231731414795, + "learning_rate": 3.866261713948576e-06, + "loss": 0.5005, + "mean_token_accuracy": 0.847544614970684, + "num_tokens": 166154311.0, + "step": 138120 + }, + { + "entropy": 1.975175791978836, + "epoch": 0.42819091431144357, + "grad_norm": 7.083240509033203, + "learning_rate": 3.866121760293223e-06, + "loss": 0.5289, + "mean_token_accuracy": 0.8364548414945603, + "num_tokens": 166165093.0, + "step": 138130 + }, + { + "entropy": 1.9826736554503441, + "epoch": 0.42822191343649324, + "grad_norm": 9.184717178344727, + "learning_rate": 3.865981821835192e-06, + "loss": 0.4599, + "mean_token_accuracy": 0.8572745114564896, + "num_tokens": 166176186.0, + "step": 138140 + }, + { + "entropy": 1.8134810969233512, + "epoch": 0.42825291256154296, + "grad_norm": 9.861612319946289, + "learning_rate": 3.8658418985717325e-06, + "loss": 0.4013, + "mean_token_accuracy": 0.8668722435832024, + "num_tokens": 166188879.0, + "step": 138150 + }, + { + "entropy": 1.884695628285408, + "epoch": 0.42828391168659263, + "grad_norm": 7.422934532165527, + "learning_rate": 3.865701990500095e-06, + "loss": 0.4236, + "mean_token_accuracy": 0.8617542907595634, + "num_tokens": 166201719.0, + "step": 138160 + }, + { + "entropy": 1.9591299802064897, + "epoch": 0.42831491081164236, + "grad_norm": 7.2439165115356445, + "learning_rate": 3.865562097617531e-06, + "loss": 0.4762, + "mean_token_accuracy": 0.8428146690130234, + "num_tokens": 166213159.0, + "step": 138170 + }, + { + "entropy": 1.9952568769454957, + "epoch": 0.428345909936692, + "grad_norm": 8.97078800201416, + "learning_rate": 3.865422219921291e-06, + "loss": 0.51, + "mean_token_accuracy": 0.8497517094016075, + "num_tokens": 166224006.0, + "step": 138180 + }, + { + "entropy": 1.924196371436119, + "epoch": 0.42837690906174175, + "grad_norm": 7.9839186668396, + "learning_rate": 3.865282357408629e-06, + "loss": 0.4873, + "mean_token_accuracy": 0.84932641685009, + "num_tokens": 166235138.0, + "step": 138190 + }, + { + "entropy": 1.8705265656113625, + "epoch": 0.4284079081867914, + "grad_norm": 8.166479110717773, + "learning_rate": 3.8651425100767995e-06, + "loss": 0.4345, + "mean_token_accuracy": 0.8603829473257065, + "num_tokens": 166246952.0, + "step": 138200 + }, + { + "entropy": 1.8786292031407357, + "epoch": 0.42843890731184114, + "grad_norm": 3.4489612579345703, + "learning_rate": 3.865002677923053e-06, + "loss": 0.3917, + "mean_token_accuracy": 0.8639238491654396, + "num_tokens": 166259522.0, + "step": 138210 + }, + { + "entropy": 1.924619671702385, + "epoch": 0.4284699064368908, + "grad_norm": 7.831303596496582, + "learning_rate": 3.864862860944646e-06, + "loss": 0.4674, + "mean_token_accuracy": 0.8498168155550957, + "num_tokens": 166270568.0, + "step": 138220 + }, + { + "entropy": 1.8376672357320785, + "epoch": 0.42850090556194054, + "grad_norm": 8.016936302185059, + "learning_rate": 3.864723059138835e-06, + "loss": 0.4077, + "mean_token_accuracy": 0.8555368795990944, + "num_tokens": 166284006.0, + "step": 138230 + }, + { + "entropy": 1.8461035266518593, + "epoch": 0.4285319046869902, + "grad_norm": 7.379615783691406, + "learning_rate": 3.864583272502873e-06, + "loss": 0.3994, + "mean_token_accuracy": 0.8619070097804069, + "num_tokens": 166296817.0, + "step": 138240 + }, + { + "entropy": 1.8854956120252608, + "epoch": 0.42856290381203993, + "grad_norm": 3.884890079498291, + "learning_rate": 3.86444350103402e-06, + "loss": 0.4355, + "mean_token_accuracy": 0.8470316350460052, + "num_tokens": 166309348.0, + "step": 138250 + }, + { + "entropy": 1.7654275074601173, + "epoch": 0.4285939029370896, + "grad_norm": 9.349808692932129, + "learning_rate": 3.864303744729532e-06, + "loss": 0.3955, + "mean_token_accuracy": 0.8671312019228935, + "num_tokens": 166322832.0, + "step": 138260 + }, + { + "entropy": 1.8022211775183679, + "epoch": 0.4286249020621393, + "grad_norm": 9.219120979309082, + "learning_rate": 3.864164003586666e-06, + "loss": 0.3904, + "mean_token_accuracy": 0.8593027204275131, + "num_tokens": 166335905.0, + "step": 138270 + }, + { + "entropy": 1.8638395503163339, + "epoch": 0.428655901187189, + "grad_norm": 7.9525628089904785, + "learning_rate": 3.864024277602683e-06, + "loss": 0.4317, + "mean_token_accuracy": 0.8568304657936097, + "num_tokens": 166348632.0, + "step": 138280 + }, + { + "entropy": 1.954586958885193, + "epoch": 0.42868690031223866, + "grad_norm": 4.062179088592529, + "learning_rate": 3.863884566774842e-06, + "loss": 0.4813, + "mean_token_accuracy": 0.8409469664096832, + "num_tokens": 166360316.0, + "step": 138290 + }, + { + "entropy": 1.9421510837972165, + "epoch": 0.4287178994372884, + "grad_norm": 2.5580577850341797, + "learning_rate": 3.863744871100402e-06, + "loss": 0.4814, + "mean_token_accuracy": 0.8459225296974182, + "num_tokens": 166372255.0, + "step": 138300 + }, + { + "entropy": 1.9440967753529548, + "epoch": 0.42874889856233805, + "grad_norm": 8.030793190002441, + "learning_rate": 3.863605190576625e-06, + "loss": 0.4716, + "mean_token_accuracy": 0.8478009730577469, + "num_tokens": 166383633.0, + "step": 138310 + }, + { + "entropy": 1.9596870571374894, + "epoch": 0.4287798976873878, + "grad_norm": 8.272663116455078, + "learning_rate": 3.863465525200771e-06, + "loss": 0.5012, + "mean_token_accuracy": 0.855411772429943, + "num_tokens": 166393996.0, + "step": 138320 + }, + { + "entropy": 1.9576293110847474, + "epoch": 0.42881089681243745, + "grad_norm": 8.87380313873291, + "learning_rate": 3.863325874970105e-06, + "loss": 0.4675, + "mean_token_accuracy": 0.8526660829782486, + "num_tokens": 166405246.0, + "step": 138330 + }, + { + "entropy": 1.857802005112171, + "epoch": 0.42884189593748717, + "grad_norm": 7.251513957977295, + "learning_rate": 3.863186239881888e-06, + "loss": 0.4319, + "mean_token_accuracy": 0.8620542094111443, + "num_tokens": 166416790.0, + "step": 138340 + }, + { + "entropy": 1.888051538169384, + "epoch": 0.42887289506253684, + "grad_norm": 5.157383441925049, + "learning_rate": 3.863046619933384e-06, + "loss": 0.4401, + "mean_token_accuracy": 0.8537735804915428, + "num_tokens": 166429648.0, + "step": 138350 + }, + { + "entropy": 1.8693744093179703, + "epoch": 0.42890389418758657, + "grad_norm": 3.8320672512054443, + "learning_rate": 3.862907015121856e-06, + "loss": 0.4023, + "mean_token_accuracy": 0.8648984044790268, + "num_tokens": 166441105.0, + "step": 138360 + }, + { + "entropy": 1.993468850851059, + "epoch": 0.42893489331263623, + "grad_norm": 7.150916576385498, + "learning_rate": 3.8627674254445724e-06, + "loss": 0.5345, + "mean_token_accuracy": 0.8378593668341636, + "num_tokens": 166452361.0, + "step": 138370 + }, + { + "entropy": 1.9181072056293487, + "epoch": 0.42896589243768596, + "grad_norm": 7.089375972747803, + "learning_rate": 3.862627850898797e-06, + "loss": 0.5196, + "mean_token_accuracy": 0.8403525799512863, + "num_tokens": 166463855.0, + "step": 138380 + }, + { + "entropy": 1.967288474738598, + "epoch": 0.4289968915627356, + "grad_norm": 7.978488445281982, + "learning_rate": 3.8624882914817964e-06, + "loss": 0.5225, + "mean_token_accuracy": 0.838663375377655, + "num_tokens": 166475665.0, + "step": 138390 + }, + { + "entropy": 1.8494392395019532, + "epoch": 0.42902789068778535, + "grad_norm": 8.566108703613281, + "learning_rate": 3.8623487471908375e-06, + "loss": 0.4489, + "mean_token_accuracy": 0.844138278067112, + "num_tokens": 166488310.0, + "step": 138400 + }, + { + "entropy": 1.8647007137537002, + "epoch": 0.429058889812835, + "grad_norm": 8.269538879394531, + "learning_rate": 3.862209218023188e-06, + "loss": 0.4146, + "mean_token_accuracy": 0.8606134533882142, + "num_tokens": 166500577.0, + "step": 138410 + }, + { + "entropy": 1.8854542568325996, + "epoch": 0.42908988893788474, + "grad_norm": 8.041147232055664, + "learning_rate": 3.862069703976116e-06, + "loss": 0.4745, + "mean_token_accuracy": 0.8451815471053123, + "num_tokens": 166512317.0, + "step": 138420 + }, + { + "entropy": 1.885179491341114, + "epoch": 0.4291208880629344, + "grad_norm": 7.563552379608154, + "learning_rate": 3.861930205046893e-06, + "loss": 0.425, + "mean_token_accuracy": 0.8753857642412186, + "num_tokens": 166524119.0, + "step": 138430 + }, + { + "entropy": 1.8435839340090752, + "epoch": 0.42915188718798414, + "grad_norm": 3.8470942974090576, + "learning_rate": 3.861790721232786e-06, + "loss": 0.3848, + "mean_token_accuracy": 0.865519005060196, + "num_tokens": 166536053.0, + "step": 138440 + }, + { + "entropy": 1.943047122657299, + "epoch": 0.4291828863130338, + "grad_norm": 7.736998558044434, + "learning_rate": 3.861651252531068e-06, + "loss": 0.4898, + "mean_token_accuracy": 0.8467626482248306, + "num_tokens": 166547902.0, + "step": 138450 + }, + { + "entropy": 1.9400530129671096, + "epoch": 0.42921388543808353, + "grad_norm": 7.892675399780273, + "learning_rate": 3.861511798939009e-06, + "loss": 0.4613, + "mean_token_accuracy": 0.8521466955542565, + "num_tokens": 166559133.0, + "step": 138460 + }, + { + "entropy": 1.8419177174568175, + "epoch": 0.4292448845631332, + "grad_norm": 8.258288383483887, + "learning_rate": 3.8613723604538814e-06, + "loss": 0.4093, + "mean_token_accuracy": 0.8595252349972725, + "num_tokens": 166571712.0, + "step": 138470 + }, + { + "entropy": 1.918247513473034, + "epoch": 0.4292758836881829, + "grad_norm": 8.431722640991211, + "learning_rate": 3.8612329370729566e-06, + "loss": 0.4883, + "mean_token_accuracy": 0.8501754224300384, + "num_tokens": 166583492.0, + "step": 138480 + }, + { + "entropy": 1.922251921892166, + "epoch": 0.4293068828132326, + "grad_norm": 7.291534423828125, + "learning_rate": 3.861093528793509e-06, + "loss": 0.4675, + "mean_token_accuracy": 0.8490281432867051, + "num_tokens": 166595760.0, + "step": 138490 + }, + { + "entropy": 1.9234865739941598, + "epoch": 0.4293378819382823, + "grad_norm": 8.550054550170898, + "learning_rate": 3.860954135612814e-06, + "loss": 0.4715, + "mean_token_accuracy": 0.8617073327302933, + "num_tokens": 166607418.0, + "step": 138500 + }, + { + "entropy": 1.9898644000291825, + "epoch": 0.429368881063332, + "grad_norm": 7.687295913696289, + "learning_rate": 3.860814757528143e-06, + "loss": 0.5028, + "mean_token_accuracy": 0.8465102508664131, + "num_tokens": 166618579.0, + "step": 138510 + }, + { + "entropy": 1.9373308807611465, + "epoch": 0.42939988018838166, + "grad_norm": 8.360345840454102, + "learning_rate": 3.8606753945367746e-06, + "loss": 0.4598, + "mean_token_accuracy": 0.849955253303051, + "num_tokens": 166629584.0, + "step": 138520 + }, + { + "entropy": 1.8768581017851829, + "epoch": 0.4294308793134314, + "grad_norm": 8.298843383789062, + "learning_rate": 3.860536046635983e-06, + "loss": 0.4401, + "mean_token_accuracy": 0.8533497661352157, + "num_tokens": 166641699.0, + "step": 138530 + }, + { + "entropy": 1.8640162199735641, + "epoch": 0.42946187843848105, + "grad_norm": 8.439657211303711, + "learning_rate": 3.8603967138230456e-06, + "loss": 0.4794, + "mean_token_accuracy": 0.8546614408493042, + "num_tokens": 166654557.0, + "step": 138540 + }, + { + "entropy": 1.7630941659212112, + "epoch": 0.4294928775635308, + "grad_norm": 4.3905744552612305, + "learning_rate": 3.8602573960952405e-06, + "loss": 0.3375, + "mean_token_accuracy": 0.8672227919101715, + "num_tokens": 166669057.0, + "step": 138550 + }, + { + "entropy": 1.883773235976696, + "epoch": 0.42952387668858044, + "grad_norm": 8.244091987609863, + "learning_rate": 3.860118093449845e-06, + "loss": 0.4368, + "mean_token_accuracy": 0.8554739341139793, + "num_tokens": 166680914.0, + "step": 138560 + }, + { + "entropy": 1.9457030475139618, + "epoch": 0.42955487581363017, + "grad_norm": 9.337789535522461, + "learning_rate": 3.8599788058841385e-06, + "loss": 0.4938, + "mean_token_accuracy": 0.8439801335334778, + "num_tokens": 166692421.0, + "step": 138570 + }, + { + "entropy": 1.8535506889224052, + "epoch": 0.42958587493867983, + "grad_norm": 10.059792518615723, + "learning_rate": 3.859839533395399e-06, + "loss": 0.3947, + "mean_token_accuracy": 0.8560854658484459, + "num_tokens": 166704599.0, + "step": 138580 + }, + { + "entropy": 1.9137897729873656, + "epoch": 0.42961687406372956, + "grad_norm": 8.756423950195312, + "learning_rate": 3.859700275980909e-06, + "loss": 0.4349, + "mean_token_accuracy": 0.8534789383411407, + "num_tokens": 166716671.0, + "step": 138590 + }, + { + "entropy": 1.8182189181447028, + "epoch": 0.42964787318877923, + "grad_norm": 4.194288730621338, + "learning_rate": 3.859561033637948e-06, + "loss": 0.4178, + "mean_token_accuracy": 0.8575180351734162, + "num_tokens": 166729546.0, + "step": 138600 + }, + { + "entropy": 1.9256513655185699, + "epoch": 0.42967887231382895, + "grad_norm": 8.066496849060059, + "learning_rate": 3.859421806363798e-06, + "loss": 0.4543, + "mean_token_accuracy": 0.8577398300170899, + "num_tokens": 166741327.0, + "step": 138610 + }, + { + "entropy": 1.9436537489295005, + "epoch": 0.4297098714388786, + "grad_norm": 8.44263744354248, + "learning_rate": 3.859282594155741e-06, + "loss": 0.4676, + "mean_token_accuracy": 0.8500909224152565, + "num_tokens": 166752956.0, + "step": 138620 + }, + { + "entropy": 1.8246969357132912, + "epoch": 0.42974087056392835, + "grad_norm": 7.600648880004883, + "learning_rate": 3.859143397011061e-06, + "loss": 0.3817, + "mean_token_accuracy": 0.8686912745237351, + "num_tokens": 166766150.0, + "step": 138630 + }, + { + "entropy": 1.8569831773638725, + "epoch": 0.429771869688978, + "grad_norm": 3.839282989501953, + "learning_rate": 3.85900421492704e-06, + "loss": 0.4299, + "mean_token_accuracy": 0.8597505047917366, + "num_tokens": 166779296.0, + "step": 138640 + }, + { + "entropy": 1.9191448912024498, + "epoch": 0.42980286881402774, + "grad_norm": 8.564552307128906, + "learning_rate": 3.858865047900964e-06, + "loss": 0.5094, + "mean_token_accuracy": 0.8487473547458648, + "num_tokens": 166790635.0, + "step": 138650 + }, + { + "entropy": 1.9488032266497612, + "epoch": 0.4298338679390774, + "grad_norm": 7.2074360847473145, + "learning_rate": 3.858725895930116e-06, + "loss": 0.4532, + "mean_token_accuracy": 0.8469724044203758, + "num_tokens": 166802382.0, + "step": 138660 + }, + { + "entropy": 1.9374908640980721, + "epoch": 0.42986486706412713, + "grad_norm": 8.300436973571777, + "learning_rate": 3.8585867590117845e-06, + "loss": 0.445, + "mean_token_accuracy": 0.8558620125055313, + "num_tokens": 166813964.0, + "step": 138670 + }, + { + "entropy": 1.9663563668727875, + "epoch": 0.4298958661891768, + "grad_norm": 9.31306266784668, + "learning_rate": 3.858447637143254e-06, + "loss": 0.4717, + "mean_token_accuracy": 0.8509336993098259, + "num_tokens": 166825330.0, + "step": 138680 + }, + { + "entropy": 1.944647277891636, + "epoch": 0.4299268653142265, + "grad_norm": 9.174337387084961, + "learning_rate": 3.858308530321811e-06, + "loss": 0.4679, + "mean_token_accuracy": 0.8518739119172096, + "num_tokens": 166836729.0, + "step": 138690 + }, + { + "entropy": 1.9004219979047776, + "epoch": 0.4299578644392762, + "grad_norm": 9.097691535949707, + "learning_rate": 3.858169438544745e-06, + "loss": 0.4376, + "mean_token_accuracy": 0.8654677256941795, + "num_tokens": 166848221.0, + "step": 138700 + }, + { + "entropy": 1.905510926246643, + "epoch": 0.4299888635643259, + "grad_norm": 7.829078674316406, + "learning_rate": 3.858030361809342e-06, + "loss": 0.4285, + "mean_token_accuracy": 0.8583436101675034, + "num_tokens": 166860834.0, + "step": 138710 + }, + { + "entropy": 2.0032723784446715, + "epoch": 0.4300198626893756, + "grad_norm": 9.5745267868042, + "learning_rate": 3.857891300112894e-06, + "loss": 0.5005, + "mean_token_accuracy": 0.845015361905098, + "num_tokens": 166871860.0, + "step": 138720 + }, + { + "entropy": 1.9505044639110565, + "epoch": 0.4300508618144253, + "grad_norm": 8.139888763427734, + "learning_rate": 3.85775225345269e-06, + "loss": 0.4537, + "mean_token_accuracy": 0.8441401451826096, + "num_tokens": 166883775.0, + "step": 138730 + }, + { + "entropy": 1.996666456758976, + "epoch": 0.430081860939475, + "grad_norm": 8.10516357421875, + "learning_rate": 3.857613221826021e-06, + "loss": 0.4895, + "mean_token_accuracy": 0.8445516094565392, + "num_tokens": 166894824.0, + "step": 138740 + }, + { + "entropy": 1.9581088334321977, + "epoch": 0.4301128600645247, + "grad_norm": 8.987552642822266, + "learning_rate": 3.8574742052301755e-06, + "loss": 0.4833, + "mean_token_accuracy": 0.8530200630426407, + "num_tokens": 166906284.0, + "step": 138750 + }, + { + "entropy": 1.8697850778698921, + "epoch": 0.4301438591895744, + "grad_norm": 7.841629981994629, + "learning_rate": 3.857335203662448e-06, + "loss": 0.4098, + "mean_token_accuracy": 0.8616936087608338, + "num_tokens": 166919062.0, + "step": 138760 + }, + { + "entropy": 1.8914014741778373, + "epoch": 0.43017485831462404, + "grad_norm": 3.872026205062866, + "learning_rate": 3.85719621712013e-06, + "loss": 0.4809, + "mean_token_accuracy": 0.8455238342285156, + "num_tokens": 166930561.0, + "step": 138770 + }, + { + "entropy": 1.8692807108163834, + "epoch": 0.43020585743967377, + "grad_norm": 2.8080806732177734, + "learning_rate": 3.857057245600515e-06, + "loss": 0.3985, + "mean_token_accuracy": 0.8588801845908165, + "num_tokens": 166944394.0, + "step": 138780 + }, + { + "entropy": 1.8834291696548462, + "epoch": 0.43023685656472344, + "grad_norm": 8.107061386108398, + "learning_rate": 3.856918289100897e-06, + "loss": 0.4835, + "mean_token_accuracy": 0.8372646197676659, + "num_tokens": 166957123.0, + "step": 138790 + }, + { + "entropy": 1.9186320677399635, + "epoch": 0.43026785568977316, + "grad_norm": 6.207789421081543, + "learning_rate": 3.856779347618569e-06, + "loss": 0.4659, + "mean_token_accuracy": 0.8523793607950211, + "num_tokens": 166969149.0, + "step": 138800 + }, + { + "entropy": 1.9312793225049973, + "epoch": 0.43029885481482283, + "grad_norm": 9.400903701782227, + "learning_rate": 3.85664042115083e-06, + "loss": 0.4098, + "mean_token_accuracy": 0.8667648211121559, + "num_tokens": 166981732.0, + "step": 138810 + }, + { + "entropy": 1.7980728089809417, + "epoch": 0.43032985393987255, + "grad_norm": 5.016563415527344, + "learning_rate": 3.8565015096949724e-06, + "loss": 0.3996, + "mean_token_accuracy": 0.8642199486494064, + "num_tokens": 166995822.0, + "step": 138820 + }, + { + "entropy": 1.8924931958317757, + "epoch": 0.4303608530649222, + "grad_norm": 8.750584602355957, + "learning_rate": 3.856362613248295e-06, + "loss": 0.4858, + "mean_token_accuracy": 0.8461324706673622, + "num_tokens": 167008749.0, + "step": 138830 + }, + { + "entropy": 1.840745933353901, + "epoch": 0.43039185218997195, + "grad_norm": 10.046074867248535, + "learning_rate": 3.856223731808094e-06, + "loss": 0.4145, + "mean_token_accuracy": 0.8469096630811691, + "num_tokens": 167022329.0, + "step": 138840 + }, + { + "entropy": 1.8810099840164185, + "epoch": 0.4304228513150216, + "grad_norm": 8.733558654785156, + "learning_rate": 3.856084865371667e-06, + "loss": 0.3754, + "mean_token_accuracy": 0.8611682400107383, + "num_tokens": 167035334.0, + "step": 138850 + }, + { + "entropy": 1.8504212602972985, + "epoch": 0.43045385044007134, + "grad_norm": 2.9879043102264404, + "learning_rate": 3.855946013936313e-06, + "loss": 0.4424, + "mean_token_accuracy": 0.8579250067472458, + "num_tokens": 167048594.0, + "step": 138860 + }, + { + "entropy": 1.9671182319521905, + "epoch": 0.430484849565121, + "grad_norm": 8.825550079345703, + "learning_rate": 3.855807177499334e-06, + "loss": 0.4828, + "mean_token_accuracy": 0.8505840554833413, + "num_tokens": 167060152.0, + "step": 138870 + }, + { + "entropy": 1.8691889211535453, + "epoch": 0.43051584869017073, + "grad_norm": 7.807781219482422, + "learning_rate": 3.855668356058026e-06, + "loss": 0.4117, + "mean_token_accuracy": 0.8700853690505028, + "num_tokens": 167072813.0, + "step": 138880 + }, + { + "entropy": 1.89640132188797, + "epoch": 0.4305468478152204, + "grad_norm": 7.267297267913818, + "learning_rate": 3.855529549609692e-06, + "loss": 0.4418, + "mean_token_accuracy": 0.8511320620775222, + "num_tokens": 167084361.0, + "step": 138890 + }, + { + "entropy": 1.8999717831611633, + "epoch": 0.4305778469402701, + "grad_norm": 8.101717948913574, + "learning_rate": 3.855390758151633e-06, + "loss": 0.4316, + "mean_token_accuracy": 0.8580278515815735, + "num_tokens": 167096540.0, + "step": 138900 + }, + { + "entropy": 1.9178320989012718, + "epoch": 0.4306088460653198, + "grad_norm": 6.982521057128906, + "learning_rate": 3.855251981681151e-06, + "loss": 0.4057, + "mean_token_accuracy": 0.8600656524300575, + "num_tokens": 167107872.0, + "step": 138910 + }, + { + "entropy": 1.9452101722359658, + "epoch": 0.4306398451903695, + "grad_norm": 10.741822242736816, + "learning_rate": 3.855113220195549e-06, + "loss": 0.4779, + "mean_token_accuracy": 0.8533175542950631, + "num_tokens": 167119118.0, + "step": 138920 + }, + { + "entropy": 1.893312358856201, + "epoch": 0.4306708443154192, + "grad_norm": 8.916707992553711, + "learning_rate": 3.8549744736921305e-06, + "loss": 0.4095, + "mean_token_accuracy": 0.8634658083319664, + "num_tokens": 167130595.0, + "step": 138930 + }, + { + "entropy": 1.8818313643336295, + "epoch": 0.4307018434404689, + "grad_norm": 9.812067985534668, + "learning_rate": 3.854835742168199e-06, + "loss": 0.4265, + "mean_token_accuracy": 0.8561764359474182, + "num_tokens": 167143490.0, + "step": 138940 + }, + { + "entropy": 1.8934076741337775, + "epoch": 0.4307328425655186, + "grad_norm": 6.764780521392822, + "learning_rate": 3.85469702562106e-06, + "loss": 0.4771, + "mean_token_accuracy": 0.8511760547757149, + "num_tokens": 167156082.0, + "step": 138950 + }, + { + "entropy": 1.983777368068695, + "epoch": 0.4307638416905683, + "grad_norm": 7.675331115722656, + "learning_rate": 3.854558324048018e-06, + "loss": 0.5062, + "mean_token_accuracy": 0.8494358032941818, + "num_tokens": 167166878.0, + "step": 138960 + }, + { + "entropy": 1.8882510006427764, + "epoch": 0.430794840815618, + "grad_norm": 3.8300962448120117, + "learning_rate": 3.85441963744638e-06, + "loss": 0.4472, + "mean_token_accuracy": 0.8561283707618713, + "num_tokens": 167178890.0, + "step": 138970 + }, + { + "entropy": 1.9575580522418021, + "epoch": 0.4308258399406677, + "grad_norm": 10.387866020202637, + "learning_rate": 3.854280965813453e-06, + "loss": 0.4783, + "mean_token_accuracy": 0.843521349132061, + "num_tokens": 167190173.0, + "step": 138980 + }, + { + "entropy": 1.9261010527610778, + "epoch": 0.43085683906571737, + "grad_norm": 7.206926345825195, + "learning_rate": 3.8541423091465445e-06, + "loss": 0.461, + "mean_token_accuracy": 0.856898583471775, + "num_tokens": 167201201.0, + "step": 138990 + }, + { + "entropy": 1.871525762975216, + "epoch": 0.4308878381907671, + "grad_norm": 7.605156898498535, + "learning_rate": 3.854003667442962e-06, + "loss": 0.4548, + "mean_token_accuracy": 0.8476424932479858, + "num_tokens": 167213579.0, + "step": 139000 + }, + { + "entropy": 1.9758980587124824, + "epoch": 0.43091883731581676, + "grad_norm": 8.815040588378906, + "learning_rate": 3.853865040700016e-06, + "loss": 0.4918, + "mean_token_accuracy": 0.8458743005990982, + "num_tokens": 167224666.0, + "step": 139010 + }, + { + "entropy": 1.883438839018345, + "epoch": 0.43094983644086643, + "grad_norm": 7.926342010498047, + "learning_rate": 3.853726428915014e-06, + "loss": 0.4409, + "mean_token_accuracy": 0.8550115719437599, + "num_tokens": 167237308.0, + "step": 139020 + }, + { + "entropy": 1.984131459891796, + "epoch": 0.43098083556591615, + "grad_norm": 7.568789958953857, + "learning_rate": 3.853587832085266e-06, + "loss": 0.5039, + "mean_token_accuracy": 0.8429438158869743, + "num_tokens": 167248949.0, + "step": 139030 + }, + { + "entropy": 1.8531333744525909, + "epoch": 0.4310118346909658, + "grad_norm": 4.8112359046936035, + "learning_rate": 3.8534492502080855e-06, + "loss": 0.4109, + "mean_token_accuracy": 0.8603510394692421, + "num_tokens": 167261627.0, + "step": 139040 + }, + { + "entropy": 1.923319575190544, + "epoch": 0.43104283381601555, + "grad_norm": 7.872719764709473, + "learning_rate": 3.853310683280783e-06, + "loss": 0.4643, + "mean_token_accuracy": 0.8496645122766495, + "num_tokens": 167272866.0, + "step": 139050 + }, + { + "entropy": 1.9188224956393243, + "epoch": 0.4310738329410652, + "grad_norm": 8.1365327835083, + "learning_rate": 3.853172131300669e-06, + "loss": 0.4393, + "mean_token_accuracy": 0.8515357092022896, + "num_tokens": 167284947.0, + "step": 139060 + }, + { + "entropy": 1.8909332856535912, + "epoch": 0.43110483206611494, + "grad_norm": 9.189377784729004, + "learning_rate": 3.8530335942650585e-06, + "loss": 0.4156, + "mean_token_accuracy": 0.853959108889103, + "num_tokens": 167297259.0, + "step": 139070 + }, + { + "entropy": 1.8885620832443237, + "epoch": 0.4311358311911646, + "grad_norm": 9.13309097290039, + "learning_rate": 3.852895072171265e-06, + "loss": 0.4055, + "mean_token_accuracy": 0.8578508168458938, + "num_tokens": 167308954.0, + "step": 139080 + }, + { + "entropy": 1.8412837103009223, + "epoch": 0.43116683031621433, + "grad_norm": 4.5541090965271, + "learning_rate": 3.8527565650166015e-06, + "loss": 0.4433, + "mean_token_accuracy": 0.8527804985642433, + "num_tokens": 167321410.0, + "step": 139090 + }, + { + "entropy": 1.9028522804379464, + "epoch": 0.431197829441264, + "grad_norm": 9.414676666259766, + "learning_rate": 3.852618072798383e-06, + "loss": 0.4245, + "mean_token_accuracy": 0.8578800424933434, + "num_tokens": 167333571.0, + "step": 139100 + }, + { + "entropy": 1.8518059805035592, + "epoch": 0.4312288285663137, + "grad_norm": 3.911571741104126, + "learning_rate": 3.852479595513928e-06, + "loss": 0.4215, + "mean_token_accuracy": 0.8502576932311058, + "num_tokens": 167346630.0, + "step": 139110 + }, + { + "entropy": 1.9005075827240945, + "epoch": 0.4312598276913634, + "grad_norm": 7.447115898132324, + "learning_rate": 3.852341133160549e-06, + "loss": 0.458, + "mean_token_accuracy": 0.8436521142721176, + "num_tokens": 167359023.0, + "step": 139120 + }, + { + "entropy": 1.955150267481804, + "epoch": 0.4312908268164131, + "grad_norm": 8.531542778015137, + "learning_rate": 3.8522026857355656e-06, + "loss": 0.4963, + "mean_token_accuracy": 0.8467582687735558, + "num_tokens": 167370225.0, + "step": 139130 + }, + { + "entropy": 1.912473227083683, + "epoch": 0.4313218259414628, + "grad_norm": 7.494203567504883, + "learning_rate": 3.852064253236293e-06, + "loss": 0.4264, + "mean_token_accuracy": 0.8552014127373695, + "num_tokens": 167382204.0, + "step": 139140 + }, + { + "entropy": 1.929689645767212, + "epoch": 0.4313528250665125, + "grad_norm": 8.015472412109375, + "learning_rate": 3.851925835660052e-06, + "loss": 0.4245, + "mean_token_accuracy": 0.8602582827210427, + "num_tokens": 167393606.0, + "step": 139150 + }, + { + "entropy": 1.93514022231102, + "epoch": 0.4313838241915622, + "grad_norm": 6.807281017303467, + "learning_rate": 3.851787433004161e-06, + "loss": 0.456, + "mean_token_accuracy": 0.855001138150692, + "num_tokens": 167404837.0, + "step": 139160 + }, + { + "entropy": 1.8539175868034363, + "epoch": 0.4314148233166119, + "grad_norm": 10.743212699890137, + "learning_rate": 3.851649045265939e-06, + "loss": 0.4302, + "mean_token_accuracy": 0.8622266680002213, + "num_tokens": 167416637.0, + "step": 139170 + }, + { + "entropy": 1.9064085155725479, + "epoch": 0.4314458224416616, + "grad_norm": 8.57366943359375, + "learning_rate": 3.851510672442708e-06, + "loss": 0.4703, + "mean_token_accuracy": 0.8500051259994507, + "num_tokens": 167428580.0, + "step": 139180 + }, + { + "entropy": 1.8125137344002724, + "epoch": 0.4314768215667113, + "grad_norm": 8.313339233398438, + "learning_rate": 3.851372314531787e-06, + "loss": 0.392, + "mean_token_accuracy": 0.8584845051169395, + "num_tokens": 167441286.0, + "step": 139190 + }, + { + "entropy": 1.9030530989170074, + "epoch": 0.43150782069176097, + "grad_norm": 4.577101707458496, + "learning_rate": 3.851233971530498e-06, + "loss": 0.4585, + "mean_token_accuracy": 0.8593366608023644, + "num_tokens": 167453514.0, + "step": 139200 + }, + { + "entropy": 1.9610344976186753, + "epoch": 0.4315388198168107, + "grad_norm": 8.109823226928711, + "learning_rate": 3.851095643436164e-06, + "loss": 0.5133, + "mean_token_accuracy": 0.8386234149336815, + "num_tokens": 167464843.0, + "step": 139210 + }, + { + "entropy": 1.9184462070465087, + "epoch": 0.43156981894186036, + "grad_norm": 4.524685859680176, + "learning_rate": 3.850957330246109e-06, + "loss": 0.4429, + "mean_token_accuracy": 0.8548565566539764, + "num_tokens": 167476607.0, + "step": 139220 + }, + { + "entropy": 1.8219572558999062, + "epoch": 0.4316008180669101, + "grad_norm": 7.470212936401367, + "learning_rate": 3.850819031957655e-06, + "loss": 0.4031, + "mean_token_accuracy": 0.8599913343787193, + "num_tokens": 167489375.0, + "step": 139230 + }, + { + "entropy": 1.878198716044426, + "epoch": 0.43163181719195975, + "grad_norm": 7.494040489196777, + "learning_rate": 3.850680748568127e-06, + "loss": 0.4019, + "mean_token_accuracy": 0.8682243391871453, + "num_tokens": 167501096.0, + "step": 139240 + }, + { + "entropy": 1.882466796040535, + "epoch": 0.4316628163170095, + "grad_norm": 3.8041422367095947, + "learning_rate": 3.85054248007485e-06, + "loss": 0.4767, + "mean_token_accuracy": 0.8378754198551178, + "num_tokens": 167513496.0, + "step": 139250 + }, + { + "entropy": 1.9623596340417861, + "epoch": 0.43169381544205915, + "grad_norm": 9.18985366821289, + "learning_rate": 3.850404226475151e-06, + "loss": 0.5032, + "mean_token_accuracy": 0.8428544372320175, + "num_tokens": 167523826.0, + "step": 139260 + }, + { + "entropy": 1.8917414352297783, + "epoch": 0.4317248145671088, + "grad_norm": 9.102961540222168, + "learning_rate": 3.850265987766354e-06, + "loss": 0.437, + "mean_token_accuracy": 0.8560737207531929, + "num_tokens": 167535560.0, + "step": 139270 + }, + { + "entropy": 1.9341025441884994, + "epoch": 0.43175581369215854, + "grad_norm": 9.072357177734375, + "learning_rate": 3.850127763945788e-06, + "loss": 0.4758, + "mean_token_accuracy": 0.8465018764138221, + "num_tokens": 167546927.0, + "step": 139280 + }, + { + "entropy": 1.7552316695451737, + "epoch": 0.4317868128172082, + "grad_norm": 6.688268661499023, + "learning_rate": 3.849989555010781e-06, + "loss": 0.363, + "mean_token_accuracy": 0.8682321265339852, + "num_tokens": 167560862.0, + "step": 139290 + }, + { + "entropy": 1.9611527785658835, + "epoch": 0.43181781194225793, + "grad_norm": 3.5315842628479004, + "learning_rate": 3.84985136095866e-06, + "loss": 0.4377, + "mean_token_accuracy": 0.8544865190982819, + "num_tokens": 167572247.0, + "step": 139300 + }, + { + "entropy": 1.9224155962467193, + "epoch": 0.4318488110673076, + "grad_norm": 8.508109092712402, + "learning_rate": 3.849713181786755e-06, + "loss": 0.4655, + "mean_token_accuracy": 0.8514292508363723, + "num_tokens": 167583263.0, + "step": 139310 + }, + { + "entropy": 1.910231950879097, + "epoch": 0.4318798101923573, + "grad_norm": 9.595215797424316, + "learning_rate": 3.849575017492395e-06, + "loss": 0.4727, + "mean_token_accuracy": 0.8502141878008842, + "num_tokens": 167595159.0, + "step": 139320 + }, + { + "entropy": 1.9836928397417068, + "epoch": 0.431910809317407, + "grad_norm": 8.529363632202148, + "learning_rate": 3.849436868072912e-06, + "loss": 0.5398, + "mean_token_accuracy": 0.8343594878911972, + "num_tokens": 167606457.0, + "step": 139330 + }, + { + "entropy": 1.8075464725494386, + "epoch": 0.4319418084424567, + "grad_norm": 7.387181282043457, + "learning_rate": 3.849298733525636e-06, + "loss": 0.389, + "mean_token_accuracy": 0.8693550243973732, + "num_tokens": 167619065.0, + "step": 139340 + }, + { + "entropy": 1.8776528060436248, + "epoch": 0.4319728075675064, + "grad_norm": 7.418018341064453, + "learning_rate": 3.849160613847898e-06, + "loss": 0.4092, + "mean_token_accuracy": 0.8570902019739151, + "num_tokens": 167631388.0, + "step": 139350 + }, + { + "entropy": 1.8456250533461571, + "epoch": 0.4320038066925561, + "grad_norm": 6.901108741760254, + "learning_rate": 3.849022509037032e-06, + "loss": 0.4719, + "mean_token_accuracy": 0.8538346603512764, + "num_tokens": 167643724.0, + "step": 139360 + }, + { + "entropy": 1.8757693514227867, + "epoch": 0.4320348058176058, + "grad_norm": 7.9772210121154785, + "learning_rate": 3.848884419090371e-06, + "loss": 0.4258, + "mean_token_accuracy": 0.8559473112225533, + "num_tokens": 167657019.0, + "step": 139370 + }, + { + "entropy": 1.8769934855401516, + "epoch": 0.4320658049426555, + "grad_norm": 7.6656413078308105, + "learning_rate": 3.848746344005249e-06, + "loss": 0.3837, + "mean_token_accuracy": 0.8610425606369972, + "num_tokens": 167669569.0, + "step": 139380 + }, + { + "entropy": 1.8971701174974442, + "epoch": 0.4320968040677052, + "grad_norm": 8.26727294921875, + "learning_rate": 3.848608283778998e-06, + "loss": 0.4906, + "mean_token_accuracy": 0.8484355866909027, + "num_tokens": 167681405.0, + "step": 139390 + }, + { + "entropy": 1.9876887768507003, + "epoch": 0.4321278031927549, + "grad_norm": 7.842116832733154, + "learning_rate": 3.848470238408956e-06, + "loss": 0.5748, + "mean_token_accuracy": 0.8318600550293922, + "num_tokens": 167692558.0, + "step": 139400 + }, + { + "entropy": 1.8590367823839187, + "epoch": 0.43215880231780457, + "grad_norm": 8.572957038879395, + "learning_rate": 3.848332207892458e-06, + "loss": 0.4538, + "mean_token_accuracy": 0.8467065960168838, + "num_tokens": 167705077.0, + "step": 139410 + }, + { + "entropy": 1.9348689168691635, + "epoch": 0.4321898014428543, + "grad_norm": 7.435234069824219, + "learning_rate": 3.84819419222684e-06, + "loss": 0.4499, + "mean_token_accuracy": 0.8602284207940102, + "num_tokens": 167716206.0, + "step": 139420 + }, + { + "entropy": 1.9003080978989602, + "epoch": 0.43222080056790396, + "grad_norm": 3.873739242553711, + "learning_rate": 3.848056191409439e-06, + "loss": 0.4818, + "mean_token_accuracy": 0.8429651230573654, + "num_tokens": 167728103.0, + "step": 139430 + }, + { + "entropy": 1.9454911321401596, + "epoch": 0.4322517996929537, + "grad_norm": 7.4276018142700195, + "learning_rate": 3.847918205437593e-06, + "loss": 0.5043, + "mean_token_accuracy": 0.844630342721939, + "num_tokens": 167738632.0, + "step": 139440 + }, + { + "entropy": 1.8453918382525445, + "epoch": 0.43228279881800336, + "grad_norm": 8.798940658569336, + "learning_rate": 3.847780234308641e-06, + "loss": 0.4311, + "mean_token_accuracy": 0.8564523667097091, + "num_tokens": 167750964.0, + "step": 139450 + }, + { + "entropy": 1.8049393191933631, + "epoch": 0.4323137979430531, + "grad_norm": 9.362977027893066, + "learning_rate": 3.847642278019923e-06, + "loss": 0.4083, + "mean_token_accuracy": 0.8607902884483337, + "num_tokens": 167763833.0, + "step": 139460 + }, + { + "entropy": 1.8966955333948134, + "epoch": 0.43234479706810275, + "grad_norm": 7.66609001159668, + "learning_rate": 3.8475043365687755e-06, + "loss": 0.4731, + "mean_token_accuracy": 0.8480794101953506, + "num_tokens": 167776065.0, + "step": 139470 + }, + { + "entropy": 1.8854218780994416, + "epoch": 0.4323757961931525, + "grad_norm": 8.28154468536377, + "learning_rate": 3.847366409952543e-06, + "loss": 0.4477, + "mean_token_accuracy": 0.8489232257008552, + "num_tokens": 167788059.0, + "step": 139480 + }, + { + "entropy": 1.8077181428670883, + "epoch": 0.43240679531820214, + "grad_norm": 3.7770919799804688, + "learning_rate": 3.847228498168565e-06, + "loss": 0.3587, + "mean_token_accuracy": 0.8708108901977539, + "num_tokens": 167801348.0, + "step": 139490 + }, + { + "entropy": 1.8897404298186302, + "epoch": 0.43243779444325187, + "grad_norm": 7.708387851715088, + "learning_rate": 3.8470906012141825e-06, + "loss": 0.4417, + "mean_token_accuracy": 0.8538424223661423, + "num_tokens": 167814120.0, + "step": 139500 + }, + { + "entropy": 1.834130634367466, + "epoch": 0.43246879356830153, + "grad_norm": 6.732454299926758, + "learning_rate": 3.846952719086739e-06, + "loss": 0.3968, + "mean_token_accuracy": 0.8624187022447586, + "num_tokens": 167826074.0, + "step": 139510 + }, + { + "entropy": 1.8972522467374802, + "epoch": 0.4324997926933512, + "grad_norm": 9.73922061920166, + "learning_rate": 3.8468148517835766e-06, + "loss": 0.4742, + "mean_token_accuracy": 0.8530626803636551, + "num_tokens": 167836968.0, + "step": 139520 + }, + { + "entropy": 1.8679583430290223, + "epoch": 0.43253079181840093, + "grad_norm": 6.562689304351807, + "learning_rate": 3.84667699930204e-06, + "loss": 0.485, + "mean_token_accuracy": 0.8574771344661712, + "num_tokens": 167849043.0, + "step": 139530 + }, + { + "entropy": 1.8196332067251206, + "epoch": 0.4325617909434506, + "grad_norm": 8.335590362548828, + "learning_rate": 3.846539161639474e-06, + "loss": 0.3958, + "mean_token_accuracy": 0.873447559773922, + "num_tokens": 167861647.0, + "step": 139540 + }, + { + "entropy": 1.8070330306887628, + "epoch": 0.4325927900685003, + "grad_norm": 8.824051856994629, + "learning_rate": 3.846401338793222e-06, + "loss": 0.3973, + "mean_token_accuracy": 0.8618940994143486, + "num_tokens": 167874455.0, + "step": 139550 + }, + { + "entropy": 1.9073014467954637, + "epoch": 0.43262378919355, + "grad_norm": 8.493125915527344, + "learning_rate": 3.846263530760633e-06, + "loss": 0.4563, + "mean_token_accuracy": 0.8542511522769928, + "num_tokens": 167885644.0, + "step": 139560 + }, + { + "entropy": 1.8156562566757202, + "epoch": 0.4326547883185997, + "grad_norm": 9.763708114624023, + "learning_rate": 3.84612573753905e-06, + "loss": 0.4308, + "mean_token_accuracy": 0.8623699083924293, + "num_tokens": 167898195.0, + "step": 139570 + }, + { + "entropy": 1.9067368656396866, + "epoch": 0.4326857874436494, + "grad_norm": 8.543749809265137, + "learning_rate": 3.845987959125823e-06, + "loss": 0.4759, + "mean_token_accuracy": 0.855238850414753, + "num_tokens": 167908792.0, + "step": 139580 + }, + { + "entropy": 1.8284807071089744, + "epoch": 0.4327167865686991, + "grad_norm": 7.208462238311768, + "learning_rate": 3.8458501955182975e-06, + "loss": 0.4143, + "mean_token_accuracy": 0.8515859737992286, + "num_tokens": 167921383.0, + "step": 139590 + }, + { + "entropy": 1.872288428246975, + "epoch": 0.4327477856937488, + "grad_norm": 7.469978332519531, + "learning_rate": 3.845712446713823e-06, + "loss": 0.4476, + "mean_token_accuracy": 0.8519970834255218, + "num_tokens": 167932974.0, + "step": 139600 + }, + { + "entropy": 1.918510441482067, + "epoch": 0.4327787848187985, + "grad_norm": 10.325370788574219, + "learning_rate": 3.845574712709749e-06, + "loss": 0.4922, + "mean_token_accuracy": 0.8455549374222755, + "num_tokens": 167945139.0, + "step": 139610 + }, + { + "entropy": 1.7871095269918442, + "epoch": 0.43280978394384817, + "grad_norm": 8.072989463806152, + "learning_rate": 3.845436993503426e-06, + "loss": 0.3753, + "mean_token_accuracy": 0.8632336914539337, + "num_tokens": 167959774.0, + "step": 139620 + }, + { + "entropy": 1.6974161878228187, + "epoch": 0.4328407830688979, + "grad_norm": 1.9402376413345337, + "learning_rate": 3.845299289092203e-06, + "loss": 0.3739, + "mean_token_accuracy": 0.8687549382448196, + "num_tokens": 167974761.0, + "step": 139630 + }, + { + "entropy": 1.8783841490745545, + "epoch": 0.43287178219394756, + "grad_norm": 8.304696083068848, + "learning_rate": 3.845161599473431e-06, + "loss": 0.458, + "mean_token_accuracy": 0.8487045183777809, + "num_tokens": 167987541.0, + "step": 139640 + }, + { + "entropy": 1.8873986780643464, + "epoch": 0.4329027813189973, + "grad_norm": 7.96927547454834, + "learning_rate": 3.845023924644462e-06, + "loss": 0.4082, + "mean_token_accuracy": 0.8622677832841873, + "num_tokens": 167999815.0, + "step": 139650 + }, + { + "entropy": 1.8961790382862092, + "epoch": 0.43293378044404696, + "grad_norm": 9.11035442352295, + "learning_rate": 3.84488626460265e-06, + "loss": 0.4601, + "mean_token_accuracy": 0.8553287327289582, + "num_tokens": 168011278.0, + "step": 139660 + }, + { + "entropy": 1.8724280893802643, + "epoch": 0.4329647795690967, + "grad_norm": 7.960391521453857, + "learning_rate": 3.8447486193453464e-06, + "loss": 0.4489, + "mean_token_accuracy": 0.8539118707180023, + "num_tokens": 168023250.0, + "step": 139670 + }, + { + "entropy": 1.9397452518343925, + "epoch": 0.43299577869414635, + "grad_norm": 6.704464912414551, + "learning_rate": 3.8446109888699054e-06, + "loss": 0.4952, + "mean_token_accuracy": 0.844795499742031, + "num_tokens": 168034708.0, + "step": 139680 + }, + { + "entropy": 1.9342885583639144, + "epoch": 0.4330267778191961, + "grad_norm": 8.719551086425781, + "learning_rate": 3.8444733731736825e-06, + "loss": 0.4688, + "mean_token_accuracy": 0.8504753604531288, + "num_tokens": 168046323.0, + "step": 139690 + }, + { + "entropy": 1.9050405994057655, + "epoch": 0.43305777694424574, + "grad_norm": 7.164178371429443, + "learning_rate": 3.8443357722540315e-06, + "loss": 0.3677, + "mean_token_accuracy": 0.87413921803236, + "num_tokens": 168058199.0, + "step": 139700 + }, + { + "entropy": 1.9514371365308762, + "epoch": 0.43308877606929547, + "grad_norm": 8.19334602355957, + "learning_rate": 3.844198186108308e-06, + "loss": 0.5086, + "mean_token_accuracy": 0.8353783071041108, + "num_tokens": 168069154.0, + "step": 139710 + }, + { + "entropy": 1.8015297777950763, + "epoch": 0.43311977519434514, + "grad_norm": 8.755090713500977, + "learning_rate": 3.84406061473387e-06, + "loss": 0.3908, + "mean_token_accuracy": 0.8574639052152634, + "num_tokens": 168082496.0, + "step": 139720 + }, + { + "entropy": 1.8433510437607765, + "epoch": 0.43315077431939486, + "grad_norm": 7.901381492614746, + "learning_rate": 3.843923058128073e-06, + "loss": 0.4282, + "mean_token_accuracy": 0.8601280748844147, + "num_tokens": 168094337.0, + "step": 139730 + }, + { + "entropy": 1.909453672170639, + "epoch": 0.43318177344444453, + "grad_norm": 4.069192409515381, + "learning_rate": 3.8437855162882766e-06, + "loss": 0.4692, + "mean_token_accuracy": 0.8465544447302819, + "num_tokens": 168106420.0, + "step": 139740 + }, + { + "entropy": 1.9447955161333084, + "epoch": 0.43321277256949425, + "grad_norm": 9.183345794677734, + "learning_rate": 3.843647989211837e-06, + "loss": 0.5139, + "mean_token_accuracy": 0.8410720884799957, + "num_tokens": 168117779.0, + "step": 139750 + }, + { + "entropy": 1.9353636160492897, + "epoch": 0.4332437716945439, + "grad_norm": 6.792473793029785, + "learning_rate": 3.843510476896116e-06, + "loss": 0.4398, + "mean_token_accuracy": 0.8577348798513412, + "num_tokens": 168129340.0, + "step": 139760 + }, + { + "entropy": 1.8856726735830307, + "epoch": 0.4332747708195936, + "grad_norm": 8.366604804992676, + "learning_rate": 3.843372979338471e-06, + "loss": 0.4784, + "mean_token_accuracy": 0.8540061011910438, + "num_tokens": 168141683.0, + "step": 139770 + }, + { + "entropy": 1.96417216360569, + "epoch": 0.4333057699446433, + "grad_norm": 8.365377426147461, + "learning_rate": 3.843235496536262e-06, + "loss": 0.513, + "mean_token_accuracy": 0.8412682712078094, + "num_tokens": 168152620.0, + "step": 139780 + }, + { + "entropy": 1.9343325823545456, + "epoch": 0.433336769069693, + "grad_norm": 6.398437023162842, + "learning_rate": 3.843098028486852e-06, + "loss": 0.434, + "mean_token_accuracy": 0.8587636038661003, + "num_tokens": 168164184.0, + "step": 139790 + }, + { + "entropy": 1.8950095444917678, + "epoch": 0.4333677681947427, + "grad_norm": 8.576680183410645, + "learning_rate": 3.842960575187603e-06, + "loss": 0.4445, + "mean_token_accuracy": 0.8526839464902878, + "num_tokens": 168175843.0, + "step": 139800 + }, + { + "entropy": 1.8716197207570076, + "epoch": 0.4333987673197924, + "grad_norm": 7.882635593414307, + "learning_rate": 3.842823136635875e-06, + "loss": 0.4234, + "mean_token_accuracy": 0.8643146887421608, + "num_tokens": 168187498.0, + "step": 139810 + }, + { + "entropy": 1.8388502165675162, + "epoch": 0.4334297664448421, + "grad_norm": 7.46567964553833, + "learning_rate": 3.842685712829033e-06, + "loss": 0.4209, + "mean_token_accuracy": 0.8461194902658462, + "num_tokens": 168200282.0, + "step": 139820 + }, + { + "entropy": 1.8559609532356263, + "epoch": 0.43346076556989177, + "grad_norm": 5.5117387771606445, + "learning_rate": 3.84254830376444e-06, + "loss": 0.4908, + "mean_token_accuracy": 0.8383535325527192, + "num_tokens": 168213312.0, + "step": 139830 + }, + { + "entropy": 1.8750181749463082, + "epoch": 0.4334917646949415, + "grad_norm": 2.167787790298462, + "learning_rate": 3.8424109094394605e-06, + "loss": 0.4544, + "mean_token_accuracy": 0.8538003221154213, + "num_tokens": 168226085.0, + "step": 139840 + }, + { + "entropy": 1.887896014750004, + "epoch": 0.43352276381999116, + "grad_norm": 3.5215682983398438, + "learning_rate": 3.842273529851461e-06, + "loss": 0.4376, + "mean_token_accuracy": 0.8547932624816894, + "num_tokens": 168237911.0, + "step": 139850 + }, + { + "entropy": 1.847142294049263, + "epoch": 0.4335537629450409, + "grad_norm": 7.590489864349365, + "learning_rate": 3.842136164997804e-06, + "loss": 0.3865, + "mean_token_accuracy": 0.8609548598527909, + "num_tokens": 168250763.0, + "step": 139860 + }, + { + "entropy": 1.7902900233864785, + "epoch": 0.43358476207009056, + "grad_norm": 3.1853187084198, + "learning_rate": 3.841998814875858e-06, + "loss": 0.4003, + "mean_token_accuracy": 0.8613974347710609, + "num_tokens": 168263670.0, + "step": 139870 + }, + { + "entropy": 1.8125322625041007, + "epoch": 0.4336157611951403, + "grad_norm": 8.757499694824219, + "learning_rate": 3.84186147948299e-06, + "loss": 0.3847, + "mean_token_accuracy": 0.8673945814371109, + "num_tokens": 168277183.0, + "step": 139880 + }, + { + "entropy": 1.9186308607459068, + "epoch": 0.43364676032018995, + "grad_norm": 8.361207962036133, + "learning_rate": 3.8417241588165675e-06, + "loss": 0.4835, + "mean_token_accuracy": 0.8502631932497025, + "num_tokens": 168288847.0, + "step": 139890 + }, + { + "entropy": 1.7792581617832184, + "epoch": 0.4336777594452397, + "grad_norm": 3.6714017391204834, + "learning_rate": 3.841586852873958e-06, + "loss": 0.4079, + "mean_token_accuracy": 0.8592787533998489, + "num_tokens": 168301911.0, + "step": 139900 + }, + { + "entropy": 1.9583450973033905, + "epoch": 0.43370875857028934, + "grad_norm": 7.83007287979126, + "learning_rate": 3.841449561652531e-06, + "loss": 0.4741, + "mean_token_accuracy": 0.8499109253287316, + "num_tokens": 168312703.0, + "step": 139910 + }, + { + "entropy": 1.83574049025774, + "epoch": 0.43373975769533907, + "grad_norm": 8.328694343566895, + "learning_rate": 3.841312285149657e-06, + "loss": 0.4001, + "mean_token_accuracy": 0.8661501377820968, + "num_tokens": 168325343.0, + "step": 139920 + }, + { + "entropy": 1.9416332960128784, + "epoch": 0.43377075682038874, + "grad_norm": 7.570043563842773, + "learning_rate": 3.841175023362706e-06, + "loss": 0.4759, + "mean_token_accuracy": 0.8532981321215629, + "num_tokens": 168337277.0, + "step": 139930 + }, + { + "entropy": 1.9153632640838623, + "epoch": 0.43380175594543846, + "grad_norm": 6.462372303009033, + "learning_rate": 3.841037776289048e-06, + "loss": 0.4737, + "mean_token_accuracy": 0.8486651003360748, + "num_tokens": 168348776.0, + "step": 139940 + }, + { + "entropy": 1.928571656346321, + "epoch": 0.43383275507048813, + "grad_norm": 8.484127044677734, + "learning_rate": 3.840900543926055e-06, + "loss": 0.4827, + "mean_token_accuracy": 0.8461811229586601, + "num_tokens": 168360653.0, + "step": 139950 + }, + { + "entropy": 1.9500855028629303, + "epoch": 0.43386375419553785, + "grad_norm": 7.157684803009033, + "learning_rate": 3.8407633262710995e-06, + "loss": 0.4805, + "mean_token_accuracy": 0.8550040423870087, + "num_tokens": 168371201.0, + "step": 139960 + }, + { + "entropy": 1.9280799493193626, + "epoch": 0.4338947533205875, + "grad_norm": 6.877387046813965, + "learning_rate": 3.840626123321555e-06, + "loss": 0.4664, + "mean_token_accuracy": 0.8493270486593246, + "num_tokens": 168382043.0, + "step": 139970 + }, + { + "entropy": 1.8996494933962822, + "epoch": 0.43392575244563725, + "grad_norm": 7.173794746398926, + "learning_rate": 3.840488935074794e-06, + "loss": 0.4746, + "mean_token_accuracy": 0.8530132621526718, + "num_tokens": 168394684.0, + "step": 139980 + }, + { + "entropy": 2.0039018139243128, + "epoch": 0.4339567515706869, + "grad_norm": 7.884592056274414, + "learning_rate": 3.840351761528191e-06, + "loss": 0.5109, + "mean_token_accuracy": 0.8355337530374527, + "num_tokens": 168406211.0, + "step": 139990 + }, + { + "entropy": 1.8956574127078056, + "epoch": 0.43398775069573664, + "grad_norm": 5.687414646148682, + "learning_rate": 3.840214602679122e-06, + "loss": 0.4171, + "mean_token_accuracy": 0.858708880841732, + "num_tokens": 168418288.0, + "step": 140000 + }, + { + "entropy": 1.913138222694397, + "epoch": 0.4340187498207863, + "grad_norm": 3.5383858680725098, + "learning_rate": 3.840077458524961e-06, + "loss": 0.4717, + "mean_token_accuracy": 0.8377836272120476, + "num_tokens": 168430136.0, + "step": 140010 + }, + { + "entropy": 1.957027330994606, + "epoch": 0.434049748945836, + "grad_norm": 7.953402996063232, + "learning_rate": 3.839940329063085e-06, + "loss": 0.4643, + "mean_token_accuracy": 0.8612416058778762, + "num_tokens": 168441409.0, + "step": 140020 + }, + { + "entropy": 1.8934217691421509, + "epoch": 0.4340807480708857, + "grad_norm": 3.684905529022217, + "learning_rate": 3.839803214290871e-06, + "loss": 0.4254, + "mean_token_accuracy": 0.8587592497467995, + "num_tokens": 168453595.0, + "step": 140030 + }, + { + "entropy": 1.9658619672060014, + "epoch": 0.43411174719593537, + "grad_norm": 7.9534525871276855, + "learning_rate": 3.839666114205696e-06, + "loss": 0.4652, + "mean_token_accuracy": 0.8584305748343468, + "num_tokens": 168464254.0, + "step": 140040 + }, + { + "entropy": 1.8513989642262458, + "epoch": 0.4341427463209851, + "grad_norm": 7.037330150604248, + "learning_rate": 3.839529028804939e-06, + "loss": 0.3944, + "mean_token_accuracy": 0.8667973428964615, + "num_tokens": 168476419.0, + "step": 140050 + }, + { + "entropy": 1.921988594532013, + "epoch": 0.43417374544603476, + "grad_norm": 8.005762100219727, + "learning_rate": 3.839391958085978e-06, + "loss": 0.4736, + "mean_token_accuracy": 0.8441802576184273, + "num_tokens": 168488005.0, + "step": 140060 + }, + { + "entropy": 1.874787649512291, + "epoch": 0.4342047445710845, + "grad_norm": 9.830438613891602, + "learning_rate": 3.839254902046193e-06, + "loss": 0.416, + "mean_token_accuracy": 0.8617915287613869, + "num_tokens": 168500064.0, + "step": 140070 + }, + { + "entropy": 1.9190911263227464, + "epoch": 0.43423574369613416, + "grad_norm": 9.500381469726562, + "learning_rate": 3.8391178606829646e-06, + "loss": 0.4676, + "mean_token_accuracy": 0.8493470534682274, + "num_tokens": 168511882.0, + "step": 140080 + }, + { + "entropy": 1.8970647498965263, + "epoch": 0.4342667428211839, + "grad_norm": 6.222733974456787, + "learning_rate": 3.838980833993672e-06, + "loss": 0.4962, + "mean_token_accuracy": 0.845898899435997, + "num_tokens": 168525185.0, + "step": 140090 + }, + { + "entropy": 1.9249570727348329, + "epoch": 0.43429774194623355, + "grad_norm": 8.286949157714844, + "learning_rate": 3.838843821975697e-06, + "loss": 0.4652, + "mean_token_accuracy": 0.8432594612240791, + "num_tokens": 168536800.0, + "step": 140100 + }, + { + "entropy": 1.7830603495240211, + "epoch": 0.4343287410712833, + "grad_norm": 3.775357723236084, + "learning_rate": 3.8387068246264244e-06, + "loss": 0.3601, + "mean_token_accuracy": 0.8711403638124466, + "num_tokens": 168550628.0, + "step": 140110 + }, + { + "entropy": 1.913707821071148, + "epoch": 0.43435974019633294, + "grad_norm": 7.623897075653076, + "learning_rate": 3.838569841943234e-06, + "loss": 0.4571, + "mean_token_accuracy": 0.8579373374581337, + "num_tokens": 168562584.0, + "step": 140120 + }, + { + "entropy": 1.959540669620037, + "epoch": 0.43439073932138267, + "grad_norm": 8.677681922912598, + "learning_rate": 3.8384328739235095e-06, + "loss": 0.4934, + "mean_token_accuracy": 0.8414706364274025, + "num_tokens": 168574108.0, + "step": 140130 + }, + { + "entropy": 1.8708548903465272, + "epoch": 0.43442173844643234, + "grad_norm": 7.195833683013916, + "learning_rate": 3.838295920564638e-06, + "loss": 0.4264, + "mean_token_accuracy": 0.8590035811066628, + "num_tokens": 168586835.0, + "step": 140140 + }, + { + "entropy": 1.9178714960813523, + "epoch": 0.43445273757148206, + "grad_norm": 8.062506675720215, + "learning_rate": 3.838158981863999e-06, + "loss": 0.4367, + "mean_token_accuracy": 0.8575378447771073, + "num_tokens": 168599034.0, + "step": 140150 + }, + { + "entropy": 1.9800096511840821, + "epoch": 0.43448373669653173, + "grad_norm": 7.617713451385498, + "learning_rate": 3.838022057818983e-06, + "loss": 0.4857, + "mean_token_accuracy": 0.8560611560940743, + "num_tokens": 168610043.0, + "step": 140160 + }, + { + "entropy": 1.8751267299056054, + "epoch": 0.43451473582158145, + "grad_norm": 9.476043701171875, + "learning_rate": 3.837885148426972e-06, + "loss": 0.4785, + "mean_token_accuracy": 0.8532286942005157, + "num_tokens": 168622646.0, + "step": 140170 + }, + { + "entropy": 1.9622350841760636, + "epoch": 0.4345457349466311, + "grad_norm": 8.158764839172363, + "learning_rate": 3.837748253685356e-06, + "loss": 0.4643, + "mean_token_accuracy": 0.852800378203392, + "num_tokens": 168633945.0, + "step": 140180 + }, + { + "entropy": 1.9728653207421303, + "epoch": 0.43457673407168085, + "grad_norm": 8.091950416564941, + "learning_rate": 3.8376113735915195e-06, + "loss": 0.4947, + "mean_token_accuracy": 0.8437511593103408, + "num_tokens": 168645091.0, + "step": 140190 + }, + { + "entropy": 1.8134228363633156, + "epoch": 0.4346077331967305, + "grad_norm": 3.810476541519165, + "learning_rate": 3.8374745081428525e-06, + "loss": 0.3402, + "mean_token_accuracy": 0.8733655110001564, + "num_tokens": 168658792.0, + "step": 140200 + }, + { + "entropy": 1.9151874303817749, + "epoch": 0.43463873232178024, + "grad_norm": 7.474352836608887, + "learning_rate": 3.837337657336742e-06, + "loss": 0.4914, + "mean_token_accuracy": 0.8463295727968216, + "num_tokens": 168670568.0, + "step": 140210 + }, + { + "entropy": 1.9047392055392265, + "epoch": 0.4346697314468299, + "grad_norm": 8.334789276123047, + "learning_rate": 3.8372008211705795e-06, + "loss": 0.4684, + "mean_token_accuracy": 0.8502994626760483, + "num_tokens": 168682642.0, + "step": 140220 + }, + { + "entropy": 1.9391117468476295, + "epoch": 0.43470073057187963, + "grad_norm": 9.910785675048828, + "learning_rate": 3.837063999641753e-06, + "loss": 0.4548, + "mean_token_accuracy": 0.8517929673194885, + "num_tokens": 168694378.0, + "step": 140230 + }, + { + "entropy": 1.9310035303235054, + "epoch": 0.4347317296969293, + "grad_norm": 8.19104290008545, + "learning_rate": 3.836927192747655e-06, + "loss": 0.44, + "mean_token_accuracy": 0.8617075026035309, + "num_tokens": 168705541.0, + "step": 140240 + }, + { + "entropy": 1.8467669546604157, + "epoch": 0.43476272882197897, + "grad_norm": 9.081491470336914, + "learning_rate": 3.8367904004856745e-06, + "loss": 0.4153, + "mean_token_accuracy": 0.8642741695046425, + "num_tokens": 168718141.0, + "step": 140250 + }, + { + "entropy": 1.841063352674246, + "epoch": 0.4347937279470287, + "grad_norm": 7.375804424285889, + "learning_rate": 3.836653622853204e-06, + "loss": 0.3789, + "mean_token_accuracy": 0.8677456378936768, + "num_tokens": 168730815.0, + "step": 140260 + }, + { + "entropy": 1.8605894804000855, + "epoch": 0.43482472707207837, + "grad_norm": 6.377718448638916, + "learning_rate": 3.836516859847637e-06, + "loss": 0.4352, + "mean_token_accuracy": 0.858807185292244, + "num_tokens": 168742824.0, + "step": 140270 + }, + { + "entropy": 1.824122267216444, + "epoch": 0.4348557261971281, + "grad_norm": 2.3955891132354736, + "learning_rate": 3.836380111466366e-06, + "loss": 0.3629, + "mean_token_accuracy": 0.8775916382670402, + "num_tokens": 168755819.0, + "step": 140280 + }, + { + "entropy": 1.9054122567176819, + "epoch": 0.43488672532217776, + "grad_norm": 10.010201454162598, + "learning_rate": 3.836243377706786e-06, + "loss": 0.4649, + "mean_token_accuracy": 0.8460003793239593, + "num_tokens": 168767525.0, + "step": 140290 + }, + { + "entropy": 1.9120841890573501, + "epoch": 0.4349177244472275, + "grad_norm": 8.370523452758789, + "learning_rate": 3.83610665856629e-06, + "loss": 0.4713, + "mean_token_accuracy": 0.8461458712816239, + "num_tokens": 168779091.0, + "step": 140300 + }, + { + "entropy": 1.9416150897741318, + "epoch": 0.43494872357227715, + "grad_norm": 8.070106506347656, + "learning_rate": 3.835969954042273e-06, + "loss": 0.4725, + "mean_token_accuracy": 0.8444668143987656, + "num_tokens": 168790525.0, + "step": 140310 + }, + { + "entropy": 1.873001678287983, + "epoch": 0.4349797226973269, + "grad_norm": 6.8141584396362305, + "learning_rate": 3.835833264132131e-06, + "loss": 0.3869, + "mean_token_accuracy": 0.8707495495676995, + "num_tokens": 168801983.0, + "step": 140320 + }, + { + "entropy": 1.9271577775478363, + "epoch": 0.43501072182237654, + "grad_norm": 6.567296504974365, + "learning_rate": 3.835696588833263e-06, + "loss": 0.4914, + "mean_token_accuracy": 0.8497977092862129, + "num_tokens": 168814329.0, + "step": 140330 + }, + { + "entropy": 1.913456965982914, + "epoch": 0.43504172094742627, + "grad_norm": 7.978017330169678, + "learning_rate": 3.8355599281430635e-06, + "loss": 0.4569, + "mean_token_accuracy": 0.8430664092302322, + "num_tokens": 168826198.0, + "step": 140340 + }, + { + "entropy": 1.882510894536972, + "epoch": 0.43507272007247594, + "grad_norm": 9.187688827514648, + "learning_rate": 3.83542328205893e-06, + "loss": 0.4491, + "mean_token_accuracy": 0.8393991827964783, + "num_tokens": 168838743.0, + "step": 140350 + }, + { + "entropy": 1.8549660280346871, + "epoch": 0.43510371919752566, + "grad_norm": 7.461533069610596, + "learning_rate": 3.835286650578262e-06, + "loss": 0.4404, + "mean_token_accuracy": 0.8586645647883415, + "num_tokens": 168851245.0, + "step": 140360 + }, + { + "entropy": 1.8594406992197037, + "epoch": 0.43513471832257533, + "grad_norm": 3.6771328449249268, + "learning_rate": 3.835150033698458e-06, + "loss": 0.3623, + "mean_token_accuracy": 0.8606583446264267, + "num_tokens": 168864222.0, + "step": 140370 + }, + { + "entropy": 1.9253443703055382, + "epoch": 0.43516571744762506, + "grad_norm": 7.7639241218566895, + "learning_rate": 3.835013431416919e-06, + "loss": 0.4706, + "mean_token_accuracy": 0.8500281646847725, + "num_tokens": 168876003.0, + "step": 140380 + }, + { + "entropy": 1.9680373221635818, + "epoch": 0.4351967165726747, + "grad_norm": 6.00819206237793, + "learning_rate": 3.834876843731043e-06, + "loss": 0.4793, + "mean_token_accuracy": 0.848180590569973, + "num_tokens": 168886871.0, + "step": 140390 + }, + { + "entropy": 1.9534240260720253, + "epoch": 0.43522771569772445, + "grad_norm": 7.778793811798096, + "learning_rate": 3.834740270638232e-06, + "loss": 0.476, + "mean_token_accuracy": 0.8536478534340859, + "num_tokens": 168898090.0, + "step": 140400 + }, + { + "entropy": 1.9209111735224724, + "epoch": 0.4352587148227741, + "grad_norm": 6.560374736785889, + "learning_rate": 3.834603712135889e-06, + "loss": 0.4202, + "mean_token_accuracy": 0.8524667829275131, + "num_tokens": 168909996.0, + "step": 140410 + }, + { + "entropy": 1.915994156897068, + "epoch": 0.43528971394782384, + "grad_norm": 4.213437080383301, + "learning_rate": 3.834467168221415e-06, + "loss": 0.4744, + "mean_token_accuracy": 0.8442320972681046, + "num_tokens": 168921960.0, + "step": 140420 + }, + { + "entropy": 1.9615544810891152, + "epoch": 0.4353207130728735, + "grad_norm": 8.39879035949707, + "learning_rate": 3.834330638892213e-06, + "loss": 0.4498, + "mean_token_accuracy": 0.8525176584720612, + "num_tokens": 168933811.0, + "step": 140430 + }, + { + "entropy": 1.8864429205656053, + "epoch": 0.43535171219792324, + "grad_norm": 7.037664413452148, + "learning_rate": 3.834194124145686e-06, + "loss": 0.4542, + "mean_token_accuracy": 0.8612302899360657, + "num_tokens": 168946667.0, + "step": 140440 + }, + { + "entropy": 2.0172328650951385, + "epoch": 0.4353827113229729, + "grad_norm": 4.6347975730896, + "learning_rate": 3.834057623979241e-06, + "loss": 0.5135, + "mean_token_accuracy": 0.842670176923275, + "num_tokens": 168958082.0, + "step": 140450 + }, + { + "entropy": 2.007206231355667, + "epoch": 0.43541371044802263, + "grad_norm": 8.676667213439941, + "learning_rate": 3.833921138390279e-06, + "loss": 0.506, + "mean_token_accuracy": 0.8318960726261139, + "num_tokens": 168969385.0, + "step": 140460 + }, + { + "entropy": 1.9538621738553048, + "epoch": 0.4354447095730723, + "grad_norm": 7.149422645568848, + "learning_rate": 3.833784667376208e-06, + "loss": 0.4332, + "mean_token_accuracy": 0.8514281839132309, + "num_tokens": 168981528.0, + "step": 140470 + }, + { + "entropy": 1.9269013822078704, + "epoch": 0.435475708698122, + "grad_norm": 7.5539350509643555, + "learning_rate": 3.833648210934433e-06, + "loss": 0.457, + "mean_token_accuracy": 0.8525858089327812, + "num_tokens": 168994111.0, + "step": 140480 + }, + { + "entropy": 2.015850293636322, + "epoch": 0.4355067078231717, + "grad_norm": 7.674806118011475, + "learning_rate": 3.8335117690623616e-06, + "loss": 0.5322, + "mean_token_accuracy": 0.8376188382506371, + "num_tokens": 169004802.0, + "step": 140490 + }, + { + "entropy": 1.868292573094368, + "epoch": 0.43553770694822136, + "grad_norm": 3.716049909591675, + "learning_rate": 3.8333753417574014e-06, + "loss": 0.4029, + "mean_token_accuracy": 0.8568149447441101, + "num_tokens": 169017161.0, + "step": 140500 + }, + { + "entropy": 1.999607202410698, + "epoch": 0.4355687060732711, + "grad_norm": 7.225944519042969, + "learning_rate": 3.833238929016959e-06, + "loss": 0.4686, + "mean_token_accuracy": 0.8523830324411392, + "num_tokens": 169027960.0, + "step": 140510 + }, + { + "entropy": 1.9995219513773919, + "epoch": 0.43559970519832075, + "grad_norm": 10.309820175170898, + "learning_rate": 3.833102530838446e-06, + "loss": 0.5616, + "mean_token_accuracy": 0.8337021216750145, + "num_tokens": 169039707.0, + "step": 140520 + }, + { + "entropy": 1.8859604820609093, + "epoch": 0.4356307043233705, + "grad_norm": 5.491618633270264, + "learning_rate": 3.8329661472192686e-06, + "loss": 0.4348, + "mean_token_accuracy": 0.8571705892682076, + "num_tokens": 169052242.0, + "step": 140530 + }, + { + "entropy": 1.8706535249948502, + "epoch": 0.43566170344842015, + "grad_norm": 4.212784767150879, + "learning_rate": 3.832829778156839e-06, + "loss": 0.4187, + "mean_token_accuracy": 0.8539310187101364, + "num_tokens": 169065057.0, + "step": 140540 + }, + { + "entropy": 1.9854949221014977, + "epoch": 0.43569270257346987, + "grad_norm": 7.0491461753845215, + "learning_rate": 3.8326934236485665e-06, + "loss": 0.4578, + "mean_token_accuracy": 0.855704678595066, + "num_tokens": 169076702.0, + "step": 140550 + }, + { + "entropy": 1.8700939059257506, + "epoch": 0.43572370169851954, + "grad_norm": 7.940951824188232, + "learning_rate": 3.832557083691864e-06, + "loss": 0.4005, + "mean_token_accuracy": 0.8501318633556366, + "num_tokens": 169089351.0, + "step": 140560 + }, + { + "entropy": 1.975642454624176, + "epoch": 0.43575470082356926, + "grad_norm": 8.858991622924805, + "learning_rate": 3.832420758284142e-06, + "loss": 0.5046, + "mean_token_accuracy": 0.8454146534204483, + "num_tokens": 169100724.0, + "step": 140570 + }, + { + "entropy": 1.8851625487208366, + "epoch": 0.43578569994861893, + "grad_norm": 7.867511749267578, + "learning_rate": 3.832284447422814e-06, + "loss": 0.4544, + "mean_token_accuracy": 0.8477107077836991, + "num_tokens": 169112999.0, + "step": 140580 + }, + { + "entropy": 1.8516397640109061, + "epoch": 0.43581669907366866, + "grad_norm": 3.843761682510376, + "learning_rate": 3.832148151105293e-06, + "loss": 0.4388, + "mean_token_accuracy": 0.8512039586901665, + "num_tokens": 169126495.0, + "step": 140590 + }, + { + "entropy": 2.005415938794613, + "epoch": 0.4358476981987183, + "grad_norm": 7.267820835113525, + "learning_rate": 3.832011869328994e-06, + "loss": 0.5148, + "mean_token_accuracy": 0.8331288233399391, + "num_tokens": 169137715.0, + "step": 140600 + }, + { + "entropy": 1.9655996575951575, + "epoch": 0.43587869732376805, + "grad_norm": 7.318187236785889, + "learning_rate": 3.831875602091329e-06, + "loss": 0.4687, + "mean_token_accuracy": 0.8459896892309189, + "num_tokens": 169149332.0, + "step": 140610 + }, + { + "entropy": 1.9661901488900184, + "epoch": 0.4359096964488177, + "grad_norm": 7.033077716827393, + "learning_rate": 3.831739349389714e-06, + "loss": 0.4418, + "mean_token_accuracy": 0.8531352400779724, + "num_tokens": 169160520.0, + "step": 140620 + }, + { + "entropy": 1.859518001973629, + "epoch": 0.43594069557386744, + "grad_norm": 8.226398468017578, + "learning_rate": 3.8316031112215676e-06, + "loss": 0.371, + "mean_token_accuracy": 0.866145646572113, + "num_tokens": 169173970.0, + "step": 140630 + }, + { + "entropy": 1.967048665881157, + "epoch": 0.4359716946989171, + "grad_norm": 7.1692962646484375, + "learning_rate": 3.831466887584302e-06, + "loss": 0.4907, + "mean_token_accuracy": 0.8470461040735244, + "num_tokens": 169184890.0, + "step": 140640 + }, + { + "entropy": 1.901542803645134, + "epoch": 0.43600269382396684, + "grad_norm": 7.0217156410217285, + "learning_rate": 3.831330678475338e-06, + "loss": 0.4176, + "mean_token_accuracy": 0.8561606049537659, + "num_tokens": 169197071.0, + "step": 140650 + }, + { + "entropy": 1.7790475860238075, + "epoch": 0.4360336929490165, + "grad_norm": 10.4070405960083, + "learning_rate": 3.831194483892091e-06, + "loss": 0.3602, + "mean_token_accuracy": 0.8745766609907151, + "num_tokens": 169210648.0, + "step": 140660 + }, + { + "entropy": 1.8767062574625015, + "epoch": 0.43606469207406623, + "grad_norm": 6.854859352111816, + "learning_rate": 3.831058303831981e-06, + "loss": 0.4237, + "mean_token_accuracy": 0.8566436514258384, + "num_tokens": 169222860.0, + "step": 140670 + }, + { + "entropy": 1.956781129539013, + "epoch": 0.4360956911991159, + "grad_norm": 8.431173324584961, + "learning_rate": 3.830922138292426e-06, + "loss": 0.4847, + "mean_token_accuracy": 0.8472963184118271, + "num_tokens": 169234554.0, + "step": 140680 + }, + { + "entropy": 1.9454703062772751, + "epoch": 0.4361266903241656, + "grad_norm": 9.45147705078125, + "learning_rate": 3.8307859872708455e-06, + "loss": 0.5006, + "mean_token_accuracy": 0.848207201063633, + "num_tokens": 169245918.0, + "step": 140690 + }, + { + "entropy": 1.9917195424437524, + "epoch": 0.4361576894492153, + "grad_norm": 8.706761360168457, + "learning_rate": 3.830649850764661e-06, + "loss": 0.4878, + "mean_token_accuracy": 0.8496665880084038, + "num_tokens": 169257011.0, + "step": 140700 + }, + { + "entropy": 1.9021431505680084, + "epoch": 0.436188688574265, + "grad_norm": 8.302827835083008, + "learning_rate": 3.830513728771293e-06, + "loss": 0.4093, + "mean_token_accuracy": 0.8685775399208069, + "num_tokens": 169268523.0, + "step": 140710 + }, + { + "entropy": 1.8747971653938293, + "epoch": 0.4362196876993147, + "grad_norm": 10.040637969970703, + "learning_rate": 3.830377621288163e-06, + "loss": 0.4226, + "mean_token_accuracy": 0.8577211305499077, + "num_tokens": 169281126.0, + "step": 140720 + }, + { + "entropy": 2.011331743001938, + "epoch": 0.4362506868243644, + "grad_norm": 9.337910652160645, + "learning_rate": 3.8302415283126924e-06, + "loss": 0.5219, + "mean_token_accuracy": 0.8415524423122406, + "num_tokens": 169291751.0, + "step": 140730 + }, + { + "entropy": 2.007808841764927, + "epoch": 0.4362816859494141, + "grad_norm": 7.370612144470215, + "learning_rate": 3.830105449842306e-06, + "loss": 0.5172, + "mean_token_accuracy": 0.83756934851408, + "num_tokens": 169303260.0, + "step": 140740 + }, + { + "entropy": 1.9281100079417228, + "epoch": 0.43631268507446375, + "grad_norm": 7.993682861328125, + "learning_rate": 3.829969385874425e-06, + "loss": 0.504, + "mean_token_accuracy": 0.8419538453221321, + "num_tokens": 169315175.0, + "step": 140750 + }, + { + "entropy": 1.8865912228822708, + "epoch": 0.43634368419951347, + "grad_norm": 5.286984443664551, + "learning_rate": 3.829833336406477e-06, + "loss": 0.4069, + "mean_token_accuracy": 0.8586981207132339, + "num_tokens": 169328237.0, + "step": 140760 + }, + { + "entropy": 1.9767191842198373, + "epoch": 0.43637468332456314, + "grad_norm": 7.798305034637451, + "learning_rate": 3.8296973014358825e-06, + "loss": 0.5088, + "mean_token_accuracy": 0.8429146945476532, + "num_tokens": 169339917.0, + "step": 140770 + }, + { + "entropy": 1.928345987200737, + "epoch": 0.43640568244961286, + "grad_norm": 3.7796151638031006, + "learning_rate": 3.829561280960071e-06, + "loss": 0.4115, + "mean_token_accuracy": 0.8631057515740395, + "num_tokens": 169351691.0, + "step": 140780 + }, + { + "entropy": 1.9100732266902924, + "epoch": 0.43643668157466253, + "grad_norm": 8.390362739562988, + "learning_rate": 3.829425274976465e-06, + "loss": 0.4309, + "mean_token_accuracy": 0.8546838641166687, + "num_tokens": 169364207.0, + "step": 140790 + }, + { + "entropy": 1.8982257694005966, + "epoch": 0.43646768069971226, + "grad_norm": 7.438647747039795, + "learning_rate": 3.829289283482494e-06, + "loss": 0.4037, + "mean_token_accuracy": 0.8580241903662682, + "num_tokens": 169376638.0, + "step": 140800 + }, + { + "entropy": 1.951183719933033, + "epoch": 0.4364986798247619, + "grad_norm": 3.837440252304077, + "learning_rate": 3.829153306475584e-06, + "loss": 0.4269, + "mean_token_accuracy": 0.8538202360272408, + "num_tokens": 169388468.0, + "step": 140810 + }, + { + "entropy": 1.9201230362057686, + "epoch": 0.43652967894981165, + "grad_norm": 8.309687614440918, + "learning_rate": 3.829017343953164e-06, + "loss": 0.4843, + "mean_token_accuracy": 0.8367157101631164, + "num_tokens": 169401878.0, + "step": 140820 + }, + { + "entropy": 1.9850788608193397, + "epoch": 0.4365606780748613, + "grad_norm": 8.711220741271973, + "learning_rate": 3.828881395912661e-06, + "loss": 0.4953, + "mean_token_accuracy": 0.8478342741727829, + "num_tokens": 169413486.0, + "step": 140830 + }, + { + "entropy": 1.9063683211803437, + "epoch": 0.43659167719991104, + "grad_norm": 10.316767692565918, + "learning_rate": 3.828745462351506e-06, + "loss": 0.4484, + "mean_token_accuracy": 0.8501601129770279, + "num_tokens": 169426451.0, + "step": 140840 + }, + { + "entropy": 2.031948208808899, + "epoch": 0.4366226763249607, + "grad_norm": 8.402220726013184, + "learning_rate": 3.828609543267129e-06, + "loss": 0.5099, + "mean_token_accuracy": 0.8418742671608925, + "num_tokens": 169437295.0, + "step": 140850 + }, + { + "entropy": 1.7779492631554603, + "epoch": 0.43665367545001044, + "grad_norm": 8.303640365600586, + "learning_rate": 3.828473638656959e-06, + "loss": 0.3924, + "mean_token_accuracy": 0.8546249598264695, + "num_tokens": 169451864.0, + "step": 140860 + }, + { + "entropy": 1.8658212095499038, + "epoch": 0.4366846745750601, + "grad_norm": 8.145844459533691, + "learning_rate": 3.828337748518429e-06, + "loss": 0.4204, + "mean_token_accuracy": 0.8555705919861794, + "num_tokens": 169465061.0, + "step": 140870 + }, + { + "entropy": 1.9951273322105407, + "epoch": 0.43671567370010983, + "grad_norm": 7.6853461265563965, + "learning_rate": 3.8282018728489685e-06, + "loss": 0.4929, + "mean_token_accuracy": 0.8458278581500054, + "num_tokens": 169476203.0, + "step": 140880 + }, + { + "entropy": 1.907281294465065, + "epoch": 0.4367466728251595, + "grad_norm": 4.954738616943359, + "learning_rate": 3.828066011646013e-06, + "loss": 0.4118, + "mean_token_accuracy": 0.84654021859169, + "num_tokens": 169488896.0, + "step": 140890 + }, + { + "entropy": 2.035962425172329, + "epoch": 0.4367776719502092, + "grad_norm": 9.046393394470215, + "learning_rate": 3.8279301649069935e-06, + "loss": 0.5015, + "mean_token_accuracy": 0.8471018671989441, + "num_tokens": 169500116.0, + "step": 140900 + }, + { + "entropy": 1.9954482674598695, + "epoch": 0.4368086710752589, + "grad_norm": 8.976217269897461, + "learning_rate": 3.827794332629344e-06, + "loss": 0.483, + "mean_token_accuracy": 0.8554398879408837, + "num_tokens": 169510982.0, + "step": 140910 + }, + { + "entropy": 2.049514207243919, + "epoch": 0.4368396702003086, + "grad_norm": 9.787312507629395, + "learning_rate": 3.8276585148105e-06, + "loss": 0.5093, + "mean_token_accuracy": 0.8407163426280022, + "num_tokens": 169521572.0, + "step": 140920 + }, + { + "entropy": 1.923102656006813, + "epoch": 0.4368706693253583, + "grad_norm": 8.028412818908691, + "learning_rate": 3.827522711447895e-06, + "loss": 0.4557, + "mean_token_accuracy": 0.8506998717784882, + "num_tokens": 169533699.0, + "step": 140930 + }, + { + "entropy": 2.0024877056479453, + "epoch": 0.436901668450408, + "grad_norm": 4.008644104003906, + "learning_rate": 3.827386922538967e-06, + "loss": 0.4489, + "mean_token_accuracy": 0.8583690106868744, + "num_tokens": 169544829.0, + "step": 140940 + }, + { + "entropy": 1.84952729716897, + "epoch": 0.4369326675754577, + "grad_norm": 9.895330429077148, + "learning_rate": 3.827251148081149e-06, + "loss": 0.3781, + "mean_token_accuracy": 0.8501643449068069, + "num_tokens": 169558584.0, + "step": 140950 + }, + { + "entropy": 1.9892782807350158, + "epoch": 0.4369636667005074, + "grad_norm": 7.615428447723389, + "learning_rate": 3.827115388071881e-06, + "loss": 0.5019, + "mean_token_accuracy": 0.844859579205513, + "num_tokens": 169569716.0, + "step": 140960 + }, + { + "entropy": 2.0157076716423035, + "epoch": 0.43699466582555707, + "grad_norm": 8.59665298461914, + "learning_rate": 3.8269796425086e-06, + "loss": 0.5129, + "mean_token_accuracy": 0.8417162656784057, + "num_tokens": 169580522.0, + "step": 140970 + }, + { + "entropy": 1.925472079217434, + "epoch": 0.4370256649506068, + "grad_norm": 3.7771337032318115, + "learning_rate": 3.826843911388742e-06, + "loss": 0.4267, + "mean_token_accuracy": 0.8646034061908722, + "num_tokens": 169592175.0, + "step": 140980 + }, + { + "entropy": 1.8540797725319862, + "epoch": 0.43705666407565646, + "grad_norm": 7.4315056800842285, + "learning_rate": 3.826708194709748e-06, + "loss": 0.3951, + "mean_token_accuracy": 0.8597232013940811, + "num_tokens": 169604782.0, + "step": 140990 + }, + { + "entropy": 1.9015352100133895, + "epoch": 0.43708766320070613, + "grad_norm": 4.213851451873779, + "learning_rate": 3.826572492469057e-06, + "loss": 0.4461, + "mean_token_accuracy": 0.8474423259496688, + "num_tokens": 169617522.0, + "step": 141000 + }, + { + "entropy": 2.016425573825836, + "epoch": 0.43711866232575586, + "grad_norm": 8.818294525146484, + "learning_rate": 3.826436804664109e-06, + "loss": 0.5032, + "mean_token_accuracy": 0.8433761283755302, + "num_tokens": 169628341.0, + "step": 141010 + }, + { + "entropy": 1.9073638312518597, + "epoch": 0.4371496614508055, + "grad_norm": 7.4958014488220215, + "learning_rate": 3.826301131292346e-06, + "loss": 0.424, + "mean_token_accuracy": 0.8657842293381691, + "num_tokens": 169641206.0, + "step": 141020 + }, + { + "entropy": 1.8773157000541687, + "epoch": 0.43718066057585525, + "grad_norm": 9.210996627807617, + "learning_rate": 3.826165472351208e-06, + "loss": 0.4286, + "mean_token_accuracy": 0.8486377000808716, + "num_tokens": 169652953.0, + "step": 141030 + }, + { + "entropy": 1.8629372149705887, + "epoch": 0.4372116597009049, + "grad_norm": 8.852280616760254, + "learning_rate": 3.826029827838136e-06, + "loss": 0.436, + "mean_token_accuracy": 0.8569909855723381, + "num_tokens": 169665732.0, + "step": 141040 + }, + { + "entropy": 1.8929132372140884, + "epoch": 0.43724265882595464, + "grad_norm": 7.813331127166748, + "learning_rate": 3.825894197750575e-06, + "loss": 0.4269, + "mean_token_accuracy": 0.8591722935438156, + "num_tokens": 169677613.0, + "step": 141050 + }, + { + "entropy": 1.9381250068545341, + "epoch": 0.4372736579510043, + "grad_norm": 8.72669792175293, + "learning_rate": 3.825758582085967e-06, + "loss": 0.4255, + "mean_token_accuracy": 0.8587992191314697, + "num_tokens": 169689244.0, + "step": 141060 + }, + { + "entropy": 1.7425117641687393, + "epoch": 0.43730465707605404, + "grad_norm": 8.140493392944336, + "learning_rate": 3.825622980841757e-06, + "loss": 0.3859, + "mean_token_accuracy": 0.8660785302519798, + "num_tokens": 169703991.0, + "step": 141070 + }, + { + "entropy": 1.8813914477825164, + "epoch": 0.4373356562011037, + "grad_norm": 9.09639835357666, + "learning_rate": 3.825487394015388e-06, + "loss": 0.4414, + "mean_token_accuracy": 0.8516086161136627, + "num_tokens": 169716782.0, + "step": 141080 + }, + { + "entropy": 1.8726382121443748, + "epoch": 0.43736665532615343, + "grad_norm": 4.414510250091553, + "learning_rate": 3.825351821604306e-06, + "loss": 0.455, + "mean_token_accuracy": 0.8511142015457154, + "num_tokens": 169729151.0, + "step": 141090 + }, + { + "entropy": 1.905012820661068, + "epoch": 0.4373976544512031, + "grad_norm": 8.519266128540039, + "learning_rate": 3.825216263605957e-06, + "loss": 0.477, + "mean_token_accuracy": 0.8418341457843781, + "num_tokens": 169741404.0, + "step": 141100 + }, + { + "entropy": 1.9148242220282554, + "epoch": 0.4374286535762528, + "grad_norm": 8.985989570617676, + "learning_rate": 3.825080720017787e-06, + "loss": 0.4941, + "mean_token_accuracy": 0.842382799088955, + "num_tokens": 169753529.0, + "step": 141110 + }, + { + "entropy": 1.8772297248244285, + "epoch": 0.4374596527013025, + "grad_norm": 3.817615509033203, + "learning_rate": 3.824945190837244e-06, + "loss": 0.4215, + "mean_token_accuracy": 0.8559065282344818, + "num_tokens": 169765965.0, + "step": 141120 + }, + { + "entropy": 1.9383199408650398, + "epoch": 0.4374906518263522, + "grad_norm": 3.4553632736206055, + "learning_rate": 3.824809676061776e-06, + "loss": 0.468, + "mean_token_accuracy": 0.8500086024403573, + "num_tokens": 169777683.0, + "step": 141130 + }, + { + "entropy": 1.9334457024931908, + "epoch": 0.4375216509514019, + "grad_norm": 4.0566182136535645, + "learning_rate": 3.82467417568883e-06, + "loss": 0.4545, + "mean_token_accuracy": 0.8465969949960709, + "num_tokens": 169789469.0, + "step": 141140 + }, + { + "entropy": 1.9533229991793633, + "epoch": 0.4375526500764516, + "grad_norm": 7.992478847503662, + "learning_rate": 3.824538689715855e-06, + "loss": 0.4447, + "mean_token_accuracy": 0.8505326002836228, + "num_tokens": 169800995.0, + "step": 141150 + }, + { + "entropy": 1.9198670163750648, + "epoch": 0.4375836492015013, + "grad_norm": 4.070978164672852, + "learning_rate": 3.8244032181403015e-06, + "loss": 0.5319, + "mean_token_accuracy": 0.8470968812704086, + "num_tokens": 169813330.0, + "step": 141160 + }, + { + "entropy": 1.8759153246879579, + "epoch": 0.437614648326551, + "grad_norm": 10.018072128295898, + "learning_rate": 3.8242677609596205e-06, + "loss": 0.4197, + "mean_token_accuracy": 0.8579384982585907, + "num_tokens": 169826267.0, + "step": 141170 + }, + { + "entropy": 1.839848317205906, + "epoch": 0.4376456474516007, + "grad_norm": 8.282307624816895, + "learning_rate": 3.824132318171262e-06, + "loss": 0.4227, + "mean_token_accuracy": 0.8613649949431419, + "num_tokens": 169840238.0, + "step": 141180 + }, + { + "entropy": 1.9160371258854867, + "epoch": 0.4376766465766504, + "grad_norm": 8.569047927856445, + "learning_rate": 3.8239968897726755e-06, + "loss": 0.4761, + "mean_token_accuracy": 0.8511138498783112, + "num_tokens": 169852089.0, + "step": 141190 + }, + { + "entropy": 1.9733612596988679, + "epoch": 0.43770764570170007, + "grad_norm": 7.161515712738037, + "learning_rate": 3.823861475761317e-06, + "loss": 0.4506, + "mean_token_accuracy": 0.8522224485874176, + "num_tokens": 169863089.0, + "step": 141200 + }, + { + "entropy": 1.9556746795773505, + "epoch": 0.4377386448267498, + "grad_norm": 8.775317192077637, + "learning_rate": 3.823726076134636e-06, + "loss": 0.4748, + "mean_token_accuracy": 0.8549538642168045, + "num_tokens": 169875115.0, + "step": 141210 + }, + { + "entropy": 1.9240303069353104, + "epoch": 0.43776964395179946, + "grad_norm": 8.23504638671875, + "learning_rate": 3.823590690890089e-06, + "loss": 0.469, + "mean_token_accuracy": 0.853808656334877, + "num_tokens": 169887042.0, + "step": 141220 + }, + { + "entropy": 1.8046733349561692, + "epoch": 0.4378006430768492, + "grad_norm": 3.5888330936431885, + "learning_rate": 3.823455320025128e-06, + "loss": 0.361, + "mean_token_accuracy": 0.8697149589657783, + "num_tokens": 169900449.0, + "step": 141230 + }, + { + "entropy": 1.8609862461686135, + "epoch": 0.43783164220189885, + "grad_norm": 3.756772041320801, + "learning_rate": 3.823319963537208e-06, + "loss": 0.3973, + "mean_token_accuracy": 0.8570352002978325, + "num_tokens": 169912988.0, + "step": 141240 + }, + { + "entropy": 1.8794613614678384, + "epoch": 0.4378626413269485, + "grad_norm": 3.9449973106384277, + "learning_rate": 3.823184621423784e-06, + "loss": 0.4047, + "mean_token_accuracy": 0.8526434049010276, + "num_tokens": 169926018.0, + "step": 141250 + }, + { + "entropy": 1.7661522284150124, + "epoch": 0.43789364045199825, + "grad_norm": 9.034821510314941, + "learning_rate": 3.8230492936823135e-06, + "loss": 0.355, + "mean_token_accuracy": 0.8720221489667892, + "num_tokens": 169939851.0, + "step": 141260 + }, + { + "entropy": 1.978477604687214, + "epoch": 0.4379246395770479, + "grad_norm": 7.994080543518066, + "learning_rate": 3.822913980310252e-06, + "loss": 0.4984, + "mean_token_accuracy": 0.8487569943070412, + "num_tokens": 169951672.0, + "step": 141270 + }, + { + "entropy": 1.9159216687083245, + "epoch": 0.43795563870209764, + "grad_norm": 7.902944564819336, + "learning_rate": 3.822778681305056e-06, + "loss": 0.4246, + "mean_token_accuracy": 0.8606354176998139, + "num_tokens": 169963211.0, + "step": 141280 + }, + { + "entropy": 1.8818385779857636, + "epoch": 0.4379866378271473, + "grad_norm": 4.299989700317383, + "learning_rate": 3.822643396664184e-06, + "loss": 0.5038, + "mean_token_accuracy": 0.8490740895271301, + "num_tokens": 169975835.0, + "step": 141290 + }, + { + "entropy": 1.9505405515432357, + "epoch": 0.43801763695219703, + "grad_norm": 7.912982940673828, + "learning_rate": 3.822508126385095e-06, + "loss": 0.4642, + "mean_token_accuracy": 0.8519005611538887, + "num_tokens": 169987046.0, + "step": 141300 + }, + { + "entropy": 1.8943722754716874, + "epoch": 0.4380486360772467, + "grad_norm": 9.575815200805664, + "learning_rate": 3.822372870465247e-06, + "loss": 0.424, + "mean_token_accuracy": 0.8552343189716339, + "num_tokens": 169999313.0, + "step": 141310 + }, + { + "entropy": 2.0350208193063737, + "epoch": 0.4380796352022964, + "grad_norm": 9.12436580657959, + "learning_rate": 3.822237628902101e-06, + "loss": 0.5847, + "mean_token_accuracy": 0.8212827384471894, + "num_tokens": 170010308.0, + "step": 141320 + }, + { + "entropy": 1.9424827635288238, + "epoch": 0.4381106343273461, + "grad_norm": 7.913768291473389, + "learning_rate": 3.822102401693118e-06, + "loss": 0.4541, + "mean_token_accuracy": 0.8558102324604988, + "num_tokens": 170021453.0, + "step": 141330 + }, + { + "entropy": 1.9400149762630463, + "epoch": 0.4381416334523958, + "grad_norm": 7.435611724853516, + "learning_rate": 3.821967188835756e-06, + "loss": 0.4669, + "mean_token_accuracy": 0.8472892969846726, + "num_tokens": 170033154.0, + "step": 141340 + }, + { + "entropy": 1.9589600950479507, + "epoch": 0.4381726325774455, + "grad_norm": 4.710630416870117, + "learning_rate": 3.82183199032748e-06, + "loss": 0.4518, + "mean_token_accuracy": 0.8444600135087967, + "num_tokens": 170044448.0, + "step": 141350 + }, + { + "entropy": 2.002592481672764, + "epoch": 0.4382036317024952, + "grad_norm": 3.6701736450195312, + "learning_rate": 3.821696806165749e-06, + "loss": 0.4958, + "mean_token_accuracy": 0.8414113700389863, + "num_tokens": 170055393.0, + "step": 141360 + }, + { + "entropy": 1.9110810875892639, + "epoch": 0.4382346308275449, + "grad_norm": 8.964020729064941, + "learning_rate": 3.821561636348028e-06, + "loss": 0.4121, + "mean_token_accuracy": 0.8573357686400414, + "num_tokens": 170067640.0, + "step": 141370 + }, + { + "entropy": 1.9764687582850455, + "epoch": 0.4382656299525946, + "grad_norm": 8.130205154418945, + "learning_rate": 3.8214264808717814e-06, + "loss": 0.5127, + "mean_token_accuracy": 0.8418732464313508, + "num_tokens": 170079057.0, + "step": 141380 + }, + { + "entropy": 1.8818054497241974, + "epoch": 0.4382966290776443, + "grad_norm": 4.381757736206055, + "learning_rate": 3.82129133973447e-06, + "loss": 0.4403, + "mean_token_accuracy": 0.8557385444641114, + "num_tokens": 170091217.0, + "step": 141390 + }, + { + "entropy": 1.999481311440468, + "epoch": 0.438327628202694, + "grad_norm": 8.559086799621582, + "learning_rate": 3.821156212933562e-06, + "loss": 0.5061, + "mean_token_accuracy": 0.8520580142736435, + "num_tokens": 170102107.0, + "step": 141400 + }, + { + "entropy": 1.877190275490284, + "epoch": 0.43835862732774367, + "grad_norm": 7.804668426513672, + "learning_rate": 3.8210211004665206e-06, + "loss": 0.4605, + "mean_token_accuracy": 0.8549025520682335, + "num_tokens": 170114007.0, + "step": 141410 + }, + { + "entropy": 1.7784633502364158, + "epoch": 0.4383896264527934, + "grad_norm": 7.331371784210205, + "learning_rate": 3.820886002330814e-06, + "loss": 0.3455, + "mean_token_accuracy": 0.8712356150150299, + "num_tokens": 170127600.0, + "step": 141420 + }, + { + "entropy": 1.948923571407795, + "epoch": 0.43842062557784306, + "grad_norm": 9.89287281036377, + "learning_rate": 3.820750918523906e-06, + "loss": 0.4901, + "mean_token_accuracy": 0.8478708654642105, + "num_tokens": 170138776.0, + "step": 141430 + }, + { + "entropy": 1.9679486066102982, + "epoch": 0.4384516247028928, + "grad_norm": 9.170439720153809, + "learning_rate": 3.820615849043266e-06, + "loss": 0.4568, + "mean_token_accuracy": 0.8551394686102867, + "num_tokens": 170150407.0, + "step": 141440 + }, + { + "entropy": 1.9057482168078423, + "epoch": 0.43848262382794245, + "grad_norm": 9.319578170776367, + "learning_rate": 3.820480793886361e-06, + "loss": 0.4794, + "mean_token_accuracy": 0.8556445702910424, + "num_tokens": 170162841.0, + "step": 141450 + }, + { + "entropy": 1.872725434601307, + "epoch": 0.4385136229529922, + "grad_norm": 8.340863227844238, + "learning_rate": 3.820345753050659e-06, + "loss": 0.4323, + "mean_token_accuracy": 0.8682135134935379, + "num_tokens": 170175371.0, + "step": 141460 + }, + { + "entropy": 1.8802336245775222, + "epoch": 0.43854462207804185, + "grad_norm": 8.725083351135254, + "learning_rate": 3.82021072653363e-06, + "loss": 0.4786, + "mean_token_accuracy": 0.8557636126875877, + "num_tokens": 170187618.0, + "step": 141470 + }, + { + "entropy": 1.8927005499601364, + "epoch": 0.43857562120309157, + "grad_norm": 3.3288369178771973, + "learning_rate": 3.820075714332744e-06, + "loss": 0.457, + "mean_token_accuracy": 0.8532668456435204, + "num_tokens": 170199475.0, + "step": 141480 + }, + { + "entropy": 1.996702989935875, + "epoch": 0.43860662032814124, + "grad_norm": 7.832830429077148, + "learning_rate": 3.819940716445472e-06, + "loss": 0.5113, + "mean_token_accuracy": 0.8426687330007553, + "num_tokens": 170210531.0, + "step": 141490 + }, + { + "entropy": 1.9969799607992171, + "epoch": 0.4386376194531909, + "grad_norm": 8.912454605102539, + "learning_rate": 3.819805732869283e-06, + "loss": 0.5241, + "mean_token_accuracy": 0.8406167760491371, + "num_tokens": 170221403.0, + "step": 141500 + }, + { + "entropy": 1.9903255164623261, + "epoch": 0.43866861857824063, + "grad_norm": 8.586922645568848, + "learning_rate": 3.81967076360165e-06, + "loss": 0.5062, + "mean_token_accuracy": 0.8494381055235862, + "num_tokens": 170232697.0, + "step": 141510 + }, + { + "entropy": 1.893722152709961, + "epoch": 0.4386996177032903, + "grad_norm": 7.579135894775391, + "learning_rate": 3.819535808640045e-06, + "loss": 0.4301, + "mean_token_accuracy": 0.8595759615302085, + "num_tokens": 170244794.0, + "step": 141520 + }, + { + "entropy": 1.9859010726213455, + "epoch": 0.43873061682834, + "grad_norm": 8.621550559997559, + "learning_rate": 3.819400867981941e-06, + "loss": 0.5332, + "mean_token_accuracy": 0.8447252556681633, + "num_tokens": 170255961.0, + "step": 141530 + }, + { + "entropy": 1.945717729628086, + "epoch": 0.4387616159533897, + "grad_norm": 8.728808403015137, + "learning_rate": 3.819265941624811e-06, + "loss": 0.4971, + "mean_token_accuracy": 0.8427330657839776, + "num_tokens": 170268226.0, + "step": 141540 + }, + { + "entropy": 2.0041126042604445, + "epoch": 0.4387926150784394, + "grad_norm": 10.106131553649902, + "learning_rate": 3.819131029566131e-06, + "loss": 0.5313, + "mean_token_accuracy": 0.8371750578284264, + "num_tokens": 170278720.0, + "step": 141550 + }, + { + "entropy": 1.9148709684610368, + "epoch": 0.4388236142034891, + "grad_norm": 9.913314819335938, + "learning_rate": 3.818996131803373e-06, + "loss": 0.4727, + "mean_token_accuracy": 0.840086530148983, + "num_tokens": 170291028.0, + "step": 141560 + }, + { + "entropy": 1.9021180346608162, + "epoch": 0.4388546133285388, + "grad_norm": 7.955324649810791, + "learning_rate": 3.818861248334014e-06, + "loss": 0.4134, + "mean_token_accuracy": 0.8594782516360283, + "num_tokens": 170303129.0, + "step": 141570 + }, + { + "entropy": 1.8553086191415786, + "epoch": 0.4388856124535885, + "grad_norm": 4.323651313781738, + "learning_rate": 3.81872637915553e-06, + "loss": 0.4325, + "mean_token_accuracy": 0.854140630364418, + "num_tokens": 170316049.0, + "step": 141580 + }, + { + "entropy": 1.9587162017822266, + "epoch": 0.4389166115786382, + "grad_norm": 8.201522827148438, + "learning_rate": 3.818591524265398e-06, + "loss": 0.4599, + "mean_token_accuracy": 0.8508410602807999, + "num_tokens": 170327674.0, + "step": 141590 + }, + { + "entropy": 1.986707004904747, + "epoch": 0.4389476107036879, + "grad_norm": 7.647431373596191, + "learning_rate": 3.8184566836610944e-06, + "loss": 0.4764, + "mean_token_accuracy": 0.8480009466409684, + "num_tokens": 170339108.0, + "step": 141600 + }, + { + "entropy": 1.948198501765728, + "epoch": 0.4389786098287376, + "grad_norm": 7.071403503417969, + "learning_rate": 3.818321857340097e-06, + "loss": 0.509, + "mean_token_accuracy": 0.8380307227373123, + "num_tokens": 170350766.0, + "step": 141610 + }, + { + "entropy": 1.9600090399384498, + "epoch": 0.43900960895378727, + "grad_norm": 9.295860290527344, + "learning_rate": 3.818187045299886e-06, + "loss": 0.4916, + "mean_token_accuracy": 0.8459015518426896, + "num_tokens": 170362080.0, + "step": 141620 + }, + { + "entropy": 1.9097954377532005, + "epoch": 0.439040608078837, + "grad_norm": 4.016225337982178, + "learning_rate": 3.818052247537938e-06, + "loss": 0.4367, + "mean_token_accuracy": 0.8569407507777214, + "num_tokens": 170374022.0, + "step": 141630 + }, + { + "entropy": 1.8951156988739968, + "epoch": 0.43907160720388666, + "grad_norm": 7.259555339813232, + "learning_rate": 3.817917464051734e-06, + "loss": 0.4419, + "mean_token_accuracy": 0.8556249186396598, + "num_tokens": 170386951.0, + "step": 141640 + }, + { + "entropy": 1.9594153746962548, + "epoch": 0.4391026063289364, + "grad_norm": 7.583054065704346, + "learning_rate": 3.817782694838756e-06, + "loss": 0.4526, + "mean_token_accuracy": 0.856129738688469, + "num_tokens": 170398778.0, + "step": 141650 + }, + { + "entropy": 1.8969098508358002, + "epoch": 0.43913360545398605, + "grad_norm": 9.536150932312012, + "learning_rate": 3.817647939896483e-06, + "loss": 0.4578, + "mean_token_accuracy": 0.8510769203305244, + "num_tokens": 170411665.0, + "step": 141660 + }, + { + "entropy": 1.9632080033421517, + "epoch": 0.4391646045790358, + "grad_norm": 7.881152153015137, + "learning_rate": 3.8175131992223965e-06, + "loss": 0.4795, + "mean_token_accuracy": 0.8432640329003334, + "num_tokens": 170423508.0, + "step": 141670 + }, + { + "entropy": 1.8790015771985054, + "epoch": 0.43919560370408545, + "grad_norm": 9.358724594116211, + "learning_rate": 3.81737847281398e-06, + "loss": 0.4405, + "mean_token_accuracy": 0.8637870714068413, + "num_tokens": 170435638.0, + "step": 141680 + }, + { + "entropy": 1.8561220914125443, + "epoch": 0.43922660282913517, + "grad_norm": 8.335770606994629, + "learning_rate": 3.8172437606687156e-06, + "loss": 0.473, + "mean_token_accuracy": 0.8470362722873688, + "num_tokens": 170449929.0, + "step": 141690 + }, + { + "entropy": 1.9746687322854997, + "epoch": 0.43925760195418484, + "grad_norm": 7.2637248039245605, + "learning_rate": 3.817109062784087e-06, + "loss": 0.5127, + "mean_token_accuracy": 0.842483501136303, + "num_tokens": 170461051.0, + "step": 141700 + }, + { + "entropy": 1.9206561237573623, + "epoch": 0.43928860107923456, + "grad_norm": 9.621170997619629, + "learning_rate": 3.816974379157578e-06, + "loss": 0.4337, + "mean_token_accuracy": 0.8608662933111191, + "num_tokens": 170472540.0, + "step": 141710 + }, + { + "entropy": 1.9312569439411162, + "epoch": 0.43931960020428423, + "grad_norm": 9.245631217956543, + "learning_rate": 3.816839709786675e-06, + "loss": 0.4809, + "mean_token_accuracy": 0.8509629473090172, + "num_tokens": 170484584.0, + "step": 141720 + }, + { + "entropy": 1.9452128455042839, + "epoch": 0.4393505993293339, + "grad_norm": 8.906986236572266, + "learning_rate": 3.81670505466886e-06, + "loss": 0.4482, + "mean_token_accuracy": 0.8519314289093017, + "num_tokens": 170496707.0, + "step": 141730 + }, + { + "entropy": 1.957577820122242, + "epoch": 0.4393815984543836, + "grad_norm": 7.901788234710693, + "learning_rate": 3.816570413801622e-06, + "loss": 0.4578, + "mean_token_accuracy": 0.8558757767081261, + "num_tokens": 170507715.0, + "step": 141740 + }, + { + "entropy": 1.9821037575602531, + "epoch": 0.4394125975794333, + "grad_norm": 7.910948276519775, + "learning_rate": 3.8164357871824466e-06, + "loss": 0.4422, + "mean_token_accuracy": 0.8592849254608155, + "num_tokens": 170518782.0, + "step": 141750 + }, + { + "entropy": 1.9237025648355484, + "epoch": 0.439443596704483, + "grad_norm": 7.395272731781006, + "learning_rate": 3.816301174808821e-06, + "loss": 0.4709, + "mean_token_accuracy": 0.8577270656824112, + "num_tokens": 170531656.0, + "step": 141760 + }, + { + "entropy": 1.9360353201627731, + "epoch": 0.4394745958295327, + "grad_norm": 4.2066426277160645, + "learning_rate": 3.8161665766782335e-06, + "loss": 0.4523, + "mean_token_accuracy": 0.8494734823703766, + "num_tokens": 170543484.0, + "step": 141770 + }, + { + "entropy": 1.969169418513775, + "epoch": 0.4395055949545824, + "grad_norm": 4.006796360015869, + "learning_rate": 3.816031992788171e-06, + "loss": 0.4693, + "mean_token_accuracy": 0.8522426009178161, + "num_tokens": 170555226.0, + "step": 141780 + }, + { + "entropy": 1.7786556363105774, + "epoch": 0.4395365940796321, + "grad_norm": 8.278726577758789, + "learning_rate": 3.815897423136125e-06, + "loss": 0.3426, + "mean_token_accuracy": 0.8771900832653046, + "num_tokens": 170568645.0, + "step": 141790 + }, + { + "entropy": 1.9889621213078499, + "epoch": 0.4395675932046818, + "grad_norm": 8.236480712890625, + "learning_rate": 3.815762867719584e-06, + "loss": 0.4572, + "mean_token_accuracy": 0.8558431312441825, + "num_tokens": 170580292.0, + "step": 141800 + }, + { + "entropy": 1.8943669281899929, + "epoch": 0.4395985923297315, + "grad_norm": 8.767983436584473, + "learning_rate": 3.815628326536037e-06, + "loss": 0.462, + "mean_token_accuracy": 0.8443626776337624, + "num_tokens": 170593285.0, + "step": 141810 + }, + { + "entropy": 1.9128671914339066, + "epoch": 0.4396295914547812, + "grad_norm": 3.5375518798828125, + "learning_rate": 3.815493799582977e-06, + "loss": 0.4133, + "mean_token_accuracy": 0.8669633626937866, + "num_tokens": 170605706.0, + "step": 141820 + }, + { + "entropy": 1.985709111392498, + "epoch": 0.43966059057983087, + "grad_norm": 7.396274566650391, + "learning_rate": 3.815359286857895e-06, + "loss": 0.4849, + "mean_token_accuracy": 0.8488274723291397, + "num_tokens": 170617185.0, + "step": 141830 + }, + { + "entropy": 1.9252109676599503, + "epoch": 0.4396915897048806, + "grad_norm": 5.221693515777588, + "learning_rate": 3.815224788358284e-06, + "loss": 0.4519, + "mean_token_accuracy": 0.8559987634420395, + "num_tokens": 170629091.0, + "step": 141840 + }, + { + "entropy": 1.8718436300754546, + "epoch": 0.43972258882993026, + "grad_norm": 4.875570297241211, + "learning_rate": 3.815090304081635e-06, + "loss": 0.4358, + "mean_token_accuracy": 0.8549114823341369, + "num_tokens": 170642049.0, + "step": 141850 + }, + { + "entropy": 1.8797766268253326, + "epoch": 0.43975358795498, + "grad_norm": 7.365478515625, + "learning_rate": 3.814955834025442e-06, + "loss": 0.4081, + "mean_token_accuracy": 0.8585930600762367, + "num_tokens": 170654668.0, + "step": 141860 + }, + { + "entropy": 1.9316943183541297, + "epoch": 0.43978458708002965, + "grad_norm": 4.897908687591553, + "learning_rate": 3.814821378187199e-06, + "loss": 0.4531, + "mean_token_accuracy": 0.8544142752885818, + "num_tokens": 170666437.0, + "step": 141870 + }, + { + "entropy": 1.799608789384365, + "epoch": 0.4398155862050794, + "grad_norm": 4.87161922454834, + "learning_rate": 3.814686936564401e-06, + "loss": 0.4168, + "mean_token_accuracy": 0.8723533883690834, + "num_tokens": 170680355.0, + "step": 141880 + }, + { + "entropy": 1.9116061985492707, + "epoch": 0.43984658533012905, + "grad_norm": 7.900229454040527, + "learning_rate": 3.814552509154544e-06, + "loss": 0.4451, + "mean_token_accuracy": 0.8542070344090462, + "num_tokens": 170691960.0, + "step": 141890 + }, + { + "entropy": 1.9224073961377144, + "epoch": 0.43987758445517877, + "grad_norm": 7.187701225280762, + "learning_rate": 3.8144180959551223e-06, + "loss": 0.4004, + "mean_token_accuracy": 0.8590036764740944, + "num_tokens": 170703505.0, + "step": 141900 + }, + { + "entropy": 1.8964632406830788, + "epoch": 0.43990858358022844, + "grad_norm": 7.443815231323242, + "learning_rate": 3.8142836969636336e-06, + "loss": 0.4465, + "mean_token_accuracy": 0.8544583544135094, + "num_tokens": 170715715.0, + "step": 141910 + }, + { + "entropy": 1.8948586672544478, + "epoch": 0.43993958270527816, + "grad_norm": 8.835453987121582, + "learning_rate": 3.8141493121775747e-06, + "loss": 0.4289, + "mean_token_accuracy": 0.8569506093859672, + "num_tokens": 170728395.0, + "step": 141920 + }, + { + "entropy": 1.925899577140808, + "epoch": 0.43997058183032783, + "grad_norm": 7.583258628845215, + "learning_rate": 3.814014941594443e-06, + "loss": 0.4607, + "mean_token_accuracy": 0.850526462495327, + "num_tokens": 170740412.0, + "step": 141930 + }, + { + "entropy": 1.9305981874465943, + "epoch": 0.44000158095537756, + "grad_norm": 3.2110064029693604, + "learning_rate": 3.8138805852117377e-06, + "loss": 0.4387, + "mean_token_accuracy": 0.8527848124504089, + "num_tokens": 170752268.0, + "step": 141940 + }, + { + "entropy": 1.8623799979686737, + "epoch": 0.4400325800804272, + "grad_norm": 7.726592540740967, + "learning_rate": 3.8137462430269565e-06, + "loss": 0.4103, + "mean_token_accuracy": 0.8616868555545807, + "num_tokens": 170764922.0, + "step": 141950 + }, + { + "entropy": 1.9484892964363099, + "epoch": 0.44006357920547695, + "grad_norm": 8.359121322631836, + "learning_rate": 3.8136119150376006e-06, + "loss": 0.4418, + "mean_token_accuracy": 0.8625785291194916, + "num_tokens": 170776660.0, + "step": 141960 + }, + { + "entropy": 1.898753324151039, + "epoch": 0.4400945783305266, + "grad_norm": 3.5382606983184814, + "learning_rate": 3.8134776012411693e-06, + "loss": 0.4573, + "mean_token_accuracy": 0.8586586147546769, + "num_tokens": 170789396.0, + "step": 141970 + }, + { + "entropy": 1.8218749940395356, + "epoch": 0.4401255774555763, + "grad_norm": 7.84831428527832, + "learning_rate": 3.8133433016351636e-06, + "loss": 0.3302, + "mean_token_accuracy": 0.8637544587254524, + "num_tokens": 170802890.0, + "step": 141980 + }, + { + "entropy": 1.9783765748143196, + "epoch": 0.440156576580626, + "grad_norm": 7.985630512237549, + "learning_rate": 3.813209016217084e-06, + "loss": 0.4766, + "mean_token_accuracy": 0.8482358425855636, + "num_tokens": 170814592.0, + "step": 141990 + }, + { + "entropy": 1.8626375451683999, + "epoch": 0.4401875757056757, + "grad_norm": 4.535806179046631, + "learning_rate": 3.8130747449844342e-06, + "loss": 0.3822, + "mean_token_accuracy": 0.8576488435268402, + "num_tokens": 170828197.0, + "step": 142000 + }, + { + "entropy": 1.9288464725017547, + "epoch": 0.4402185748307254, + "grad_norm": 4.06549596786499, + "learning_rate": 3.812940487934716e-06, + "loss": 0.4301, + "mean_token_accuracy": 0.8626020058989525, + "num_tokens": 170840003.0, + "step": 142010 + }, + { + "entropy": 1.9120705425739288, + "epoch": 0.4402495739557751, + "grad_norm": 7.126809597015381, + "learning_rate": 3.8128062450654323e-06, + "loss": 0.4804, + "mean_token_accuracy": 0.8442542180418968, + "num_tokens": 170852921.0, + "step": 142020 + }, + { + "entropy": 1.9289035245776176, + "epoch": 0.4402805730808248, + "grad_norm": 7.8247246742248535, + "learning_rate": 3.8126720163740884e-06, + "loss": 0.4411, + "mean_token_accuracy": 0.846360231935978, + "num_tokens": 170866003.0, + "step": 142030 + }, + { + "entropy": 1.9721984177827836, + "epoch": 0.44031157220587447, + "grad_norm": 8.725114822387695, + "learning_rate": 3.8125378018581864e-06, + "loss": 0.4929, + "mean_token_accuracy": 0.85105609446764, + "num_tokens": 170877690.0, + "step": 142040 + }, + { + "entropy": 1.8866790056228637, + "epoch": 0.4403425713309242, + "grad_norm": 9.154146194458008, + "learning_rate": 3.8124036015152344e-06, + "loss": 0.4527, + "mean_token_accuracy": 0.8527222231030465, + "num_tokens": 170890766.0, + "step": 142050 + }, + { + "entropy": 1.9038648203015327, + "epoch": 0.44037357045597386, + "grad_norm": 9.711874008178711, + "learning_rate": 3.8122694153427362e-06, + "loss": 0.4257, + "mean_token_accuracy": 0.8540343955159188, + "num_tokens": 170902831.0, + "step": 142060 + }, + { + "entropy": 1.946737252175808, + "epoch": 0.4404045695810236, + "grad_norm": 6.8612380027771, + "learning_rate": 3.8121352433381986e-06, + "loss": 0.457, + "mean_token_accuracy": 0.8524143502116204, + "num_tokens": 170914196.0, + "step": 142070 + }, + { + "entropy": 1.9659889578819274, + "epoch": 0.44043556870607325, + "grad_norm": 7.955692768096924, + "learning_rate": 3.812001085499129e-06, + "loss": 0.556, + "mean_token_accuracy": 0.8456687927246094, + "num_tokens": 170925736.0, + "step": 142080 + }, + { + "entropy": 1.918195514380932, + "epoch": 0.440466567831123, + "grad_norm": 4.864497184753418, + "learning_rate": 3.811866941823035e-06, + "loss": 0.4486, + "mean_token_accuracy": 0.8531820371747016, + "num_tokens": 170937320.0, + "step": 142090 + }, + { + "entropy": 1.924280734360218, + "epoch": 0.44049756695617265, + "grad_norm": 8.45577335357666, + "learning_rate": 3.8117328123074237e-06, + "loss": 0.4733, + "mean_token_accuracy": 0.8453326195478439, + "num_tokens": 170949120.0, + "step": 142100 + }, + { + "entropy": 1.8904883772134782, + "epoch": 0.4405285660812224, + "grad_norm": 3.6915700435638428, + "learning_rate": 3.8115986969498047e-06, + "loss": 0.4394, + "mean_token_accuracy": 0.8524854764342308, + "num_tokens": 170960950.0, + "step": 142110 + }, + { + "entropy": 1.787447765469551, + "epoch": 0.44055956520627204, + "grad_norm": 2.5302090644836426, + "learning_rate": 3.811464595747688e-06, + "loss": 0.376, + "mean_token_accuracy": 0.8622151434421539, + "num_tokens": 170974946.0, + "step": 142120 + }, + { + "entropy": 1.7317695260047912, + "epoch": 0.44059056433132177, + "grad_norm": 4.011857509613037, + "learning_rate": 3.811330508698583e-06, + "loss": 0.3431, + "mean_token_accuracy": 0.8648561984300613, + "num_tokens": 170989419.0, + "step": 142130 + }, + { + "entropy": 1.9078683421015739, + "epoch": 0.44062156345637143, + "grad_norm": 4.096642971038818, + "learning_rate": 3.8111964358000005e-06, + "loss": 0.4389, + "mean_token_accuracy": 0.8534543007612229, + "num_tokens": 171001053.0, + "step": 142140 + }, + { + "entropy": 1.938607893884182, + "epoch": 0.44065256258142116, + "grad_norm": 8.272358894348145, + "learning_rate": 3.8110623770494515e-06, + "loss": 0.4844, + "mean_token_accuracy": 0.8414226740598678, + "num_tokens": 171012985.0, + "step": 142150 + }, + { + "entropy": 1.9575382590293884, + "epoch": 0.4406835617064708, + "grad_norm": 8.397919654846191, + "learning_rate": 3.8109283324444484e-06, + "loss": 0.4773, + "mean_token_accuracy": 0.8492600947618485, + "num_tokens": 171023779.0, + "step": 142160 + }, + { + "entropy": 1.935109880566597, + "epoch": 0.44071456083152055, + "grad_norm": 6.095836162567139, + "learning_rate": 3.8107943019825027e-06, + "loss": 0.4592, + "mean_token_accuracy": 0.8479932472109795, + "num_tokens": 171035289.0, + "step": 142170 + }, + { + "entropy": 1.952261172235012, + "epoch": 0.4407455599565702, + "grad_norm": 7.735157489776611, + "learning_rate": 3.8106602856611296e-06, + "loss": 0.579, + "mean_token_accuracy": 0.8471168518066406, + "num_tokens": 171047459.0, + "step": 142180 + }, + { + "entropy": 1.929404976963997, + "epoch": 0.44077655908161995, + "grad_norm": 3.8500773906707764, + "learning_rate": 3.8105262834778404e-06, + "loss": 0.4765, + "mean_token_accuracy": 0.8393190920352935, + "num_tokens": 171059462.0, + "step": 142190 + }, + { + "entropy": 1.8857171356678009, + "epoch": 0.4408075582066696, + "grad_norm": 8.527490615844727, + "learning_rate": 3.8103922954301514e-06, + "loss": 0.4077, + "mean_token_accuracy": 0.8652898028492928, + "num_tokens": 171071937.0, + "step": 142200 + }, + { + "entropy": 1.9904403433203697, + "epoch": 0.44083855733171934, + "grad_norm": 9.216706275939941, + "learning_rate": 3.810258321515576e-06, + "loss": 0.5001, + "mean_token_accuracy": 0.8476386234164238, + "num_tokens": 171083344.0, + "step": 142210 + }, + { + "entropy": 1.8266815394163132, + "epoch": 0.440869556456769, + "grad_norm": 9.037412643432617, + "learning_rate": 3.81012436173163e-06, + "loss": 0.4039, + "mean_token_accuracy": 0.855592280626297, + "num_tokens": 171097604.0, + "step": 142220 + }, + { + "entropy": 1.963519898056984, + "epoch": 0.4409005555818187, + "grad_norm": 7.543654441833496, + "learning_rate": 3.809990416075832e-06, + "loss": 0.4642, + "mean_token_accuracy": 0.8579617440700531, + "num_tokens": 171108370.0, + "step": 142230 + }, + { + "entropy": 1.9259283766150475, + "epoch": 0.4409315547068684, + "grad_norm": 7.956765174865723, + "learning_rate": 3.809856484545695e-06, + "loss": 0.4481, + "mean_token_accuracy": 0.8434132248163223, + "num_tokens": 171120766.0, + "step": 142240 + }, + { + "entropy": 1.9885295629501343, + "epoch": 0.44096255383191807, + "grad_norm": 8.011740684509277, + "learning_rate": 3.8097225671387384e-06, + "loss": 0.4909, + "mean_token_accuracy": 0.8473825648427009, + "num_tokens": 171131762.0, + "step": 142250 + }, + { + "entropy": 1.993890830874443, + "epoch": 0.4409935529569678, + "grad_norm": 8.500561714172363, + "learning_rate": 3.80958866385248e-06, + "loss": 0.4702, + "mean_token_accuracy": 0.8485628053545952, + "num_tokens": 171142751.0, + "step": 142260 + }, + { + "entropy": 1.937794804573059, + "epoch": 0.44102455208201746, + "grad_norm": 4.054864883422852, + "learning_rate": 3.8094547746844392e-06, + "loss": 0.4594, + "mean_token_accuracy": 0.8569764137268067, + "num_tokens": 171155116.0, + "step": 142270 + }, + { + "entropy": 1.9017036750912666, + "epoch": 0.4410555512070672, + "grad_norm": 7.951387882232666, + "learning_rate": 3.809320899632134e-06, + "loss": 0.4321, + "mean_token_accuracy": 0.8537882640957832, + "num_tokens": 171167623.0, + "step": 142280 + }, + { + "entropy": 1.9774706676602363, + "epoch": 0.44108655033211686, + "grad_norm": 9.138680458068848, + "learning_rate": 3.809187038693085e-06, + "loss": 0.5366, + "mean_token_accuracy": 0.8372975870966911, + "num_tokens": 171179230.0, + "step": 142290 + }, + { + "entropy": 1.9862707808613778, + "epoch": 0.4411175494571666, + "grad_norm": 7.265309810638428, + "learning_rate": 3.809053191864811e-06, + "loss": 0.4819, + "mean_token_accuracy": 0.8446347713470459, + "num_tokens": 171190091.0, + "step": 142300 + }, + { + "entropy": 1.9375395745038986, + "epoch": 0.44114854858221625, + "grad_norm": 8.636645317077637, + "learning_rate": 3.808919359144836e-06, + "loss": 0.4197, + "mean_token_accuracy": 0.8591432020068168, + "num_tokens": 171201834.0, + "step": 142310 + }, + { + "entropy": 1.905143465101719, + "epoch": 0.441179547707266, + "grad_norm": 4.498809814453125, + "learning_rate": 3.8087855405306796e-06, + "loss": 0.4133, + "mean_token_accuracy": 0.8690479651093483, + "num_tokens": 171213960.0, + "step": 142320 + }, + { + "entropy": 1.9409716859459878, + "epoch": 0.44121054683231564, + "grad_norm": 8.317008972167969, + "learning_rate": 3.8086517360198645e-06, + "loss": 0.4613, + "mean_token_accuracy": 0.8425351768732071, + "num_tokens": 171225664.0, + "step": 142330 + }, + { + "entropy": 1.9071755453944206, + "epoch": 0.44124154595736537, + "grad_norm": 7.693809986114502, + "learning_rate": 3.8085179456099135e-06, + "loss": 0.4228, + "mean_token_accuracy": 0.8560735136270523, + "num_tokens": 171238142.0, + "step": 142340 + }, + { + "entropy": 1.8364718183875084, + "epoch": 0.44127254508241504, + "grad_norm": 5.97056245803833, + "learning_rate": 3.80838416929835e-06, + "loss": 0.395, + "mean_token_accuracy": 0.8731970608234405, + "num_tokens": 171251102.0, + "step": 142350 + }, + { + "entropy": 1.8636333227157593, + "epoch": 0.44130354420746476, + "grad_norm": 4.0348920822143555, + "learning_rate": 3.8082504070826986e-06, + "loss": 0.4076, + "mean_token_accuracy": 0.863330303132534, + "num_tokens": 171263656.0, + "step": 142360 + }, + { + "entropy": 1.8688743382692337, + "epoch": 0.44133454333251443, + "grad_norm": 8.717557907104492, + "learning_rate": 3.8081166589604846e-06, + "loss": 0.3986, + "mean_token_accuracy": 0.8605389028787613, + "num_tokens": 171276463.0, + "step": 142370 + }, + { + "entropy": 1.851096446812153, + "epoch": 0.44136554245756415, + "grad_norm": 7.275476455688477, + "learning_rate": 3.8079829249292312e-06, + "loss": 0.4234, + "mean_token_accuracy": 0.8582789734005928, + "num_tokens": 171289605.0, + "step": 142380 + }, + { + "entropy": 1.966257530450821, + "epoch": 0.4413965415826138, + "grad_norm": 9.199238777160645, + "learning_rate": 3.8078492049864664e-06, + "loss": 0.4801, + "mean_token_accuracy": 0.8538633704185485, + "num_tokens": 171300168.0, + "step": 142390 + }, + { + "entropy": 1.7878931298851968, + "epoch": 0.44142754070766355, + "grad_norm": 3.798326253890991, + "learning_rate": 3.8077154991297155e-06, + "loss": 0.3512, + "mean_token_accuracy": 0.863753379881382, + "num_tokens": 171314188.0, + "step": 142400 + }, + { + "entropy": 1.9457097873091698, + "epoch": 0.4414585398327132, + "grad_norm": 7.9168853759765625, + "learning_rate": 3.8075818073565064e-06, + "loss": 0.4897, + "mean_token_accuracy": 0.8465751528739929, + "num_tokens": 171326103.0, + "step": 142410 + }, + { + "entropy": 1.9705891758203506, + "epoch": 0.44148953895776294, + "grad_norm": 9.16375732421875, + "learning_rate": 3.8074481296643662e-06, + "loss": 0.4359, + "mean_token_accuracy": 0.860154564678669, + "num_tokens": 171337413.0, + "step": 142420 + }, + { + "entropy": 1.942754976451397, + "epoch": 0.4415205380828126, + "grad_norm": 8.25278091430664, + "learning_rate": 3.807314466050824e-06, + "loss": 0.4173, + "mean_token_accuracy": 0.8554434284567833, + "num_tokens": 171349655.0, + "step": 142430 + }, + { + "entropy": 1.9804495930671693, + "epoch": 0.44155153720786233, + "grad_norm": 7.4566144943237305, + "learning_rate": 3.8071808165134087e-06, + "loss": 0.5058, + "mean_token_accuracy": 0.848778210580349, + "num_tokens": 171360943.0, + "step": 142440 + }, + { + "entropy": 1.8187919482588768, + "epoch": 0.441582536332912, + "grad_norm": 7.4357686042785645, + "learning_rate": 3.807047181049649e-06, + "loss": 0.3858, + "mean_token_accuracy": 0.8577719181776047, + "num_tokens": 171374928.0, + "step": 142450 + }, + { + "entropy": 1.9486502900719642, + "epoch": 0.4416135354579617, + "grad_norm": 8.6198148727417, + "learning_rate": 3.8069135596570757e-06, + "loss": 0.4717, + "mean_token_accuracy": 0.8480415970087052, + "num_tokens": 171387021.0, + "step": 142460 + }, + { + "entropy": 1.7527381375432014, + "epoch": 0.4416445345830114, + "grad_norm": 7.5862135887146, + "learning_rate": 3.806779952333219e-06, + "loss": 0.3672, + "mean_token_accuracy": 0.8725428983569146, + "num_tokens": 171401592.0, + "step": 142470 + }, + { + "entropy": 1.9407009825110435, + "epoch": 0.44167553370806106, + "grad_norm": 7.653254985809326, + "learning_rate": 3.806646359075612e-06, + "loss": 0.4221, + "mean_token_accuracy": 0.8616171032190323, + "num_tokens": 171413307.0, + "step": 142480 + }, + { + "entropy": 1.8926226265728474, + "epoch": 0.4417065328331108, + "grad_norm": 7.93331241607666, + "learning_rate": 3.806512779881785e-06, + "loss": 0.4041, + "mean_token_accuracy": 0.8604073897004128, + "num_tokens": 171427027.0, + "step": 142490 + }, + { + "entropy": 1.9086723767220974, + "epoch": 0.44173753195816046, + "grad_norm": 8.239239692687988, + "learning_rate": 3.806379214749271e-06, + "loss": 0.5015, + "mean_token_accuracy": 0.8534449085593223, + "num_tokens": 171440127.0, + "step": 142500 + }, + { + "entropy": 1.888797688484192, + "epoch": 0.4417685310832102, + "grad_norm": 8.310606956481934, + "learning_rate": 3.8062456636756035e-06, + "loss": 0.3774, + "mean_token_accuracy": 0.871643802523613, + "num_tokens": 171452073.0, + "step": 142510 + }, + { + "entropy": 1.8726611629128456, + "epoch": 0.44179953020825985, + "grad_norm": 8.1859769821167, + "learning_rate": 3.8061121266583157e-06, + "loss": 0.4828, + "mean_token_accuracy": 0.8519933164119721, + "num_tokens": 171465272.0, + "step": 142520 + }, + { + "entropy": 1.9565669789910316, + "epoch": 0.4418305293333096, + "grad_norm": 7.003886699676514, + "learning_rate": 3.805978603694943e-06, + "loss": 0.4643, + "mean_token_accuracy": 0.8577442169189453, + "num_tokens": 171476604.0, + "step": 142530 + }, + { + "entropy": 1.969398957490921, + "epoch": 0.44186152845835924, + "grad_norm": 7.602597713470459, + "learning_rate": 3.80584509478302e-06, + "loss": 0.4394, + "mean_token_accuracy": 0.8602959275245666, + "num_tokens": 171487864.0, + "step": 142540 + }, + { + "entropy": 1.9313815072178842, + "epoch": 0.44189252758340897, + "grad_norm": 7.627027988433838, + "learning_rate": 3.805711599920083e-06, + "loss": 0.4469, + "mean_token_accuracy": 0.8534037470817566, + "num_tokens": 171500018.0, + "step": 142550 + }, + { + "entropy": 1.9397203966975212, + "epoch": 0.44192352670845864, + "grad_norm": 8.17835521697998, + "learning_rate": 3.805578119103666e-06, + "loss": 0.4608, + "mean_token_accuracy": 0.8493491873145104, + "num_tokens": 171511356.0, + "step": 142560 + }, + { + "entropy": 1.8530723094940185, + "epoch": 0.44195452583350836, + "grad_norm": 7.996511459350586, + "learning_rate": 3.8054446523313083e-06, + "loss": 0.4775, + "mean_token_accuracy": 0.8453282520174981, + "num_tokens": 171524204.0, + "step": 142570 + }, + { + "entropy": 1.839161404967308, + "epoch": 0.44198552495855803, + "grad_norm": 4.661229133605957, + "learning_rate": 3.8053111996005454e-06, + "loss": 0.3742, + "mean_token_accuracy": 0.859870445728302, + "num_tokens": 171537124.0, + "step": 142580 + }, + { + "entropy": 1.8566135600209237, + "epoch": 0.44201652408360775, + "grad_norm": 5.89265775680542, + "learning_rate": 3.8051777609089174e-06, + "loss": 0.4537, + "mean_token_accuracy": 0.8573786094784737, + "num_tokens": 171550107.0, + "step": 142590 + }, + { + "entropy": 1.8885900154709816, + "epoch": 0.4420475232086574, + "grad_norm": 7.972777366638184, + "learning_rate": 3.805044336253962e-06, + "loss": 0.4445, + "mean_token_accuracy": 0.8563668861985206, + "num_tokens": 171562634.0, + "step": 142600 + }, + { + "entropy": 1.9973913908004761, + "epoch": 0.44207852233370715, + "grad_norm": 7.949423789978027, + "learning_rate": 3.8049109256332184e-06, + "loss": 0.5469, + "mean_token_accuracy": 0.8333679780364036, + "num_tokens": 171574739.0, + "step": 142610 + }, + { + "entropy": 1.9654783993959426, + "epoch": 0.4421095214587568, + "grad_norm": 8.60360050201416, + "learning_rate": 3.804777529044226e-06, + "loss": 0.4183, + "mean_token_accuracy": 0.8631428495049477, + "num_tokens": 171585950.0, + "step": 142620 + }, + { + "entropy": 1.9139345183968544, + "epoch": 0.44214052058380654, + "grad_norm": 8.448734283447266, + "learning_rate": 3.804644146484526e-06, + "loss": 0.4199, + "mean_token_accuracy": 0.850536921620369, + "num_tokens": 171597859.0, + "step": 142630 + }, + { + "entropy": 1.8951680943369866, + "epoch": 0.4421715197088562, + "grad_norm": 7.874819278717041, + "learning_rate": 3.8045107779516586e-06, + "loss": 0.4726, + "mean_token_accuracy": 0.8422896340489388, + "num_tokens": 171610294.0, + "step": 142640 + }, + { + "entropy": 1.9419591963291167, + "epoch": 0.44220251883390593, + "grad_norm": 3.6570918560028076, + "learning_rate": 3.8043774234431667e-06, + "loss": 0.4749, + "mean_token_accuracy": 0.8482140868902206, + "num_tokens": 171621850.0, + "step": 142650 + }, + { + "entropy": 1.8761625073850154, + "epoch": 0.4422335179589556, + "grad_norm": 8.101895332336426, + "learning_rate": 3.804244082956592e-06, + "loss": 0.4294, + "mean_token_accuracy": 0.8565763145685196, + "num_tokens": 171634316.0, + "step": 142660 + }, + { + "entropy": 1.9388285502791405, + "epoch": 0.4422645170840053, + "grad_norm": 3.6656668186187744, + "learning_rate": 3.8041107564894777e-06, + "loss": 0.4821, + "mean_token_accuracy": 0.8403656214475632, + "num_tokens": 171646140.0, + "step": 142670 + }, + { + "entropy": 1.9169174507260323, + "epoch": 0.442295516209055, + "grad_norm": 7.208846092224121, + "learning_rate": 3.803977444039366e-06, + "loss": 0.4243, + "mean_token_accuracy": 0.8567602500319481, + "num_tokens": 171657641.0, + "step": 142680 + }, + { + "entropy": 1.7894317671656608, + "epoch": 0.4423265153341047, + "grad_norm": 4.454036712646484, + "learning_rate": 3.803844145603802e-06, + "loss": 0.367, + "mean_token_accuracy": 0.8561735481023789, + "num_tokens": 171671211.0, + "step": 142690 + }, + { + "entropy": 1.924549500644207, + "epoch": 0.4423575144591544, + "grad_norm": 4.878185272216797, + "learning_rate": 3.803710861180331e-06, + "loss": 0.4273, + "mean_token_accuracy": 0.8529748469591141, + "num_tokens": 171683021.0, + "step": 142700 + }, + { + "entropy": 1.926971609890461, + "epoch": 0.4423885135842041, + "grad_norm": 8.438478469848633, + "learning_rate": 3.803577590766498e-06, + "loss": 0.4745, + "mean_token_accuracy": 0.8476522505283356, + "num_tokens": 171694922.0, + "step": 142710 + }, + { + "entropy": 1.869713193178177, + "epoch": 0.4424195127092538, + "grad_norm": 8.490039825439453, + "learning_rate": 3.8034443343598484e-06, + "loss": 0.4056, + "mean_token_accuracy": 0.8578268960118294, + "num_tokens": 171707523.0, + "step": 142720 + }, + { + "entropy": 1.89549450725317, + "epoch": 0.44245051183430345, + "grad_norm": 3.9622802734375, + "learning_rate": 3.8033110919579275e-06, + "loss": 0.4157, + "mean_token_accuracy": 0.8647204831242561, + "num_tokens": 171719274.0, + "step": 142730 + }, + { + "entropy": 1.9119060546159745, + "epoch": 0.4424815109593532, + "grad_norm": 10.171487808227539, + "learning_rate": 3.8031778635582846e-06, + "loss": 0.4752, + "mean_token_accuracy": 0.8421938866376877, + "num_tokens": 171731484.0, + "step": 142740 + }, + { + "entropy": 1.9421584740281106, + "epoch": 0.44251251008440284, + "grad_norm": 7.809762001037598, + "learning_rate": 3.803044649158467e-06, + "loss": 0.4406, + "mean_token_accuracy": 0.8524343490600585, + "num_tokens": 171742555.0, + "step": 142750 + }, + { + "entropy": 1.8323628097772597, + "epoch": 0.44254350920945257, + "grad_norm": 3.0407345294952393, + "learning_rate": 3.802911448756022e-06, + "loss": 0.3788, + "mean_token_accuracy": 0.8646909952163696, + "num_tokens": 171756753.0, + "step": 142760 + }, + { + "entropy": 1.949809417128563, + "epoch": 0.44257450833450224, + "grad_norm": 8.330705642700195, + "learning_rate": 3.8027782623484994e-06, + "loss": 0.4843, + "mean_token_accuracy": 0.8445712998509407, + "num_tokens": 171768227.0, + "step": 142770 + }, + { + "entropy": 1.8988068416714667, + "epoch": 0.44260550745955196, + "grad_norm": 7.558119297027588, + "learning_rate": 3.802645089933448e-06, + "loss": 0.4338, + "mean_token_accuracy": 0.8599656626582146, + "num_tokens": 171779928.0, + "step": 142780 + }, + { + "entropy": 1.8191672816872597, + "epoch": 0.44263650658460163, + "grad_norm": 8.800172805786133, + "learning_rate": 3.8025119315084186e-06, + "loss": 0.408, + "mean_token_accuracy": 0.8557739660143853, + "num_tokens": 171793531.0, + "step": 142790 + }, + { + "entropy": 1.912230722606182, + "epoch": 0.44266750570965135, + "grad_norm": 8.029808044433594, + "learning_rate": 3.802378787070961e-06, + "loss": 0.4513, + "mean_token_accuracy": 0.8631566151976585, + "num_tokens": 171804841.0, + "step": 142800 + }, + { + "entropy": 1.9423863098025322, + "epoch": 0.442698504834701, + "grad_norm": 7.330270290374756, + "learning_rate": 3.8022456566186274e-06, + "loss": 0.4391, + "mean_token_accuracy": 0.8568160951137542, + "num_tokens": 171816045.0, + "step": 142810 + }, + { + "entropy": 1.909077961742878, + "epoch": 0.44272950395975075, + "grad_norm": 8.023503303527832, + "learning_rate": 3.8021125401489695e-06, + "loss": 0.4576, + "mean_token_accuracy": 0.8499825984239578, + "num_tokens": 171827873.0, + "step": 142820 + }, + { + "entropy": 1.914843738079071, + "epoch": 0.4427605030848004, + "grad_norm": 7.886246681213379, + "learning_rate": 3.8019794376595393e-06, + "loss": 0.4809, + "mean_token_accuracy": 0.854096032679081, + "num_tokens": 171839212.0, + "step": 142830 + }, + { + "entropy": 1.747120851278305, + "epoch": 0.44279150220985014, + "grad_norm": 4.407447338104248, + "learning_rate": 3.8018463491478903e-06, + "loss": 0.3271, + "mean_token_accuracy": 0.8662238627672195, + "num_tokens": 171853351.0, + "step": 142840 + }, + { + "entropy": 1.9104099452495575, + "epoch": 0.4428225013348998, + "grad_norm": 7.366025924682617, + "learning_rate": 3.801713274611577e-06, + "loss": 0.4701, + "mean_token_accuracy": 0.8494726955890656, + "num_tokens": 171865251.0, + "step": 142850 + }, + { + "entropy": 1.9125598162412643, + "epoch": 0.44285350045994953, + "grad_norm": 8.276859283447266, + "learning_rate": 3.8015802140481523e-06, + "loss": 0.4449, + "mean_token_accuracy": 0.8597826391458512, + "num_tokens": 171877204.0, + "step": 142860 + }, + { + "entropy": 1.9104377076029777, + "epoch": 0.4428844995849992, + "grad_norm": 3.7604010105133057, + "learning_rate": 3.801447167455171e-06, + "loss": 0.4483, + "mean_token_accuracy": 0.8584300577640533, + "num_tokens": 171889490.0, + "step": 142870 + }, + { + "entropy": 1.7467719167470932, + "epoch": 0.4429154987100489, + "grad_norm": 2.625401258468628, + "learning_rate": 3.8013141348301903e-06, + "loss": 0.361, + "mean_token_accuracy": 0.8759482517838478, + "num_tokens": 171904139.0, + "step": 142880 + }, + { + "entropy": 1.8149724557995797, + "epoch": 0.4429464978350986, + "grad_norm": 7.951505184173584, + "learning_rate": 3.801181116170765e-06, + "loss": 0.381, + "mean_token_accuracy": 0.8639876395463943, + "num_tokens": 171918069.0, + "step": 142890 + }, + { + "entropy": 1.9812359169125557, + "epoch": 0.4429774969601483, + "grad_norm": 6.9604926109313965, + "learning_rate": 3.801048111474452e-06, + "loss": 0.4941, + "mean_token_accuracy": 0.8483646467328072, + "num_tokens": 171929606.0, + "step": 142900 + }, + { + "entropy": 1.9642615109682082, + "epoch": 0.443008496085198, + "grad_norm": 7.556371688842773, + "learning_rate": 3.800915120738809e-06, + "loss": 0.4788, + "mean_token_accuracy": 0.8475595816969872, + "num_tokens": 171941190.0, + "step": 142910 + }, + { + "entropy": 1.8470994770526885, + "epoch": 0.4430394952102477, + "grad_norm": 7.835377216339111, + "learning_rate": 3.8007821439613928e-06, + "loss": 0.3891, + "mean_token_accuracy": 0.8662957057356835, + "num_tokens": 171954065.0, + "step": 142920 + }, + { + "entropy": 1.8816244021058082, + "epoch": 0.4430704943352974, + "grad_norm": 3.198092222213745, + "learning_rate": 3.8006491811397643e-06, + "loss": 0.4295, + "mean_token_accuracy": 0.8502931743860245, + "num_tokens": 171966243.0, + "step": 142930 + }, + { + "entropy": 1.8951477900147438, + "epoch": 0.4431014934603471, + "grad_norm": 6.3186750411987305, + "learning_rate": 3.80051623227148e-06, + "loss": 0.4193, + "mean_token_accuracy": 0.8593979939818382, + "num_tokens": 171979052.0, + "step": 142940 + }, + { + "entropy": 1.939696778357029, + "epoch": 0.4431324925853968, + "grad_norm": 9.299203872680664, + "learning_rate": 3.8003832973541e-06, + "loss": 0.4681, + "mean_token_accuracy": 0.8494555294513703, + "num_tokens": 171990440.0, + "step": 142950 + }, + { + "entropy": 1.8888702988624573, + "epoch": 0.4431634917104465, + "grad_norm": 3.8916590213775635, + "learning_rate": 3.800250376385186e-06, + "loss": 0.4636, + "mean_token_accuracy": 0.8480198219418525, + "num_tokens": 172002952.0, + "step": 142960 + }, + { + "entropy": 1.9692298248410225, + "epoch": 0.44319449083549617, + "grad_norm": 7.702281475067139, + "learning_rate": 3.8001174693622976e-06, + "loss": 0.4755, + "mean_token_accuracy": 0.8490083158016205, + "num_tokens": 172013634.0, + "step": 142970 + }, + { + "entropy": 1.796003994345665, + "epoch": 0.44322548996054584, + "grad_norm": 2.3298850059509277, + "learning_rate": 3.7999845762829975e-06, + "loss": 0.3878, + "mean_token_accuracy": 0.8645777225494384, + "num_tokens": 172027808.0, + "step": 142980 + }, + { + "entropy": 1.8192441150546075, + "epoch": 0.44325648908559556, + "grad_norm": 8.177521705627441, + "learning_rate": 3.7998516971448463e-06, + "loss": 0.4098, + "mean_token_accuracy": 0.8649537086486816, + "num_tokens": 172041098.0, + "step": 142990 + }, + { + "entropy": 1.7670658022165298, + "epoch": 0.44328748821064523, + "grad_norm": 3.8939199447631836, + "learning_rate": 3.7997188319454082e-06, + "loss": 0.333, + "mean_token_accuracy": 0.8773150816559792, + "num_tokens": 172054398.0, + "step": 143000 + }, + { + "entropy": 1.839756967127323, + "epoch": 0.44331848733569496, + "grad_norm": 8.327499389648438, + "learning_rate": 3.7995859806822448e-06, + "loss": 0.4289, + "mean_token_accuracy": 0.8523253813385964, + "num_tokens": 172067290.0, + "step": 143010 + }, + { + "entropy": 1.9034264922142028, + "epoch": 0.4433494864607446, + "grad_norm": 7.8952531814575195, + "learning_rate": 3.7994531433529215e-06, + "loss": 0.4566, + "mean_token_accuracy": 0.852616871893406, + "num_tokens": 172079439.0, + "step": 143020 + }, + { + "entropy": 1.9509797424077988, + "epoch": 0.44338048558579435, + "grad_norm": 8.539716720581055, + "learning_rate": 3.7993203199550016e-06, + "loss": 0.4845, + "mean_token_accuracy": 0.8532881319522858, + "num_tokens": 172090400.0, + "step": 143030 + }, + { + "entropy": 1.8199230879545212, + "epoch": 0.443411484710844, + "grad_norm": 7.303118705749512, + "learning_rate": 3.7991875104860506e-06, + "loss": 0.4115, + "mean_token_accuracy": 0.8639900296926498, + "num_tokens": 172103631.0, + "step": 143040 + }, + { + "entropy": 1.9047269806265832, + "epoch": 0.44344248383589374, + "grad_norm": 3.573137044906616, + "learning_rate": 3.799054714943634e-06, + "loss": 0.4069, + "mean_token_accuracy": 0.8529732450842857, + "num_tokens": 172115897.0, + "step": 143050 + }, + { + "entropy": 1.927889946103096, + "epoch": 0.4434734829609434, + "grad_norm": 6.2143635749816895, + "learning_rate": 3.798921933325319e-06, + "loss": 0.5202, + "mean_token_accuracy": 0.845208078622818, + "num_tokens": 172127650.0, + "step": 143060 + }, + { + "entropy": 1.8747559115290642, + "epoch": 0.44350448208599313, + "grad_norm": 8.309805870056152, + "learning_rate": 3.798789165628671e-06, + "loss": 0.4779, + "mean_token_accuracy": 0.849570831656456, + "num_tokens": 172139526.0, + "step": 143070 + }, + { + "entropy": 1.8134203180670738, + "epoch": 0.4435354812110428, + "grad_norm": 10.744686126708984, + "learning_rate": 3.7986564118512593e-06, + "loss": 0.3987, + "mean_token_accuracy": 0.8602747023105621, + "num_tokens": 172152170.0, + "step": 143080 + }, + { + "entropy": 1.920543058216572, + "epoch": 0.44356648033609253, + "grad_norm": 8.824508666992188, + "learning_rate": 3.79852367199065e-06, + "loss": 0.4664, + "mean_token_accuracy": 0.8488981783390045, + "num_tokens": 172163758.0, + "step": 143090 + }, + { + "entropy": 1.95814551115036, + "epoch": 0.4435974794611422, + "grad_norm": 5.424408912658691, + "learning_rate": 3.798390946044413e-06, + "loss": 0.4833, + "mean_token_accuracy": 0.8507681146264077, + "num_tokens": 172175098.0, + "step": 143100 + }, + { + "entropy": 1.9256742283701898, + "epoch": 0.4436284785861919, + "grad_norm": 8.446990966796875, + "learning_rate": 3.7982582340101166e-06, + "loss": 0.4942, + "mean_token_accuracy": 0.8471813514828682, + "num_tokens": 172186173.0, + "step": 143110 + }, + { + "entropy": 1.914665700495243, + "epoch": 0.4436594777112416, + "grad_norm": 4.053114414215088, + "learning_rate": 3.7981255358853308e-06, + "loss": 0.4461, + "mean_token_accuracy": 0.8550711467862129, + "num_tokens": 172198247.0, + "step": 143120 + }, + { + "entropy": 1.9240732803940772, + "epoch": 0.4436904768362913, + "grad_norm": 6.92840576171875, + "learning_rate": 3.7979928516676266e-06, + "loss": 0.4483, + "mean_token_accuracy": 0.8500184774398803, + "num_tokens": 172210111.0, + "step": 143130 + }, + { + "entropy": 1.9857992932200432, + "epoch": 0.443721475961341, + "grad_norm": 9.123396873474121, + "learning_rate": 3.7978601813545756e-06, + "loss": 0.5182, + "mean_token_accuracy": 0.8403029024600983, + "num_tokens": 172221785.0, + "step": 143140 + }, + { + "entropy": 1.88204335719347, + "epoch": 0.4437524750863907, + "grad_norm": 4.223636627197266, + "learning_rate": 3.797727524943748e-06, + "loss": 0.4046, + "mean_token_accuracy": 0.8584297001361847, + "num_tokens": 172234848.0, + "step": 143150 + }, + { + "entropy": 1.8985393255949021, + "epoch": 0.4437834742114404, + "grad_norm": 10.910396575927734, + "learning_rate": 3.797594882432716e-06, + "loss": 0.4021, + "mean_token_accuracy": 0.856754156947136, + "num_tokens": 172247385.0, + "step": 143160 + }, + { + "entropy": 1.9117623910307884, + "epoch": 0.4438144733364901, + "grad_norm": 7.727200508117676, + "learning_rate": 3.7974622538190527e-06, + "loss": 0.4536, + "mean_token_accuracy": 0.845762537419796, + "num_tokens": 172260013.0, + "step": 143170 + }, + { + "entropy": 1.9631491884589196, + "epoch": 0.44384547246153977, + "grad_norm": 8.514778137207031, + "learning_rate": 3.7973296391003324e-06, + "loss": 0.4897, + "mean_token_accuracy": 0.8480650544166565, + "num_tokens": 172271771.0, + "step": 143180 + }, + { + "entropy": 1.8932307675480842, + "epoch": 0.4438764715865895, + "grad_norm": 8.17857551574707, + "learning_rate": 3.797197038274128e-06, + "loss": 0.427, + "mean_token_accuracy": 0.8522116959095001, + "num_tokens": 172284879.0, + "step": 143190 + }, + { + "entropy": 2.053502270579338, + "epoch": 0.44390747071163916, + "grad_norm": 8.686943054199219, + "learning_rate": 3.7970644513380144e-06, + "loss": 0.5244, + "mean_token_accuracy": 0.8443288967013359, + "num_tokens": 172295803.0, + "step": 143200 + }, + { + "entropy": 2.0258347660303118, + "epoch": 0.4439384698366889, + "grad_norm": 8.006467819213867, + "learning_rate": 3.7969318782895674e-06, + "loss": 0.536, + "mean_token_accuracy": 0.8328389957547188, + "num_tokens": 172306930.0, + "step": 143210 + }, + { + "entropy": 1.9896105587482453, + "epoch": 0.44396946896173856, + "grad_norm": 3.218825340270996, + "learning_rate": 3.7967993191263625e-06, + "loss": 0.4878, + "mean_token_accuracy": 0.8439623191952705, + "num_tokens": 172318217.0, + "step": 143220 + }, + { + "entropy": 1.9521566152572631, + "epoch": 0.4440004680867882, + "grad_norm": 7.5543389320373535, + "learning_rate": 3.7966667738459746e-06, + "loss": 0.4407, + "mean_token_accuracy": 0.8609140679240227, + "num_tokens": 172330536.0, + "step": 143230 + }, + { + "entropy": 1.9294056311249732, + "epoch": 0.44403146721183795, + "grad_norm": 9.052746772766113, + "learning_rate": 3.7965342424459822e-06, + "loss": 0.4496, + "mean_token_accuracy": 0.8596820458769798, + "num_tokens": 172341999.0, + "step": 143240 + }, + { + "entropy": 1.869072140753269, + "epoch": 0.4440624663368876, + "grad_norm": 7.041997909545898, + "learning_rate": 3.7964017249239623e-06, + "loss": 0.4067, + "mean_token_accuracy": 0.8588750317692757, + "num_tokens": 172355292.0, + "step": 143250 + }, + { + "entropy": 1.9508719503879548, + "epoch": 0.44409346546193734, + "grad_norm": 4.241085529327393, + "learning_rate": 3.796269221277493e-06, + "loss": 0.4702, + "mean_token_accuracy": 0.8481484889984131, + "num_tokens": 172367036.0, + "step": 143260 + }, + { + "entropy": 1.9314745679497718, + "epoch": 0.444124464586987, + "grad_norm": 4.061893939971924, + "learning_rate": 3.7961367315041535e-06, + "loss": 0.425, + "mean_token_accuracy": 0.8549939930438996, + "num_tokens": 172379192.0, + "step": 143270 + }, + { + "entropy": 1.9073988541960716, + "epoch": 0.44415546371203674, + "grad_norm": 7.23845100402832, + "learning_rate": 3.7960042556015226e-06, + "loss": 0.4879, + "mean_token_accuracy": 0.8475894257426262, + "num_tokens": 172391173.0, + "step": 143280 + }, + { + "entropy": 1.9345348447561264, + "epoch": 0.4441864628370864, + "grad_norm": 8.21992015838623, + "learning_rate": 3.79587179356718e-06, + "loss": 0.4772, + "mean_token_accuracy": 0.8488015040755272, + "num_tokens": 172403416.0, + "step": 143290 + }, + { + "entropy": 1.9360866978764535, + "epoch": 0.44421746196213613, + "grad_norm": 6.462368011474609, + "learning_rate": 3.7957393453987075e-06, + "loss": 0.4593, + "mean_token_accuracy": 0.8466382816433906, + "num_tokens": 172415684.0, + "step": 143300 + }, + { + "entropy": 1.9160262137651443, + "epoch": 0.4442484610871858, + "grad_norm": 6.9671454429626465, + "learning_rate": 3.795606911093684e-06, + "loss": 0.4454, + "mean_token_accuracy": 0.8536192387342453, + "num_tokens": 172427475.0, + "step": 143310 + }, + { + "entropy": 1.9292744249105453, + "epoch": 0.4442794602122355, + "grad_norm": 3.4620540142059326, + "learning_rate": 3.7954744906496932e-06, + "loss": 0.4252, + "mean_token_accuracy": 0.8594009026885032, + "num_tokens": 172439631.0, + "step": 143320 + }, + { + "entropy": 1.9033573985099792, + "epoch": 0.4443104593372852, + "grad_norm": 8.030845642089844, + "learning_rate": 3.795342084064316e-06, + "loss": 0.4125, + "mean_token_accuracy": 0.86111601293087, + "num_tokens": 172451878.0, + "step": 143330 + }, + { + "entropy": 1.851330216228962, + "epoch": 0.4443414584623349, + "grad_norm": 8.43782901763916, + "learning_rate": 3.795209691335136e-06, + "loss": 0.3983, + "mean_token_accuracy": 0.8640111222863197, + "num_tokens": 172465066.0, + "step": 143340 + }, + { + "entropy": 1.991729763150215, + "epoch": 0.4443724575873846, + "grad_norm": 7.826473236083984, + "learning_rate": 3.7950773124597375e-06, + "loss": 0.488, + "mean_token_accuracy": 0.8467871636152268, + "num_tokens": 172476252.0, + "step": 143350 + }, + { + "entropy": 1.9685329020023346, + "epoch": 0.4444034567124343, + "grad_norm": 8.118627548217773, + "learning_rate": 3.794944947435702e-06, + "loss": 0.4874, + "mean_token_accuracy": 0.8508369252085686, + "num_tokens": 172487279.0, + "step": 143360 + }, + { + "entropy": 2.045986759662628, + "epoch": 0.444434455837484, + "grad_norm": 8.419778823852539, + "learning_rate": 3.794812596260616e-06, + "loss": 0.4839, + "mean_token_accuracy": 0.8484291791915893, + "num_tokens": 172498181.0, + "step": 143370 + }, + { + "entropy": 1.9107683286070825, + "epoch": 0.4444654549625337, + "grad_norm": 8.121953964233398, + "learning_rate": 3.794680258932064e-06, + "loss": 0.4352, + "mean_token_accuracy": 0.8511162519454956, + "num_tokens": 172510612.0, + "step": 143380 + }, + { + "entropy": 1.9096698313951492, + "epoch": 0.44449645408758337, + "grad_norm": 9.873578071594238, + "learning_rate": 3.7945479354476334e-06, + "loss": 0.3986, + "mean_token_accuracy": 0.867861407995224, + "num_tokens": 172522684.0, + "step": 143390 + }, + { + "entropy": 1.9415227890014648, + "epoch": 0.4445274532126331, + "grad_norm": 9.934027671813965, + "learning_rate": 3.794415625804908e-06, + "loss": 0.477, + "mean_token_accuracy": 0.8525919511914253, + "num_tokens": 172533993.0, + "step": 143400 + }, + { + "entropy": 1.9405535489320755, + "epoch": 0.44455845233768276, + "grad_norm": 7.328090667724609, + "learning_rate": 3.794283330001477e-06, + "loss": 0.4226, + "mean_token_accuracy": 0.8587682485580445, + "num_tokens": 172545914.0, + "step": 143410 + }, + { + "entropy": 1.984012272953987, + "epoch": 0.4445894514627325, + "grad_norm": 8.643956184387207, + "learning_rate": 3.7941510480349265e-06, + "loss": 0.4748, + "mean_token_accuracy": 0.8469142064452171, + "num_tokens": 172556997.0, + "step": 143420 + }, + { + "entropy": 1.9300741687417031, + "epoch": 0.44462045058778216, + "grad_norm": 4.657692909240723, + "learning_rate": 3.794018779902845e-06, + "loss": 0.4216, + "mean_token_accuracy": 0.860441392660141, + "num_tokens": 172568929.0, + "step": 143430 + }, + { + "entropy": 1.8880173966288567, + "epoch": 0.4446514497128319, + "grad_norm": 8.421919822692871, + "learning_rate": 3.793886525602822e-06, + "loss": 0.395, + "mean_token_accuracy": 0.8604597702622414, + "num_tokens": 172580791.0, + "step": 143440 + }, + { + "entropy": 1.933186987042427, + "epoch": 0.44468244883788155, + "grad_norm": 3.5404715538024902, + "learning_rate": 3.793754285132446e-06, + "loss": 0.4859, + "mean_token_accuracy": 0.8485592529177666, + "num_tokens": 172593075.0, + "step": 143450 + }, + { + "entropy": 1.950406338274479, + "epoch": 0.4447134479629312, + "grad_norm": 8.130172729492188, + "learning_rate": 3.7936220584893074e-06, + "loss": 0.4427, + "mean_token_accuracy": 0.8546596810221672, + "num_tokens": 172604452.0, + "step": 143460 + }, + { + "entropy": 1.9698896750807762, + "epoch": 0.44474444708798094, + "grad_norm": 8.126727104187012, + "learning_rate": 3.7934898456709963e-06, + "loss": 0.4745, + "mean_token_accuracy": 0.8500417619943619, + "num_tokens": 172616103.0, + "step": 143470 + }, + { + "entropy": 1.8846001833677293, + "epoch": 0.4447754462130306, + "grad_norm": 7.103678226470947, + "learning_rate": 3.793357646675104e-06, + "loss": 0.4099, + "mean_token_accuracy": 0.8594605028629303, + "num_tokens": 172627939.0, + "step": 143480 + }, + { + "entropy": 1.9624834671616553, + "epoch": 0.44480644533808034, + "grad_norm": 7.368971824645996, + "learning_rate": 3.7932254614992225e-06, + "loss": 0.4464, + "mean_token_accuracy": 0.853089140355587, + "num_tokens": 172639602.0, + "step": 143490 + }, + { + "entropy": 2.0040226548910143, + "epoch": 0.44483744446313, + "grad_norm": 8.010761260986328, + "learning_rate": 3.7930932901409443e-06, + "loss": 0.5038, + "mean_token_accuracy": 0.8468669816851616, + "num_tokens": 172650585.0, + "step": 143500 + }, + { + "entropy": 1.9335386529564857, + "epoch": 0.44486844358817973, + "grad_norm": 8.675997734069824, + "learning_rate": 3.7929611325978612e-06, + "loss": 0.4349, + "mean_token_accuracy": 0.8535260155797004, + "num_tokens": 172662166.0, + "step": 143510 + }, + { + "entropy": 1.975813153386116, + "epoch": 0.4448994427132294, + "grad_norm": 9.182839393615723, + "learning_rate": 3.7928289888675668e-06, + "loss": 0.4709, + "mean_token_accuracy": 0.855238126218319, + "num_tokens": 172673577.0, + "step": 143520 + }, + { + "entropy": 1.982006560266018, + "epoch": 0.4449304418382791, + "grad_norm": 8.914770126342773, + "learning_rate": 3.7926968589476558e-06, + "loss": 0.4923, + "mean_token_accuracy": 0.8514761239290237, + "num_tokens": 172685618.0, + "step": 143530 + }, + { + "entropy": 2.004455064237118, + "epoch": 0.4449614409633288, + "grad_norm": 8.590579986572266, + "learning_rate": 3.7925647428357226e-06, + "loss": 0.4645, + "mean_token_accuracy": 0.8540130868554116, + "num_tokens": 172697021.0, + "step": 143540 + }, + { + "entropy": 2.02477196007967, + "epoch": 0.4449924400883785, + "grad_norm": 8.24437427520752, + "learning_rate": 3.7924326405293627e-06, + "loss": 0.4715, + "mean_token_accuracy": 0.8468568831682205, + "num_tokens": 172708199.0, + "step": 143550 + }, + { + "entropy": 1.9295995250344276, + "epoch": 0.4450234392134282, + "grad_norm": 4.574173927307129, + "learning_rate": 3.7923005520261708e-06, + "loss": 0.4307, + "mean_token_accuracy": 0.8547417059540748, + "num_tokens": 172720296.0, + "step": 143560 + }, + { + "entropy": 1.7906879603862762, + "epoch": 0.4450544383384779, + "grad_norm": 8.181215286254883, + "learning_rate": 3.792168477323745e-06, + "loss": 0.3569, + "mean_token_accuracy": 0.8623343601822853, + "num_tokens": 172734098.0, + "step": 143570 + }, + { + "entropy": 1.7868395507335664, + "epoch": 0.4450854374635276, + "grad_norm": 4.160261631011963, + "learning_rate": 3.7920364164196805e-06, + "loss": 0.3373, + "mean_token_accuracy": 0.8650955274701119, + "num_tokens": 172748197.0, + "step": 143580 + }, + { + "entropy": 1.906904463469982, + "epoch": 0.4451164365885773, + "grad_norm": 7.085610389709473, + "learning_rate": 3.7919043693115757e-06, + "loss": 0.425, + "mean_token_accuracy": 0.857354860007763, + "num_tokens": 172760940.0, + "step": 143590 + }, + { + "entropy": 1.8628742694854736, + "epoch": 0.44514743571362697, + "grad_norm": 7.324219226837158, + "learning_rate": 3.7917723359970293e-06, + "loss": 0.3814, + "mean_token_accuracy": 0.8632989749312401, + "num_tokens": 172773595.0, + "step": 143600 + }, + { + "entropy": 1.9510155633091926, + "epoch": 0.4451784348386767, + "grad_norm": 7.612024784088135, + "learning_rate": 3.791640316473639e-06, + "loss": 0.4609, + "mean_token_accuracy": 0.8439678400754929, + "num_tokens": 172784596.0, + "step": 143610 + }, + { + "entropy": 1.9409842744469643, + "epoch": 0.44520943396372636, + "grad_norm": 7.95268440246582, + "learning_rate": 3.791508310739005e-06, + "loss": 0.4449, + "mean_token_accuracy": 0.8481158643960953, + "num_tokens": 172796663.0, + "step": 143620 + }, + { + "entropy": 1.8901426151394844, + "epoch": 0.4452404330887761, + "grad_norm": 7.622109889984131, + "learning_rate": 3.7913763187907265e-06, + "loss": 0.4143, + "mean_token_accuracy": 0.8628189340233803, + "num_tokens": 172809208.0, + "step": 143630 + }, + { + "entropy": 1.9054123714566231, + "epoch": 0.44527143221382576, + "grad_norm": 8.614324569702148, + "learning_rate": 3.791244340626404e-06, + "loss": 0.4353, + "mean_token_accuracy": 0.8581098973751068, + "num_tokens": 172821386.0, + "step": 143640 + }, + { + "entropy": 1.8443352833390236, + "epoch": 0.4453024313388755, + "grad_norm": 4.017133712768555, + "learning_rate": 3.7911123762436393e-06, + "loss": 0.4023, + "mean_token_accuracy": 0.8598392441868782, + "num_tokens": 172834753.0, + "step": 143650 + }, + { + "entropy": 1.9353301167488097, + "epoch": 0.44533343046392515, + "grad_norm": 4.68195104598999, + "learning_rate": 3.7909804256400327e-06, + "loss": 0.444, + "mean_token_accuracy": 0.8590727686882019, + "num_tokens": 172846687.0, + "step": 143660 + }, + { + "entropy": 2.013177511096001, + "epoch": 0.4453644295889749, + "grad_norm": 8.711843490600586, + "learning_rate": 3.7908484888131875e-06, + "loss": 0.4639, + "mean_token_accuracy": 0.8534877151250839, + "num_tokens": 172857233.0, + "step": 143670 + }, + { + "entropy": 1.9266994565725326, + "epoch": 0.44539542871402454, + "grad_norm": 8.333199501037598, + "learning_rate": 3.7907165657607066e-06, + "loss": 0.4196, + "mean_token_accuracy": 0.8665392875671387, + "num_tokens": 172869433.0, + "step": 143680 + }, + { + "entropy": 1.9714792117476463, + "epoch": 0.44542642783907427, + "grad_norm": 8.7294340133667, + "learning_rate": 3.7905846564801923e-06, + "loss": 0.4595, + "mean_token_accuracy": 0.8482206985354424, + "num_tokens": 172880658.0, + "step": 143690 + }, + { + "entropy": 1.8752370357513428, + "epoch": 0.44545742696412394, + "grad_norm": 8.233158111572266, + "learning_rate": 3.7904527609692506e-06, + "loss": 0.3987, + "mean_token_accuracy": 0.8617415338754654, + "num_tokens": 172892973.0, + "step": 143700 + }, + { + "entropy": 2.0074197858572007, + "epoch": 0.4454884260891736, + "grad_norm": 7.99542760848999, + "learning_rate": 3.7903208792254844e-06, + "loss": 0.4904, + "mean_token_accuracy": 0.8521378964185715, + "num_tokens": 172904130.0, + "step": 143710 + }, + { + "entropy": 1.99031350761652, + "epoch": 0.44551942521422333, + "grad_norm": 8.73692798614502, + "learning_rate": 3.7901890112464982e-06, + "loss": 0.4322, + "mean_token_accuracy": 0.8600781843066215, + "num_tokens": 172915284.0, + "step": 143720 + }, + { + "entropy": 1.9789996802806855, + "epoch": 0.445550424339273, + "grad_norm": 9.458690643310547, + "learning_rate": 3.7900571570299e-06, + "loss": 0.4956, + "mean_token_accuracy": 0.8530536726117134, + "num_tokens": 172926970.0, + "step": 143730 + }, + { + "entropy": 1.834419848024845, + "epoch": 0.4455814234643227, + "grad_norm": 4.028777599334717, + "learning_rate": 3.789925316573294e-06, + "loss": 0.4093, + "mean_token_accuracy": 0.8613479465246201, + "num_tokens": 172939899.0, + "step": 143740 + }, + { + "entropy": 1.945051771402359, + "epoch": 0.4456124225893724, + "grad_norm": 4.0096306800842285, + "learning_rate": 3.7897934898742885e-06, + "loss": 0.4787, + "mean_token_accuracy": 0.8497995778918266, + "num_tokens": 172951805.0, + "step": 143750 + }, + { + "entropy": 1.8734006211161613, + "epoch": 0.4456434217144221, + "grad_norm": 3.48857045173645, + "learning_rate": 3.7896616769304905e-06, + "loss": 0.3735, + "mean_token_accuracy": 0.8659715816378594, + "num_tokens": 172964673.0, + "step": 143760 + }, + { + "entropy": 1.9327120184898376, + "epoch": 0.4456744208394718, + "grad_norm": 8.14692497253418, + "learning_rate": 3.7895298777395078e-06, + "loss": 0.46, + "mean_token_accuracy": 0.8533953115344047, + "num_tokens": 172975682.0, + "step": 143770 + }, + { + "entropy": 1.9580781385302544, + "epoch": 0.4457054199645215, + "grad_norm": 8.688072204589844, + "learning_rate": 3.7893980922989496e-06, + "loss": 0.4946, + "mean_token_accuracy": 0.839683572947979, + "num_tokens": 172987936.0, + "step": 143780 + }, + { + "entropy": 1.908944171667099, + "epoch": 0.4457364190895712, + "grad_norm": 3.971160888671875, + "learning_rate": 3.7892663206064244e-06, + "loss": 0.4127, + "mean_token_accuracy": 0.8549481183290482, + "num_tokens": 173000930.0, + "step": 143790 + }, + { + "entropy": 2.0028712913393973, + "epoch": 0.4457674182146209, + "grad_norm": 8.951005935668945, + "learning_rate": 3.789134562659543e-06, + "loss": 0.4679, + "mean_token_accuracy": 0.845504654943943, + "num_tokens": 173012916.0, + "step": 143800 + }, + { + "entropy": 1.9837856590747833, + "epoch": 0.44579841733967057, + "grad_norm": 7.293043613433838, + "learning_rate": 3.7890028184559154e-06, + "loss": 0.4261, + "mean_token_accuracy": 0.8591191872954369, + "num_tokens": 173024193.0, + "step": 143810 + }, + { + "entropy": 1.9127214059233666, + "epoch": 0.4458294164647203, + "grad_norm": 3.8465218544006348, + "learning_rate": 3.7888710879931517e-06, + "loss": 0.4417, + "mean_token_accuracy": 0.8534463986754417, + "num_tokens": 173035733.0, + "step": 143820 + }, + { + "entropy": 1.949730084836483, + "epoch": 0.44586041558976997, + "grad_norm": 10.93399429321289, + "learning_rate": 3.788739371268865e-06, + "loss": 0.4748, + "mean_token_accuracy": 0.8493082284927368, + "num_tokens": 173047215.0, + "step": 143830 + }, + { + "entropy": 2.0025800704956054, + "epoch": 0.4458914147148197, + "grad_norm": 8.645318984985352, + "learning_rate": 3.788607668280666e-06, + "loss": 0.5141, + "mean_token_accuracy": 0.8457403063774109, + "num_tokens": 173058282.0, + "step": 143840 + }, + { + "entropy": 1.910311257839203, + "epoch": 0.44592241383986936, + "grad_norm": 3.202796220779419, + "learning_rate": 3.7884759790261683e-06, + "loss": 0.4434, + "mean_token_accuracy": 0.8490109801292419, + "num_tokens": 173070834.0, + "step": 143850 + }, + { + "entropy": 1.9238232627511025, + "epoch": 0.4459534129649191, + "grad_norm": 8.433602333068848, + "learning_rate": 3.7883443035029844e-06, + "loss": 0.4331, + "mean_token_accuracy": 0.8548846036195755, + "num_tokens": 173083400.0, + "step": 143860 + }, + { + "entropy": 1.961086443066597, + "epoch": 0.44598441208996875, + "grad_norm": 3.7970497608184814, + "learning_rate": 3.7882126417087294e-06, + "loss": 0.4884, + "mean_token_accuracy": 0.8421978339552879, + "num_tokens": 173094716.0, + "step": 143870 + }, + { + "entropy": 1.9548719599843025, + "epoch": 0.4460154112150185, + "grad_norm": 8.201971054077148, + "learning_rate": 3.788080993641017e-06, + "loss": 0.5078, + "mean_token_accuracy": 0.8327627673745155, + "num_tokens": 173107004.0, + "step": 143880 + }, + { + "entropy": 1.8781091898679734, + "epoch": 0.44604641034006814, + "grad_norm": 3.5687875747680664, + "learning_rate": 3.7879493592974612e-06, + "loss": 0.4453, + "mean_token_accuracy": 0.8458937138319016, + "num_tokens": 173119649.0, + "step": 143890 + }, + { + "entropy": 1.9691094398498534, + "epoch": 0.44607740946511787, + "grad_norm": 10.030198097229004, + "learning_rate": 3.7878177386756796e-06, + "loss": 0.516, + "mean_token_accuracy": 0.8466410353779793, + "num_tokens": 173131103.0, + "step": 143900 + }, + { + "entropy": 2.0002141326665877, + "epoch": 0.44610840859016754, + "grad_norm": 8.030385971069336, + "learning_rate": 3.7876861317732877e-06, + "loss": 0.5004, + "mean_token_accuracy": 0.8476643145084382, + "num_tokens": 173142013.0, + "step": 143910 + }, + { + "entropy": 1.9814749881625175, + "epoch": 0.44613940771521726, + "grad_norm": 9.209390640258789, + "learning_rate": 3.787554538587902e-06, + "loss": 0.4712, + "mean_token_accuracy": 0.8513798832893371, + "num_tokens": 173153145.0, + "step": 143920 + }, + { + "entropy": 1.953411616384983, + "epoch": 0.44617040684026693, + "grad_norm": 10.11378288269043, + "learning_rate": 3.7874229591171395e-06, + "loss": 0.467, + "mean_token_accuracy": 0.845984798669815, + "num_tokens": 173165304.0, + "step": 143930 + }, + { + "entropy": 1.8993015423417092, + "epoch": 0.44620140596531666, + "grad_norm": 7.429069995880127, + "learning_rate": 3.787291393358619e-06, + "loss": 0.4043, + "mean_token_accuracy": 0.8644404307007789, + "num_tokens": 173177675.0, + "step": 143940 + }, + { + "entropy": 1.8486544996500016, + "epoch": 0.4462324050903663, + "grad_norm": 8.14201545715332, + "learning_rate": 3.7871598413099593e-06, + "loss": 0.379, + "mean_token_accuracy": 0.8645230799913406, + "num_tokens": 173191122.0, + "step": 143950 + }, + { + "entropy": 2.023305447399616, + "epoch": 0.446263404215416, + "grad_norm": 10.556729316711426, + "learning_rate": 3.7870283029687777e-06, + "loss": 0.4801, + "mean_token_accuracy": 0.847667895257473, + "num_tokens": 173202142.0, + "step": 143960 + }, + { + "entropy": 1.910950092971325, + "epoch": 0.4462944033404657, + "grad_norm": 7.93691349029541, + "learning_rate": 3.7868967783326956e-06, + "loss": 0.4291, + "mean_token_accuracy": 0.856875829398632, + "num_tokens": 173214829.0, + "step": 143970 + }, + { + "entropy": 1.9166125014424324, + "epoch": 0.4463254024655154, + "grad_norm": 9.42168140411377, + "learning_rate": 3.7867652673993318e-06, + "loss": 0.4674, + "mean_token_accuracy": 0.8535972073674202, + "num_tokens": 173226844.0, + "step": 143980 + }, + { + "entropy": 1.8531441867351532, + "epoch": 0.4463564015905651, + "grad_norm": 4.800891399383545, + "learning_rate": 3.7866337701663088e-06, + "loss": 0.4391, + "mean_token_accuracy": 0.8521735593676567, + "num_tokens": 173239753.0, + "step": 143990 + }, + { + "entropy": 1.881640262901783, + "epoch": 0.4463874007156148, + "grad_norm": 9.612931251525879, + "learning_rate": 3.7865022866312468e-06, + "loss": 0.4001, + "mean_token_accuracy": 0.8631109833717346, + "num_tokens": 173252177.0, + "step": 144000 + }, + { + "entropy": 1.8542085379362105, + "epoch": 0.4464183998406645, + "grad_norm": 5.085047245025635, + "learning_rate": 3.7863708167917686e-06, + "loss": 0.3643, + "mean_token_accuracy": 0.8695595130324364, + "num_tokens": 173264694.0, + "step": 144010 + }, + { + "entropy": 2.0253135934472084, + "epoch": 0.4464493989657142, + "grad_norm": 9.341069221496582, + "learning_rate": 3.7862393606454958e-06, + "loss": 0.4986, + "mean_token_accuracy": 0.8394965454936028, + "num_tokens": 173275727.0, + "step": 144020 + }, + { + "entropy": 1.8591209024190902, + "epoch": 0.4464803980907639, + "grad_norm": 8.249305725097656, + "learning_rate": 3.786107918190052e-06, + "loss": 0.4193, + "mean_token_accuracy": 0.8542803794145584, + "num_tokens": 173288795.0, + "step": 144030 + }, + { + "entropy": 1.9186673119664193, + "epoch": 0.44651139721581357, + "grad_norm": 9.003576278686523, + "learning_rate": 3.785976489423061e-06, + "loss": 0.4394, + "mean_token_accuracy": 0.8566552370786666, + "num_tokens": 173300153.0, + "step": 144040 + }, + { + "entropy": 1.9807184368371964, + "epoch": 0.4465423963408633, + "grad_norm": 8.3126859664917, + "learning_rate": 3.785845074342148e-06, + "loss": 0.4589, + "mean_token_accuracy": 0.8607570424675941, + "num_tokens": 173311456.0, + "step": 144050 + }, + { + "entropy": 1.9973660945892333, + "epoch": 0.44657339546591296, + "grad_norm": 8.058760643005371, + "learning_rate": 3.7857136729449356e-06, + "loss": 0.4827, + "mean_token_accuracy": 0.8486983701586723, + "num_tokens": 173322684.0, + "step": 144060 + }, + { + "entropy": 1.9315310716629028, + "epoch": 0.4466043945909627, + "grad_norm": 9.327740669250488, + "learning_rate": 3.7855822852290523e-06, + "loss": 0.4784, + "mean_token_accuracy": 0.8483046740293503, + "num_tokens": 173334375.0, + "step": 144070 + }, + { + "entropy": 1.854925161600113, + "epoch": 0.44663539371601235, + "grad_norm": 8.95000171661377, + "learning_rate": 3.785450911192121e-06, + "loss": 0.4349, + "mean_token_accuracy": 0.8594622611999512, + "num_tokens": 173347587.0, + "step": 144080 + }, + { + "entropy": 1.9255126759409904, + "epoch": 0.4466663928410621, + "grad_norm": 9.044776916503906, + "learning_rate": 3.7853195508317707e-06, + "loss": 0.5041, + "mean_token_accuracy": 0.8483678221702575, + "num_tokens": 173359548.0, + "step": 144090 + }, + { + "entropy": 1.9388061597943307, + "epoch": 0.44669739196611175, + "grad_norm": 8.007100105285645, + "learning_rate": 3.785188204145627e-06, + "loss": 0.4604, + "mean_token_accuracy": 0.8521351292729378, + "num_tokens": 173371042.0, + "step": 144100 + }, + { + "entropy": 1.9809336930513382, + "epoch": 0.44672839109116147, + "grad_norm": 8.529678344726562, + "learning_rate": 3.7850568711313192e-06, + "loss": 0.4665, + "mean_token_accuracy": 0.8555900052189827, + "num_tokens": 173382290.0, + "step": 144110 + }, + { + "entropy": 1.9656935811042786, + "epoch": 0.44675939021621114, + "grad_norm": 4.3543219566345215, + "learning_rate": 3.7849255517864743e-06, + "loss": 0.4852, + "mean_token_accuracy": 0.8531507521867752, + "num_tokens": 173393550.0, + "step": 144120 + }, + { + "entropy": 1.8287263602018355, + "epoch": 0.44679038934126086, + "grad_norm": 3.763613700866699, + "learning_rate": 3.784794246108721e-06, + "loss": 0.3894, + "mean_token_accuracy": 0.859270915389061, + "num_tokens": 173406666.0, + "step": 144130 + }, + { + "entropy": 1.93349888920784, + "epoch": 0.44682138846631053, + "grad_norm": 8.177099227905273, + "learning_rate": 3.784662954095691e-06, + "loss": 0.4772, + "mean_token_accuracy": 0.8471749663352967, + "num_tokens": 173418993.0, + "step": 144140 + }, + { + "entropy": 1.9891784444451333, + "epoch": 0.44685238759136026, + "grad_norm": 9.054378509521484, + "learning_rate": 3.7845316757450114e-06, + "loss": 0.4624, + "mean_token_accuracy": 0.8486263379454613, + "num_tokens": 173430436.0, + "step": 144150 + }, + { + "entropy": 1.9303235545754434, + "epoch": 0.4468833867164099, + "grad_norm": 8.891922950744629, + "learning_rate": 3.7844004110543148e-06, + "loss": 0.4609, + "mean_token_accuracy": 0.8510625973343849, + "num_tokens": 173442415.0, + "step": 144160 + }, + { + "entropy": 1.94441519677639, + "epoch": 0.44691438584145965, + "grad_norm": 8.909342765808105, + "learning_rate": 3.7842691600212316e-06, + "loss": 0.4587, + "mean_token_accuracy": 0.8438147768378258, + "num_tokens": 173454807.0, + "step": 144170 + }, + { + "entropy": 2.0386078268289567, + "epoch": 0.4469453849665093, + "grad_norm": 10.605982780456543, + "learning_rate": 3.784137922643394e-06, + "loss": 0.5531, + "mean_token_accuracy": 0.8398728102445603, + "num_tokens": 173465671.0, + "step": 144180 + }, + { + "entropy": 1.9403854593634606, + "epoch": 0.44697638409155904, + "grad_norm": 7.102710723876953, + "learning_rate": 3.7840066989184345e-06, + "loss": 0.4316, + "mean_token_accuracy": 0.8576330199837685, + "num_tokens": 173478051.0, + "step": 144190 + }, + { + "entropy": 1.9223571211099624, + "epoch": 0.4470073832166087, + "grad_norm": 7.868150234222412, + "learning_rate": 3.7838754888439857e-06, + "loss": 0.4203, + "mean_token_accuracy": 0.8626696810126304, + "num_tokens": 173489382.0, + "step": 144200 + }, + { + "entropy": 1.9799730449914932, + "epoch": 0.4470383823416584, + "grad_norm": 6.588014602661133, + "learning_rate": 3.7837442924176804e-06, + "loss": 0.5044, + "mean_token_accuracy": 0.8521138474345207, + "num_tokens": 173501335.0, + "step": 144210 + }, + { + "entropy": 1.9104075238108635, + "epoch": 0.4470693814667081, + "grad_norm": 8.533629417419434, + "learning_rate": 3.783613109637155e-06, + "loss": 0.4194, + "mean_token_accuracy": 0.8599554657936096, + "num_tokens": 173513423.0, + "step": 144220 + }, + { + "entropy": 1.884183020889759, + "epoch": 0.4471003805917578, + "grad_norm": 2.680689573287964, + "learning_rate": 3.7834819405000404e-06, + "loss": 0.4075, + "mean_token_accuracy": 0.8564264923334122, + "num_tokens": 173525703.0, + "step": 144230 + }, + { + "entropy": 1.998404061794281, + "epoch": 0.4471313797168075, + "grad_norm": 7.732211112976074, + "learning_rate": 3.7833507850039757e-06, + "loss": 0.4948, + "mean_token_accuracy": 0.8460897743701935, + "num_tokens": 173536387.0, + "step": 144240 + }, + { + "entropy": 1.888758347928524, + "epoch": 0.44716237884185717, + "grad_norm": 9.650225639343262, + "learning_rate": 3.783219643146595e-06, + "loss": 0.4148, + "mean_token_accuracy": 0.8614796027541161, + "num_tokens": 173548843.0, + "step": 144250 + }, + { + "entropy": 1.9674888044595717, + "epoch": 0.4471933779669069, + "grad_norm": 7.974256992340088, + "learning_rate": 3.7830885149255346e-06, + "loss": 0.4328, + "mean_token_accuracy": 0.8619727566838264, + "num_tokens": 173560036.0, + "step": 144260 + }, + { + "entropy": 1.854830276221037, + "epoch": 0.44722437709195656, + "grad_norm": 4.305238246917725, + "learning_rate": 3.7829574003384317e-06, + "loss": 0.4224, + "mean_token_accuracy": 0.8533999145030975, + "num_tokens": 173572981.0, + "step": 144270 + }, + { + "entropy": 1.8603426963090897, + "epoch": 0.4472553762170063, + "grad_norm": 8.774975776672363, + "learning_rate": 3.7828262993829235e-06, + "loss": 0.4179, + "mean_token_accuracy": 0.8556037560105324, + "num_tokens": 173586083.0, + "step": 144280 + }, + { + "entropy": 1.8292605131864548, + "epoch": 0.44728637534205595, + "grad_norm": 3.801150321960449, + "learning_rate": 3.7826952120566485e-06, + "loss": 0.3767, + "mean_token_accuracy": 0.8639505878090858, + "num_tokens": 173599424.0, + "step": 144290 + }, + { + "entropy": 1.8792633160948753, + "epoch": 0.4473173744671057, + "grad_norm": 4.3725409507751465, + "learning_rate": 3.7825641383572448e-06, + "loss": 0.4181, + "mean_token_accuracy": 0.8518676936626435, + "num_tokens": 173611821.0, + "step": 144300 + }, + { + "entropy": 1.9342765644192697, + "epoch": 0.44734837359215535, + "grad_norm": 9.642680168151855, + "learning_rate": 3.7824330782823524e-06, + "loss": 0.4445, + "mean_token_accuracy": 0.8533917725086212, + "num_tokens": 173623485.0, + "step": 144310 + }, + { + "entropy": 2.012094184756279, + "epoch": 0.44737937271720507, + "grad_norm": 7.291985988616943, + "learning_rate": 3.7823020318296104e-06, + "loss": 0.5282, + "mean_token_accuracy": 0.8319395646452904, + "num_tokens": 173634217.0, + "step": 144320 + }, + { + "entropy": 1.9318621635437012, + "epoch": 0.44741037184225474, + "grad_norm": 6.990114212036133, + "learning_rate": 3.7821709989966605e-06, + "loss": 0.4082, + "mean_token_accuracy": 0.8600582510232926, + "num_tokens": 173646108.0, + "step": 144330 + }, + { + "entropy": 1.9527502954006195, + "epoch": 0.44744137096730446, + "grad_norm": 7.637629508972168, + "learning_rate": 3.782039979781142e-06, + "loss": 0.4795, + "mean_token_accuracy": 0.8539629653096199, + "num_tokens": 173657100.0, + "step": 144340 + }, + { + "entropy": 1.969818153977394, + "epoch": 0.44747237009235413, + "grad_norm": 8.846341133117676, + "learning_rate": 3.7819089741806974e-06, + "loss": 0.5181, + "mean_token_accuracy": 0.8485133573412895, + "num_tokens": 173668376.0, + "step": 144350 + }, + { + "entropy": 1.903004801273346, + "epoch": 0.44750336921740386, + "grad_norm": 6.75057315826416, + "learning_rate": 3.781777982192969e-06, + "loss": 0.5438, + "mean_token_accuracy": 0.8346272155642509, + "num_tokens": 173681042.0, + "step": 144360 + }, + { + "entropy": 1.8782558515667915, + "epoch": 0.4475343683424535, + "grad_norm": 6.844717025756836, + "learning_rate": 3.781647003815598e-06, + "loss": 0.4359, + "mean_token_accuracy": 0.8552762180566787, + "num_tokens": 173693000.0, + "step": 144370 + }, + { + "entropy": 1.9393670424818992, + "epoch": 0.44756536746750325, + "grad_norm": 9.035807609558105, + "learning_rate": 3.7815160390462298e-06, + "loss": 0.4454, + "mean_token_accuracy": 0.8559998840093612, + "num_tokens": 173704983.0, + "step": 144380 + }, + { + "entropy": 1.9297925919294356, + "epoch": 0.4475963665925529, + "grad_norm": 6.4936137199401855, + "learning_rate": 3.7813850878825064e-06, + "loss": 0.4642, + "mean_token_accuracy": 0.8614618703722954, + "num_tokens": 173716355.0, + "step": 144390 + }, + { + "entropy": 1.8133862063288688, + "epoch": 0.44762736571760264, + "grad_norm": 4.11335563659668, + "learning_rate": 3.7812541503220733e-06, + "loss": 0.3879, + "mean_token_accuracy": 0.8672507256269455, + "num_tokens": 173730318.0, + "step": 144400 + }, + { + "entropy": 1.956637793779373, + "epoch": 0.4476583648426523, + "grad_norm": 6.994054794311523, + "learning_rate": 3.7811232263625753e-06, + "loss": 0.4681, + "mean_token_accuracy": 0.8508543521165848, + "num_tokens": 173741321.0, + "step": 144410 + }, + { + "entropy": 1.92156320810318, + "epoch": 0.44768936396770204, + "grad_norm": 7.512271404266357, + "learning_rate": 3.780992316001657e-06, + "loss": 0.5093, + "mean_token_accuracy": 0.845100137591362, + "num_tokens": 173753512.0, + "step": 144420 + }, + { + "entropy": 1.8369450643658638, + "epoch": 0.4477203630927517, + "grad_norm": 8.064924240112305, + "learning_rate": 3.7808614192369664e-06, + "loss": 0.402, + "mean_token_accuracy": 0.8641338348388672, + "num_tokens": 173767543.0, + "step": 144430 + }, + { + "entropy": 1.8546362400054932, + "epoch": 0.44775136221780143, + "grad_norm": 9.19071102142334, + "learning_rate": 3.7807305360661485e-06, + "loss": 0.3772, + "mean_token_accuracy": 0.8610058188438415, + "num_tokens": 173780631.0, + "step": 144440 + }, + { + "entropy": 1.9356670066714288, + "epoch": 0.4477823613428511, + "grad_norm": 11.3148832321167, + "learning_rate": 3.780599666486851e-06, + "loss": 0.4627, + "mean_token_accuracy": 0.852401140332222, + "num_tokens": 173792247.0, + "step": 144450 + }, + { + "entropy": 1.8100459277629852, + "epoch": 0.44781336046790077, + "grad_norm": 3.687837839126587, + "learning_rate": 3.780468810496722e-06, + "loss": 0.3704, + "mean_token_accuracy": 0.8491495341062546, + "num_tokens": 173806105.0, + "step": 144460 + }, + { + "entropy": 1.9653054475784302, + "epoch": 0.4478443595929505, + "grad_norm": 8.540299415588379, + "learning_rate": 3.7803379680934092e-06, + "loss": 0.4749, + "mean_token_accuracy": 0.8488686487078667, + "num_tokens": 173818024.0, + "step": 144470 + }, + { + "entropy": 1.9154264122247695, + "epoch": 0.44787535871800016, + "grad_norm": 8.398823738098145, + "learning_rate": 3.7802071392745628e-06, + "loss": 0.4439, + "mean_token_accuracy": 0.8501715928316116, + "num_tokens": 173830403.0, + "step": 144480 + }, + { + "entropy": 1.9817367240786552, + "epoch": 0.4479063578430499, + "grad_norm": 8.067160606384277, + "learning_rate": 3.7800763240378307e-06, + "loss": 0.4427, + "mean_token_accuracy": 0.8624659746885299, + "num_tokens": 173841447.0, + "step": 144490 + }, + { + "entropy": 1.826142853498459, + "epoch": 0.44793735696809955, + "grad_norm": 8.450629234313965, + "learning_rate": 3.7799455223808647e-06, + "loss": 0.3764, + "mean_token_accuracy": 0.8624396830797195, + "num_tokens": 173854816.0, + "step": 144500 + }, + { + "entropy": 1.9071953654289246, + "epoch": 0.4479683560931493, + "grad_norm": 8.542773246765137, + "learning_rate": 3.7798147343013134e-06, + "loss": 0.4273, + "mean_token_accuracy": 0.8587081953883171, + "num_tokens": 173866813.0, + "step": 144510 + }, + { + "entropy": 1.9302286952733994, + "epoch": 0.44799935521819895, + "grad_norm": 8.922464370727539, + "learning_rate": 3.7796839597968305e-06, + "loss": 0.4607, + "mean_token_accuracy": 0.8490835353732109, + "num_tokens": 173878864.0, + "step": 144520 + }, + { + "entropy": 1.841838812828064, + "epoch": 0.44803035434324867, + "grad_norm": 10.363226890563965, + "learning_rate": 3.7795531988650663e-06, + "loss": 0.4245, + "mean_token_accuracy": 0.8650688961148262, + "num_tokens": 173892098.0, + "step": 144530 + }, + { + "entropy": 1.9519219264388084, + "epoch": 0.44806135346829834, + "grad_norm": 8.940476417541504, + "learning_rate": 3.7794224515036733e-06, + "loss": 0.4974, + "mean_token_accuracy": 0.8444177433848381, + "num_tokens": 173904195.0, + "step": 144540 + }, + { + "entropy": 1.998878961801529, + "epoch": 0.44809235259334806, + "grad_norm": 8.424077033996582, + "learning_rate": 3.7792917177103043e-06, + "loss": 0.5687, + "mean_token_accuracy": 0.8384242698550224, + "num_tokens": 173915400.0, + "step": 144550 + }, + { + "entropy": 1.9215024933218956, + "epoch": 0.44812335171839773, + "grad_norm": 8.416211128234863, + "learning_rate": 3.7791609974826136e-06, + "loss": 0.4661, + "mean_token_accuracy": 0.8600714936852455, + "num_tokens": 173927090.0, + "step": 144560 + }, + { + "entropy": 1.891715730726719, + "epoch": 0.44815435084344746, + "grad_norm": 9.589639663696289, + "learning_rate": 3.7790302908182543e-06, + "loss": 0.4641, + "mean_token_accuracy": 0.8505370214581489, + "num_tokens": 173938920.0, + "step": 144570 + }, + { + "entropy": 1.8214015498757363, + "epoch": 0.4481853499684971, + "grad_norm": 8.072266578674316, + "learning_rate": 3.7788995977148823e-06, + "loss": 0.348, + "mean_token_accuracy": 0.8764893263578415, + "num_tokens": 173951952.0, + "step": 144580 + }, + { + "entropy": 1.9523023292422295, + "epoch": 0.44821634909354685, + "grad_norm": 8.023730278015137, + "learning_rate": 3.7787689181701514e-06, + "loss": 0.4498, + "mean_token_accuracy": 0.8551726579666138, + "num_tokens": 173963990.0, + "step": 144590 + }, + { + "entropy": 1.975418707728386, + "epoch": 0.4482473482185965, + "grad_norm": 8.128300666809082, + "learning_rate": 3.7786382521817178e-06, + "loss": 0.4957, + "mean_token_accuracy": 0.8471471086144448, + "num_tokens": 173974720.0, + "step": 144600 + }, + { + "entropy": 1.935040408372879, + "epoch": 0.44827834734364624, + "grad_norm": 8.031977653503418, + "learning_rate": 3.7785075997472385e-06, + "loss": 0.4166, + "mean_token_accuracy": 0.8642361879348754, + "num_tokens": 173986483.0, + "step": 144610 + }, + { + "entropy": 1.9660716861486436, + "epoch": 0.4483093464686959, + "grad_norm": 7.869020938873291, + "learning_rate": 3.7783769608643696e-06, + "loss": 0.4754, + "mean_token_accuracy": 0.8499971255660057, + "num_tokens": 173998365.0, + "step": 144620 + }, + { + "entropy": 1.998292076587677, + "epoch": 0.44834034559374564, + "grad_norm": 8.130206108093262, + "learning_rate": 3.778246335530769e-06, + "loss": 0.5028, + "mean_token_accuracy": 0.8486690282821655, + "num_tokens": 174009884.0, + "step": 144630 + }, + { + "entropy": 1.890485832095146, + "epoch": 0.4483713447187953, + "grad_norm": 7.7400946617126465, + "learning_rate": 3.778115723744095e-06, + "loss": 0.4081, + "mean_token_accuracy": 0.8541572690010071, + "num_tokens": 174022402.0, + "step": 144640 + }, + { + "entropy": 1.897654327750206, + "epoch": 0.44840234384384503, + "grad_norm": 4.250067710876465, + "learning_rate": 3.7779851255020057e-06, + "loss": 0.4291, + "mean_token_accuracy": 0.8561486795544624, + "num_tokens": 174034391.0, + "step": 144650 + }, + { + "entropy": 1.917440117895603, + "epoch": 0.4484333429688947, + "grad_norm": 4.559934139251709, + "learning_rate": 3.7778545408021607e-06, + "loss": 0.4682, + "mean_token_accuracy": 0.8511192619800567, + "num_tokens": 174046427.0, + "step": 144660 + }, + { + "entropy": 1.9716055259108543, + "epoch": 0.4484643420939444, + "grad_norm": 7.858667850494385, + "learning_rate": 3.77772396964222e-06, + "loss": 0.4878, + "mean_token_accuracy": 0.8432918503880501, + "num_tokens": 174057999.0, + "step": 144670 + }, + { + "entropy": 1.928077156841755, + "epoch": 0.4484953412189941, + "grad_norm": 9.245266914367676, + "learning_rate": 3.777593412019842e-06, + "loss": 0.4646, + "mean_token_accuracy": 0.8530062898993492, + "num_tokens": 174069219.0, + "step": 144680 + }, + { + "entropy": 2.0191250085830688, + "epoch": 0.4485263403440438, + "grad_norm": 8.808843612670898, + "learning_rate": 3.7774628679326895e-06, + "loss": 0.5707, + "mean_token_accuracy": 0.8358566120266915, + "num_tokens": 174079632.0, + "step": 144690 + }, + { + "entropy": 1.9445554435253143, + "epoch": 0.4485573394690935, + "grad_norm": 10.58156681060791, + "learning_rate": 3.7773323373784244e-06, + "loss": 0.4865, + "mean_token_accuracy": 0.8430225223302841, + "num_tokens": 174091815.0, + "step": 144700 + }, + { + "entropy": 1.9787197232246398, + "epoch": 0.44858833859414315, + "grad_norm": 8.029949188232422, + "learning_rate": 3.777201820354707e-06, + "loss": 0.4623, + "mean_token_accuracy": 0.8574790298938751, + "num_tokens": 174102183.0, + "step": 144710 + }, + { + "entropy": 1.8836344301700592, + "epoch": 0.4486193377191929, + "grad_norm": 4.910955429077148, + "learning_rate": 3.777071316859201e-06, + "loss": 0.4363, + "mean_token_accuracy": 0.8546113297343254, + "num_tokens": 174114589.0, + "step": 144720 + }, + { + "entropy": 1.8803539738059043, + "epoch": 0.44865033684424255, + "grad_norm": 7.87885046005249, + "learning_rate": 3.776940826889569e-06, + "loss": 0.4525, + "mean_token_accuracy": 0.8489265456795693, + "num_tokens": 174126765.0, + "step": 144730 + }, + { + "entropy": 1.9403909876942635, + "epoch": 0.44868133596929227, + "grad_norm": 9.444135665893555, + "learning_rate": 3.776810350443475e-06, + "loss": 0.489, + "mean_token_accuracy": 0.8454124689102173, + "num_tokens": 174138010.0, + "step": 144740 + }, + { + "entropy": 1.869876691699028, + "epoch": 0.44871233509434194, + "grad_norm": 10.500418663024902, + "learning_rate": 3.776679887518583e-06, + "loss": 0.4856, + "mean_token_accuracy": 0.8416117459535599, + "num_tokens": 174151284.0, + "step": 144750 + }, + { + "entropy": 1.8063198134303093, + "epoch": 0.44874333421939167, + "grad_norm": 7.03587532043457, + "learning_rate": 3.776549438112559e-06, + "loss": 0.3784, + "mean_token_accuracy": 0.8664273172616959, + "num_tokens": 174164715.0, + "step": 144760 + }, + { + "entropy": 1.9691897720098495, + "epoch": 0.44877433334444133, + "grad_norm": 8.313720703125, + "learning_rate": 3.7764190022230658e-06, + "loss": 0.4791, + "mean_token_accuracy": 0.8534282594919205, + "num_tokens": 174175915.0, + "step": 144770 + }, + { + "entropy": 1.8201162710785865, + "epoch": 0.44880533246949106, + "grad_norm": 3.464089870452881, + "learning_rate": 3.7762885798477715e-06, + "loss": 0.3616, + "mean_token_accuracy": 0.8693333268165588, + "num_tokens": 174188460.0, + "step": 144780 + }, + { + "entropy": 1.968066155910492, + "epoch": 0.4488363315945407, + "grad_norm": 9.63811206817627, + "learning_rate": 3.776158170984343e-06, + "loss": 0.4864, + "mean_token_accuracy": 0.8475179970264435, + "num_tokens": 174199357.0, + "step": 144790 + }, + { + "entropy": 1.8865119606256484, + "epoch": 0.44886733071959045, + "grad_norm": 8.431649208068848, + "learning_rate": 3.7760277756304458e-06, + "loss": 0.4675, + "mean_token_accuracy": 0.8538785234093667, + "num_tokens": 174211184.0, + "step": 144800 + }, + { + "entropy": 1.9904786467552185, + "epoch": 0.4488983298446401, + "grad_norm": 8.829815864562988, + "learning_rate": 3.775897393783749e-06, + "loss": 0.4995, + "mean_token_accuracy": 0.849235288798809, + "num_tokens": 174222072.0, + "step": 144810 + }, + { + "entropy": 1.8386689230799675, + "epoch": 0.44892932896968984, + "grad_norm": 8.532641410827637, + "learning_rate": 3.775767025441919e-06, + "loss": 0.412, + "mean_token_accuracy": 0.8608880117535591, + "num_tokens": 174235310.0, + "step": 144820 + }, + { + "entropy": 1.9428915694355964, + "epoch": 0.4489603280947395, + "grad_norm": 8.705090522766113, + "learning_rate": 3.7756366706026264e-06, + "loss": 0.5264, + "mean_token_accuracy": 0.8452722027897834, + "num_tokens": 174247069.0, + "step": 144830 + }, + { + "entropy": 1.9664770871400834, + "epoch": 0.44899132721978924, + "grad_norm": 9.778718948364258, + "learning_rate": 3.7755063292635396e-06, + "loss": 0.5638, + "mean_token_accuracy": 0.8389669686555863, + "num_tokens": 174257829.0, + "step": 144840 + }, + { + "entropy": 1.9240783050656318, + "epoch": 0.4490223263448389, + "grad_norm": 7.943138122558594, + "learning_rate": 3.775376001422329e-06, + "loss": 0.504, + "mean_token_accuracy": 0.8448940262198448, + "num_tokens": 174269741.0, + "step": 144850 + }, + { + "entropy": 1.9432480692863465, + "epoch": 0.44905332546988863, + "grad_norm": 3.412449598312378, + "learning_rate": 3.7752456870766646e-06, + "loss": 0.4416, + "mean_token_accuracy": 0.8651240512728691, + "num_tokens": 174280859.0, + "step": 144860 + }, + { + "entropy": 1.8958119705319405, + "epoch": 0.4490843245949383, + "grad_norm": 4.008243083953857, + "learning_rate": 3.775115386224218e-06, + "loss": 0.4582, + "mean_token_accuracy": 0.8487641841173172, + "num_tokens": 174293162.0, + "step": 144870 + }, + { + "entropy": 1.9293140321969986, + "epoch": 0.449115323719988, + "grad_norm": 8.646997451782227, + "learning_rate": 3.7749850988626597e-06, + "loss": 0.4743, + "mean_token_accuracy": 0.8494552314281464, + "num_tokens": 174304631.0, + "step": 144880 + }, + { + "entropy": 1.8826806560158729, + "epoch": 0.4491463228450377, + "grad_norm": 7.910927772521973, + "learning_rate": 3.7748548249896632e-06, + "loss": 0.3843, + "mean_token_accuracy": 0.8651323825120926, + "num_tokens": 174316973.0, + "step": 144890 + }, + { + "entropy": 1.9384344890713692, + "epoch": 0.4491773219700874, + "grad_norm": 8.303495407104492, + "learning_rate": 3.7747245646029e-06, + "loss": 0.4715, + "mean_token_accuracy": 0.8473970159888268, + "num_tokens": 174328666.0, + "step": 144900 + }, + { + "entropy": 1.9702891185879707, + "epoch": 0.4492083210951371, + "grad_norm": 7.635629177093506, + "learning_rate": 3.7745943177000442e-06, + "loss": 0.4746, + "mean_token_accuracy": 0.84871247112751, + "num_tokens": 174340268.0, + "step": 144910 + }, + { + "entropy": 1.9801568925380706, + "epoch": 0.4492393202201868, + "grad_norm": 7.393730640411377, + "learning_rate": 3.7744640842787706e-06, + "loss": 0.4899, + "mean_token_accuracy": 0.8527787283062935, + "num_tokens": 174350919.0, + "step": 144920 + }, + { + "entropy": 1.8521282449364662, + "epoch": 0.4492703193452365, + "grad_norm": 7.78226900100708, + "learning_rate": 3.774333864336751e-06, + "loss": 0.3754, + "mean_token_accuracy": 0.8685169115662574, + "num_tokens": 174363562.0, + "step": 144930 + }, + { + "entropy": 1.8877981454133987, + "epoch": 0.44930131847028615, + "grad_norm": 7.9879302978515625, + "learning_rate": 3.774203657871663e-06, + "loss": 0.4636, + "mean_token_accuracy": 0.8551206976175308, + "num_tokens": 174375369.0, + "step": 144940 + }, + { + "entropy": 1.8575106739997864, + "epoch": 0.4493323175953359, + "grad_norm": 7.526092529296875, + "learning_rate": 3.7740734648811805e-06, + "loss": 0.42, + "mean_token_accuracy": 0.8627775356173515, + "num_tokens": 174387464.0, + "step": 144950 + }, + { + "entropy": 1.873704120516777, + "epoch": 0.44936331672038554, + "grad_norm": 8.647795677185059, + "learning_rate": 3.7739432853629796e-06, + "loss": 0.4119, + "mean_token_accuracy": 0.8583318084478379, + "num_tokens": 174399542.0, + "step": 144960 + }, + { + "entropy": 1.9029436275362968, + "epoch": 0.44939431584543527, + "grad_norm": 7.637712001800537, + "learning_rate": 3.773813119314738e-06, + "loss": 0.454, + "mean_token_accuracy": 0.8516765266656876, + "num_tokens": 174410772.0, + "step": 144970 + }, + { + "entropy": 1.8933783307671548, + "epoch": 0.44942531497048493, + "grad_norm": 4.113735675811768, + "learning_rate": 3.7736829667341326e-06, + "loss": 0.5039, + "mean_token_accuracy": 0.843044999241829, + "num_tokens": 174423995.0, + "step": 144980 + }, + { + "entropy": 1.9160526722669602, + "epoch": 0.44945631409553466, + "grad_norm": 9.520600318908691, + "learning_rate": 3.7735528276188404e-06, + "loss": 0.4766, + "mean_token_accuracy": 0.8556877493858337, + "num_tokens": 174435604.0, + "step": 144990 + }, + { + "entropy": 1.9604547709226607, + "epoch": 0.44948731322058433, + "grad_norm": 8.132418632507324, + "learning_rate": 3.773422701966541e-06, + "loss": 0.4974, + "mean_token_accuracy": 0.8487904667854309, + "num_tokens": 174446584.0, + "step": 145000 + }, + { + "entropy": 1.8689698219299316, + "epoch": 0.44951831234563405, + "grad_norm": 8.19633960723877, + "learning_rate": 3.773292589774912e-06, + "loss": 0.4134, + "mean_token_accuracy": 0.8521596968173981, + "num_tokens": 174458869.0, + "step": 145010 + }, + { + "entropy": 1.9306217849254608, + "epoch": 0.4495493114706837, + "grad_norm": 6.501941204071045, + "learning_rate": 3.773162491041633e-06, + "loss": 0.4743, + "mean_token_accuracy": 0.8544813707470894, + "num_tokens": 174470488.0, + "step": 145020 + }, + { + "entropy": 1.9626944810152054, + "epoch": 0.44958031059573345, + "grad_norm": 6.954222202301025, + "learning_rate": 3.7730324057643857e-06, + "loss": 0.4843, + "mean_token_accuracy": 0.8428614303469658, + "num_tokens": 174480946.0, + "step": 145030 + }, + { + "entropy": 1.9757573395967483, + "epoch": 0.4496113097207831, + "grad_norm": 7.623712539672852, + "learning_rate": 3.7729023339408476e-06, + "loss": 0.5138, + "mean_token_accuracy": 0.8365162685513496, + "num_tokens": 174492039.0, + "step": 145040 + }, + { + "entropy": 1.8950822830200196, + "epoch": 0.44964230884583284, + "grad_norm": 4.679478645324707, + "learning_rate": 3.7727722755687034e-06, + "loss": 0.4306, + "mean_token_accuracy": 0.84790218770504, + "num_tokens": 174504318.0, + "step": 145050 + }, + { + "entropy": 1.8425507709383964, + "epoch": 0.4496733079708825, + "grad_norm": 3.7281038761138916, + "learning_rate": 3.7726422306456324e-06, + "loss": 0.421, + "mean_token_accuracy": 0.8493483513593674, + "num_tokens": 174516826.0, + "step": 145060 + }, + { + "entropy": 1.8533750414848327, + "epoch": 0.44970430709593223, + "grad_norm": 3.924395799636841, + "learning_rate": 3.7725121991693183e-06, + "loss": 0.4451, + "mean_token_accuracy": 0.8450315102934838, + "num_tokens": 174530110.0, + "step": 145070 + }, + { + "entropy": 1.9302792519330978, + "epoch": 0.4497353062209819, + "grad_norm": 12.66701889038086, + "learning_rate": 3.772382181137442e-06, + "loss": 0.4751, + "mean_token_accuracy": 0.846200980246067, + "num_tokens": 174542159.0, + "step": 145080 + }, + { + "entropy": 1.9027525156736373, + "epoch": 0.4497663053460316, + "grad_norm": 7.88311243057251, + "learning_rate": 3.7722521765476877e-06, + "loss": 0.4395, + "mean_token_accuracy": 0.8578101888298988, + "num_tokens": 174552779.0, + "step": 145090 + }, + { + "entropy": 1.8144456431269647, + "epoch": 0.4497973044710813, + "grad_norm": 7.007596015930176, + "learning_rate": 3.77212218539774e-06, + "loss": 0.4167, + "mean_token_accuracy": 0.8665218591690064, + "num_tokens": 174566254.0, + "step": 145100 + }, + { + "entropy": 1.9618958830833435, + "epoch": 0.449828303596131, + "grad_norm": 7.736656665802002, + "learning_rate": 3.771992207685284e-06, + "loss": 0.4956, + "mean_token_accuracy": 0.836806908249855, + "num_tokens": 174578147.0, + "step": 145110 + }, + { + "entropy": 1.8933872073888778, + "epoch": 0.4498593027211807, + "grad_norm": 7.00977087020874, + "learning_rate": 3.771862243408003e-06, + "loss": 0.3912, + "mean_token_accuracy": 0.8770283341407776, + "num_tokens": 174589833.0, + "step": 145120 + }, + { + "entropy": 1.9726043611764907, + "epoch": 0.4498903018462304, + "grad_norm": 8.317329406738281, + "learning_rate": 3.7717322925635836e-06, + "loss": 0.4768, + "mean_token_accuracy": 0.8538321673870086, + "num_tokens": 174600627.0, + "step": 145130 + }, + { + "entropy": 1.8826549246907234, + "epoch": 0.4499213009712801, + "grad_norm": 4.149418354034424, + "learning_rate": 3.7716023551497116e-06, + "loss": 0.3934, + "mean_token_accuracy": 0.8519825637340546, + "num_tokens": 174613125.0, + "step": 145140 + }, + { + "entropy": 2.011317655444145, + "epoch": 0.4499523000963298, + "grad_norm": 7.87576961517334, + "learning_rate": 3.7714724311640744e-06, + "loss": 0.4786, + "mean_token_accuracy": 0.8533690482378006, + "num_tokens": 174623674.0, + "step": 145150 + }, + { + "entropy": 1.9144355922937393, + "epoch": 0.4499832992213795, + "grad_norm": 7.214294910430908, + "learning_rate": 3.771342520604358e-06, + "loss": 0.4861, + "mean_token_accuracy": 0.851697339117527, + "num_tokens": 174635535.0, + "step": 145160 + }, + { + "entropy": 1.9184280708432198, + "epoch": 0.4500142983464292, + "grad_norm": 4.311148166656494, + "learning_rate": 3.771212623468252e-06, + "loss": 0.4744, + "mean_token_accuracy": 0.8484656348824501, + "num_tokens": 174647262.0, + "step": 145170 + }, + { + "entropy": 1.9903872221708299, + "epoch": 0.45004529747147887, + "grad_norm": 8.605646133422852, + "learning_rate": 3.771082739753443e-06, + "loss": 0.4741, + "mean_token_accuracy": 0.8503182783722878, + "num_tokens": 174658244.0, + "step": 145180 + }, + { + "entropy": 1.9099340721964837, + "epoch": 0.45007629659652854, + "grad_norm": 9.464656829833984, + "learning_rate": 3.7709528694576214e-06, + "loss": 0.4633, + "mean_token_accuracy": 0.8493123814463616, + "num_tokens": 174670230.0, + "step": 145190 + }, + { + "entropy": 1.962883660197258, + "epoch": 0.45010729572157826, + "grad_norm": 8.372581481933594, + "learning_rate": 3.7708230125784757e-06, + "loss": 0.4834, + "mean_token_accuracy": 0.8490772858262062, + "num_tokens": 174681923.0, + "step": 145200 + }, + { + "entropy": 1.8974821627140046, + "epoch": 0.45013829484662793, + "grad_norm": 3.493914842605591, + "learning_rate": 3.7706931691136962e-06, + "loss": 0.4276, + "mean_token_accuracy": 0.861706106364727, + "num_tokens": 174693772.0, + "step": 145210 + }, + { + "entropy": 1.9498993948101997, + "epoch": 0.45016929397167765, + "grad_norm": 7.030703067779541, + "learning_rate": 3.7705633390609737e-06, + "loss": 0.4377, + "mean_token_accuracy": 0.8613954395055771, + "num_tokens": 174705327.0, + "step": 145220 + }, + { + "entropy": 1.92320823520422, + "epoch": 0.4502002930967273, + "grad_norm": 11.262289047241211, + "learning_rate": 3.770433522418e-06, + "loss": 0.4846, + "mean_token_accuracy": 0.8487863168120384, + "num_tokens": 174717121.0, + "step": 145230 + }, + { + "entropy": 1.8713699102401733, + "epoch": 0.45023129222177705, + "grad_norm": 9.125288963317871, + "learning_rate": 3.770303719182465e-06, + "loss": 0.4408, + "mean_token_accuracy": 0.8564379021525383, + "num_tokens": 174729447.0, + "step": 145240 + }, + { + "entropy": 1.916731895506382, + "epoch": 0.4502622913468267, + "grad_norm": 8.092860221862793, + "learning_rate": 3.7701739293520634e-06, + "loss": 0.446, + "mean_token_accuracy": 0.8585136890411377, + "num_tokens": 174741652.0, + "step": 145250 + }, + { + "entropy": 1.8323576033115387, + "epoch": 0.45029329047187644, + "grad_norm": 5.573619365692139, + "learning_rate": 3.7700441529244865e-06, + "loss": 0.4288, + "mean_token_accuracy": 0.858129957318306, + "num_tokens": 174754464.0, + "step": 145260 + }, + { + "entropy": 1.8868038043379785, + "epoch": 0.4503242895969261, + "grad_norm": 7.117466926574707, + "learning_rate": 3.7699143898974273e-06, + "loss": 0.4213, + "mean_token_accuracy": 0.8598243370652199, + "num_tokens": 174767280.0, + "step": 145270 + }, + { + "entropy": 1.9327538147568704, + "epoch": 0.45035528872197583, + "grad_norm": 7.857034206390381, + "learning_rate": 3.769784640268581e-06, + "loss": 0.5033, + "mean_token_accuracy": 0.8374062940478325, + "num_tokens": 174779680.0, + "step": 145280 + }, + { + "entropy": 1.8759731560945512, + "epoch": 0.4503862878470255, + "grad_norm": 9.070082664489746, + "learning_rate": 3.769654904035642e-06, + "loss": 0.4414, + "mean_token_accuracy": 0.8461429521441459, + "num_tokens": 174792617.0, + "step": 145290 + }, + { + "entropy": 1.9318383768200875, + "epoch": 0.4504172869720752, + "grad_norm": 8.199836730957031, + "learning_rate": 3.7695251811963052e-06, + "loss": 0.4324, + "mean_token_accuracy": 0.8561635687947273, + "num_tokens": 174804624.0, + "step": 145300 + }, + { + "entropy": 1.9782269150018692, + "epoch": 0.4504482860971249, + "grad_norm": 3.9492764472961426, + "learning_rate": 3.7693954717482656e-06, + "loss": 0.4845, + "mean_token_accuracy": 0.847953063249588, + "num_tokens": 174816707.0, + "step": 145310 + }, + { + "entropy": 1.8973633468151092, + "epoch": 0.4504792852221746, + "grad_norm": 8.541032791137695, + "learning_rate": 3.76926577568922e-06, + "loss": 0.407, + "mean_token_accuracy": 0.8600034147500992, + "num_tokens": 174829220.0, + "step": 145320 + }, + { + "entropy": 1.868040455877781, + "epoch": 0.4505102843472243, + "grad_norm": 8.232982635498047, + "learning_rate": 3.769136093016865e-06, + "loss": 0.4017, + "mean_token_accuracy": 0.860238878428936, + "num_tokens": 174841040.0, + "step": 145330 + }, + { + "entropy": 1.8973990380764008, + "epoch": 0.450541283472274, + "grad_norm": 7.8517165184021, + "learning_rate": 3.769006423728897e-06, + "loss": 0.4159, + "mean_token_accuracy": 0.8573246464133263, + "num_tokens": 174852510.0, + "step": 145340 + }, + { + "entropy": 1.88846056163311, + "epoch": 0.4505722825973237, + "grad_norm": 7.867887496948242, + "learning_rate": 3.7688767678230155e-06, + "loss": 0.4817, + "mean_token_accuracy": 0.8506530284881592, + "num_tokens": 174864689.0, + "step": 145350 + }, + { + "entropy": 1.9104204386472703, + "epoch": 0.4506032817223734, + "grad_norm": 9.259328842163086, + "learning_rate": 3.768747125296918e-06, + "loss": 0.463, + "mean_token_accuracy": 0.8554364711046218, + "num_tokens": 174876233.0, + "step": 145360 + }, + { + "entropy": 1.9462157368659974, + "epoch": 0.4506342808474231, + "grad_norm": 8.897783279418945, + "learning_rate": 3.7686174961483033e-06, + "loss": 0.4928, + "mean_token_accuracy": 0.8470346301794052, + "num_tokens": 174886851.0, + "step": 145370 + }, + { + "entropy": 1.9113350063562393, + "epoch": 0.4506652799724728, + "grad_norm": 9.343399047851562, + "learning_rate": 3.768487880374872e-06, + "loss": 0.4542, + "mean_token_accuracy": 0.8516996219754219, + "num_tokens": 174898934.0, + "step": 145380 + }, + { + "entropy": 1.9105153515934945, + "epoch": 0.45069627909752247, + "grad_norm": 8.577310562133789, + "learning_rate": 3.768358277974323e-06, + "loss": 0.4668, + "mean_token_accuracy": 0.8571393460035324, + "num_tokens": 174910924.0, + "step": 145390 + }, + { + "entropy": 1.7790193900465965, + "epoch": 0.4507272782225722, + "grad_norm": 3.691340446472168, + "learning_rate": 3.7682286889443563e-06, + "loss": 0.3487, + "mean_token_accuracy": 0.8683654963970184, + "num_tokens": 174924748.0, + "step": 145400 + }, + { + "entropy": 1.853318177163601, + "epoch": 0.45075827734762186, + "grad_norm": 3.717491388320923, + "learning_rate": 3.768099113282675e-06, + "loss": 0.419, + "mean_token_accuracy": 0.8503245025873184, + "num_tokens": 174937623.0, + "step": 145410 + }, + { + "entropy": 1.8263938404619693, + "epoch": 0.4507892764726716, + "grad_norm": 3.7551968097686768, + "learning_rate": 3.7679695509869798e-06, + "loss": 0.4215, + "mean_token_accuracy": 0.8548484072089195, + "num_tokens": 174951248.0, + "step": 145420 + }, + { + "entropy": 1.9366182684898376, + "epoch": 0.45082027559772125, + "grad_norm": 7.332760810852051, + "learning_rate": 3.7678400020549727e-06, + "loss": 0.5017, + "mean_token_accuracy": 0.8432886257767678, + "num_tokens": 174962653.0, + "step": 145430 + }, + { + "entropy": 1.9753572344779968, + "epoch": 0.4508512747227709, + "grad_norm": 8.431512832641602, + "learning_rate": 3.767710466484357e-06, + "loss": 0.5259, + "mean_token_accuracy": 0.839150819182396, + "num_tokens": 174973717.0, + "step": 145440 + }, + { + "entropy": 1.9610534459352493, + "epoch": 0.45088227384782065, + "grad_norm": 6.784049034118652, + "learning_rate": 3.767580944272836e-06, + "loss": 0.5492, + "mean_token_accuracy": 0.845959635078907, + "num_tokens": 174985280.0, + "step": 145450 + }, + { + "entropy": 1.8077823013067245, + "epoch": 0.4509132729728703, + "grad_norm": 9.173667907714844, + "learning_rate": 3.767451435418114e-06, + "loss": 0.3778, + "mean_token_accuracy": 0.8598848000168801, + "num_tokens": 174999259.0, + "step": 145460 + }, + { + "entropy": 1.8944556072354317, + "epoch": 0.45094427209792004, + "grad_norm": 8.268648147583008, + "learning_rate": 3.767321939917894e-06, + "loss": 0.4506, + "mean_token_accuracy": 0.861379300057888, + "num_tokens": 175010366.0, + "step": 145470 + }, + { + "entropy": 1.8945633813738822, + "epoch": 0.4509752712229697, + "grad_norm": 3.7836310863494873, + "learning_rate": 3.7671924577698832e-06, + "loss": 0.4324, + "mean_token_accuracy": 0.8532457053661346, + "num_tokens": 175022036.0, + "step": 145480 + }, + { + "entropy": 1.9272257044911385, + "epoch": 0.45100627034801943, + "grad_norm": 9.02260971069336, + "learning_rate": 3.767062988971786e-06, + "loss": 0.4369, + "mean_token_accuracy": 0.8532265886664391, + "num_tokens": 175034153.0, + "step": 145490 + }, + { + "entropy": 1.9854293823242188, + "epoch": 0.4510372694730691, + "grad_norm": 7.62827730178833, + "learning_rate": 3.766933533521308e-06, + "loss": 0.4918, + "mean_token_accuracy": 0.8465104550123215, + "num_tokens": 175045152.0, + "step": 145500 + }, + { + "entropy": 1.888518001139164, + "epoch": 0.4510682685981188, + "grad_norm": 7.6335883140563965, + "learning_rate": 3.766804091416157e-06, + "loss": 0.4595, + "mean_token_accuracy": 0.8494478777050972, + "num_tokens": 175057796.0, + "step": 145510 + }, + { + "entropy": 1.916823922097683, + "epoch": 0.4510992677231685, + "grad_norm": 9.0162935256958, + "learning_rate": 3.76667466265404e-06, + "loss": 0.4461, + "mean_token_accuracy": 0.8522469475865364, + "num_tokens": 175069430.0, + "step": 145520 + }, + { + "entropy": 1.8178383886814118, + "epoch": 0.4511302668482182, + "grad_norm": 8.224920272827148, + "learning_rate": 3.766545247232664e-06, + "loss": 0.4528, + "mean_token_accuracy": 0.8601442322134971, + "num_tokens": 175082985.0, + "step": 145530 + }, + { + "entropy": 1.898731729388237, + "epoch": 0.4511612659732679, + "grad_norm": 3.8143157958984375, + "learning_rate": 3.7664158451497383e-06, + "loss": 0.4486, + "mean_token_accuracy": 0.8475686386227608, + "num_tokens": 175095045.0, + "step": 145540 + }, + { + "entropy": 1.9468153685331344, + "epoch": 0.4511922650983176, + "grad_norm": 6.286230564117432, + "learning_rate": 3.766286456402971e-06, + "loss": 0.4355, + "mean_token_accuracy": 0.8552075073122978, + "num_tokens": 175106619.0, + "step": 145550 + }, + { + "entropy": 1.8661633163690567, + "epoch": 0.4512232642233673, + "grad_norm": 3.8550450801849365, + "learning_rate": 3.766157080990073e-06, + "loss": 0.4514, + "mean_token_accuracy": 0.8487115263938904, + "num_tokens": 175119296.0, + "step": 145560 + }, + { + "entropy": 1.928366206586361, + "epoch": 0.451254263348417, + "grad_norm": 7.825737953186035, + "learning_rate": 3.7660277189087524e-06, + "loss": 0.4441, + "mean_token_accuracy": 0.8565060004591942, + "num_tokens": 175130424.0, + "step": 145570 + }, + { + "entropy": 1.9019853085279466, + "epoch": 0.4512852624734667, + "grad_norm": 4.290144920349121, + "learning_rate": 3.7658983701567215e-06, + "loss": 0.4669, + "mean_token_accuracy": 0.8472353518009186, + "num_tokens": 175143118.0, + "step": 145580 + }, + { + "entropy": 1.811234064400196, + "epoch": 0.4513162615985164, + "grad_norm": 8.767404556274414, + "learning_rate": 3.7657690347316896e-06, + "loss": 0.4132, + "mean_token_accuracy": 0.859492601454258, + "num_tokens": 175155763.0, + "step": 145590 + }, + { + "entropy": 1.9169629096984864, + "epoch": 0.45134726072356607, + "grad_norm": 8.782833099365234, + "learning_rate": 3.7656397126313704e-06, + "loss": 0.4829, + "mean_token_accuracy": 0.8517961174249649, + "num_tokens": 175167606.0, + "step": 145600 + }, + { + "entropy": 1.9656015813350678, + "epoch": 0.4513782598486158, + "grad_norm": 7.532365322113037, + "learning_rate": 3.765510403853474e-06, + "loss": 0.5174, + "mean_token_accuracy": 0.8420636489987373, + "num_tokens": 175178779.0, + "step": 145610 + }, + { + "entropy": 1.89768455773592, + "epoch": 0.45140925897366546, + "grad_norm": 7.2777419090271, + "learning_rate": 3.765381108395715e-06, + "loss": 0.3982, + "mean_token_accuracy": 0.8631177753210068, + "num_tokens": 175190689.0, + "step": 145620 + }, + { + "entropy": 1.8941746070981025, + "epoch": 0.4514402580987152, + "grad_norm": 7.515166282653809, + "learning_rate": 3.7652518262558054e-06, + "loss": 0.4465, + "mean_token_accuracy": 0.8515808820724488, + "num_tokens": 175202846.0, + "step": 145630 + }, + { + "entropy": 1.8148242741823197, + "epoch": 0.45147125722376485, + "grad_norm": 3.4633376598358154, + "learning_rate": 3.7651225574314597e-06, + "loss": 0.3591, + "mean_token_accuracy": 0.8688716858625412, + "num_tokens": 175215871.0, + "step": 145640 + }, + { + "entropy": 1.897299911081791, + "epoch": 0.4515022563488146, + "grad_norm": 8.746367454528809, + "learning_rate": 3.764993301920392e-06, + "loss": 0.4639, + "mean_token_accuracy": 0.8508375376462937, + "num_tokens": 175227683.0, + "step": 145650 + }, + { + "entropy": 1.9884761601686478, + "epoch": 0.45153325547386425, + "grad_norm": 7.514569282531738, + "learning_rate": 3.764864059720317e-06, + "loss": 0.4895, + "mean_token_accuracy": 0.8484263613820076, + "num_tokens": 175238305.0, + "step": 145660 + }, + { + "entropy": 1.8774521455168725, + "epoch": 0.45156425459891397, + "grad_norm": 4.357245922088623, + "learning_rate": 3.7647348308289522e-06, + "loss": 0.4094, + "mean_token_accuracy": 0.8594071462750434, + "num_tokens": 175250175.0, + "step": 145670 + }, + { + "entropy": 1.9650772213935852, + "epoch": 0.45159525372396364, + "grad_norm": 9.48290729522705, + "learning_rate": 3.7646056152440104e-06, + "loss": 0.5147, + "mean_token_accuracy": 0.8337369963526726, + "num_tokens": 175261255.0, + "step": 145680 + }, + { + "entropy": 1.8208548158407212, + "epoch": 0.4516262528490133, + "grad_norm": 9.698999404907227, + "learning_rate": 3.7644764129632104e-06, + "loss": 0.404, + "mean_token_accuracy": 0.8611343666911125, + "num_tokens": 175274442.0, + "step": 145690 + }, + { + "entropy": 1.8542212471365929, + "epoch": 0.45165725197406303, + "grad_norm": 8.707727432250977, + "learning_rate": 3.7643472239842692e-06, + "loss": 0.4699, + "mean_token_accuracy": 0.8457312777638435, + "num_tokens": 175287615.0, + "step": 145700 + }, + { + "entropy": 1.947345346212387, + "epoch": 0.4516882510991127, + "grad_norm": 9.62697696685791, + "learning_rate": 3.7642180483049036e-06, + "loss": 0.5165, + "mean_token_accuracy": 0.8280424475669861, + "num_tokens": 175299235.0, + "step": 145710 + }, + { + "entropy": 1.7479452803730964, + "epoch": 0.4517192502241624, + "grad_norm": 7.1266350746154785, + "learning_rate": 3.7640888859228326e-06, + "loss": 0.3448, + "mean_token_accuracy": 0.8756140992045403, + "num_tokens": 175312946.0, + "step": 145720 + }, + { + "entropy": 1.9307225465774536, + "epoch": 0.4517502493492121, + "grad_norm": 10.900723457336426, + "learning_rate": 3.7639597368357745e-06, + "loss": 0.4533, + "mean_token_accuracy": 0.8503560811281204, + "num_tokens": 175324628.0, + "step": 145730 + }, + { + "entropy": 1.9207808002829552, + "epoch": 0.4517812484742618, + "grad_norm": 8.007662773132324, + "learning_rate": 3.763830601041448e-06, + "loss": 0.4391, + "mean_token_accuracy": 0.8547277525067329, + "num_tokens": 175336268.0, + "step": 145740 + }, + { + "entropy": 1.9525502398610115, + "epoch": 0.4518122475993115, + "grad_norm": 7.941483974456787, + "learning_rate": 3.763701478537576e-06, + "loss": 0.4872, + "mean_token_accuracy": 0.8508265346288681, + "num_tokens": 175347435.0, + "step": 145750 + }, + { + "entropy": 1.8660466879606248, + "epoch": 0.4518432467243612, + "grad_norm": 4.241132736206055, + "learning_rate": 3.763572369321876e-06, + "loss": 0.4325, + "mean_token_accuracy": 0.8603238686919212, + "num_tokens": 175359501.0, + "step": 145760 + }, + { + "entropy": 1.9864515900611877, + "epoch": 0.4518742458494109, + "grad_norm": 8.52209758758545, + "learning_rate": 3.763443273392069e-06, + "loss": 0.539, + "mean_token_accuracy": 0.8401314124464989, + "num_tokens": 175370454.0, + "step": 145770 + }, + { + "entropy": 1.917511025071144, + "epoch": 0.4519052449744606, + "grad_norm": 4.007366180419922, + "learning_rate": 3.7633141907458774e-06, + "loss": 0.4836, + "mean_token_accuracy": 0.8430694580078125, + "num_tokens": 175382097.0, + "step": 145780 + }, + { + "entropy": 1.8464987218379973, + "epoch": 0.4519362440995103, + "grad_norm": 4.244717121124268, + "learning_rate": 3.7631851213810237e-06, + "loss": 0.4654, + "mean_token_accuracy": 0.8494131848216057, + "num_tokens": 175396376.0, + "step": 145790 + }, + { + "entropy": 1.8551280453801156, + "epoch": 0.45196724322456, + "grad_norm": 8.366869926452637, + "learning_rate": 3.76305606529523e-06, + "loss": 0.4078, + "mean_token_accuracy": 0.8622759610414505, + "num_tokens": 175408942.0, + "step": 145800 + }, + { + "entropy": 1.8460657626390458, + "epoch": 0.45199824234960967, + "grad_norm": 7.980034351348877, + "learning_rate": 3.7629270224862198e-06, + "loss": 0.3803, + "mean_token_accuracy": 0.8638658255338669, + "num_tokens": 175421526.0, + "step": 145810 + }, + { + "entropy": 1.8811449334025383, + "epoch": 0.4520292414746594, + "grad_norm": 8.846792221069336, + "learning_rate": 3.762797992951716e-06, + "loss": 0.4201, + "mean_token_accuracy": 0.8668080389499664, + "num_tokens": 175434163.0, + "step": 145820 + }, + { + "entropy": 1.8398293122649192, + "epoch": 0.45206024059970906, + "grad_norm": 8.336902618408203, + "learning_rate": 3.762668976689443e-06, + "loss": 0.4324, + "mean_token_accuracy": 0.8575309097766877, + "num_tokens": 175447165.0, + "step": 145830 + }, + { + "entropy": 1.8911280773580075, + "epoch": 0.4520912397247588, + "grad_norm": 3.6037964820861816, + "learning_rate": 3.7625399736971264e-06, + "loss": 0.4532, + "mean_token_accuracy": 0.8530662402510643, + "num_tokens": 175458732.0, + "step": 145840 + }, + { + "entropy": 1.937391071021557, + "epoch": 0.45212223884980846, + "grad_norm": 7.425451755523682, + "learning_rate": 3.7624109839724915e-06, + "loss": 0.4557, + "mean_token_accuracy": 0.8544071182608605, + "num_tokens": 175470385.0, + "step": 145850 + }, + { + "entropy": 1.9437742054462432, + "epoch": 0.4521532379748582, + "grad_norm": 7.55548095703125, + "learning_rate": 3.762282007513263e-06, + "loss": 0.459, + "mean_token_accuracy": 0.8576256543397903, + "num_tokens": 175482257.0, + "step": 145860 + }, + { + "entropy": 1.879883836209774, + "epoch": 0.45218423709990785, + "grad_norm": 9.295843124389648, + "learning_rate": 3.7621530443171695e-06, + "loss": 0.4445, + "mean_token_accuracy": 0.8495224505662918, + "num_tokens": 175494991.0, + "step": 145870 + }, + { + "entropy": 1.8533268101513385, + "epoch": 0.4522152362249576, + "grad_norm": 2.5859084129333496, + "learning_rate": 3.7620240943819353e-06, + "loss": 0.4178, + "mean_token_accuracy": 0.8556357622146606, + "num_tokens": 175507885.0, + "step": 145880 + }, + { + "entropy": 1.8991135403513908, + "epoch": 0.45224623535000724, + "grad_norm": 8.07573127746582, + "learning_rate": 3.7618951577052897e-06, + "loss": 0.4416, + "mean_token_accuracy": 0.8474540561437607, + "num_tokens": 175519719.0, + "step": 145890 + }, + { + "entropy": 1.9330607324838638, + "epoch": 0.45227723447505697, + "grad_norm": 7.152818202972412, + "learning_rate": 3.7617662342849608e-06, + "loss": 0.4621, + "mean_token_accuracy": 0.8559790030121803, + "num_tokens": 175531157.0, + "step": 145900 + }, + { + "entropy": 1.875030305981636, + "epoch": 0.45230823360010664, + "grad_norm": 10.458932876586914, + "learning_rate": 3.7616373241186765e-06, + "loss": 0.4402, + "mean_token_accuracy": 0.85587347894907, + "num_tokens": 175542756.0, + "step": 145910 + }, + { + "entropy": 1.9337833374738693, + "epoch": 0.45233923272515636, + "grad_norm": 9.064762115478516, + "learning_rate": 3.7615084272041664e-06, + "loss": 0.4848, + "mean_token_accuracy": 0.8471284449100495, + "num_tokens": 175555072.0, + "step": 145920 + }, + { + "entropy": 1.9829118058085442, + "epoch": 0.45237023185020603, + "grad_norm": 7.892855644226074, + "learning_rate": 3.7613795435391603e-06, + "loss": 0.4718, + "mean_token_accuracy": 0.8524794027209281, + "num_tokens": 175566512.0, + "step": 145930 + }, + { + "entropy": 1.8689216911792754, + "epoch": 0.4524012309752557, + "grad_norm": 7.025180816650391, + "learning_rate": 3.761250673121388e-06, + "loss": 0.4345, + "mean_token_accuracy": 0.8556947037577629, + "num_tokens": 175577952.0, + "step": 145940 + }, + { + "entropy": 1.8746563777327538, + "epoch": 0.4524322301003054, + "grad_norm": 8.948837280273438, + "learning_rate": 3.7611218159485807e-06, + "loss": 0.4173, + "mean_token_accuracy": 0.8540759429335594, + "num_tokens": 175589918.0, + "step": 145950 + }, + { + "entropy": 1.906599646806717, + "epoch": 0.4524632292253551, + "grad_norm": 8.048293113708496, + "learning_rate": 3.7609929720184695e-06, + "loss": 0.447, + "mean_token_accuracy": 0.8491839274764061, + "num_tokens": 175602264.0, + "step": 145960 + }, + { + "entropy": 1.8783759981393815, + "epoch": 0.4524942283504048, + "grad_norm": 8.030713081359863, + "learning_rate": 3.7608641413287865e-06, + "loss": 0.3877, + "mean_token_accuracy": 0.8676090762019157, + "num_tokens": 175613895.0, + "step": 145970 + }, + { + "entropy": 1.8621134147047997, + "epoch": 0.4525252274754545, + "grad_norm": 9.925078392028809, + "learning_rate": 3.760735323877264e-06, + "loss": 0.4474, + "mean_token_accuracy": 0.8541931763291359, + "num_tokens": 175625978.0, + "step": 145980 + }, + { + "entropy": 1.9505731910467148, + "epoch": 0.4525562266005042, + "grad_norm": 8.451035499572754, + "learning_rate": 3.7606065196616353e-06, + "loss": 0.4809, + "mean_token_accuracy": 0.8430397853255271, + "num_tokens": 175637809.0, + "step": 145990 + }, + { + "entropy": 1.9288512140512466, + "epoch": 0.4525872257255539, + "grad_norm": 9.2060546875, + "learning_rate": 3.7604777286796333e-06, + "loss": 0.4286, + "mean_token_accuracy": 0.8581225946545601, + "num_tokens": 175649771.0, + "step": 146000 + }, + { + "entropy": 1.9088809505105018, + "epoch": 0.4526182248506036, + "grad_norm": 9.723130226135254, + "learning_rate": 3.7603489509289924e-06, + "loss": 0.4687, + "mean_token_accuracy": 0.8498413473367691, + "num_tokens": 175661361.0, + "step": 146010 + }, + { + "entropy": 1.8807017832994462, + "epoch": 0.45264922397565327, + "grad_norm": 7.284520626068115, + "learning_rate": 3.7602201864074476e-06, + "loss": 0.4296, + "mean_token_accuracy": 0.8585941612720489, + "num_tokens": 175674036.0, + "step": 146020 + }, + { + "entropy": 1.9656174287199975, + "epoch": 0.452680223100703, + "grad_norm": 9.173661231994629, + "learning_rate": 3.7600914351127343e-06, + "loss": 0.4841, + "mean_token_accuracy": 0.8418927356600762, + "num_tokens": 175685277.0, + "step": 146030 + }, + { + "entropy": 1.9602088913321496, + "epoch": 0.45271122222575266, + "grad_norm": 8.204484939575195, + "learning_rate": 3.759962697042587e-06, + "loss": 0.4398, + "mean_token_accuracy": 0.8636955425143242, + "num_tokens": 175696654.0, + "step": 146040 + }, + { + "entropy": 1.886138205230236, + "epoch": 0.4527422213508024, + "grad_norm": 8.34501838684082, + "learning_rate": 3.7598339721947426e-06, + "loss": 0.4302, + "mean_token_accuracy": 0.85477264970541, + "num_tokens": 175708775.0, + "step": 146050 + }, + { + "entropy": 1.8968783617019653, + "epoch": 0.45277322047585206, + "grad_norm": 8.124290466308594, + "learning_rate": 3.7597052605669376e-06, + "loss": 0.4263, + "mean_token_accuracy": 0.8634179666638374, + "num_tokens": 175720239.0, + "step": 146060 + }, + { + "entropy": 1.9461852222681046, + "epoch": 0.4528042196009018, + "grad_norm": 10.085100173950195, + "learning_rate": 3.7595765621569098e-06, + "loss": 0.4547, + "mean_token_accuracy": 0.8562173038721085, + "num_tokens": 175731786.0, + "step": 146070 + }, + { + "entropy": 1.8739292353391648, + "epoch": 0.45283521872595145, + "grad_norm": 8.647810935974121, + "learning_rate": 3.7594478769623967e-06, + "loss": 0.4165, + "mean_token_accuracy": 0.8552744179964066, + "num_tokens": 175744743.0, + "step": 146080 + }, + { + "entropy": 1.972387745976448, + "epoch": 0.4528662178510012, + "grad_norm": 8.841172218322754, + "learning_rate": 3.759319204981136e-06, + "loss": 0.4821, + "mean_token_accuracy": 0.8444529309868812, + "num_tokens": 175756143.0, + "step": 146090 + }, + { + "entropy": 1.804822953045368, + "epoch": 0.45289721697605084, + "grad_norm": 7.164949893951416, + "learning_rate": 3.7591905462108686e-06, + "loss": 0.374, + "mean_token_accuracy": 0.8675896748900414, + "num_tokens": 175768824.0, + "step": 146100 + }, + { + "entropy": 1.858502623438835, + "epoch": 0.45292821610110057, + "grad_norm": 7.452975749969482, + "learning_rate": 3.7590619006493333e-06, + "loss": 0.4396, + "mean_token_accuracy": 0.8505475029349328, + "num_tokens": 175781202.0, + "step": 146110 + }, + { + "entropy": 1.9403677567839623, + "epoch": 0.45295921522615024, + "grad_norm": 9.553077697753906, + "learning_rate": 3.7589332682942687e-06, + "loss": 0.4697, + "mean_token_accuracy": 0.8545985773205758, + "num_tokens": 175792433.0, + "step": 146120 + }, + { + "entropy": 1.929755797982216, + "epoch": 0.45299021435119996, + "grad_norm": 7.850072860717773, + "learning_rate": 3.7588046491434164e-06, + "loss": 0.4831, + "mean_token_accuracy": 0.8516537502408028, + "num_tokens": 175803014.0, + "step": 146130 + }, + { + "entropy": 1.8702018111944199, + "epoch": 0.45302121347624963, + "grad_norm": 4.102813243865967, + "learning_rate": 3.758676043194518e-06, + "loss": 0.3885, + "mean_token_accuracy": 0.8598198860883712, + "num_tokens": 175814852.0, + "step": 146140 + }, + { + "entropy": 1.875620885193348, + "epoch": 0.45305221260129935, + "grad_norm": 7.794433116912842, + "learning_rate": 3.7585474504453145e-06, + "loss": 0.4321, + "mean_token_accuracy": 0.8569299295544625, + "num_tokens": 175826845.0, + "step": 146150 + }, + { + "entropy": 1.9491507172584535, + "epoch": 0.453083211726349, + "grad_norm": 8.696412086486816, + "learning_rate": 3.758418870893548e-06, + "loss": 0.479, + "mean_token_accuracy": 0.8512806192040443, + "num_tokens": 175838457.0, + "step": 146160 + }, + { + "entropy": 1.8456061393022538, + "epoch": 0.45311421085139875, + "grad_norm": 3.9336066246032715, + "learning_rate": 3.758290304536962e-06, + "loss": 0.3994, + "mean_token_accuracy": 0.861776913702488, + "num_tokens": 175850893.0, + "step": 146170 + }, + { + "entropy": 1.8336857661604882, + "epoch": 0.4531452099764484, + "grad_norm": 4.638808727264404, + "learning_rate": 3.7581617513732994e-06, + "loss": 0.3938, + "mean_token_accuracy": 0.8632370933890343, + "num_tokens": 175863816.0, + "step": 146180 + }, + { + "entropy": 1.8227697402238845, + "epoch": 0.4531762091014981, + "grad_norm": 8.638792991638184, + "learning_rate": 3.7580332114003033e-06, + "loss": 0.475, + "mean_token_accuracy": 0.8531124874949455, + "num_tokens": 175877462.0, + "step": 146190 + }, + { + "entropy": 1.938033263385296, + "epoch": 0.4532072082265478, + "grad_norm": 9.167952537536621, + "learning_rate": 3.757904684615719e-06, + "loss": 0.4254, + "mean_token_accuracy": 0.8607913628220558, + "num_tokens": 175889063.0, + "step": 146200 + }, + { + "entropy": 1.8250967353582381, + "epoch": 0.4532382073515975, + "grad_norm": 7.031805515289307, + "learning_rate": 3.7577761710172912e-06, + "loss": 0.3626, + "mean_token_accuracy": 0.871262151002884, + "num_tokens": 175902376.0, + "step": 146210 + }, + { + "entropy": 1.872345322370529, + "epoch": 0.4532692064766472, + "grad_norm": 7.401670932769775, + "learning_rate": 3.757647670602764e-06, + "loss": 0.4387, + "mean_token_accuracy": 0.8529538735747337, + "num_tokens": 175915132.0, + "step": 146220 + }, + { + "entropy": 1.8786262601613999, + "epoch": 0.45330020560169687, + "grad_norm": 4.830646514892578, + "learning_rate": 3.757519183369886e-06, + "loss": 0.4165, + "mean_token_accuracy": 0.85924152135849, + "num_tokens": 175927680.0, + "step": 146230 + }, + { + "entropy": 1.9301627531647683, + "epoch": 0.4533312047267466, + "grad_norm": 4.050126552581787, + "learning_rate": 3.757390709316402e-06, + "loss": 0.457, + "mean_token_accuracy": 0.8585702002048492, + "num_tokens": 175938798.0, + "step": 146240 + }, + { + "entropy": 1.909451249241829, + "epoch": 0.45336220385179626, + "grad_norm": 7.195918560028076, + "learning_rate": 3.757262248440059e-06, + "loss": 0.4654, + "mean_token_accuracy": 0.8493908762931823, + "num_tokens": 175949970.0, + "step": 146250 + }, + { + "entropy": 1.9241393342614175, + "epoch": 0.453393202976846, + "grad_norm": 3.823732614517212, + "learning_rate": 3.7571338007386053e-06, + "loss": 0.5309, + "mean_token_accuracy": 0.830666047334671, + "num_tokens": 175961893.0, + "step": 146260 + }, + { + "entropy": 1.9584316313266754, + "epoch": 0.45342420210189566, + "grad_norm": 7.582314968109131, + "learning_rate": 3.7570053662097884e-06, + "loss": 0.4784, + "mean_token_accuracy": 0.8390909880399704, + "num_tokens": 175973692.0, + "step": 146270 + }, + { + "entropy": 1.8017342045903206, + "epoch": 0.4534552012269454, + "grad_norm": 4.599339008331299, + "learning_rate": 3.7568769448513577e-06, + "loss": 0.336, + "mean_token_accuracy": 0.87629035115242, + "num_tokens": 175987073.0, + "step": 146280 + }, + { + "entropy": 1.9104434236884118, + "epoch": 0.45348620035199505, + "grad_norm": 5.8684210777282715, + "learning_rate": 3.756748536661061e-06, + "loss": 0.4021, + "mean_token_accuracy": 0.8615088537335396, + "num_tokens": 175999179.0, + "step": 146290 + }, + { + "entropy": 1.9020302399992943, + "epoch": 0.4535171994770448, + "grad_norm": 8.238286972045898, + "learning_rate": 3.7566201416366497e-06, + "loss": 0.4678, + "mean_token_accuracy": 0.850467374920845, + "num_tokens": 176010704.0, + "step": 146300 + }, + { + "entropy": 1.9250214383006097, + "epoch": 0.45354819860209444, + "grad_norm": 7.398449897766113, + "learning_rate": 3.756491759775874e-06, + "loss": 0.4741, + "mean_token_accuracy": 0.8439469993114471, + "num_tokens": 176023049.0, + "step": 146310 + }, + { + "entropy": 1.9645772278308868, + "epoch": 0.45357919772714417, + "grad_norm": 7.6212334632873535, + "learning_rate": 3.7563633910764837e-06, + "loss": 0.5461, + "mean_token_accuracy": 0.8408754646778107, + "num_tokens": 176034032.0, + "step": 146320 + }, + { + "entropy": 1.9224450066685677, + "epoch": 0.45361019685219384, + "grad_norm": 5.688684463500977, + "learning_rate": 3.7562350355362297e-06, + "loss": 0.5091, + "mean_token_accuracy": 0.8347508668899536, + "num_tokens": 176046264.0, + "step": 146330 + }, + { + "entropy": 1.9145216554403306, + "epoch": 0.45364119597724356, + "grad_norm": 8.109264373779297, + "learning_rate": 3.7561066931528657e-06, + "loss": 0.4583, + "mean_token_accuracy": 0.8559110507369041, + "num_tokens": 176057654.0, + "step": 146340 + }, + { + "entropy": 1.903997114300728, + "epoch": 0.45367219510229323, + "grad_norm": 7.708643436431885, + "learning_rate": 3.755978363924143e-06, + "loss": 0.4314, + "mean_token_accuracy": 0.8482723668217659, + "num_tokens": 176070619.0, + "step": 146350 + }, + { + "entropy": 1.8557177141308785, + "epoch": 0.45370319422734295, + "grad_norm": 4.477802753448486, + "learning_rate": 3.755850047847815e-06, + "loss": 0.4151, + "mean_token_accuracy": 0.8525144010782242, + "num_tokens": 176083576.0, + "step": 146360 + }, + { + "entropy": 1.9063315868377686, + "epoch": 0.4537341933523926, + "grad_norm": 8.484750747680664, + "learning_rate": 3.7557217449216354e-06, + "loss": 0.4557, + "mean_token_accuracy": 0.8580427676439285, + "num_tokens": 176095193.0, + "step": 146370 + }, + { + "entropy": 1.8667533531785012, + "epoch": 0.45376519247744235, + "grad_norm": 9.92103099822998, + "learning_rate": 3.755593455143357e-06, + "loss": 0.4305, + "mean_token_accuracy": 0.8585796818137169, + "num_tokens": 176107763.0, + "step": 146380 + }, + { + "entropy": 1.9320159569382667, + "epoch": 0.453796191602492, + "grad_norm": 9.859814643859863, + "learning_rate": 3.7554651785107367e-06, + "loss": 0.4778, + "mean_token_accuracy": 0.8518664732575416, + "num_tokens": 176119971.0, + "step": 146390 + }, + { + "entropy": 1.8948560282588005, + "epoch": 0.45382719072754174, + "grad_norm": 3.578590154647827, + "learning_rate": 3.755336915021527e-06, + "loss": 0.4498, + "mean_token_accuracy": 0.8504968300461769, + "num_tokens": 176132352.0, + "step": 146400 + }, + { + "entropy": 1.9224641382694245, + "epoch": 0.4538581898525914, + "grad_norm": 9.709887504577637, + "learning_rate": 3.755208664673485e-06, + "loss": 0.452, + "mean_token_accuracy": 0.849474447965622, + "num_tokens": 176145215.0, + "step": 146410 + }, + { + "entropy": 1.9500353157520294, + "epoch": 0.45388918897764113, + "grad_norm": 7.521528720855713, + "learning_rate": 3.7550804274643673e-06, + "loss": 0.4387, + "mean_token_accuracy": 0.8548279941082001, + "num_tokens": 176156902.0, + "step": 146420 + }, + { + "entropy": 1.9636742144823074, + "epoch": 0.4539201881026908, + "grad_norm": 9.059736251831055, + "learning_rate": 3.7549522033919293e-06, + "loss": 0.4779, + "mean_token_accuracy": 0.8536644339561462, + "num_tokens": 176167981.0, + "step": 146430 + }, + { + "entropy": 1.9579711258411407, + "epoch": 0.45395118722774047, + "grad_norm": 7.5242390632629395, + "learning_rate": 3.7548239924539294e-06, + "loss": 0.4569, + "mean_token_accuracy": 0.855312067270279, + "num_tokens": 176179862.0, + "step": 146440 + }, + { + "entropy": 1.9306029558181763, + "epoch": 0.4539821863527902, + "grad_norm": 8.735157012939453, + "learning_rate": 3.754695794648125e-06, + "loss": 0.4603, + "mean_token_accuracy": 0.8461642101407051, + "num_tokens": 176191558.0, + "step": 146450 + }, + { + "entropy": 1.8903416648507119, + "epoch": 0.45401318547783986, + "grad_norm": 8.93207836151123, + "learning_rate": 3.7545676099722737e-06, + "loss": 0.4224, + "mean_token_accuracy": 0.8568781584501266, + "num_tokens": 176203878.0, + "step": 146460 + }, + { + "entropy": 1.918420398235321, + "epoch": 0.4540441846028896, + "grad_norm": 8.027023315429688, + "learning_rate": 3.7544394384241366e-06, + "loss": 0.4635, + "mean_token_accuracy": 0.8519258230924607, + "num_tokens": 176215578.0, + "step": 146470 + }, + { + "entropy": 1.9900656551122666, + "epoch": 0.45407518372793926, + "grad_norm": 6.475451469421387, + "learning_rate": 3.754311280001471e-06, + "loss": 0.4894, + "mean_token_accuracy": 0.8443711012601852, + "num_tokens": 176226440.0, + "step": 146480 + }, + { + "entropy": 1.9735152557492257, + "epoch": 0.454106182852989, + "grad_norm": 8.966446876525879, + "learning_rate": 3.7541831347020374e-06, + "loss": 0.5237, + "mean_token_accuracy": 0.8361330017447471, + "num_tokens": 176237579.0, + "step": 146490 + }, + { + "entropy": 1.8347552955150603, + "epoch": 0.45413718197803865, + "grad_norm": 4.222280502319336, + "learning_rate": 3.754055002523596e-06, + "loss": 0.4493, + "mean_token_accuracy": 0.8571354284882545, + "num_tokens": 176251272.0, + "step": 146500 + }, + { + "entropy": 1.8998313397169113, + "epoch": 0.4541681811030884, + "grad_norm": 8.564806938171387, + "learning_rate": 3.7539268834639085e-06, + "loss": 0.4753, + "mean_token_accuracy": 0.8468169465661048, + "num_tokens": 176263481.0, + "step": 146510 + }, + { + "entropy": 1.912764126062393, + "epoch": 0.45419918022813804, + "grad_norm": 8.024970054626465, + "learning_rate": 3.7537987775207373e-06, + "loss": 0.3862, + "mean_token_accuracy": 0.8708274886012077, + "num_tokens": 176275846.0, + "step": 146520 + }, + { + "entropy": 1.9775932729244232, + "epoch": 0.45423017935318777, + "grad_norm": 10.816838264465332, + "learning_rate": 3.753670684691842e-06, + "loss": 0.5207, + "mean_token_accuracy": 0.842138460278511, + "num_tokens": 176286757.0, + "step": 146530 + }, + { + "entropy": 1.9837248116731643, + "epoch": 0.45426117847823744, + "grad_norm": 8.28238296508789, + "learning_rate": 3.7535426049749867e-06, + "loss": 0.4965, + "mean_token_accuracy": 0.8460395529866218, + "num_tokens": 176297210.0, + "step": 146540 + }, + { + "entropy": 1.9528759866952896, + "epoch": 0.45429217760328716, + "grad_norm": 8.31442642211914, + "learning_rate": 3.7534145383679354e-06, + "loss": 0.4845, + "mean_token_accuracy": 0.8538484081625939, + "num_tokens": 176309108.0, + "step": 146550 + }, + { + "entropy": 1.8652475848793983, + "epoch": 0.45432317672833683, + "grad_norm": 8.39297866821289, + "learning_rate": 3.7532864848684496e-06, + "loss": 0.4237, + "mean_token_accuracy": 0.8631422132253647, + "num_tokens": 176322014.0, + "step": 146560 + }, + { + "entropy": 1.9841240167617797, + "epoch": 0.45435417585338655, + "grad_norm": 9.03348159790039, + "learning_rate": 3.7531584444742956e-06, + "loss": 0.4887, + "mean_token_accuracy": 0.8504109963774681, + "num_tokens": 176332381.0, + "step": 146570 + }, + { + "entropy": 1.8902300730347634, + "epoch": 0.4543851749784362, + "grad_norm": 3.9884934425354004, + "learning_rate": 3.753030417183237e-06, + "loss": 0.4038, + "mean_token_accuracy": 0.8569736734032631, + "num_tokens": 176345267.0, + "step": 146580 + }, + { + "entropy": 1.8768309980630875, + "epoch": 0.45441617410348595, + "grad_norm": 8.625255584716797, + "learning_rate": 3.75290240299304e-06, + "loss": 0.4368, + "mean_token_accuracy": 0.8578463882207871, + "num_tokens": 176357705.0, + "step": 146590 + }, + { + "entropy": 1.9548759251832961, + "epoch": 0.4544471732285356, + "grad_norm": 4.450130462646484, + "learning_rate": 3.7527744019014693e-06, + "loss": 0.4713, + "mean_token_accuracy": 0.855126628279686, + "num_tokens": 176369275.0, + "step": 146600 + }, + { + "entropy": 1.8565620198845862, + "epoch": 0.45447817235358534, + "grad_norm": 8.501686096191406, + "learning_rate": 3.7526464139062918e-06, + "loss": 0.405, + "mean_token_accuracy": 0.863280688226223, + "num_tokens": 176382234.0, + "step": 146610 + }, + { + "entropy": 1.8947626411914826, + "epoch": 0.454509171478635, + "grad_norm": 9.0901460647583, + "learning_rate": 3.752518439005274e-06, + "loss": 0.4351, + "mean_token_accuracy": 0.8616373479366303, + "num_tokens": 176394636.0, + "step": 146620 + }, + { + "entropy": 1.9476829752326013, + "epoch": 0.45454017060368473, + "grad_norm": 8.009326934814453, + "learning_rate": 3.752390477196185e-06, + "loss": 0.4846, + "mean_token_accuracy": 0.8480390936136246, + "num_tokens": 176406091.0, + "step": 146630 + }, + { + "entropy": 1.9551826044917107, + "epoch": 0.4545711697287344, + "grad_norm": 3.430755853652954, + "learning_rate": 3.752262528476791e-06, + "loss": 0.5021, + "mean_token_accuracy": 0.8439038008451462, + "num_tokens": 176417335.0, + "step": 146640 + }, + { + "entropy": 1.9948366075754165, + "epoch": 0.4546021688537841, + "grad_norm": 6.840229034423828, + "learning_rate": 3.752134592844861e-06, + "loss": 0.5027, + "mean_token_accuracy": 0.8471700206398964, + "num_tokens": 176428379.0, + "step": 146650 + }, + { + "entropy": 1.964817936718464, + "epoch": 0.4546331679788338, + "grad_norm": 4.002964496612549, + "learning_rate": 3.7520066702981637e-06, + "loss": 0.4591, + "mean_token_accuracy": 0.8483250498771667, + "num_tokens": 176439775.0, + "step": 146660 + }, + { + "entropy": 1.9319669753313065, + "epoch": 0.45466416710388347, + "grad_norm": 4.155783653259277, + "learning_rate": 3.7518787608344694e-06, + "loss": 0.4414, + "mean_token_accuracy": 0.8619429409503937, + "num_tokens": 176451549.0, + "step": 146670 + }, + { + "entropy": 1.9431686520576477, + "epoch": 0.4546951662289332, + "grad_norm": 8.80432415008545, + "learning_rate": 3.7517508644515476e-06, + "loss": 0.4715, + "mean_token_accuracy": 0.8515708222985268, + "num_tokens": 176463221.0, + "step": 146680 + }, + { + "entropy": 1.912970869243145, + "epoch": 0.45472616535398286, + "grad_norm": 9.149775505065918, + "learning_rate": 3.7516229811471686e-06, + "loss": 0.4347, + "mean_token_accuracy": 0.8584818720817566, + "num_tokens": 176475849.0, + "step": 146690 + }, + { + "entropy": 1.948892466723919, + "epoch": 0.4547571644790326, + "grad_norm": 8.775398254394531, + "learning_rate": 3.751495110919105e-06, + "loss": 0.427, + "mean_token_accuracy": 0.8604862332344055, + "num_tokens": 176487394.0, + "step": 146700 + }, + { + "entropy": 1.879041202366352, + "epoch": 0.45478816360408225, + "grad_norm": 8.988895416259766, + "learning_rate": 3.7513672537651273e-06, + "loss": 0.4596, + "mean_token_accuracy": 0.8561202257871627, + "num_tokens": 176499781.0, + "step": 146710 + }, + { + "entropy": 1.9582669615745545, + "epoch": 0.454819162729132, + "grad_norm": 10.033254623413086, + "learning_rate": 3.751239409683008e-06, + "loss": 0.5125, + "mean_token_accuracy": 0.8357150718569756, + "num_tokens": 176512409.0, + "step": 146720 + }, + { + "entropy": 1.9103184998035432, + "epoch": 0.45485016185418164, + "grad_norm": 8.347360610961914, + "learning_rate": 3.75111157867052e-06, + "loss": 0.4322, + "mean_token_accuracy": 0.8483709633350373, + "num_tokens": 176524784.0, + "step": 146730 + }, + { + "entropy": 1.9160448759794235, + "epoch": 0.45488116097923137, + "grad_norm": 4.41226863861084, + "learning_rate": 3.7509837607254356e-06, + "loss": 0.4349, + "mean_token_accuracy": 0.8565223976969719, + "num_tokens": 176536691.0, + "step": 146740 + }, + { + "entropy": 1.8885368049144744, + "epoch": 0.45491216010428104, + "grad_norm": 7.640499591827393, + "learning_rate": 3.75085595584553e-06, + "loss": 0.4101, + "mean_token_accuracy": 0.8599068447947502, + "num_tokens": 176549077.0, + "step": 146750 + }, + { + "entropy": 1.9121224999427795, + "epoch": 0.45494315922933076, + "grad_norm": 8.662951469421387, + "learning_rate": 3.750728164028577e-06, + "loss": 0.4483, + "mean_token_accuracy": 0.8458693385124206, + "num_tokens": 176561257.0, + "step": 146760 + }, + { + "entropy": 1.8658764064311981, + "epoch": 0.45497415835438043, + "grad_norm": 8.34899616241455, + "learning_rate": 3.7506003852723517e-06, + "loss": 0.4581, + "mean_token_accuracy": 0.8492805927991867, + "num_tokens": 176574547.0, + "step": 146770 + }, + { + "entropy": 2.010634405910969, + "epoch": 0.45500515747943016, + "grad_norm": 4.921254634857178, + "learning_rate": 3.7504726195746287e-06, + "loss": 0.4775, + "mean_token_accuracy": 0.8483109161257744, + "num_tokens": 176585789.0, + "step": 146780 + }, + { + "entropy": 2.0231185287237166, + "epoch": 0.4550361566044798, + "grad_norm": 7.859384536743164, + "learning_rate": 3.750344866933185e-06, + "loss": 0.482, + "mean_token_accuracy": 0.854818707704544, + "num_tokens": 176596403.0, + "step": 146790 + }, + { + "entropy": 1.809226544201374, + "epoch": 0.45506715572952955, + "grad_norm": 7.986112594604492, + "learning_rate": 3.750217127345796e-06, + "loss": 0.3894, + "mean_token_accuracy": 0.8633719727396965, + "num_tokens": 176610424.0, + "step": 146800 + }, + { + "entropy": 1.9953494429588319, + "epoch": 0.4550981548545792, + "grad_norm": 8.195384979248047, + "learning_rate": 3.7500894008102395e-06, + "loss": 0.5193, + "mean_token_accuracy": 0.8488559857010841, + "num_tokens": 176621493.0, + "step": 146810 + }, + { + "entropy": 1.7815378695726394, + "epoch": 0.45512915397962894, + "grad_norm": 6.289517402648926, + "learning_rate": 3.7499616873242926e-06, + "loss": 0.3925, + "mean_token_accuracy": 0.8637578442692757, + "num_tokens": 176635418.0, + "step": 146820 + }, + { + "entropy": 1.9626459717750548, + "epoch": 0.4551601531046786, + "grad_norm": 8.439818382263184, + "learning_rate": 3.7498339868857342e-06, + "loss": 0.508, + "mean_token_accuracy": 0.852840892970562, + "num_tokens": 176646632.0, + "step": 146830 + }, + { + "entropy": 1.906586892902851, + "epoch": 0.45519115222972834, + "grad_norm": 8.007729530334473, + "learning_rate": 3.749706299492342e-06, + "loss": 0.4267, + "mean_token_accuracy": 0.8653773680329323, + "num_tokens": 176657728.0, + "step": 146840 + }, + { + "entropy": 1.9015120595693589, + "epoch": 0.455222151354778, + "grad_norm": 7.211781978607178, + "learning_rate": 3.749578625141895e-06, + "loss": 0.4305, + "mean_token_accuracy": 0.8567000299692153, + "num_tokens": 176670194.0, + "step": 146850 + }, + { + "entropy": 1.9847611114382744, + "epoch": 0.45525315047982773, + "grad_norm": 6.767532825469971, + "learning_rate": 3.7494509638321734e-06, + "loss": 0.4531, + "mean_token_accuracy": 0.8509671688079834, + "num_tokens": 176681480.0, + "step": 146860 + }, + { + "entropy": 1.8333649218082428, + "epoch": 0.4552841496048774, + "grad_norm": 7.665855407714844, + "learning_rate": 3.749323315560957e-06, + "loss": 0.4189, + "mean_token_accuracy": 0.856324379146099, + "num_tokens": 176694603.0, + "step": 146870 + }, + { + "entropy": 1.9811863422393798, + "epoch": 0.4553151487299271, + "grad_norm": 8.544697761535645, + "learning_rate": 3.7491956803260273e-06, + "loss": 0.4924, + "mean_token_accuracy": 0.8474650055170059, + "num_tokens": 176705780.0, + "step": 146880 + }, + { + "entropy": 1.8731940254569053, + "epoch": 0.4553461478549768, + "grad_norm": 8.085098266601562, + "learning_rate": 3.7490680581251637e-06, + "loss": 0.4266, + "mean_token_accuracy": 0.8562427669763565, + "num_tokens": 176717194.0, + "step": 146890 + }, + { + "entropy": 1.862533800303936, + "epoch": 0.4553771469800265, + "grad_norm": 4.403972625732422, + "learning_rate": 3.74894044895615e-06, + "loss": 0.4545, + "mean_token_accuracy": 0.8544600263237954, + "num_tokens": 176730041.0, + "step": 146900 + }, + { + "entropy": 1.8781135827302933, + "epoch": 0.4554081461050762, + "grad_norm": 3.9804108142852783, + "learning_rate": 3.7488128528167672e-06, + "loss": 0.428, + "mean_token_accuracy": 0.8630628094077111, + "num_tokens": 176742512.0, + "step": 146910 + }, + { + "entropy": 1.9072934925556182, + "epoch": 0.45543914523012585, + "grad_norm": 7.206721305847168, + "learning_rate": 3.7486852697047988e-06, + "loss": 0.4205, + "mean_token_accuracy": 0.8565845042467117, + "num_tokens": 176754331.0, + "step": 146920 + }, + { + "entropy": 1.9471127331256866, + "epoch": 0.4554701443551756, + "grad_norm": 6.824718952178955, + "learning_rate": 3.748557699618028e-06, + "loss": 0.4853, + "mean_token_accuracy": 0.8565745607018471, + "num_tokens": 176765360.0, + "step": 146930 + }, + { + "entropy": 1.888777793943882, + "epoch": 0.45550114348022525, + "grad_norm": 8.23228645324707, + "learning_rate": 3.748430142554238e-06, + "loss": 0.4229, + "mean_token_accuracy": 0.865308640897274, + "num_tokens": 176777052.0, + "step": 146940 + }, + { + "entropy": 1.881396123766899, + "epoch": 0.45553214260527497, + "grad_norm": 7.634214878082275, + "learning_rate": 3.748302598511214e-06, + "loss": 0.4877, + "mean_token_accuracy": 0.8448997780680656, + "num_tokens": 176789075.0, + "step": 146950 + }, + { + "entropy": 1.888343983888626, + "epoch": 0.45556314173032464, + "grad_norm": 8.560070991516113, + "learning_rate": 3.748175067486742e-06, + "loss": 0.447, + "mean_token_accuracy": 0.8476789712905883, + "num_tokens": 176801688.0, + "step": 146960 + }, + { + "entropy": 1.8695935264229775, + "epoch": 0.45559414085537436, + "grad_norm": 4.059643268585205, + "learning_rate": 3.7480475494786045e-06, + "loss": 0.3986, + "mean_token_accuracy": 0.8586703911423683, + "num_tokens": 176814689.0, + "step": 146970 + }, + { + "entropy": 1.8886354252696038, + "epoch": 0.45562513998042403, + "grad_norm": 3.8732738494873047, + "learning_rate": 3.7479200444845893e-06, + "loss": 0.4933, + "mean_token_accuracy": 0.8417670831084252, + "num_tokens": 176827493.0, + "step": 146980 + }, + { + "entropy": 1.8563087373971938, + "epoch": 0.45565613910547376, + "grad_norm": 8.856595993041992, + "learning_rate": 3.7477925525024837e-06, + "loss": 0.4056, + "mean_token_accuracy": 0.8550028428435326, + "num_tokens": 176840753.0, + "step": 146990 + }, + { + "entropy": 1.941939078271389, + "epoch": 0.4556871382305234, + "grad_norm": 4.509472846984863, + "learning_rate": 3.7476650735300728e-06, + "loss": 0.4409, + "mean_token_accuracy": 0.8547850877046586, + "num_tokens": 176852241.0, + "step": 147000 + }, + { + "entropy": 1.9420098468661309, + "epoch": 0.45571813735557315, + "grad_norm": 4.272128582000732, + "learning_rate": 3.747537607565146e-06, + "loss": 0.4371, + "mean_token_accuracy": 0.8548914834856987, + "num_tokens": 176863804.0, + "step": 147010 + }, + { + "entropy": 1.897740714251995, + "epoch": 0.4557491364806228, + "grad_norm": 9.78155517578125, + "learning_rate": 3.7474101546054897e-06, + "loss": 0.4496, + "mean_token_accuracy": 0.8575510829687119, + "num_tokens": 176876007.0, + "step": 147020 + }, + { + "entropy": 1.901134905219078, + "epoch": 0.45578013560567254, + "grad_norm": 8.032532691955566, + "learning_rate": 3.747282714648894e-06, + "loss": 0.4319, + "mean_token_accuracy": 0.8620973736047745, + "num_tokens": 176887951.0, + "step": 147030 + }, + { + "entropy": 1.9228925719857215, + "epoch": 0.4558111347307222, + "grad_norm": 3.396730422973633, + "learning_rate": 3.747155287693148e-06, + "loss": 0.4387, + "mean_token_accuracy": 0.8453203931450843, + "num_tokens": 176899964.0, + "step": 147040 + }, + { + "entropy": 1.9790107786655426, + "epoch": 0.45584213385577194, + "grad_norm": 6.975090026855469, + "learning_rate": 3.7470278737360395e-06, + "loss": 0.4874, + "mean_token_accuracy": 0.8480637580156326, + "num_tokens": 176911678.0, + "step": 147050 + }, + { + "entropy": 1.8822043985128403, + "epoch": 0.4558731329808216, + "grad_norm": 9.182622909545898, + "learning_rate": 3.7469004727753605e-06, + "loss": 0.4168, + "mean_token_accuracy": 0.8651730179786682, + "num_tokens": 176923300.0, + "step": 147060 + }, + { + "entropy": 1.9158610820770263, + "epoch": 0.45590413210587133, + "grad_norm": 3.5178349018096924, + "learning_rate": 3.746773084808901e-06, + "loss": 0.4353, + "mean_token_accuracy": 0.8595368161797523, + "num_tokens": 176935060.0, + "step": 147070 + }, + { + "entropy": 1.968504549562931, + "epoch": 0.455935131230921, + "grad_norm": 7.308728218078613, + "learning_rate": 3.7466457098344528e-06, + "loss": 0.4946, + "mean_token_accuracy": 0.8455461367964745, + "num_tokens": 176946490.0, + "step": 147080 + }, + { + "entropy": 1.8836671754717826, + "epoch": 0.4559661303559707, + "grad_norm": 8.155224800109863, + "learning_rate": 3.7465183478498068e-06, + "loss": 0.4312, + "mean_token_accuracy": 0.8532806396484375, + "num_tokens": 176959124.0, + "step": 147090 + }, + { + "entropy": 1.9047341987490654, + "epoch": 0.4559971294810204, + "grad_norm": 7.964391708374023, + "learning_rate": 3.7463909988527563e-06, + "loss": 0.4336, + "mean_token_accuracy": 0.8509239420294762, + "num_tokens": 176971598.0, + "step": 147100 + }, + { + "entropy": 1.9551147490739822, + "epoch": 0.4560281286060701, + "grad_norm": 7.997689723968506, + "learning_rate": 3.7462636628410933e-06, + "loss": 0.5301, + "mean_token_accuracy": 0.8321620270609855, + "num_tokens": 176982580.0, + "step": 147110 + }, + { + "entropy": 1.898515647649765, + "epoch": 0.4560591277311198, + "grad_norm": 2.4085474014282227, + "learning_rate": 3.7461363398126123e-06, + "loss": 0.4427, + "mean_token_accuracy": 0.8540950760245323, + "num_tokens": 176995367.0, + "step": 147120 + }, + { + "entropy": 1.841650950908661, + "epoch": 0.4560901268561695, + "grad_norm": 7.601720333099365, + "learning_rate": 3.746009029765105e-06, + "loss": 0.3595, + "mean_token_accuracy": 0.8667580097913742, + "num_tokens": 177008617.0, + "step": 147130 + }, + { + "entropy": 1.8938813239336014, + "epoch": 0.4561211259812192, + "grad_norm": 8.31351375579834, + "learning_rate": 3.745881732696369e-06, + "loss": 0.4297, + "mean_token_accuracy": 0.8541634380817413, + "num_tokens": 177021433.0, + "step": 147140 + }, + { + "entropy": 1.9346230700612068, + "epoch": 0.4561521251062689, + "grad_norm": 8.50265884399414, + "learning_rate": 3.745754448604195e-06, + "loss": 0.4375, + "mean_token_accuracy": 0.8484914928674698, + "num_tokens": 177033364.0, + "step": 147150 + }, + { + "entropy": 1.8723743125796317, + "epoch": 0.45618312423131857, + "grad_norm": 3.476085901260376, + "learning_rate": 3.745627177486383e-06, + "loss": 0.4031, + "mean_token_accuracy": 0.8580270186066628, + "num_tokens": 177046006.0, + "step": 147160 + }, + { + "entropy": 1.8366812959313392, + "epoch": 0.45621412335636824, + "grad_norm": 3.9904587268829346, + "learning_rate": 3.745499919340726e-06, + "loss": 0.3737, + "mean_token_accuracy": 0.8696741983294487, + "num_tokens": 177059038.0, + "step": 147170 + }, + { + "entropy": 1.8440161734819411, + "epoch": 0.45624512248141796, + "grad_norm": 9.400425910949707, + "learning_rate": 3.745372674165021e-06, + "loss": 0.3684, + "mean_token_accuracy": 0.8613582566380501, + "num_tokens": 177072384.0, + "step": 147180 + }, + { + "entropy": 1.9192558169364928, + "epoch": 0.45627612160646763, + "grad_norm": 10.3735933303833, + "learning_rate": 3.7452454419570656e-06, + "loss": 0.4259, + "mean_token_accuracy": 0.8616964593529701, + "num_tokens": 177084461.0, + "step": 147190 + }, + { + "entropy": 1.8800915464758874, + "epoch": 0.45630712073151736, + "grad_norm": 3.897484302520752, + "learning_rate": 3.745118222714657e-06, + "loss": 0.4315, + "mean_token_accuracy": 0.8533887133002281, + "num_tokens": 177096842.0, + "step": 147200 + }, + { + "entropy": 1.945913690328598, + "epoch": 0.456338119856567, + "grad_norm": 2.707284688949585, + "learning_rate": 3.7449910164355936e-06, + "loss": 0.4759, + "mean_token_accuracy": 0.847554586827755, + "num_tokens": 177109235.0, + "step": 147210 + }, + { + "entropy": 1.9585506185889243, + "epoch": 0.45636911898161675, + "grad_norm": 7.910290718078613, + "learning_rate": 3.7448638231176737e-06, + "loss": 0.4517, + "mean_token_accuracy": 0.8551441594958306, + "num_tokens": 177120779.0, + "step": 147220 + }, + { + "entropy": 1.8726617515087127, + "epoch": 0.4564001181066664, + "grad_norm": 3.750462770462036, + "learning_rate": 3.7447366427586964e-06, + "loss": 0.4599, + "mean_token_accuracy": 0.8569039270281792, + "num_tokens": 177133126.0, + "step": 147230 + }, + { + "entropy": 1.91754612326622, + "epoch": 0.45643111723171614, + "grad_norm": 7.520792007446289, + "learning_rate": 3.7446094753564614e-06, + "loss": 0.4543, + "mean_token_accuracy": 0.8505095258355141, + "num_tokens": 177145098.0, + "step": 147240 + }, + { + "entropy": 1.9441439002752303, + "epoch": 0.4564621163567658, + "grad_norm": 10.833855628967285, + "learning_rate": 3.7444823209087682e-06, + "loss": 0.4681, + "mean_token_accuracy": 0.8531385481357574, + "num_tokens": 177156290.0, + "step": 147250 + }, + { + "entropy": 1.912196746468544, + "epoch": 0.45649311548181554, + "grad_norm": 9.136136054992676, + "learning_rate": 3.744355179413419e-06, + "loss": 0.4201, + "mean_token_accuracy": 0.8659820929169655, + "num_tokens": 177167879.0, + "step": 147260 + }, + { + "entropy": 1.7876418299973011, + "epoch": 0.4565241146068652, + "grad_norm": 7.6198039054870605, + "learning_rate": 3.7442280508682134e-06, + "loss": 0.3973, + "mean_token_accuracy": 0.8576739236712456, + "num_tokens": 177181505.0, + "step": 147270 + }, + { + "entropy": 1.925425273180008, + "epoch": 0.45655511373191493, + "grad_norm": 7.062708377838135, + "learning_rate": 3.7441009352709544e-06, + "loss": 0.4873, + "mean_token_accuracy": 0.8492991134524346, + "num_tokens": 177193315.0, + "step": 147280 + }, + { + "entropy": 1.847182884812355, + "epoch": 0.4565861128569646, + "grad_norm": 7.065945625305176, + "learning_rate": 3.7439738326194437e-06, + "loss": 0.402, + "mean_token_accuracy": 0.8619561657309532, + "num_tokens": 177205609.0, + "step": 147290 + }, + { + "entropy": 1.8271872013807298, + "epoch": 0.4566171119820143, + "grad_norm": 4.04852294921875, + "learning_rate": 3.7438467429114837e-06, + "loss": 0.3711, + "mean_token_accuracy": 0.8713533550500869, + "num_tokens": 177219174.0, + "step": 147300 + }, + { + "entropy": 1.9408521130681038, + "epoch": 0.456648111107064, + "grad_norm": 8.535037994384766, + "learning_rate": 3.743719666144879e-06, + "loss": 0.5051, + "mean_token_accuracy": 0.843614149093628, + "num_tokens": 177230752.0, + "step": 147310 + }, + { + "entropy": 1.8946807518601418, + "epoch": 0.4566791102321137, + "grad_norm": 3.0620830059051514, + "learning_rate": 3.743592602317431e-06, + "loss": 0.4103, + "mean_token_accuracy": 0.8616632342338562, + "num_tokens": 177243742.0, + "step": 147320 + }, + { + "entropy": 1.8989785492420197, + "epoch": 0.4567101093571634, + "grad_norm": 7.388391017913818, + "learning_rate": 3.743465551426947e-06, + "loss": 0.4324, + "mean_token_accuracy": 0.8464514210820198, + "num_tokens": 177256265.0, + "step": 147330 + }, + { + "entropy": 1.7777711182832718, + "epoch": 0.4567411084822131, + "grad_norm": 8.797972679138184, + "learning_rate": 3.7433385134712295e-06, + "loss": 0.4015, + "mean_token_accuracy": 0.8580346912145614, + "num_tokens": 177270945.0, + "step": 147340 + }, + { + "entropy": 2.003285530209541, + "epoch": 0.4567721076072628, + "grad_norm": 8.642237663269043, + "learning_rate": 3.7432114884480853e-06, + "loss": 0.5276, + "mean_token_accuracy": 0.8421431943774224, + "num_tokens": 177281633.0, + "step": 147350 + }, + { + "entropy": 1.9756213307380677, + "epoch": 0.4568031067323125, + "grad_norm": 9.405720710754395, + "learning_rate": 3.743084476355319e-06, + "loss": 0.4912, + "mean_token_accuracy": 0.8499099537730217, + "num_tokens": 177292337.0, + "step": 147360 + }, + { + "entropy": 1.8737393349409104, + "epoch": 0.45683410585736217, + "grad_norm": 6.993408203125, + "learning_rate": 3.742957477190739e-06, + "loss": 0.4167, + "mean_token_accuracy": 0.8587880119681358, + "num_tokens": 177304763.0, + "step": 147370 + }, + { + "entropy": 1.934411568939686, + "epoch": 0.4568651049824119, + "grad_norm": 8.104722023010254, + "learning_rate": 3.74283049095215e-06, + "loss": 0.4842, + "mean_token_accuracy": 0.8433166891336441, + "num_tokens": 177316276.0, + "step": 147380 + }, + { + "entropy": 1.9655498325824738, + "epoch": 0.45689610410746156, + "grad_norm": 8.411802291870117, + "learning_rate": 3.742703517637361e-06, + "loss": 0.4673, + "mean_token_accuracy": 0.8529037058353424, + "num_tokens": 177327855.0, + "step": 147390 + }, + { + "entropy": 1.848972088098526, + "epoch": 0.4569271032325113, + "grad_norm": 8.530176162719727, + "learning_rate": 3.74257655724418e-06, + "loss": 0.4828, + "mean_token_accuracy": 0.8518965750932693, + "num_tokens": 177340947.0, + "step": 147400 + }, + { + "entropy": 1.8785428568720817, + "epoch": 0.45695810235756096, + "grad_norm": 8.014692306518555, + "learning_rate": 3.742449609770415e-06, + "loss": 0.4692, + "mean_token_accuracy": 0.8454515695571899, + "num_tokens": 177352583.0, + "step": 147410 + }, + { + "entropy": 1.9009640589356422, + "epoch": 0.4569891014826106, + "grad_norm": 8.411507606506348, + "learning_rate": 3.7423226752138736e-06, + "loss": 0.4361, + "mean_token_accuracy": 0.8564955353736877, + "num_tokens": 177364796.0, + "step": 147420 + }, + { + "entropy": 1.8695711359381675, + "epoch": 0.45702010060766035, + "grad_norm": 8.759578704833984, + "learning_rate": 3.7421957535723686e-06, + "loss": 0.4138, + "mean_token_accuracy": 0.8556355476379395, + "num_tokens": 177377130.0, + "step": 147430 + }, + { + "entropy": 1.882589338719845, + "epoch": 0.45705109973271, + "grad_norm": 5.764090538024902, + "learning_rate": 3.7420688448437077e-06, + "loss": 0.4659, + "mean_token_accuracy": 0.8518973112106323, + "num_tokens": 177389852.0, + "step": 147440 + }, + { + "entropy": 1.9209093794226646, + "epoch": 0.45708209885775974, + "grad_norm": 9.359416007995605, + "learning_rate": 3.7419419490257012e-06, + "loss": 0.475, + "mean_token_accuracy": 0.8516506165266037, + "num_tokens": 177401433.0, + "step": 147450 + }, + { + "entropy": 1.86415656208992, + "epoch": 0.4571130979828094, + "grad_norm": 8.562514305114746, + "learning_rate": 3.741815066116162e-06, + "loss": 0.4233, + "mean_token_accuracy": 0.8509325221180916, + "num_tokens": 177414478.0, + "step": 147460 + }, + { + "entropy": 1.745816995203495, + "epoch": 0.45714409710785914, + "grad_norm": 6.630068302154541, + "learning_rate": 3.7416881961129002e-06, + "loss": 0.381, + "mean_token_accuracy": 0.8737103626132011, + "num_tokens": 177428315.0, + "step": 147470 + }, + { + "entropy": 1.8828457549214364, + "epoch": 0.4571750962329088, + "grad_norm": 4.396876335144043, + "learning_rate": 3.7415613390137295e-06, + "loss": 0.442, + "mean_token_accuracy": 0.8575210615992546, + "num_tokens": 177440392.0, + "step": 147480 + }, + { + "entropy": 1.9914973288774491, + "epoch": 0.45720609535795853, + "grad_norm": 6.813479423522949, + "learning_rate": 3.7414344948164604e-06, + "loss": 0.5172, + "mean_token_accuracy": 0.8524721592664719, + "num_tokens": 177451268.0, + "step": 147490 + }, + { + "entropy": 1.8895151317119598, + "epoch": 0.4572370944830082, + "grad_norm": 3.594588279724121, + "learning_rate": 3.741307663518908e-06, + "loss": 0.4189, + "mean_token_accuracy": 0.8583090662956238, + "num_tokens": 177463489.0, + "step": 147500 + }, + { + "entropy": 1.9084604054689407, + "epoch": 0.4572680936080579, + "grad_norm": 4.002728462219238, + "learning_rate": 3.741180845118885e-06, + "loss": 0.4889, + "mean_token_accuracy": 0.8463861241936683, + "num_tokens": 177475847.0, + "step": 147510 + }, + { + "entropy": 1.9138479217886926, + "epoch": 0.4572990927331076, + "grad_norm": 9.18737506866455, + "learning_rate": 3.7410540396142063e-06, + "loss": 0.4332, + "mean_token_accuracy": 0.8641066178679466, + "num_tokens": 177487582.0, + "step": 147520 + }, + { + "entropy": 1.87220705896616, + "epoch": 0.4573300918581573, + "grad_norm": 7.512433052062988, + "learning_rate": 3.740927247002686e-06, + "loss": 0.4415, + "mean_token_accuracy": 0.858481977880001, + "num_tokens": 177499543.0, + "step": 147530 + }, + { + "entropy": 1.9340161770582198, + "epoch": 0.457361090983207, + "grad_norm": 7.679035186767578, + "learning_rate": 3.74080046728214e-06, + "loss": 0.4543, + "mean_token_accuracy": 0.8467412024736405, + "num_tokens": 177510615.0, + "step": 147540 + }, + { + "entropy": 1.8590017393231393, + "epoch": 0.4573920901082567, + "grad_norm": 9.347676277160645, + "learning_rate": 3.7406737004503834e-06, + "loss": 0.429, + "mean_token_accuracy": 0.8628332495689393, + "num_tokens": 177522638.0, + "step": 147550 + }, + { + "entropy": 1.911301201581955, + "epoch": 0.4574230892333064, + "grad_norm": 8.779614448547363, + "learning_rate": 3.740546946505233e-06, + "loss": 0.4369, + "mean_token_accuracy": 0.8550707563757897, + "num_tokens": 177534749.0, + "step": 147560 + }, + { + "entropy": 1.9754537165164947, + "epoch": 0.4574540883583561, + "grad_norm": 8.753040313720703, + "learning_rate": 3.740420205444505e-06, + "loss": 0.4771, + "mean_token_accuracy": 0.853889499604702, + "num_tokens": 177546067.0, + "step": 147570 + }, + { + "entropy": 1.9112480387091637, + "epoch": 0.4574850874834058, + "grad_norm": 4.434845447540283, + "learning_rate": 3.740293477266017e-06, + "loss": 0.4801, + "mean_token_accuracy": 0.8533158525824547, + "num_tokens": 177557969.0, + "step": 147580 + }, + { + "entropy": 1.877299964427948, + "epoch": 0.4575160866084555, + "grad_norm": 7.680180072784424, + "learning_rate": 3.7401667619675876e-06, + "loss": 0.3953, + "mean_token_accuracy": 0.8607863545417785, + "num_tokens": 177570545.0, + "step": 147590 + }, + { + "entropy": 1.9265247106552124, + "epoch": 0.45754708573350517, + "grad_norm": 8.468583106994629, + "learning_rate": 3.7400400595470337e-06, + "loss": 0.4765, + "mean_token_accuracy": 0.8503778129816055, + "num_tokens": 177582395.0, + "step": 147600 + }, + { + "entropy": 1.8167890459299088, + "epoch": 0.4575780848585549, + "grad_norm": 8.333720207214355, + "learning_rate": 3.7399133700021756e-06, + "loss": 0.4504, + "mean_token_accuracy": 0.8408757716417312, + "num_tokens": 177596261.0, + "step": 147610 + }, + { + "entropy": 1.8485788330435753, + "epoch": 0.45760908398360456, + "grad_norm": 7.62937593460083, + "learning_rate": 3.7397866933308325e-06, + "loss": 0.4469, + "mean_token_accuracy": 0.8608444139361382, + "num_tokens": 177608763.0, + "step": 147620 + }, + { + "entropy": 1.9072691813111304, + "epoch": 0.4576400831086543, + "grad_norm": 9.699867248535156, + "learning_rate": 3.7396600295308234e-06, + "loss": 0.447, + "mean_token_accuracy": 0.8526267111301422, + "num_tokens": 177621101.0, + "step": 147630 + }, + { + "entropy": 1.9000159114599229, + "epoch": 0.45767108223370395, + "grad_norm": 7.382110118865967, + "learning_rate": 3.7395333785999692e-06, + "loss": 0.4659, + "mean_token_accuracy": 0.8545347273349762, + "num_tokens": 177633245.0, + "step": 147640 + }, + { + "entropy": 1.9029620870947839, + "epoch": 0.4577020813587537, + "grad_norm": 9.210521697998047, + "learning_rate": 3.739406740536092e-06, + "loss": 0.4738, + "mean_token_accuracy": 0.8504366457462311, + "num_tokens": 177643987.0, + "step": 147650 + }, + { + "entropy": 1.9046365901827813, + "epoch": 0.45773308048380335, + "grad_norm": 4.034806251525879, + "learning_rate": 3.739280115337011e-06, + "loss": 0.4549, + "mean_token_accuracy": 0.8487413614988327, + "num_tokens": 177655795.0, + "step": 147660 + }, + { + "entropy": 1.8236022911965848, + "epoch": 0.457764079608853, + "grad_norm": 4.0919623374938965, + "learning_rate": 3.73915350300055e-06, + "loss": 0.3661, + "mean_token_accuracy": 0.8643067702651024, + "num_tokens": 177668862.0, + "step": 147670 + }, + { + "entropy": 1.9250768944621086, + "epoch": 0.45779507873390274, + "grad_norm": 8.812362670898438, + "learning_rate": 3.7390269035245302e-06, + "loss": 0.4477, + "mean_token_accuracy": 0.8513668492436409, + "num_tokens": 177681125.0, + "step": 147680 + }, + { + "entropy": 1.8930511608719827, + "epoch": 0.4578260778589524, + "grad_norm": 7.733847141265869, + "learning_rate": 3.738900316906776e-06, + "loss": 0.4254, + "mean_token_accuracy": 0.8619498953223228, + "num_tokens": 177693301.0, + "step": 147690 + }, + { + "entropy": 1.9794332563877106, + "epoch": 0.45785707698400213, + "grad_norm": 7.670042514801025, + "learning_rate": 3.7387737431451097e-06, + "loss": 0.504, + "mean_token_accuracy": 0.8443345949053764, + "num_tokens": 177704344.0, + "step": 147700 + }, + { + "entropy": 1.9156940042972566, + "epoch": 0.4578880761090518, + "grad_norm": 7.951246738433838, + "learning_rate": 3.738647182237357e-06, + "loss": 0.4712, + "mean_token_accuracy": 0.8424468949437142, + "num_tokens": 177716411.0, + "step": 147710 + }, + { + "entropy": 1.8918796986341477, + "epoch": 0.4579190752341015, + "grad_norm": 6.88615083694458, + "learning_rate": 3.738520634181341e-06, + "loss": 0.414, + "mean_token_accuracy": 0.861275652050972, + "num_tokens": 177728341.0, + "step": 147720 + }, + { + "entropy": 1.849881935119629, + "epoch": 0.4579500743591512, + "grad_norm": 3.4234378337860107, + "learning_rate": 3.738394098974886e-06, + "loss": 0.396, + "mean_token_accuracy": 0.8636976391077041, + "num_tokens": 177740803.0, + "step": 147730 + }, + { + "entropy": 1.8762715697288512, + "epoch": 0.4579810734842009, + "grad_norm": 8.60593318939209, + "learning_rate": 3.7382675766158196e-06, + "loss": 0.4829, + "mean_token_accuracy": 0.8438662976026535, + "num_tokens": 177752757.0, + "step": 147740 + }, + { + "entropy": 1.9411585584282876, + "epoch": 0.4580120726092506, + "grad_norm": 9.615153312683105, + "learning_rate": 3.738141067101967e-06, + "loss": 0.4989, + "mean_token_accuracy": 0.8427041709423065, + "num_tokens": 177764050.0, + "step": 147750 + }, + { + "entropy": 1.8888093829154968, + "epoch": 0.4580430717343003, + "grad_norm": 8.258987426757812, + "learning_rate": 3.7380145704311548e-06, + "loss": 0.4782, + "mean_token_accuracy": 0.85269236266613, + "num_tokens": 177775788.0, + "step": 147760 + }, + { + "entropy": 1.857784403860569, + "epoch": 0.45807407085935, + "grad_norm": 3.8808376789093018, + "learning_rate": 3.73788808660121e-06, + "loss": 0.4144, + "mean_token_accuracy": 0.8614834606647491, + "num_tokens": 177787727.0, + "step": 147770 + }, + { + "entropy": 1.9264632612466812, + "epoch": 0.4581050699843997, + "grad_norm": 8.179401397705078, + "learning_rate": 3.7377616156099605e-06, + "loss": 0.4947, + "mean_token_accuracy": 0.8487723514437675, + "num_tokens": 177799380.0, + "step": 147780 + }, + { + "entropy": 1.9536435410380364, + "epoch": 0.4581360691094494, + "grad_norm": 8.652231216430664, + "learning_rate": 3.737635157455235e-06, + "loss": 0.5055, + "mean_token_accuracy": 0.8477381393313408, + "num_tokens": 177810671.0, + "step": 147790 + }, + { + "entropy": 1.8677313759922982, + "epoch": 0.4581670682344991, + "grad_norm": 8.161964416503906, + "learning_rate": 3.7375087121348613e-06, + "loss": 0.4361, + "mean_token_accuracy": 0.8476336553692818, + "num_tokens": 177823218.0, + "step": 147800 + }, + { + "entropy": 1.8325920164585114, + "epoch": 0.45819806735954877, + "grad_norm": 7.421534538269043, + "learning_rate": 3.737382279646669e-06, + "loss": 0.367, + "mean_token_accuracy": 0.8645919308066368, + "num_tokens": 177836151.0, + "step": 147810 + }, + { + "entropy": 1.9766230046749116, + "epoch": 0.4582290664845985, + "grad_norm": 8.750480651855469, + "learning_rate": 3.7372558599884873e-06, + "loss": 0.5172, + "mean_token_accuracy": 0.8399770334362984, + "num_tokens": 177847098.0, + "step": 147820 + }, + { + "entropy": 1.9309228926897049, + "epoch": 0.45826006560964816, + "grad_norm": 7.476661205291748, + "learning_rate": 3.737129453158147e-06, + "loss": 0.4841, + "mean_token_accuracy": 0.842421543598175, + "num_tokens": 177859514.0, + "step": 147830 + }, + { + "entropy": 1.847913098335266, + "epoch": 0.4582910647346979, + "grad_norm": 6.960815906524658, + "learning_rate": 3.737003059153479e-06, + "loss": 0.5784, + "mean_token_accuracy": 0.8363130848854781, + "num_tokens": 177873072.0, + "step": 147840 + }, + { + "entropy": 1.8798335790634155, + "epoch": 0.45832206385974755, + "grad_norm": 7.6284894943237305, + "learning_rate": 3.7368766779723135e-06, + "loss": 0.4554, + "mean_token_accuracy": 0.8519891336560249, + "num_tokens": 177884925.0, + "step": 147850 + }, + { + "entropy": 1.9396116152405738, + "epoch": 0.4583530629847973, + "grad_norm": 7.631931304931641, + "learning_rate": 3.7367503096124836e-06, + "loss": 0.4735, + "mean_token_accuracy": 0.8487934455275535, + "num_tokens": 177896276.0, + "step": 147860 + }, + { + "entropy": 1.9332348197698592, + "epoch": 0.45838406210984695, + "grad_norm": 9.10966682434082, + "learning_rate": 3.7366239540718206e-06, + "loss": 0.4686, + "mean_token_accuracy": 0.8453480824828148, + "num_tokens": 177907658.0, + "step": 147870 + }, + { + "entropy": 1.847151155769825, + "epoch": 0.45841506123489667, + "grad_norm": 7.519127368927002, + "learning_rate": 3.736497611348158e-06, + "loss": 0.4284, + "mean_token_accuracy": 0.8468550607562065, + "num_tokens": 177919761.0, + "step": 147880 + }, + { + "entropy": 1.854358959197998, + "epoch": 0.45844606035994634, + "grad_norm": 7.146642684936523, + "learning_rate": 3.7363712814393296e-06, + "loss": 0.4346, + "mean_token_accuracy": 0.8587031260132789, + "num_tokens": 177931858.0, + "step": 147890 + }, + { + "entropy": 1.8741928294301033, + "epoch": 0.45847705948499606, + "grad_norm": 8.542024612426758, + "learning_rate": 3.7362449643431677e-06, + "loss": 0.449, + "mean_token_accuracy": 0.8568659141659737, + "num_tokens": 177943416.0, + "step": 147900 + }, + { + "entropy": 1.9269966036081314, + "epoch": 0.45850805861004573, + "grad_norm": 5.178137302398682, + "learning_rate": 3.736118660057506e-06, + "loss": 0.5051, + "mean_token_accuracy": 0.8503173589706421, + "num_tokens": 177954691.0, + "step": 147910 + }, + { + "entropy": 1.856522725522518, + "epoch": 0.4585390577350954, + "grad_norm": 7.2768330574035645, + "learning_rate": 3.735992368580183e-06, + "loss": 0.4206, + "mean_token_accuracy": 0.8675465360283852, + "num_tokens": 177966749.0, + "step": 147920 + }, + { + "entropy": 1.8835629358887673, + "epoch": 0.4585700568601451, + "grad_norm": 3.9212841987609863, + "learning_rate": 3.73586608990903e-06, + "loss": 0.4512, + "mean_token_accuracy": 0.853912553191185, + "num_tokens": 177978879.0, + "step": 147930 + }, + { + "entropy": 1.8709630161523818, + "epoch": 0.4586010559851948, + "grad_norm": 9.633736610412598, + "learning_rate": 3.735739824041885e-06, + "loss": 0.4579, + "mean_token_accuracy": 0.8498531699180603, + "num_tokens": 177990951.0, + "step": 147940 + }, + { + "entropy": 1.9850156009197235, + "epoch": 0.4586320551102445, + "grad_norm": 8.021490097045898, + "learning_rate": 3.735613570976584e-06, + "loss": 0.5309, + "mean_token_accuracy": 0.8361830487847328, + "num_tokens": 178001723.0, + "step": 147950 + }, + { + "entropy": 1.9277898535132407, + "epoch": 0.4586630542352942, + "grad_norm": 8.133795738220215, + "learning_rate": 3.735487330710964e-06, + "loss": 0.5754, + "mean_token_accuracy": 0.8342059582471848, + "num_tokens": 178013683.0, + "step": 147960 + }, + { + "entropy": 1.9039981633424758, + "epoch": 0.4586940533603439, + "grad_norm": 9.529333114624023, + "learning_rate": 3.735361103242862e-06, + "loss": 0.4942, + "mean_token_accuracy": 0.8599925115704536, + "num_tokens": 178025035.0, + "step": 147970 + }, + { + "entropy": 1.8207675963640213, + "epoch": 0.4587250524853936, + "grad_norm": 4.995032787322998, + "learning_rate": 3.735234888570117e-06, + "loss": 0.3791, + "mean_token_accuracy": 0.8648058965802192, + "num_tokens": 178038165.0, + "step": 147980 + }, + { + "entropy": 1.7604272417724132, + "epoch": 0.4587560516104433, + "grad_norm": 7.621956825256348, + "learning_rate": 3.735108686690566e-06, + "loss": 0.3362, + "mean_token_accuracy": 0.8773904070258141, + "num_tokens": 178051753.0, + "step": 147990 + }, + { + "entropy": 1.8666868284344673, + "epoch": 0.458787050735493, + "grad_norm": 8.41880989074707, + "learning_rate": 3.7349824976020483e-06, + "loss": 0.4879, + "mean_token_accuracy": 0.8527174681425095, + "num_tokens": 178063598.0, + "step": 148000 + }, + { + "entropy": 1.8387210443615913, + "epoch": 0.4588180498605427, + "grad_norm": 3.875763416290283, + "learning_rate": 3.7348563213024038e-06, + "loss": 0.4166, + "mean_token_accuracy": 0.8554558053612709, + "num_tokens": 178075606.0, + "step": 148010 + }, + { + "entropy": 1.8273346543312072, + "epoch": 0.45884904898559237, + "grad_norm": 9.78944206237793, + "learning_rate": 3.7347301577894716e-06, + "loss": 0.4098, + "mean_token_accuracy": 0.8571007132530213, + "num_tokens": 178088272.0, + "step": 148020 + }, + { + "entropy": 1.8279028117656708, + "epoch": 0.4588800481106421, + "grad_norm": 7.993279933929443, + "learning_rate": 3.7346040070610935e-06, + "loss": 0.4273, + "mean_token_accuracy": 0.8609201908111572, + "num_tokens": 178100592.0, + "step": 148030 + }, + { + "entropy": 1.9010518863797188, + "epoch": 0.45891104723569176, + "grad_norm": 8.148711204528809, + "learning_rate": 3.73447786911511e-06, + "loss": 0.4772, + "mean_token_accuracy": 0.8482613220810891, + "num_tokens": 178112256.0, + "step": 148040 + }, + { + "entropy": 1.84859948605299, + "epoch": 0.4589420463607415, + "grad_norm": 6.481073379516602, + "learning_rate": 3.734351743949362e-06, + "loss": 0.4582, + "mean_token_accuracy": 0.8522376015782356, + "num_tokens": 178124130.0, + "step": 148050 + }, + { + "entropy": 1.9222456485033035, + "epoch": 0.45897304548579115, + "grad_norm": 9.769478797912598, + "learning_rate": 3.734225631561692e-06, + "loss": 0.4303, + "mean_token_accuracy": 0.858902907371521, + "num_tokens": 178135001.0, + "step": 148060 + }, + { + "entropy": 1.9226790130138398, + "epoch": 0.4590040446108409, + "grad_norm": 7.749213218688965, + "learning_rate": 3.7340995319499424e-06, + "loss": 0.4852, + "mean_token_accuracy": 0.8433895200490952, + "num_tokens": 178146194.0, + "step": 148070 + }, + { + "entropy": 1.7752541199326515, + "epoch": 0.45903504373589055, + "grad_norm": 9.610624313354492, + "learning_rate": 3.7339734451119556e-06, + "loss": 0.423, + "mean_token_accuracy": 0.8526219561696052, + "num_tokens": 178159988.0, + "step": 148080 + }, + { + "entropy": 1.7483320951461792, + "epoch": 0.45906604286094027, + "grad_norm": 8.351910591125488, + "learning_rate": 3.733847371045577e-06, + "loss": 0.3534, + "mean_token_accuracy": 0.8714915245771409, + "num_tokens": 178173336.0, + "step": 148090 + }, + { + "entropy": 1.8959960505366324, + "epoch": 0.45909704198598994, + "grad_norm": 4.344943523406982, + "learning_rate": 3.7337213097486484e-06, + "loss": 0.4555, + "mean_token_accuracy": 0.852771207690239, + "num_tokens": 178184748.0, + "step": 148100 + }, + { + "entropy": 1.8576713547110557, + "epoch": 0.45912804111103966, + "grad_norm": 8.485532760620117, + "learning_rate": 3.733595261219016e-06, + "loss": 0.4161, + "mean_token_accuracy": 0.8570611268281937, + "num_tokens": 178196913.0, + "step": 148110 + }, + { + "entropy": 1.8324482500553132, + "epoch": 0.45915904023608933, + "grad_norm": 3.7649433612823486, + "learning_rate": 3.7334692254545234e-06, + "loss": 0.4328, + "mean_token_accuracy": 0.8562913209199905, + "num_tokens": 178209039.0, + "step": 148120 + }, + { + "entropy": 1.8635966286063195, + "epoch": 0.45919003936113906, + "grad_norm": 3.6343321800231934, + "learning_rate": 3.733343202453017e-06, + "loss": 0.4415, + "mean_token_accuracy": 0.8553838461637497, + "num_tokens": 178220749.0, + "step": 148130 + }, + { + "entropy": 1.8851610526442528, + "epoch": 0.4592210384861887, + "grad_norm": 6.960264682769775, + "learning_rate": 3.7332171922123433e-06, + "loss": 0.458, + "mean_token_accuracy": 0.8590750455856323, + "num_tokens": 178232494.0, + "step": 148140 + }, + { + "entropy": 1.8172919273376464, + "epoch": 0.4592520376112384, + "grad_norm": 8.392343521118164, + "learning_rate": 3.733091194730348e-06, + "loss": 0.4938, + "mean_token_accuracy": 0.844761498272419, + "num_tokens": 178245770.0, + "step": 148150 + }, + { + "entropy": 1.8854141429066658, + "epoch": 0.4592830367362881, + "grad_norm": 8.518744468688965, + "learning_rate": 3.7329652100048796e-06, + "loss": 0.4558, + "mean_token_accuracy": 0.8478623673319816, + "num_tokens": 178257483.0, + "step": 148160 + }, + { + "entropy": 1.9081405073404312, + "epoch": 0.4593140358613378, + "grad_norm": 8.384743690490723, + "learning_rate": 3.7328392380337842e-06, + "loss": 0.4752, + "mean_token_accuracy": 0.8525558218359948, + "num_tokens": 178268727.0, + "step": 148170 + }, + { + "entropy": 1.8962554574012755, + "epoch": 0.4593450349863875, + "grad_norm": 9.84249496459961, + "learning_rate": 3.73271327881491e-06, + "loss": 0.4361, + "mean_token_accuracy": 0.8616734966635704, + "num_tokens": 178280654.0, + "step": 148180 + }, + { + "entropy": 1.9595209866762162, + "epoch": 0.4593760341114372, + "grad_norm": 7.133771896362305, + "learning_rate": 3.732587332346106e-06, + "loss": 0.4789, + "mean_token_accuracy": 0.8539301365613937, + "num_tokens": 178292341.0, + "step": 148190 + }, + { + "entropy": 1.9443857610225677, + "epoch": 0.4594070332364869, + "grad_norm": 7.489109516143799, + "learning_rate": 3.732461398625222e-06, + "loss": 0.4481, + "mean_token_accuracy": 0.8486480563879013, + "num_tokens": 178303553.0, + "step": 148200 + }, + { + "entropy": 1.8499363631010055, + "epoch": 0.4594380323615366, + "grad_norm": 9.401103019714355, + "learning_rate": 3.7323354776501063e-06, + "loss": 0.4375, + "mean_token_accuracy": 0.8625474691390991, + "num_tokens": 178315044.0, + "step": 148210 + }, + { + "entropy": 1.9178618222475052, + "epoch": 0.4594690314865863, + "grad_norm": 7.870799541473389, + "learning_rate": 3.73220956941861e-06, + "loss": 0.4987, + "mean_token_accuracy": 0.8432634264230728, + "num_tokens": 178326928.0, + "step": 148220 + }, + { + "entropy": 1.948700188100338, + "epoch": 0.45950003061163597, + "grad_norm": 9.164449691772461, + "learning_rate": 3.7320836739285838e-06, + "loss": 0.4742, + "mean_token_accuracy": 0.8515329852700233, + "num_tokens": 178338112.0, + "step": 148230 + }, + { + "entropy": 1.9085903450846673, + "epoch": 0.4595310297366857, + "grad_norm": 8.133758544921875, + "learning_rate": 3.731957791177878e-06, + "loss": 0.4241, + "mean_token_accuracy": 0.8663553714752197, + "num_tokens": 178349859.0, + "step": 148240 + }, + { + "entropy": 1.8798146709799766, + "epoch": 0.45956202886173536, + "grad_norm": 3.9677722454071045, + "learning_rate": 3.7318319211643456e-06, + "loss": 0.4692, + "mean_token_accuracy": 0.8445356607437133, + "num_tokens": 178362252.0, + "step": 148250 + }, + { + "entropy": 1.843547348678112, + "epoch": 0.4595930279867851, + "grad_norm": 9.012165069580078, + "learning_rate": 3.731706063885837e-06, + "loss": 0.4151, + "mean_token_accuracy": 0.8570879876613617, + "num_tokens": 178374338.0, + "step": 148260 + }, + { + "entropy": 1.897803196310997, + "epoch": 0.45962402711183475, + "grad_norm": 2.348179817199707, + "learning_rate": 3.731580219340206e-06, + "loss": 0.4171, + "mean_token_accuracy": 0.8629708766937256, + "num_tokens": 178385867.0, + "step": 148270 + }, + { + "entropy": 1.755355989933014, + "epoch": 0.4596550262368845, + "grad_norm": 5.742542743682861, + "learning_rate": 3.7314543875253065e-06, + "loss": 0.359, + "mean_token_accuracy": 0.8633789956569672, + "num_tokens": 178400091.0, + "step": 148280 + }, + { + "entropy": 1.8990287438035012, + "epoch": 0.45968602536193415, + "grad_norm": 8.405163764953613, + "learning_rate": 3.731328568438991e-06, + "loss": 0.48, + "mean_token_accuracy": 0.8494486227631569, + "num_tokens": 178412761.0, + "step": 148290 + }, + { + "entropy": 1.8492143407464028, + "epoch": 0.45971702448698387, + "grad_norm": 8.039115905761719, + "learning_rate": 3.731202762079114e-06, + "loss": 0.3856, + "mean_token_accuracy": 0.8641363605856895, + "num_tokens": 178425735.0, + "step": 148300 + }, + { + "entropy": 1.9216509327292441, + "epoch": 0.45974802361203354, + "grad_norm": 8.291874885559082, + "learning_rate": 3.7310769684435306e-06, + "loss": 0.4854, + "mean_token_accuracy": 0.8502373903989792, + "num_tokens": 178437058.0, + "step": 148310 + }, + { + "entropy": 1.925723034143448, + "epoch": 0.45977902273708326, + "grad_norm": 9.9407320022583, + "learning_rate": 3.730951187530095e-06, + "loss": 0.4756, + "mean_token_accuracy": 0.8500989750027657, + "num_tokens": 178448170.0, + "step": 148320 + }, + { + "entropy": 1.8789965465664864, + "epoch": 0.45981002186213293, + "grad_norm": 8.22014045715332, + "learning_rate": 3.7308254193366646e-06, + "loss": 0.4538, + "mean_token_accuracy": 0.8639925360679627, + "num_tokens": 178461198.0, + "step": 148330 + }, + { + "entropy": 1.9338722795248031, + "epoch": 0.45984102098718266, + "grad_norm": 7.892792224884033, + "learning_rate": 3.7306996638610936e-06, + "loss": 0.4897, + "mean_token_accuracy": 0.8488512441515923, + "num_tokens": 178471923.0, + "step": 148340 + }, + { + "entropy": 1.9255221277475356, + "epoch": 0.4598720201122323, + "grad_norm": 4.1723198890686035, + "learning_rate": 3.7305739211012404e-06, + "loss": 0.483, + "mean_token_accuracy": 0.8413741737604141, + "num_tokens": 178483299.0, + "step": 148350 + }, + { + "entropy": 1.9141563907265664, + "epoch": 0.45990301923728205, + "grad_norm": 7.0736260414123535, + "learning_rate": 3.7304481910549613e-06, + "loss": 0.4735, + "mean_token_accuracy": 0.8529133036732673, + "num_tokens": 178494625.0, + "step": 148360 + }, + { + "entropy": 1.8494025871157647, + "epoch": 0.4599340183623317, + "grad_norm": 7.612475872039795, + "learning_rate": 3.7303224737201137e-06, + "loss": 0.4706, + "mean_token_accuracy": 0.8510186776518822, + "num_tokens": 178507700.0, + "step": 148370 + }, + { + "entropy": 1.8061942994594573, + "epoch": 0.45996501748738144, + "grad_norm": 4.09743595123291, + "learning_rate": 3.730196769094557e-06, + "loss": 0.4241, + "mean_token_accuracy": 0.8509301677346229, + "num_tokens": 178521259.0, + "step": 148380 + }, + { + "entropy": 1.9035452425479888, + "epoch": 0.4599960166124311, + "grad_norm": 3.9658563137054443, + "learning_rate": 3.73007107717615e-06, + "loss": 0.4427, + "mean_token_accuracy": 0.8570003524422646, + "num_tokens": 178532802.0, + "step": 148390 + }, + { + "entropy": 1.821213935315609, + "epoch": 0.4600270157374808, + "grad_norm": 3.9787087440490723, + "learning_rate": 3.729945397962751e-06, + "loss": 0.3726, + "mean_token_accuracy": 0.869298306107521, + "num_tokens": 178545572.0, + "step": 148400 + }, + { + "entropy": 1.796069860458374, + "epoch": 0.4600580148625305, + "grad_norm": 4.056951999664307, + "learning_rate": 3.72981973145222e-06, + "loss": 0.4046, + "mean_token_accuracy": 0.8616714045405388, + "num_tokens": 178558915.0, + "step": 148410 + }, + { + "entropy": 1.9034935608506203, + "epoch": 0.4600890139875802, + "grad_norm": 8.9136381149292, + "learning_rate": 3.7296940776424174e-06, + "loss": 0.4599, + "mean_token_accuracy": 0.851823453605175, + "num_tokens": 178570522.0, + "step": 148420 + }, + { + "entropy": 1.921525527536869, + "epoch": 0.4601200131126299, + "grad_norm": 8.047652244567871, + "learning_rate": 3.7295684365312045e-06, + "loss": 0.4648, + "mean_token_accuracy": 0.8542478293180465, + "num_tokens": 178581965.0, + "step": 148430 + }, + { + "entropy": 1.8355329155921936, + "epoch": 0.46015101223767957, + "grad_norm": 4.0909223556518555, + "learning_rate": 3.7294428081164413e-06, + "loss": 0.3998, + "mean_token_accuracy": 0.8662302419543266, + "num_tokens": 178594538.0, + "step": 148440 + }, + { + "entropy": 1.859887145459652, + "epoch": 0.4601820113627293, + "grad_norm": 9.011685371398926, + "learning_rate": 3.729317192395991e-06, + "loss": 0.453, + "mean_token_accuracy": 0.8597676187753678, + "num_tokens": 178606405.0, + "step": 148450 + }, + { + "entropy": 1.8634124889969825, + "epoch": 0.46021301048777896, + "grad_norm": 7.429311752319336, + "learning_rate": 3.729191589367715e-06, + "loss": 0.4054, + "mean_token_accuracy": 0.8525898218154907, + "num_tokens": 178619148.0, + "step": 148460 + }, + { + "entropy": 1.8275597229599954, + "epoch": 0.4602440096128287, + "grad_norm": 3.9588801860809326, + "learning_rate": 3.729065999029476e-06, + "loss": 0.3897, + "mean_token_accuracy": 0.8704280957579613, + "num_tokens": 178630852.0, + "step": 148470 + }, + { + "entropy": 1.8700731366872787, + "epoch": 0.46027500873787836, + "grad_norm": 9.530686378479004, + "learning_rate": 3.728940421379138e-06, + "loss": 0.4343, + "mean_token_accuracy": 0.8597807481884956, + "num_tokens": 178643067.0, + "step": 148480 + }, + { + "entropy": 1.8858630776405334, + "epoch": 0.4603060078629281, + "grad_norm": 8.52836799621582, + "learning_rate": 3.7288148564145645e-06, + "loss": 0.4496, + "mean_token_accuracy": 0.84766735881567, + "num_tokens": 178654791.0, + "step": 148490 + }, + { + "entropy": 1.9085488021373749, + "epoch": 0.46033700698797775, + "grad_norm": 7.8866095542907715, + "learning_rate": 3.7286893041336187e-06, + "loss": 0.512, + "mean_token_accuracy": 0.8404016211628914, + "num_tokens": 178666290.0, + "step": 148500 + }, + { + "entropy": 1.9258504971861838, + "epoch": 0.4603680061130275, + "grad_norm": 8.244267463684082, + "learning_rate": 3.728563764534167e-06, + "loss": 0.4795, + "mean_token_accuracy": 0.8527882546186447, + "num_tokens": 178677958.0, + "step": 148510 + }, + { + "entropy": 1.93827313631773, + "epoch": 0.46039900523807714, + "grad_norm": 7.484049320220947, + "learning_rate": 3.728438237614075e-06, + "loss": 0.4855, + "mean_token_accuracy": 0.8486429572105407, + "num_tokens": 178689438.0, + "step": 148520 + }, + { + "entropy": 1.8482612371444702, + "epoch": 0.46043000436312687, + "grad_norm": 6.9296650886535645, + "learning_rate": 3.7283127233712067e-06, + "loss": 0.405, + "mean_token_accuracy": 0.8683997794985772, + "num_tokens": 178702457.0, + "step": 148530 + }, + { + "entropy": 1.9950364649295806, + "epoch": 0.46046100348817653, + "grad_norm": 7.755855560302734, + "learning_rate": 3.7281872218034292e-06, + "loss": 0.5214, + "mean_token_accuracy": 0.838994313776493, + "num_tokens": 178713062.0, + "step": 148540 + }, + { + "entropy": 1.7965051412582398, + "epoch": 0.46049200261322626, + "grad_norm": 7.688485622406006, + "learning_rate": 3.7280617329086093e-06, + "loss": 0.3361, + "mean_token_accuracy": 0.8706543311476708, + "num_tokens": 178726109.0, + "step": 148550 + }, + { + "entropy": 1.939668095111847, + "epoch": 0.46052300173827593, + "grad_norm": 8.922786712646484, + "learning_rate": 3.7279362566846154e-06, + "loss": 0.4764, + "mean_token_accuracy": 0.8509280979633331, + "num_tokens": 178737184.0, + "step": 148560 + }, + { + "entropy": 1.8518592774868012, + "epoch": 0.46055400086332565, + "grad_norm": 8.265620231628418, + "learning_rate": 3.7278107931293138e-06, + "loss": 0.387, + "mean_token_accuracy": 0.8692864283919335, + "num_tokens": 178749576.0, + "step": 148570 + }, + { + "entropy": 1.9392090559005737, + "epoch": 0.4605849999883753, + "grad_norm": 8.548821449279785, + "learning_rate": 3.7276853422405733e-06, + "loss": 0.5045, + "mean_token_accuracy": 0.8460419103503227, + "num_tokens": 178760264.0, + "step": 148580 + }, + { + "entropy": 1.9111363351345063, + "epoch": 0.46061599911342505, + "grad_norm": 7.530226230621338, + "learning_rate": 3.7275599040162634e-06, + "loss": 0.4283, + "mean_token_accuracy": 0.8527784243226051, + "num_tokens": 178771695.0, + "step": 148590 + }, + { + "entropy": 1.8285814180970192, + "epoch": 0.4606469982384747, + "grad_norm": 3.894634485244751, + "learning_rate": 3.727434478454252e-06, + "loss": 0.4168, + "mean_token_accuracy": 0.8559230253100395, + "num_tokens": 178784229.0, + "step": 148600 + }, + { + "entropy": 1.8875297084450722, + "epoch": 0.46067799736352444, + "grad_norm": 7.381993293762207, + "learning_rate": 3.7273090655524108e-06, + "loss": 0.4304, + "mean_token_accuracy": 0.8570609837770462, + "num_tokens": 178796492.0, + "step": 148610 + }, + { + "entropy": 1.8404451578855514, + "epoch": 0.4607089964885741, + "grad_norm": 7.309342384338379, + "learning_rate": 3.7271836653086084e-06, + "loss": 0.4021, + "mean_token_accuracy": 0.8731897443532943, + "num_tokens": 178808742.0, + "step": 148620 + }, + { + "entropy": 1.879277119040489, + "epoch": 0.46073999561362383, + "grad_norm": 9.954212188720703, + "learning_rate": 3.7270582777207166e-06, + "loss": 0.4689, + "mean_token_accuracy": 0.8552733480930328, + "num_tokens": 178820065.0, + "step": 148630 + }, + { + "entropy": 1.909400151669979, + "epoch": 0.4607709947386735, + "grad_norm": 8.673418045043945, + "learning_rate": 3.7269329027866064e-06, + "loss": 0.4915, + "mean_token_accuracy": 0.8421248093247413, + "num_tokens": 178832605.0, + "step": 148640 + }, + { + "entropy": 1.9523657143115998, + "epoch": 0.46080199386372317, + "grad_norm": 7.831915378570557, + "learning_rate": 3.7268075405041496e-06, + "loss": 0.4772, + "mean_token_accuracy": 0.8484080314636231, + "num_tokens": 178844397.0, + "step": 148650 + }, + { + "entropy": 1.7859274119138717, + "epoch": 0.4608329929887729, + "grad_norm": 8.42618465423584, + "learning_rate": 3.7266821908712185e-06, + "loss": 0.4208, + "mean_token_accuracy": 0.8656513899564743, + "num_tokens": 178856946.0, + "step": 148660 + }, + { + "entropy": 1.9320579081773759, + "epoch": 0.46086399211382256, + "grad_norm": 7.774157524108887, + "learning_rate": 3.7265568538856866e-06, + "loss": 0.4744, + "mean_token_accuracy": 0.8523464262485504, + "num_tokens": 178868317.0, + "step": 148670 + }, + { + "entropy": 1.8498662784695625, + "epoch": 0.4608949912388723, + "grad_norm": 8.220366477966309, + "learning_rate": 3.726431529545426e-06, + "loss": 0.4639, + "mean_token_accuracy": 0.8491094589233399, + "num_tokens": 178880925.0, + "step": 148680 + }, + { + "entropy": 1.9150644198060036, + "epoch": 0.46092599036392196, + "grad_norm": 9.27624225616455, + "learning_rate": 3.726306217848312e-06, + "loss": 0.4911, + "mean_token_accuracy": 0.8403692692518234, + "num_tokens": 178892546.0, + "step": 148690 + }, + { + "entropy": 1.7894390970468521, + "epoch": 0.4609569894889717, + "grad_norm": 9.18440055847168, + "learning_rate": 3.726180918792218e-06, + "loss": 0.3835, + "mean_token_accuracy": 0.8709027290344238, + "num_tokens": 178904334.0, + "step": 148700 + }, + { + "entropy": 1.858834259212017, + "epoch": 0.46098798861402135, + "grad_norm": 3.9686553478240967, + "learning_rate": 3.7260556323750185e-06, + "loss": 0.3973, + "mean_token_accuracy": 0.8583307415246964, + "num_tokens": 178916596.0, + "step": 148710 + }, + { + "entropy": 1.8790298044681548, + "epoch": 0.4610189877390711, + "grad_norm": 8.098514556884766, + "learning_rate": 3.72593035859459e-06, + "loss": 0.4591, + "mean_token_accuracy": 0.8534575864672661, + "num_tokens": 178929270.0, + "step": 148720 + }, + { + "entropy": 1.8725609213113785, + "epoch": 0.46104998686412074, + "grad_norm": 8.010581016540527, + "learning_rate": 3.725805097448807e-06, + "loss": 0.4111, + "mean_token_accuracy": 0.8627361282706261, + "num_tokens": 178941517.0, + "step": 148730 + }, + { + "entropy": 1.9305458396673203, + "epoch": 0.46108098598917047, + "grad_norm": 8.154109954833984, + "learning_rate": 3.7256798489355474e-06, + "loss": 0.4493, + "mean_token_accuracy": 0.8501837208867074, + "num_tokens": 178953101.0, + "step": 148740 + }, + { + "entropy": 1.9796837285161017, + "epoch": 0.46111198511422014, + "grad_norm": 9.379487991333008, + "learning_rate": 3.7255546130526867e-06, + "loss": 0.508, + "mean_token_accuracy": 0.8350688204169273, + "num_tokens": 178964485.0, + "step": 148750 + }, + { + "entropy": 1.951874351501465, + "epoch": 0.46114298423926986, + "grad_norm": 7.641496658325195, + "learning_rate": 3.7254293897981025e-06, + "loss": 0.4815, + "mean_token_accuracy": 0.843595664203167, + "num_tokens": 178975573.0, + "step": 148760 + }, + { + "entropy": 1.8258162140846252, + "epoch": 0.46117398336431953, + "grad_norm": 2.5984456539154053, + "learning_rate": 3.725304179169673e-06, + "loss": 0.4267, + "mean_token_accuracy": 0.8542465507984162, + "num_tokens": 178988605.0, + "step": 148770 + }, + { + "entropy": 1.8872524976730347, + "epoch": 0.46120498248936925, + "grad_norm": 7.967230319976807, + "learning_rate": 3.7251789811652763e-06, + "loss": 0.4397, + "mean_token_accuracy": 0.8548190101981163, + "num_tokens": 179001072.0, + "step": 148780 + }, + { + "entropy": 1.9689855933189393, + "epoch": 0.4612359816144189, + "grad_norm": 4.094521522521973, + "learning_rate": 3.7250537957827913e-06, + "loss": 0.5026, + "mean_token_accuracy": 0.8418999388813972, + "num_tokens": 179012080.0, + "step": 148790 + }, + { + "entropy": 1.9371369108557701, + "epoch": 0.46126698073946865, + "grad_norm": 7.492170333862305, + "learning_rate": 3.7249286230200974e-06, + "loss": 0.4804, + "mean_token_accuracy": 0.854214908182621, + "num_tokens": 179023118.0, + "step": 148800 + }, + { + "entropy": 1.941811603307724, + "epoch": 0.4612979798645183, + "grad_norm": 7.5748491287231445, + "learning_rate": 3.7248034628750744e-06, + "loss": 0.4423, + "mean_token_accuracy": 0.85891852080822, + "num_tokens": 179034894.0, + "step": 148810 + }, + { + "entropy": 1.8954156935214996, + "epoch": 0.46132897898956804, + "grad_norm": 3.9654111862182617, + "learning_rate": 3.7246783153456033e-06, + "loss": 0.4425, + "mean_token_accuracy": 0.8546635210514069, + "num_tokens": 179047196.0, + "step": 148820 + }, + { + "entropy": 1.9124370649456979, + "epoch": 0.4613599781146177, + "grad_norm": 8.43816089630127, + "learning_rate": 3.7245531804295637e-06, + "loss": 0.4774, + "mean_token_accuracy": 0.8510984659194947, + "num_tokens": 179058196.0, + "step": 148830 + }, + { + "entropy": 1.949829702079296, + "epoch": 0.46139097723966743, + "grad_norm": 9.092519760131836, + "learning_rate": 3.724428058124837e-06, + "loss": 0.4881, + "mean_token_accuracy": 0.8465055644512176, + "num_tokens": 179069758.0, + "step": 148840 + }, + { + "entropy": 1.9349031627178193, + "epoch": 0.4614219763647171, + "grad_norm": 7.756466865539551, + "learning_rate": 3.7243029484293057e-06, + "loss": 0.4625, + "mean_token_accuracy": 0.8493330150842666, + "num_tokens": 179081712.0, + "step": 148850 + }, + { + "entropy": 1.8835980251431466, + "epoch": 0.4614529754897668, + "grad_norm": 5.772395610809326, + "learning_rate": 3.724177851340852e-06, + "loss": 0.5075, + "mean_token_accuracy": 0.8446671605110169, + "num_tokens": 179095186.0, + "step": 148860 + }, + { + "entropy": 1.9193114623427392, + "epoch": 0.4614839746148165, + "grad_norm": 7.486071586608887, + "learning_rate": 3.724052766857359e-06, + "loss": 0.4239, + "mean_token_accuracy": 0.8581682324409485, + "num_tokens": 179107120.0, + "step": 148870 + }, + { + "entropy": 1.9004424124956132, + "epoch": 0.4615149737398662, + "grad_norm": 4.124865531921387, + "learning_rate": 3.7239276949767094e-06, + "loss": 0.4304, + "mean_token_accuracy": 0.8557851612567902, + "num_tokens": 179119606.0, + "step": 148880 + }, + { + "entropy": 1.8963897615671157, + "epoch": 0.4615459728649159, + "grad_norm": 3.557903289794922, + "learning_rate": 3.7238026356967873e-06, + "loss": 0.4233, + "mean_token_accuracy": 0.8588035330176353, + "num_tokens": 179132126.0, + "step": 148890 + }, + { + "entropy": 1.9891429126262665, + "epoch": 0.46157697198996556, + "grad_norm": 8.919794082641602, + "learning_rate": 3.7236775890154765e-06, + "loss": 0.5074, + "mean_token_accuracy": 0.8473231881856919, + "num_tokens": 179143132.0, + "step": 148900 + }, + { + "entropy": 1.9493281915783882, + "epoch": 0.4616079711150153, + "grad_norm": 9.03589153289795, + "learning_rate": 3.723552554930663e-06, + "loss": 0.5008, + "mean_token_accuracy": 0.8437724366784096, + "num_tokens": 179154839.0, + "step": 148910 + }, + { + "entropy": 1.879402995109558, + "epoch": 0.46163897024006495, + "grad_norm": 7.5339741706848145, + "learning_rate": 3.7234275334402314e-06, + "loss": 0.4215, + "mean_token_accuracy": 0.8598219409585, + "num_tokens": 179167503.0, + "step": 148920 + }, + { + "entropy": 1.8404528819024564, + "epoch": 0.4616699693651147, + "grad_norm": 5.325761795043945, + "learning_rate": 3.7233025245420666e-06, + "loss": 0.4685, + "mean_token_accuracy": 0.8565814971923829, + "num_tokens": 179180748.0, + "step": 148930 + }, + { + "entropy": 1.8871916115283967, + "epoch": 0.46170096849016434, + "grad_norm": 7.421195983886719, + "learning_rate": 3.7231775282340564e-06, + "loss": 0.4556, + "mean_token_accuracy": 0.8600098505616188, + "num_tokens": 179192693.0, + "step": 148940 + }, + { + "entropy": 1.9641156986355781, + "epoch": 0.46173196761521407, + "grad_norm": 7.706519603729248, + "learning_rate": 3.7230525445140874e-06, + "loss": 0.4721, + "mean_token_accuracy": 0.8449445277452469, + "num_tokens": 179203723.0, + "step": 148950 + }, + { + "entropy": 1.9411569505929946, + "epoch": 0.46176296674026374, + "grad_norm": 9.283172607421875, + "learning_rate": 3.7229275733800462e-06, + "loss": 0.4603, + "mean_token_accuracy": 0.8541247725486756, + "num_tokens": 179215213.0, + "step": 148960 + }, + { + "entropy": 2.000536176562309, + "epoch": 0.46179396586531346, + "grad_norm": 7.330437183380127, + "learning_rate": 3.722802614829821e-06, + "loss": 0.4743, + "mean_token_accuracy": 0.8524016052484512, + "num_tokens": 179225916.0, + "step": 148970 + }, + { + "entropy": 1.841599515080452, + "epoch": 0.46182496499036313, + "grad_norm": 8.324941635131836, + "learning_rate": 3.7226776688612994e-06, + "loss": 0.4567, + "mean_token_accuracy": 0.8502395913004875, + "num_tokens": 179238822.0, + "step": 148980 + }, + { + "entropy": 1.8596518978476524, + "epoch": 0.46185596411541285, + "grad_norm": 9.439504623413086, + "learning_rate": 3.722552735472371e-06, + "loss": 0.3836, + "mean_token_accuracy": 0.8619119867682457, + "num_tokens": 179251678.0, + "step": 148990 + }, + { + "entropy": 1.9302584543824195, + "epoch": 0.4618869632404625, + "grad_norm": 9.211128234863281, + "learning_rate": 3.722427814660926e-06, + "loss": 0.4525, + "mean_token_accuracy": 0.8613484084606171, + "num_tokens": 179263263.0, + "step": 149000 + }, + { + "entropy": 1.840495379269123, + "epoch": 0.46191796236551225, + "grad_norm": 7.943533897399902, + "learning_rate": 3.722302906424852e-06, + "loss": 0.3887, + "mean_token_accuracy": 0.8663021042943001, + "num_tokens": 179275505.0, + "step": 149010 + }, + { + "entropy": 1.9889112055301665, + "epoch": 0.4619489614905619, + "grad_norm": 8.111519813537598, + "learning_rate": 3.7221780107620397e-06, + "loss": 0.4651, + "mean_token_accuracy": 0.8498691335320473, + "num_tokens": 179286521.0, + "step": 149020 + }, + { + "entropy": 1.9183252349495887, + "epoch": 0.46197996061561164, + "grad_norm": 7.588090419769287, + "learning_rate": 3.7220531276703815e-06, + "loss": 0.4665, + "mean_token_accuracy": 0.8486698940396309, + "num_tokens": 179297674.0, + "step": 149030 + }, + { + "entropy": 1.8954298749566079, + "epoch": 0.4620109597406613, + "grad_norm": 7.996798515319824, + "learning_rate": 3.7219282571477677e-06, + "loss": 0.4566, + "mean_token_accuracy": 0.852338932454586, + "num_tokens": 179309717.0, + "step": 149040 + }, + { + "entropy": 1.902578841149807, + "epoch": 0.46204195886571103, + "grad_norm": 7.982056140899658, + "learning_rate": 3.7218033991920895e-06, + "loss": 0.4607, + "mean_token_accuracy": 0.8511950179934502, + "num_tokens": 179321639.0, + "step": 149050 + }, + { + "entropy": 1.8236243352293968, + "epoch": 0.4620729579907607, + "grad_norm": 8.151361465454102, + "learning_rate": 3.7216785538012397e-06, + "loss": 0.3998, + "mean_token_accuracy": 0.8600339099764824, + "num_tokens": 179334706.0, + "step": 149060 + }, + { + "entropy": 1.9069883227348328, + "epoch": 0.4621039571158104, + "grad_norm": 8.232033729553223, + "learning_rate": 3.7215537209731107e-06, + "loss": 0.4365, + "mean_token_accuracy": 0.860856780409813, + "num_tokens": 179346482.0, + "step": 149070 + }, + { + "entropy": 1.8350558295845985, + "epoch": 0.4621349562408601, + "grad_norm": 8.054594039916992, + "learning_rate": 3.7214289007055965e-06, + "loss": 0.3866, + "mean_token_accuracy": 0.8655603528022766, + "num_tokens": 179359849.0, + "step": 149080 + }, + { + "entropy": 1.972979885339737, + "epoch": 0.4621659553659098, + "grad_norm": 10.824267387390137, + "learning_rate": 3.7213040929965905e-06, + "loss": 0.4864, + "mean_token_accuracy": 0.8447522640228271, + "num_tokens": 179371308.0, + "step": 149090 + }, + { + "entropy": 1.8467707887291909, + "epoch": 0.4621969544909595, + "grad_norm": 8.265110969543457, + "learning_rate": 3.721179297843986e-06, + "loss": 0.4606, + "mean_token_accuracy": 0.8495004117488861, + "num_tokens": 179384138.0, + "step": 149100 + }, + { + "entropy": 1.9475996479392053, + "epoch": 0.4622279536160092, + "grad_norm": 7.331584453582764, + "learning_rate": 3.721054515245679e-06, + "loss": 0.5038, + "mean_token_accuracy": 0.848460578918457, + "num_tokens": 179395588.0, + "step": 149110 + }, + { + "entropy": 1.879881000518799, + "epoch": 0.4622589527410589, + "grad_norm": 7.338623523712158, + "learning_rate": 3.720929745199565e-06, + "loss": 0.4278, + "mean_token_accuracy": 0.8511856839060783, + "num_tokens": 179407656.0, + "step": 149120 + }, + { + "entropy": 1.9637582659721375, + "epoch": 0.4622899518661086, + "grad_norm": 9.248175621032715, + "learning_rate": 3.720804987703538e-06, + "loss": 0.4885, + "mean_token_accuracy": 0.8401120364665985, + "num_tokens": 179418775.0, + "step": 149130 + }, + { + "entropy": 1.9078269064426423, + "epoch": 0.4623209509911583, + "grad_norm": 8.055317878723145, + "learning_rate": 3.7206802427554957e-06, + "loss": 0.4194, + "mean_token_accuracy": 0.856049720942974, + "num_tokens": 179431246.0, + "step": 149140 + }, + { + "entropy": 1.880311058461666, + "epoch": 0.46235195011620794, + "grad_norm": 3.6950716972351074, + "learning_rate": 3.7205555103533336e-06, + "loss": 0.4157, + "mean_token_accuracy": 0.862914027273655, + "num_tokens": 179443865.0, + "step": 149150 + }, + { + "entropy": 1.9153502866625787, + "epoch": 0.46238294924125767, + "grad_norm": 3.731424331665039, + "learning_rate": 3.7204307904949507e-06, + "loss": 0.4726, + "mean_token_accuracy": 0.849963067471981, + "num_tokens": 179455278.0, + "step": 149160 + }, + { + "entropy": 1.9533527433872222, + "epoch": 0.46241394836630734, + "grad_norm": 8.858798027038574, + "learning_rate": 3.7203060831782423e-06, + "loss": 0.4618, + "mean_token_accuracy": 0.8534232169389725, + "num_tokens": 179467185.0, + "step": 149170 + }, + { + "entropy": 1.9469057649374009, + "epoch": 0.46244494749135706, + "grad_norm": 7.934726715087891, + "learning_rate": 3.7201813884011084e-06, + "loss": 0.4779, + "mean_token_accuracy": 0.8529333204030991, + "num_tokens": 179478428.0, + "step": 149180 + }, + { + "entropy": 1.8080351307988167, + "epoch": 0.46247594661640673, + "grad_norm": 7.475351810455322, + "learning_rate": 3.7200567061614475e-06, + "loss": 0.4169, + "mean_token_accuracy": 0.8639614179730415, + "num_tokens": 179491614.0, + "step": 149190 + }, + { + "entropy": 1.9022365421056748, + "epoch": 0.46250694574145645, + "grad_norm": 8.010424613952637, + "learning_rate": 3.719932036457157e-06, + "loss": 0.4343, + "mean_token_accuracy": 0.8514123097062111, + "num_tokens": 179502942.0, + "step": 149200 + }, + { + "entropy": 2.004026171565056, + "epoch": 0.4625379448665061, + "grad_norm": 12.545236587524414, + "learning_rate": 3.71980737928614e-06, + "loss": 0.5794, + "mean_token_accuracy": 0.830876411497593, + "num_tokens": 179513755.0, + "step": 149210 + }, + { + "entropy": 1.8916426077485085, + "epoch": 0.46256894399155585, + "grad_norm": 5.487705230712891, + "learning_rate": 3.719682734646293e-06, + "loss": 0.4147, + "mean_token_accuracy": 0.8689695119857788, + "num_tokens": 179525126.0, + "step": 149220 + }, + { + "entropy": 1.8946526229381562, + "epoch": 0.4625999431166055, + "grad_norm": 9.722243309020996, + "learning_rate": 3.7195581025355194e-06, + "loss": 0.4581, + "mean_token_accuracy": 0.8481919586658477, + "num_tokens": 179536892.0, + "step": 149230 + }, + { + "entropy": 1.9386965110898018, + "epoch": 0.46263094224165524, + "grad_norm": 7.750062942504883, + "learning_rate": 3.719433482951718e-06, + "loss": 0.4443, + "mean_token_accuracy": 0.8568326279520988, + "num_tokens": 179548377.0, + "step": 149240 + }, + { + "entropy": 1.8785508409142495, + "epoch": 0.4626619413667049, + "grad_norm": 4.4261393547058105, + "learning_rate": 3.7193088758927924e-06, + "loss": 0.3965, + "mean_token_accuracy": 0.8603580877184868, + "num_tokens": 179560882.0, + "step": 149250 + }, + { + "entropy": 1.863745103776455, + "epoch": 0.46269294049175463, + "grad_norm": 7.138288974761963, + "learning_rate": 3.7191842813566436e-06, + "loss": 0.4085, + "mean_token_accuracy": 0.8604954317212105, + "num_tokens": 179573767.0, + "step": 149260 + }, + { + "entropy": 1.848945789039135, + "epoch": 0.4627239396168043, + "grad_norm": 5.103929042816162, + "learning_rate": 3.7190596993411744e-06, + "loss": 0.4499, + "mean_token_accuracy": 0.8594519913196563, + "num_tokens": 179587556.0, + "step": 149270 + }, + { + "entropy": 1.8771667987108231, + "epoch": 0.462754938741854, + "grad_norm": 9.511100769042969, + "learning_rate": 3.7189351298442885e-06, + "loss": 0.4375, + "mean_token_accuracy": 0.8482055693864823, + "num_tokens": 179599698.0, + "step": 149280 + }, + { + "entropy": 1.8984940245747566, + "epoch": 0.4627859378669037, + "grad_norm": 3.7801976203918457, + "learning_rate": 3.718810572863889e-06, + "loss": 0.4823, + "mean_token_accuracy": 0.8494816675782204, + "num_tokens": 179611306.0, + "step": 149290 + }, + { + "entropy": 1.8457211047410964, + "epoch": 0.4628169369919534, + "grad_norm": 8.371335983276367, + "learning_rate": 3.718686028397879e-06, + "loss": 0.425, + "mean_token_accuracy": 0.8612516298890114, + "num_tokens": 179623572.0, + "step": 149300 + }, + { + "entropy": 1.8669260188937187, + "epoch": 0.4628479361170031, + "grad_norm": 4.057291507720947, + "learning_rate": 3.718561496444166e-06, + "loss": 0.4648, + "mean_token_accuracy": 0.8506672963500023, + "num_tokens": 179635689.0, + "step": 149310 + }, + { + "entropy": 2.0177447497844696, + "epoch": 0.4628789352420528, + "grad_norm": 7.82882022857666, + "learning_rate": 3.718436977000652e-06, + "loss": 0.5247, + "mean_token_accuracy": 0.8370867878198623, + "num_tokens": 179646327.0, + "step": 149320 + }, + { + "entropy": 1.9195616245269775, + "epoch": 0.4629099343671025, + "grad_norm": 4.400636196136475, + "learning_rate": 3.7183124700652433e-06, + "loss": 0.4436, + "mean_token_accuracy": 0.8610096588730812, + "num_tokens": 179658283.0, + "step": 149330 + }, + { + "entropy": 1.9372329905629158, + "epoch": 0.4629409334921522, + "grad_norm": 7.58730936050415, + "learning_rate": 3.7181879756358472e-06, + "loss": 0.4956, + "mean_token_accuracy": 0.8437316343188286, + "num_tokens": 179670274.0, + "step": 149340 + }, + { + "entropy": 1.8574361830949784, + "epoch": 0.4629719326172019, + "grad_norm": 6.498873233795166, + "learning_rate": 3.7180634937103697e-06, + "loss": 0.4185, + "mean_token_accuracy": 0.8578757628798485, + "num_tokens": 179682351.0, + "step": 149350 + }, + { + "entropy": 1.9330038219690322, + "epoch": 0.4630029317422516, + "grad_norm": 8.56723690032959, + "learning_rate": 3.717939024286717e-06, + "loss": 0.4655, + "mean_token_accuracy": 0.8528059482574463, + "num_tokens": 179693716.0, + "step": 149360 + }, + { + "entropy": 1.9517970651388168, + "epoch": 0.46303393086730127, + "grad_norm": 7.569751262664795, + "learning_rate": 3.7178145673627976e-06, + "loss": 0.4461, + "mean_token_accuracy": 0.8599812477827072, + "num_tokens": 179705054.0, + "step": 149370 + }, + { + "entropy": 1.8608211636543275, + "epoch": 0.463064929992351, + "grad_norm": 4.166712760925293, + "learning_rate": 3.717690122936518e-06, + "loss": 0.4196, + "mean_token_accuracy": 0.8539978817105294, + "num_tokens": 179718580.0, + "step": 149380 + }, + { + "entropy": 1.797726885974407, + "epoch": 0.46309592911740066, + "grad_norm": 8.749974250793457, + "learning_rate": 3.7175656910057894e-06, + "loss": 0.4225, + "mean_token_accuracy": 0.8570615857839584, + "num_tokens": 179732381.0, + "step": 149390 + }, + { + "entropy": 1.863544289022684, + "epoch": 0.46312692824245033, + "grad_norm": 6.951138496398926, + "learning_rate": 3.717441271568518e-06, + "loss": 0.4032, + "mean_token_accuracy": 0.8617607593536377, + "num_tokens": 179745055.0, + "step": 149400 + }, + { + "entropy": 1.8385921865701675, + "epoch": 0.46315792736750006, + "grad_norm": 3.46244215965271, + "learning_rate": 3.7173168646226155e-06, + "loss": 0.3779, + "mean_token_accuracy": 0.8700759828090667, + "num_tokens": 179757818.0, + "step": 149410 + }, + { + "entropy": 1.88816240131855, + "epoch": 0.4631889264925497, + "grad_norm": 6.927772521972656, + "learning_rate": 3.7171924701659907e-06, + "loss": 0.4714, + "mean_token_accuracy": 0.8400983780622482, + "num_tokens": 179770718.0, + "step": 149420 + }, + { + "entropy": 1.9157932326197624, + "epoch": 0.46321992561759945, + "grad_norm": 6.942224025726318, + "learning_rate": 3.717068088196554e-06, + "loss": 0.4132, + "mean_token_accuracy": 0.8620764970779419, + "num_tokens": 179782310.0, + "step": 149430 + }, + { + "entropy": 1.925161102414131, + "epoch": 0.4632509247426491, + "grad_norm": 6.985175609588623, + "learning_rate": 3.7169437187122166e-06, + "loss": 0.4419, + "mean_token_accuracy": 0.8568544924259186, + "num_tokens": 179793811.0, + "step": 149440 + }, + { + "entropy": 1.8122023433446883, + "epoch": 0.46328192386769884, + "grad_norm": 8.966583251953125, + "learning_rate": 3.71681936171089e-06, + "loss": 0.4477, + "mean_token_accuracy": 0.8570965170860291, + "num_tokens": 179807530.0, + "step": 149450 + }, + { + "entropy": 1.8448025688529015, + "epoch": 0.4633129229927485, + "grad_norm": 8.287588119506836, + "learning_rate": 3.716695017190486e-06, + "loss": 0.3767, + "mean_token_accuracy": 0.8626005664467812, + "num_tokens": 179820550.0, + "step": 149460 + }, + { + "entropy": 1.963685867190361, + "epoch": 0.46334392211779823, + "grad_norm": 7.667242050170898, + "learning_rate": 3.716570685148917e-06, + "loss": 0.4761, + "mean_token_accuracy": 0.85397337526083, + "num_tokens": 179830999.0, + "step": 149470 + }, + { + "entropy": 1.8665792480111123, + "epoch": 0.4633749212428479, + "grad_norm": 8.30051040649414, + "learning_rate": 3.716446365584096e-06, + "loss": 0.3937, + "mean_token_accuracy": 0.8599151030182839, + "num_tokens": 179843005.0, + "step": 149480 + }, + { + "entropy": 1.8342208310961723, + "epoch": 0.46340592036789763, + "grad_norm": 3.5341451168060303, + "learning_rate": 3.7163220584939363e-06, + "loss": 0.402, + "mean_token_accuracy": 0.8594845876097679, + "num_tokens": 179855385.0, + "step": 149490 + }, + { + "entropy": 1.9173177301883697, + "epoch": 0.4634369194929473, + "grad_norm": 8.4180326461792, + "learning_rate": 3.7161977638763523e-06, + "loss": 0.4628, + "mean_token_accuracy": 0.8482234224677085, + "num_tokens": 179867801.0, + "step": 149500 + }, + { + "entropy": 1.9616335391998292, + "epoch": 0.463467918617997, + "grad_norm": 6.9208221435546875, + "learning_rate": 3.716073481729258e-06, + "loss": 0.5128, + "mean_token_accuracy": 0.8467522487044334, + "num_tokens": 179879398.0, + "step": 149510 + }, + { + "entropy": 1.8291648626327515, + "epoch": 0.4634989177430467, + "grad_norm": 3.6951959133148193, + "learning_rate": 3.715949212050568e-06, + "loss": 0.4187, + "mean_token_accuracy": 0.8521149665117264, + "num_tokens": 179892903.0, + "step": 149520 + }, + { + "entropy": 1.9397112756967545, + "epoch": 0.4635299168680964, + "grad_norm": 7.79329252243042, + "learning_rate": 3.715824954838198e-06, + "loss": 0.4978, + "mean_token_accuracy": 0.8460395872592926, + "num_tokens": 179903789.0, + "step": 149530 + }, + { + "entropy": 1.8958303049206733, + "epoch": 0.4635609159931461, + "grad_norm": 9.17414665222168, + "learning_rate": 3.715700710090064e-06, + "loss": 0.4351, + "mean_token_accuracy": 0.8583477824926377, + "num_tokens": 179915857.0, + "step": 149540 + }, + { + "entropy": 1.934301419556141, + "epoch": 0.4635919151181958, + "grad_norm": 11.262062072753906, + "learning_rate": 3.7155764778040813e-06, + "loss": 0.4945, + "mean_token_accuracy": 0.8361470863223076, + "num_tokens": 179927748.0, + "step": 149550 + }, + { + "entropy": 2.001458024978638, + "epoch": 0.4636229142432455, + "grad_norm": 9.550740242004395, + "learning_rate": 3.7154522579781682e-06, + "loss": 0.531, + "mean_token_accuracy": 0.8417502701282501, + "num_tokens": 179938659.0, + "step": 149560 + }, + { + "entropy": 1.838100890815258, + "epoch": 0.4636539133682952, + "grad_norm": 7.261234760284424, + "learning_rate": 3.715328050610241e-06, + "loss": 0.3895, + "mean_token_accuracy": 0.8724177911877632, + "num_tokens": 179951437.0, + "step": 149570 + }, + { + "entropy": 1.8873604014515877, + "epoch": 0.46368491249334487, + "grad_norm": 4.315876007080078, + "learning_rate": 3.715203855698218e-06, + "loss": 0.4199, + "mean_token_accuracy": 0.854177550971508, + "num_tokens": 179964006.0, + "step": 149580 + }, + { + "entropy": 1.938150754570961, + "epoch": 0.4637159116183946, + "grad_norm": 8.612800598144531, + "learning_rate": 3.715079673240017e-06, + "loss": 0.4698, + "mean_token_accuracy": 0.8499852314591407, + "num_tokens": 179975104.0, + "step": 149590 + }, + { + "entropy": 1.9041712909936905, + "epoch": 0.46374691074344426, + "grad_norm": 3.8847134113311768, + "learning_rate": 3.714955503233557e-06, + "loss": 0.4501, + "mean_token_accuracy": 0.8547717839479446, + "num_tokens": 179987110.0, + "step": 149600 + }, + { + "entropy": 1.951756013929844, + "epoch": 0.463777909868494, + "grad_norm": 6.442052364349365, + "learning_rate": 3.714831345676757e-06, + "loss": 0.4659, + "mean_token_accuracy": 0.8537251800298691, + "num_tokens": 179998453.0, + "step": 149610 + }, + { + "entropy": 1.8288054898381234, + "epoch": 0.46380890899354366, + "grad_norm": 6.619274139404297, + "learning_rate": 3.714707200567537e-06, + "loss": 0.3848, + "mean_token_accuracy": 0.8710165083408355, + "num_tokens": 180010976.0, + "step": 149620 + }, + { + "entropy": 1.9353380858898164, + "epoch": 0.4638399081185934, + "grad_norm": 7.440566062927246, + "learning_rate": 3.7145830679038177e-06, + "loss": 0.495, + "mean_token_accuracy": 0.8452064722776413, + "num_tokens": 180022973.0, + "step": 149630 + }, + { + "entropy": 1.857330885529518, + "epoch": 0.46387090724364305, + "grad_norm": 5.837486267089844, + "learning_rate": 3.714458947683519e-06, + "loss": 0.3873, + "mean_token_accuracy": 0.8586672931909561, + "num_tokens": 180035450.0, + "step": 149640 + }, + { + "entropy": 1.9173126935958862, + "epoch": 0.4639019063686927, + "grad_norm": 9.367176055908203, + "learning_rate": 3.714334839904563e-06, + "loss": 0.4133, + "mean_token_accuracy": 0.8661222815513611, + "num_tokens": 180046863.0, + "step": 149650 + }, + { + "entropy": 1.8740798771381377, + "epoch": 0.46393290549374244, + "grad_norm": 8.433083534240723, + "learning_rate": 3.7142107445648706e-06, + "loss": 0.4236, + "mean_token_accuracy": 0.8610092878341675, + "num_tokens": 180059176.0, + "step": 149660 + }, + { + "entropy": 1.9137323945760727, + "epoch": 0.4639639046187921, + "grad_norm": 8.501178741455078, + "learning_rate": 3.714086661662364e-06, + "loss": 0.443, + "mean_token_accuracy": 0.8547899782657623, + "num_tokens": 180071303.0, + "step": 149670 + }, + { + "entropy": 1.8415597707033158, + "epoch": 0.46399490374384184, + "grad_norm": 9.41646957397461, + "learning_rate": 3.713962591194966e-06, + "loss": 0.4168, + "mean_token_accuracy": 0.8580908700823784, + "num_tokens": 180084425.0, + "step": 149680 + }, + { + "entropy": 1.8512480854988098, + "epoch": 0.4640259028688915, + "grad_norm": 4.0245442390441895, + "learning_rate": 3.7138385331606e-06, + "loss": 0.3873, + "mean_token_accuracy": 0.8679956078529358, + "num_tokens": 180097243.0, + "step": 149690 + }, + { + "entropy": 1.908530332148075, + "epoch": 0.46405690199394123, + "grad_norm": 8.557743072509766, + "learning_rate": 3.713714487557189e-06, + "loss": 0.4529, + "mean_token_accuracy": 0.852016007900238, + "num_tokens": 180108962.0, + "step": 149700 + }, + { + "entropy": 1.8267366781830787, + "epoch": 0.4640879011189909, + "grad_norm": 4.43588924407959, + "learning_rate": 3.713590454382658e-06, + "loss": 0.4163, + "mean_token_accuracy": 0.8512490093708038, + "num_tokens": 180121542.0, + "step": 149710 + }, + { + "entropy": 1.8044231168925762, + "epoch": 0.4641189002440406, + "grad_norm": 8.573354721069336, + "learning_rate": 3.71346643363493e-06, + "loss": 0.388, + "mean_token_accuracy": 0.8628219202160835, + "num_tokens": 180134671.0, + "step": 149720 + }, + { + "entropy": 1.9123155683279038, + "epoch": 0.4641498993690903, + "grad_norm": 3.89583158493042, + "learning_rate": 3.7133424253119323e-06, + "loss": 0.4387, + "mean_token_accuracy": 0.8472074523568154, + "num_tokens": 180146781.0, + "step": 149730 + }, + { + "entropy": 1.9491783201694488, + "epoch": 0.46418089849414, + "grad_norm": 7.883488655090332, + "learning_rate": 3.713218429411589e-06, + "loss": 0.5038, + "mean_token_accuracy": 0.8498224407434464, + "num_tokens": 180158375.0, + "step": 149740 + }, + { + "entropy": 1.8938617929816246, + "epoch": 0.4642118976191897, + "grad_norm": 7.663883209228516, + "learning_rate": 3.713094445931827e-06, + "loss": 0.4595, + "mean_token_accuracy": 0.8473029389977456, + "num_tokens": 180170450.0, + "step": 149750 + }, + { + "entropy": 1.8819366082549096, + "epoch": 0.4642428967442394, + "grad_norm": 8.43736457824707, + "learning_rate": 3.712970474870572e-06, + "loss": 0.4776, + "mean_token_accuracy": 0.8485167771577835, + "num_tokens": 180182097.0, + "step": 149760 + }, + { + "entropy": 1.8832198321819305, + "epoch": 0.4642738958692891, + "grad_norm": 6.826776504516602, + "learning_rate": 3.7128465162257517e-06, + "loss": 0.4203, + "mean_token_accuracy": 0.8600822359323501, + "num_tokens": 180194184.0, + "step": 149770 + }, + { + "entropy": 1.9311166375875473, + "epoch": 0.4643048949943388, + "grad_norm": 8.055139541625977, + "learning_rate": 3.712722569995293e-06, + "loss": 0.4818, + "mean_token_accuracy": 0.850020831823349, + "num_tokens": 180205786.0, + "step": 149780 + }, + { + "entropy": 1.9121816590428353, + "epoch": 0.46433589411938847, + "grad_norm": 8.332880973815918, + "learning_rate": 3.712598636177124e-06, + "loss": 0.4429, + "mean_token_accuracy": 0.8503255888819694, + "num_tokens": 180218001.0, + "step": 149790 + }, + { + "entropy": 1.9609209045767784, + "epoch": 0.4643668932444382, + "grad_norm": 4.948374271392822, + "learning_rate": 3.7124747147691733e-06, + "loss": 0.4832, + "mean_token_accuracy": 0.8423975124955178, + "num_tokens": 180229921.0, + "step": 149800 + }, + { + "entropy": 1.9410631626844406, + "epoch": 0.46439789236948786, + "grad_norm": 8.004473686218262, + "learning_rate": 3.71235080576937e-06, + "loss": 0.4916, + "mean_token_accuracy": 0.8438965693116188, + "num_tokens": 180242344.0, + "step": 149810 + }, + { + "entropy": 1.8821017548441887, + "epoch": 0.4644288914945376, + "grad_norm": 5.065506935119629, + "learning_rate": 3.7122269091756436e-06, + "loss": 0.4566, + "mean_token_accuracy": 0.8540616199374199, + "num_tokens": 180254354.0, + "step": 149820 + }, + { + "entropy": 1.9584839269518852, + "epoch": 0.46445989061958726, + "grad_norm": 7.649283409118652, + "learning_rate": 3.7121030249859243e-06, + "loss": 0.512, + "mean_token_accuracy": 0.8388520136475563, + "num_tokens": 180265796.0, + "step": 149830 + }, + { + "entropy": 1.8087536610662938, + "epoch": 0.464490889744637, + "grad_norm": 4.455826282501221, + "learning_rate": 3.7119791531981408e-06, + "loss": 0.3759, + "mean_token_accuracy": 0.8636887416243553, + "num_tokens": 180279611.0, + "step": 149840 + }, + { + "entropy": 1.9264666765928269, + "epoch": 0.46452188886968665, + "grad_norm": 7.5778985023498535, + "learning_rate": 3.711855293810227e-06, + "loss": 0.479, + "mean_token_accuracy": 0.8526265174150467, + "num_tokens": 180290829.0, + "step": 149850 + }, + { + "entropy": 1.9072339527308941, + "epoch": 0.4645528879947364, + "grad_norm": 6.993530750274658, + "learning_rate": 3.711731446820111e-06, + "loss": 0.4546, + "mean_token_accuracy": 0.8573938056826591, + "num_tokens": 180303038.0, + "step": 149860 + }, + { + "entropy": 1.9101138547062875, + "epoch": 0.46458388711978604, + "grad_norm": 8.588576316833496, + "learning_rate": 3.7116076122257273e-06, + "loss": 0.4783, + "mean_token_accuracy": 0.8481775879859924, + "num_tokens": 180314807.0, + "step": 149870 + }, + { + "entropy": 1.7950350888073445, + "epoch": 0.4646148862448357, + "grad_norm": 4.491353511810303, + "learning_rate": 3.7114837900250068e-06, + "loss": 0.4137, + "mean_token_accuracy": 0.8682404264807702, + "num_tokens": 180328419.0, + "step": 149880 + }, + { + "entropy": 1.9566629469394683, + "epoch": 0.46464588536988544, + "grad_norm": 7.9985198974609375, + "learning_rate": 3.7113599802158823e-06, + "loss": 0.4799, + "mean_token_accuracy": 0.8541330322623253, + "num_tokens": 180339648.0, + "step": 149890 + }, + { + "entropy": 1.9067189604043961, + "epoch": 0.4646768844949351, + "grad_norm": 7.05324649810791, + "learning_rate": 3.711236182796288e-06, + "loss": 0.427, + "mean_token_accuracy": 0.8619942396879197, + "num_tokens": 180351473.0, + "step": 149900 + }, + { + "entropy": 1.9476832181215287, + "epoch": 0.46470788361998483, + "grad_norm": 3.14349365234375, + "learning_rate": 3.711112397764157e-06, + "loss": 0.4888, + "mean_token_accuracy": 0.8488188147544861, + "num_tokens": 180362723.0, + "step": 149910 + }, + { + "entropy": 1.7743832789361478, + "epoch": 0.4647388827450345, + "grad_norm": 7.153512001037598, + "learning_rate": 3.7109886251174236e-06, + "loss": 0.3569, + "mean_token_accuracy": 0.8677819207310676, + "num_tokens": 180376546.0, + "step": 149920 + }, + { + "entropy": 1.8927229553461076, + "epoch": 0.4647698818700842, + "grad_norm": 2.545341968536377, + "learning_rate": 3.710864864854023e-06, + "loss": 0.4332, + "mean_token_accuracy": 0.8583212435245514, + "num_tokens": 180388347.0, + "step": 149930 + }, + { + "entropy": 1.8385560512542725, + "epoch": 0.4648008809951339, + "grad_norm": 7.90435791015625, + "learning_rate": 3.71074111697189e-06, + "loss": 0.4113, + "mean_token_accuracy": 0.8564158409833909, + "num_tokens": 180400665.0, + "step": 149940 + }, + { + "entropy": 1.8350412741303443, + "epoch": 0.4648318801201836, + "grad_norm": 4.03927755355835, + "learning_rate": 3.7106173814689606e-06, + "loss": 0.4009, + "mean_token_accuracy": 0.8569675713777543, + "num_tokens": 180412723.0, + "step": 149950 + }, + { + "entropy": 1.8664705529808998, + "epoch": 0.4648628792452333, + "grad_norm": 2.9268198013305664, + "learning_rate": 3.710493658343171e-06, + "loss": 0.5036, + "mean_token_accuracy": 0.8416856169700623, + "num_tokens": 180426131.0, + "step": 149960 + }, + { + "entropy": 1.922463881969452, + "epoch": 0.464893878370283, + "grad_norm": 7.96071195602417, + "learning_rate": 3.7103699475924576e-06, + "loss": 0.4839, + "mean_token_accuracy": 0.844486691057682, + "num_tokens": 180437621.0, + "step": 149970 + }, + { + "entropy": 1.8738661885261536, + "epoch": 0.4649248774953327, + "grad_norm": 7.6994404792785645, + "learning_rate": 3.710246249214757e-06, + "loss": 0.4296, + "mean_token_accuracy": 0.859145550429821, + "num_tokens": 180449886.0, + "step": 149980 + }, + { + "entropy": 1.854253427684307, + "epoch": 0.4649558766203824, + "grad_norm": 3.478299856185913, + "learning_rate": 3.7101225632080085e-06, + "loss": 0.4088, + "mean_token_accuracy": 0.8661620482802391, + "num_tokens": 180461919.0, + "step": 149990 + }, + { + "entropy": 1.9357280641794206, + "epoch": 0.46498687574543207, + "grad_norm": 10.878789901733398, + "learning_rate": 3.709998889570149e-06, + "loss": 0.464, + "mean_token_accuracy": 0.8454708248376847, + "num_tokens": 180473902.0, + "step": 150000 + }, + { + "entropy": 1.9131696656346322, + "epoch": 0.4650178748704818, + "grad_norm": 8.852290153503418, + "learning_rate": 3.709875228299118e-06, + "loss": 0.5096, + "mean_token_accuracy": 0.8401144355535507, + "num_tokens": 180484961.0, + "step": 150010 + }, + { + "entropy": 1.8424381971359254, + "epoch": 0.46504887399553146, + "grad_norm": 8.186081886291504, + "learning_rate": 3.709751579392853e-06, + "loss": 0.4484, + "mean_token_accuracy": 0.8486584141850472, + "num_tokens": 180497785.0, + "step": 150020 + }, + { + "entropy": 1.8932843878865242, + "epoch": 0.4650798731205812, + "grad_norm": 6.866567134857178, + "learning_rate": 3.709627942849295e-06, + "loss": 0.4095, + "mean_token_accuracy": 0.869660022854805, + "num_tokens": 180509401.0, + "step": 150030 + }, + { + "entropy": 1.9485246509313583, + "epoch": 0.46511087224563086, + "grad_norm": 8.80336856842041, + "learning_rate": 3.7095043186663837e-06, + "loss": 0.4428, + "mean_token_accuracy": 0.8586052864789963, + "num_tokens": 180520431.0, + "step": 150040 + }, + { + "entropy": 1.914072147011757, + "epoch": 0.4651418713706806, + "grad_norm": 14.637115478515625, + "learning_rate": 3.709380706842059e-06, + "loss": 0.4752, + "mean_token_accuracy": 0.8486096784472466, + "num_tokens": 180532404.0, + "step": 150050 + }, + { + "entropy": 1.7987019002437592, + "epoch": 0.46517287049573025, + "grad_norm": 2.0740082263946533, + "learning_rate": 3.709257107374263e-06, + "loss": 0.4432, + "mean_token_accuracy": 0.8616669073700904, + "num_tokens": 180546487.0, + "step": 150060 + }, + { + "entropy": 1.8739713817834853, + "epoch": 0.46520386962078, + "grad_norm": 8.125478744506836, + "learning_rate": 3.709133520260937e-06, + "loss": 0.4047, + "mean_token_accuracy": 0.8655928507447243, + "num_tokens": 180558526.0, + "step": 150070 + }, + { + "entropy": 1.9065773144364357, + "epoch": 0.46523486874582964, + "grad_norm": 8.001792907714844, + "learning_rate": 3.7090099455000217e-06, + "loss": 0.4646, + "mean_token_accuracy": 0.8476300582289695, + "num_tokens": 180571010.0, + "step": 150080 + }, + { + "entropy": 1.8418815195560456, + "epoch": 0.46526586787087937, + "grad_norm": 4.2682318687438965, + "learning_rate": 3.70888638308946e-06, + "loss": 0.4265, + "mean_token_accuracy": 0.850444746017456, + "num_tokens": 180584015.0, + "step": 150090 + }, + { + "entropy": 1.775348497927189, + "epoch": 0.46529686699592904, + "grad_norm": 2.5660839080810547, + "learning_rate": 3.7087628330271962e-06, + "loss": 0.3799, + "mean_token_accuracy": 0.8672156199812889, + "num_tokens": 180596841.0, + "step": 150100 + }, + { + "entropy": 1.8526948064565658, + "epoch": 0.46532786612097876, + "grad_norm": 9.855128288269043, + "learning_rate": 3.708639295311173e-06, + "loss": 0.4494, + "mean_token_accuracy": 0.8536330834031105, + "num_tokens": 180609137.0, + "step": 150110 + }, + { + "entropy": 1.9509498670697212, + "epoch": 0.46535886524602843, + "grad_norm": 5.597041130065918, + "learning_rate": 3.708515769939334e-06, + "loss": 0.4768, + "mean_token_accuracy": 0.8556590229272842, + "num_tokens": 180620230.0, + "step": 150120 + }, + { + "entropy": 1.9075401276350021, + "epoch": 0.4653898643710781, + "grad_norm": 8.79906940460205, + "learning_rate": 3.708392256909624e-06, + "loss": 0.4772, + "mean_token_accuracy": 0.846394631266594, + "num_tokens": 180631753.0, + "step": 150130 + }, + { + "entropy": 1.9188668608665467, + "epoch": 0.4654208634961278, + "grad_norm": 7.841006755828857, + "learning_rate": 3.7082687562199866e-06, + "loss": 0.4726, + "mean_token_accuracy": 0.8531607300043106, + "num_tokens": 180642477.0, + "step": 150140 + }, + { + "entropy": 1.794436551630497, + "epoch": 0.4654518626211775, + "grad_norm": 8.100784301757812, + "learning_rate": 3.708145267868368e-06, + "loss": 0.4286, + "mean_token_accuracy": 0.8534730911254883, + "num_tokens": 180655800.0, + "step": 150150 + }, + { + "entropy": 1.8831926614046097, + "epoch": 0.4654828617462272, + "grad_norm": 6.693047046661377, + "learning_rate": 3.7080217918527147e-06, + "loss": 0.431, + "mean_token_accuracy": 0.8538334533572197, + "num_tokens": 180667072.0, + "step": 150160 + }, + { + "entropy": 1.8978948533535003, + "epoch": 0.4655138608712769, + "grad_norm": 8.671180725097656, + "learning_rate": 3.707898328170972e-06, + "loss": 0.5048, + "mean_token_accuracy": 0.8394773602485657, + "num_tokens": 180678789.0, + "step": 150170 + }, + { + "entropy": 1.9485942766070365, + "epoch": 0.4655448599963266, + "grad_norm": 9.572785377502441, + "learning_rate": 3.707774876821087e-06, + "loss": 0.4643, + "mean_token_accuracy": 0.8439914211630821, + "num_tokens": 180690154.0, + "step": 150180 + }, + { + "entropy": 1.952707216143608, + "epoch": 0.4655758591213763, + "grad_norm": 8.8486967086792, + "learning_rate": 3.707651437801007e-06, + "loss": 0.513, + "mean_token_accuracy": 0.8544019669294357, + "num_tokens": 180701768.0, + "step": 150190 + }, + { + "entropy": 1.9008170261979103, + "epoch": 0.465606858246426, + "grad_norm": 8.073139190673828, + "learning_rate": 3.7075280111086796e-06, + "loss": 0.4375, + "mean_token_accuracy": 0.8544175952672959, + "num_tokens": 180713142.0, + "step": 150200 + }, + { + "entropy": 1.9324932008981706, + "epoch": 0.46563785737147567, + "grad_norm": 7.887551307678223, + "learning_rate": 3.707404596742053e-06, + "loss": 0.4257, + "mean_token_accuracy": 0.865489798784256, + "num_tokens": 180724698.0, + "step": 150210 + }, + { + "entropy": 1.8756032541394234, + "epoch": 0.4656688564965254, + "grad_norm": 7.615468978881836, + "learning_rate": 3.707281194699076e-06, + "loss": 0.4626, + "mean_token_accuracy": 0.8480568066239357, + "num_tokens": 180737683.0, + "step": 150220 + }, + { + "entropy": 1.8858239620923996, + "epoch": 0.46569985562157507, + "grad_norm": 7.510842800140381, + "learning_rate": 3.707157804977698e-06, + "loss": 0.4757, + "mean_token_accuracy": 0.8432325854897499, + "num_tokens": 180749594.0, + "step": 150230 + }, + { + "entropy": 1.9456377267837524, + "epoch": 0.4657308547466248, + "grad_norm": 6.716031074523926, + "learning_rate": 3.7070344275758684e-06, + "loss": 0.465, + "mean_token_accuracy": 0.8643984943628311, + "num_tokens": 180761340.0, + "step": 150240 + }, + { + "entropy": 1.9459609061479568, + "epoch": 0.46576185387167446, + "grad_norm": 8.735827445983887, + "learning_rate": 3.706911062491537e-06, + "loss": 0.51, + "mean_token_accuracy": 0.8414299875497818, + "num_tokens": 180773140.0, + "step": 150250 + }, + { + "entropy": 1.8788623332977294, + "epoch": 0.4657928529967242, + "grad_norm": 6.6167497634887695, + "learning_rate": 3.7067877097226546e-06, + "loss": 0.4211, + "mean_token_accuracy": 0.8643033638596535, + "num_tokens": 180785039.0, + "step": 150260 + }, + { + "entropy": 1.7132162213325501, + "epoch": 0.46582385212177385, + "grad_norm": 3.7535245418548584, + "learning_rate": 3.706664369267172e-06, + "loss": 0.3818, + "mean_token_accuracy": 0.8678473249077797, + "num_tokens": 180799943.0, + "step": 150270 + }, + { + "entropy": 1.9511631667613982, + "epoch": 0.4658548512468236, + "grad_norm": 8.460371017456055, + "learning_rate": 3.7065410411230414e-06, + "loss": 0.4953, + "mean_token_accuracy": 0.8503921389579773, + "num_tokens": 180810998.0, + "step": 150280 + }, + { + "entropy": 1.8199073910713195, + "epoch": 0.46588585037187324, + "grad_norm": 8.949850082397461, + "learning_rate": 3.706417725288214e-06, + "loss": 0.4, + "mean_token_accuracy": 0.8588975444436073, + "num_tokens": 180824024.0, + "step": 150290 + }, + { + "entropy": 1.871119175851345, + "epoch": 0.46591684949692297, + "grad_norm": 8.456604957580566, + "learning_rate": 3.7062944217606428e-06, + "loss": 0.4164, + "mean_token_accuracy": 0.8629543900489807, + "num_tokens": 180835815.0, + "step": 150300 + }, + { + "entropy": 1.8718924656510354, + "epoch": 0.46594784862197264, + "grad_norm": 9.544185638427734, + "learning_rate": 3.706171130538281e-06, + "loss": 0.3963, + "mean_token_accuracy": 0.8634023860096931, + "num_tokens": 180847428.0, + "step": 150310 + }, + { + "entropy": 1.8509297877550126, + "epoch": 0.46597884774702236, + "grad_norm": 7.217175483703613, + "learning_rate": 3.7060478516190818e-06, + "loss": 0.3889, + "mean_token_accuracy": 0.8697842806577682, + "num_tokens": 180859536.0, + "step": 150320 + }, + { + "entropy": 1.8995045498013496, + "epoch": 0.46600984687207203, + "grad_norm": 7.621474266052246, + "learning_rate": 3.7059245850009987e-06, + "loss": 0.4723, + "mean_token_accuracy": 0.8520592898130417, + "num_tokens": 180872519.0, + "step": 150330 + }, + { + "entropy": 1.820411132276058, + "epoch": 0.46604084599712176, + "grad_norm": 7.533627033233643, + "learning_rate": 3.7058013306819874e-06, + "loss": 0.3875, + "mean_token_accuracy": 0.866758693754673, + "num_tokens": 180885396.0, + "step": 150340 + }, + { + "entropy": 1.9017600163817405, + "epoch": 0.4660718451221714, + "grad_norm": 7.15775728225708, + "learning_rate": 3.7056780886600014e-06, + "loss": 0.4171, + "mean_token_accuracy": 0.8558528557419777, + "num_tokens": 180897325.0, + "step": 150350 + }, + { + "entropy": 1.9121145501732826, + "epoch": 0.46610284424722115, + "grad_norm": 4.693599700927734, + "learning_rate": 3.705554858932996e-06, + "loss": 0.464, + "mean_token_accuracy": 0.8516936868429184, + "num_tokens": 180909047.0, + "step": 150360 + }, + { + "entropy": 1.9288795188069343, + "epoch": 0.4661338433722708, + "grad_norm": 7.500865936279297, + "learning_rate": 3.7054316414989283e-06, + "loss": 0.4391, + "mean_token_accuracy": 0.8530079022049903, + "num_tokens": 180920731.0, + "step": 150370 + }, + { + "entropy": 1.9719238847494125, + "epoch": 0.4661648424973205, + "grad_norm": 9.861359596252441, + "learning_rate": 3.7053084363557534e-06, + "loss": 0.5333, + "mean_token_accuracy": 0.8317109391093254, + "num_tokens": 180931182.0, + "step": 150380 + }, + { + "entropy": 1.9200509548187257, + "epoch": 0.4661958416223702, + "grad_norm": 3.9624438285827637, + "learning_rate": 3.705185243501429e-06, + "loss": 0.4287, + "mean_token_accuracy": 0.8530553847551345, + "num_tokens": 180942651.0, + "step": 150390 + }, + { + "entropy": 1.902939459681511, + "epoch": 0.4662268407474199, + "grad_norm": 10.163533210754395, + "learning_rate": 3.705062062933911e-06, + "loss": 0.4502, + "mean_token_accuracy": 0.8538908958435059, + "num_tokens": 180954042.0, + "step": 150400 + }, + { + "entropy": 1.869973176717758, + "epoch": 0.4662578398724696, + "grad_norm": 9.920553207397461, + "learning_rate": 3.7049388946511593e-06, + "loss": 0.4584, + "mean_token_accuracy": 0.8528695210814476, + "num_tokens": 180966670.0, + "step": 150410 + }, + { + "entropy": 1.9353784874081612, + "epoch": 0.4662888389975193, + "grad_norm": 7.96409273147583, + "learning_rate": 3.7048157386511297e-06, + "loss": 0.474, + "mean_token_accuracy": 0.8458786249160767, + "num_tokens": 180978919.0, + "step": 150420 + }, + { + "entropy": 1.8363364577293395, + "epoch": 0.466319838122569, + "grad_norm": 8.377767562866211, + "learning_rate": 3.7046925949317823e-06, + "loss": 0.4639, + "mean_token_accuracy": 0.8574515968561173, + "num_tokens": 180992419.0, + "step": 150430 + }, + { + "entropy": 1.8962649412453174, + "epoch": 0.46635083724761867, + "grad_norm": 10.839685440063477, + "learning_rate": 3.7045694634910766e-06, + "loss": 0.4511, + "mean_token_accuracy": 0.848162266612053, + "num_tokens": 181004918.0, + "step": 150440 + }, + { + "entropy": 1.8515453770756722, + "epoch": 0.4663818363726684, + "grad_norm": 4.3565354347229, + "learning_rate": 3.7044463443269713e-06, + "loss": 0.3889, + "mean_token_accuracy": 0.8653715640306473, + "num_tokens": 181018477.0, + "step": 150450 + }, + { + "entropy": 1.9294961079955102, + "epoch": 0.46641283549771806, + "grad_norm": 8.185575485229492, + "learning_rate": 3.704323237437427e-06, + "loss": 0.4565, + "mean_token_accuracy": 0.8540726110339165, + "num_tokens": 181030636.0, + "step": 150460 + }, + { + "entropy": 1.987247222661972, + "epoch": 0.4664438346227678, + "grad_norm": 10.924062728881836, + "learning_rate": 3.704200142820404e-06, + "loss": 0.5302, + "mean_token_accuracy": 0.8374628499150276, + "num_tokens": 181041619.0, + "step": 150470 + }, + { + "entropy": 1.9206083595752717, + "epoch": 0.46647483374781745, + "grad_norm": 7.948111057281494, + "learning_rate": 3.7040770604738633e-06, + "loss": 0.4365, + "mean_token_accuracy": 0.8553918346762657, + "num_tokens": 181053506.0, + "step": 150480 + }, + { + "entropy": 1.9235239014029504, + "epoch": 0.4665058328728672, + "grad_norm": 8.41197395324707, + "learning_rate": 3.703953990395767e-06, + "loss": 0.4492, + "mean_token_accuracy": 0.8539768025279045, + "num_tokens": 181064814.0, + "step": 150490 + }, + { + "entropy": 1.9210913270711898, + "epoch": 0.46653683199791685, + "grad_norm": 7.933341979980469, + "learning_rate": 3.7038309325840766e-06, + "loss": 0.4637, + "mean_token_accuracy": 0.8557110175490379, + "num_tokens": 181075834.0, + "step": 150500 + }, + { + "entropy": 1.904814685881138, + "epoch": 0.46656783112296657, + "grad_norm": 7.137592315673828, + "learning_rate": 3.703707887036754e-06, + "loss": 0.4201, + "mean_token_accuracy": 0.8601122245192527, + "num_tokens": 181087919.0, + "step": 150510 + }, + { + "entropy": 1.8629762142896653, + "epoch": 0.46659883024801624, + "grad_norm": 4.012499809265137, + "learning_rate": 3.7035848537517633e-06, + "loss": 0.4779, + "mean_token_accuracy": 0.8468594029545784, + "num_tokens": 181099822.0, + "step": 150520 + }, + { + "entropy": 1.8338840797543525, + "epoch": 0.46662982937306596, + "grad_norm": 8.058430671691895, + "learning_rate": 3.703461832727068e-06, + "loss": 0.3951, + "mean_token_accuracy": 0.861028978228569, + "num_tokens": 181112410.0, + "step": 150530 + }, + { + "entropy": 1.9143824577331543, + "epoch": 0.46666082849811563, + "grad_norm": 7.549951076507568, + "learning_rate": 3.7033388239606303e-06, + "loss": 0.4539, + "mean_token_accuracy": 0.8437608435750008, + "num_tokens": 181123938.0, + "step": 150540 + }, + { + "entropy": 1.9112285375595093, + "epoch": 0.46669182762316536, + "grad_norm": 7.177019119262695, + "learning_rate": 3.7032158274504173e-06, + "loss": 0.4965, + "mean_token_accuracy": 0.8438215360045433, + "num_tokens": 181134127.0, + "step": 150550 + }, + { + "entropy": 1.8974832728505135, + "epoch": 0.466722826748215, + "grad_norm": 8.187768936157227, + "learning_rate": 3.7030928431943912e-06, + "loss": 0.4749, + "mean_token_accuracy": 0.8517803817987442, + "num_tokens": 181145556.0, + "step": 150560 + }, + { + "entropy": 1.9428151741623878, + "epoch": 0.46675382587326475, + "grad_norm": 9.65618896484375, + "learning_rate": 3.702969871190518e-06, + "loss": 0.489, + "mean_token_accuracy": 0.843976517021656, + "num_tokens": 181157264.0, + "step": 150570 + }, + { + "entropy": 1.9293995440006255, + "epoch": 0.4667848249983144, + "grad_norm": 7.708983898162842, + "learning_rate": 3.7028469114367653e-06, + "loss": 0.4889, + "mean_token_accuracy": 0.8495089873671532, + "num_tokens": 181168528.0, + "step": 150580 + }, + { + "entropy": 1.8680501088500023, + "epoch": 0.46681582412336414, + "grad_norm": 8.186799049377441, + "learning_rate": 3.702723963931097e-06, + "loss": 0.4549, + "mean_token_accuracy": 0.8557474792003632, + "num_tokens": 181180264.0, + "step": 150590 + }, + { + "entropy": 1.8408723667263984, + "epoch": 0.4668468232484138, + "grad_norm": 2.6501667499542236, + "learning_rate": 3.7026010286714814e-06, + "loss": 0.5115, + "mean_token_accuracy": 0.8513808265328408, + "num_tokens": 181193075.0, + "step": 150600 + }, + { + "entropy": 1.9793402075767517, + "epoch": 0.46687782237346354, + "grad_norm": 7.212341785430908, + "learning_rate": 3.702478105655884e-06, + "loss": 0.5262, + "mean_token_accuracy": 0.8411362200975419, + "num_tokens": 181203773.0, + "step": 150610 + }, + { + "entropy": 1.937410406768322, + "epoch": 0.4669088214985132, + "grad_norm": 8.84796142578125, + "learning_rate": 3.702355194882275e-06, + "loss": 0.4799, + "mean_token_accuracy": 0.8421960338950157, + "num_tokens": 181214991.0, + "step": 150620 + }, + { + "entropy": 1.8487501621246338, + "epoch": 0.4669398206235629, + "grad_norm": 3.367696523666382, + "learning_rate": 3.7022322963486203e-06, + "loss": 0.4316, + "mean_token_accuracy": 0.8592374339699745, + "num_tokens": 181228067.0, + "step": 150630 + }, + { + "entropy": 1.8551168724894525, + "epoch": 0.4669708197486126, + "grad_norm": 3.5218849182128906, + "learning_rate": 3.7021094100528897e-06, + "loss": 0.4068, + "mean_token_accuracy": 0.8567650452256202, + "num_tokens": 181240939.0, + "step": 150640 + }, + { + "entropy": 1.8298555374145509, + "epoch": 0.46700181887366227, + "grad_norm": 3.4528143405914307, + "learning_rate": 3.701986535993051e-06, + "loss": 0.3701, + "mean_token_accuracy": 0.8601305693387985, + "num_tokens": 181254113.0, + "step": 150650 + }, + { + "entropy": 1.9451593279838562, + "epoch": 0.467032817998712, + "grad_norm": 10.91589069366455, + "learning_rate": 3.7018636741670766e-06, + "loss": 0.5097, + "mean_token_accuracy": 0.852519790828228, + "num_tokens": 181265250.0, + "step": 150660 + }, + { + "entropy": 1.9798747926950455, + "epoch": 0.46706381712376166, + "grad_norm": 9.844844818115234, + "learning_rate": 3.701740824572933e-06, + "loss": 0.4884, + "mean_token_accuracy": 0.8470689475536346, + "num_tokens": 181275900.0, + "step": 150670 + }, + { + "entropy": 1.8553171455860138, + "epoch": 0.4670948162488114, + "grad_norm": 8.106888771057129, + "learning_rate": 3.7016179872085933e-06, + "loss": 0.5243, + "mean_token_accuracy": 0.8486344560980796, + "num_tokens": 181288591.0, + "step": 150680 + }, + { + "entropy": 1.9101608902215959, + "epoch": 0.46712581537386105, + "grad_norm": 5.4185075759887695, + "learning_rate": 3.7014951620720275e-06, + "loss": 0.4668, + "mean_token_accuracy": 0.8533428713679314, + "num_tokens": 181300555.0, + "step": 150690 + }, + { + "entropy": 1.8143582224845887, + "epoch": 0.4671568144989108, + "grad_norm": 8.726012229919434, + "learning_rate": 3.7013723491612075e-06, + "loss": 0.4051, + "mean_token_accuracy": 0.8616741225123405, + "num_tokens": 181313432.0, + "step": 150700 + }, + { + "entropy": 1.8796430230140686, + "epoch": 0.46718781362396045, + "grad_norm": 8.185118675231934, + "learning_rate": 3.701249548474104e-06, + "loss": 0.4173, + "mean_token_accuracy": 0.8626327365636826, + "num_tokens": 181325140.0, + "step": 150710 + }, + { + "entropy": 1.9016419626772403, + "epoch": 0.46721881274901017, + "grad_norm": 8.940559387207031, + "learning_rate": 3.7011267600086907e-06, + "loss": 0.4506, + "mean_token_accuracy": 0.8517810702323914, + "num_tokens": 181337140.0, + "step": 150720 + }, + { + "entropy": 1.9060870558023453, + "epoch": 0.46724981187405984, + "grad_norm": 7.316099643707275, + "learning_rate": 3.7010039837629403e-06, + "loss": 0.4335, + "mean_token_accuracy": 0.8516420558094978, + "num_tokens": 181348828.0, + "step": 150730 + }, + { + "entropy": 1.8336908236145972, + "epoch": 0.46728081099910956, + "grad_norm": 8.662108421325684, + "learning_rate": 3.700881219734826e-06, + "loss": 0.4264, + "mean_token_accuracy": 0.8569240808486939, + "num_tokens": 181360993.0, + "step": 150740 + }, + { + "entropy": 1.867463281750679, + "epoch": 0.46731181012415923, + "grad_norm": 4.231754302978516, + "learning_rate": 3.7007584679223206e-06, + "loss": 0.4002, + "mean_token_accuracy": 0.8555324092507363, + "num_tokens": 181373461.0, + "step": 150750 + }, + { + "entropy": 1.9317969232797623, + "epoch": 0.46734280924920896, + "grad_norm": 7.876861572265625, + "learning_rate": 3.7006357283234e-06, + "loss": 0.4745, + "mean_token_accuracy": 0.8606542080640793, + "num_tokens": 181383899.0, + "step": 150760 + }, + { + "entropy": 2.0077045977115633, + "epoch": 0.4673738083742586, + "grad_norm": 7.578515529632568, + "learning_rate": 3.700513000936038e-06, + "loss": 0.5033, + "mean_token_accuracy": 0.8427977785468102, + "num_tokens": 181394945.0, + "step": 150770 + }, + { + "entropy": 1.898098950833082, + "epoch": 0.46740480749930835, + "grad_norm": 9.076956748962402, + "learning_rate": 3.7003902857582097e-06, + "loss": 0.4804, + "mean_token_accuracy": 0.8464702069759369, + "num_tokens": 181406939.0, + "step": 150780 + }, + { + "entropy": 1.8830089807510375, + "epoch": 0.467435806624358, + "grad_norm": 8.655149459838867, + "learning_rate": 3.7002675827878913e-06, + "loss": 0.4525, + "mean_token_accuracy": 0.8559210911393166, + "num_tokens": 181419739.0, + "step": 150790 + }, + { + "entropy": 1.9508404403924942, + "epoch": 0.46746680574940774, + "grad_norm": 7.471928596496582, + "learning_rate": 3.7001448920230598e-06, + "loss": 0.4981, + "mean_token_accuracy": 0.8527390614151955, + "num_tokens": 181430494.0, + "step": 150800 + }, + { + "entropy": 1.9231002733111382, + "epoch": 0.4674978048744574, + "grad_norm": 8.397159576416016, + "learning_rate": 3.70002221346169e-06, + "loss": 0.5567, + "mean_token_accuracy": 0.8290016546845436, + "num_tokens": 181441868.0, + "step": 150810 + }, + { + "entropy": 1.9228404954075813, + "epoch": 0.46752880399950714, + "grad_norm": 4.443188190460205, + "learning_rate": 3.69989954710176e-06, + "loss": 0.4481, + "mean_token_accuracy": 0.847866815328598, + "num_tokens": 181453921.0, + "step": 150820 + }, + { + "entropy": 1.9264577642083167, + "epoch": 0.4675598031245568, + "grad_norm": 8.896585464477539, + "learning_rate": 3.699776892941247e-06, + "loss": 0.505, + "mean_token_accuracy": 0.8456339925527573, + "num_tokens": 181465120.0, + "step": 150830 + }, + { + "entropy": 1.9083795070648193, + "epoch": 0.46759080224960653, + "grad_norm": 8.588702201843262, + "learning_rate": 3.6996542509781293e-06, + "loss": 0.4605, + "mean_token_accuracy": 0.8652929633855819, + "num_tokens": 181476510.0, + "step": 150840 + }, + { + "entropy": 1.895962081849575, + "epoch": 0.4676218013746562, + "grad_norm": 8.5095853805542, + "learning_rate": 3.6995316212103853e-06, + "loss": 0.4649, + "mean_token_accuracy": 0.8454143151640892, + "num_tokens": 181488282.0, + "step": 150850 + }, + { + "entropy": 1.8628547742962838, + "epoch": 0.4676528004997059, + "grad_norm": 2.8419368267059326, + "learning_rate": 3.699409003635994e-06, + "loss": 0.4532, + "mean_token_accuracy": 0.8537169203162194, + "num_tokens": 181501584.0, + "step": 150860 + }, + { + "entropy": 1.915559995174408, + "epoch": 0.4676837996247556, + "grad_norm": 7.790353775024414, + "learning_rate": 3.6992863982529358e-06, + "loss": 0.4685, + "mean_token_accuracy": 0.8497019991278648, + "num_tokens": 181513133.0, + "step": 150870 + }, + { + "entropy": 1.856499010324478, + "epoch": 0.46771479874980526, + "grad_norm": 3.7286479473114014, + "learning_rate": 3.699163805059189e-06, + "loss": 0.4253, + "mean_token_accuracy": 0.8563529670238494, + "num_tokens": 181526648.0, + "step": 150880 + }, + { + "entropy": 1.8931098520755767, + "epoch": 0.467745797874855, + "grad_norm": 8.853365898132324, + "learning_rate": 3.699041224052734e-06, + "loss": 0.4505, + "mean_token_accuracy": 0.8582080423831939, + "num_tokens": 181538660.0, + "step": 150890 + }, + { + "entropy": 1.8996970742940902, + "epoch": 0.46777679699990465, + "grad_norm": 8.242768287658691, + "learning_rate": 3.6989186552315533e-06, + "loss": 0.4563, + "mean_token_accuracy": 0.8507839307188988, + "num_tokens": 181550742.0, + "step": 150900 + }, + { + "entropy": 1.9381722196936608, + "epoch": 0.4678077961249544, + "grad_norm": 8.271310806274414, + "learning_rate": 3.6987960985936266e-06, + "loss": 0.4456, + "mean_token_accuracy": 0.8593697786331177, + "num_tokens": 181562045.0, + "step": 150910 + }, + { + "entropy": 1.899745689332485, + "epoch": 0.46783879525000405, + "grad_norm": 9.7952241897583, + "learning_rate": 3.698673554136937e-06, + "loss": 0.4397, + "mean_token_accuracy": 0.8565332680940628, + "num_tokens": 181573997.0, + "step": 150920 + }, + { + "entropy": 1.9225837558507919, + "epoch": 0.46786979437505377, + "grad_norm": 6.767740726470947, + "learning_rate": 3.6985510218594654e-06, + "loss": 0.4257, + "mean_token_accuracy": 0.8650736913084984, + "num_tokens": 181585575.0, + "step": 150930 + }, + { + "entropy": 1.9318598195910455, + "epoch": 0.46790079350010344, + "grad_norm": 10.693130493164062, + "learning_rate": 3.698428501759196e-06, + "loss": 0.5226, + "mean_token_accuracy": 0.8426518440246582, + "num_tokens": 181597531.0, + "step": 150940 + }, + { + "entropy": 1.9709179133176804, + "epoch": 0.46793179262515316, + "grad_norm": 9.757343292236328, + "learning_rate": 3.6983059938341105e-06, + "loss": 0.4907, + "mean_token_accuracy": 0.8445596411824227, + "num_tokens": 181609089.0, + "step": 150950 + }, + { + "entropy": 1.9118812575936317, + "epoch": 0.46796279175020283, + "grad_norm": 7.699484825134277, + "learning_rate": 3.698183498082194e-06, + "loss": 0.4614, + "mean_token_accuracy": 0.8565623730421066, + "num_tokens": 181620966.0, + "step": 150960 + }, + { + "entropy": 1.918295791745186, + "epoch": 0.46799379087525256, + "grad_norm": 8.302897453308105, + "learning_rate": 3.698061014501429e-06, + "loss": 0.4548, + "mean_token_accuracy": 0.8550585329532623, + "num_tokens": 181632131.0, + "step": 150970 + }, + { + "entropy": 1.9636784076690674, + "epoch": 0.4680247900003022, + "grad_norm": 6.999095916748047, + "learning_rate": 3.697938543089802e-06, + "loss": 0.4968, + "mean_token_accuracy": 0.8381591513752937, + "num_tokens": 181643332.0, + "step": 150980 + }, + { + "entropy": 1.8199531510472298, + "epoch": 0.46805578912535195, + "grad_norm": 3.5625722408294678, + "learning_rate": 3.6978160838452965e-06, + "loss": 0.3936, + "mean_token_accuracy": 0.8629630744457245, + "num_tokens": 181656304.0, + "step": 150990 + }, + { + "entropy": 1.8767156556248665, + "epoch": 0.4680867882504016, + "grad_norm": 7.794158935546875, + "learning_rate": 3.697693636765899e-06, + "loss": 0.438, + "mean_token_accuracy": 0.8513418570160866, + "num_tokens": 181668812.0, + "step": 151000 + }, + { + "entropy": 1.8939493969082832, + "epoch": 0.46811778737545134, + "grad_norm": 7.774882793426514, + "learning_rate": 3.697571201849594e-06, + "loss": 0.4146, + "mean_token_accuracy": 0.8537428677082062, + "num_tokens": 181680854.0, + "step": 151010 + }, + { + "entropy": 1.8954125791788101, + "epoch": 0.468148786500501, + "grad_norm": 9.265185356140137, + "learning_rate": 3.6974487790943705e-06, + "loss": 0.4375, + "mean_token_accuracy": 0.8574156984686852, + "num_tokens": 181693486.0, + "step": 151020 + }, + { + "entropy": 1.8389709085226058, + "epoch": 0.46817978562555074, + "grad_norm": 5.6764702796936035, + "learning_rate": 3.697326368498213e-06, + "loss": 0.4121, + "mean_token_accuracy": 0.858234989643097, + "num_tokens": 181706258.0, + "step": 151030 + }, + { + "entropy": 1.9311970368027687, + "epoch": 0.4682107847506004, + "grad_norm": 11.079048156738281, + "learning_rate": 3.69720397005911e-06, + "loss": 0.487, + "mean_token_accuracy": 0.846907764673233, + "num_tokens": 181717998.0, + "step": 151040 + }, + { + "entropy": 1.9391098693013191, + "epoch": 0.46824178387565013, + "grad_norm": 8.38887882232666, + "learning_rate": 3.6970815837750495e-06, + "loss": 0.4387, + "mean_token_accuracy": 0.8551925793290138, + "num_tokens": 181729056.0, + "step": 151050 + }, + { + "entropy": 1.8806929126381875, + "epoch": 0.4682727830006998, + "grad_norm": 8.16617202758789, + "learning_rate": 3.696959209644019e-06, + "loss": 0.4775, + "mean_token_accuracy": 0.8616417735815048, + "num_tokens": 181740577.0, + "step": 151060 + }, + { + "entropy": 1.8071967348456384, + "epoch": 0.4683037821257495, + "grad_norm": 9.592656135559082, + "learning_rate": 3.696836847664009e-06, + "loss": 0.4312, + "mean_token_accuracy": 0.8578245341777802, + "num_tokens": 181753546.0, + "step": 151070 + }, + { + "entropy": 1.955559852719307, + "epoch": 0.4683347812507992, + "grad_norm": 9.505187034606934, + "learning_rate": 3.6967144978330066e-06, + "loss": 0.4992, + "mean_token_accuracy": 0.8476584449410438, + "num_tokens": 181764665.0, + "step": 151080 + }, + { + "entropy": 1.9042584761977195, + "epoch": 0.4683657803758489, + "grad_norm": 8.192767143249512, + "learning_rate": 3.6965921601490035e-06, + "loss": 0.5423, + "mean_token_accuracy": 0.8445826590061187, + "num_tokens": 181777194.0, + "step": 151090 + }, + { + "entropy": 1.9415881663560868, + "epoch": 0.4683967795008986, + "grad_norm": 8.145171165466309, + "learning_rate": 3.696469834609988e-06, + "loss": 0.4829, + "mean_token_accuracy": 0.853820464015007, + "num_tokens": 181788063.0, + "step": 151100 + }, + { + "entropy": 1.7884770929813385, + "epoch": 0.4684277786259483, + "grad_norm": 9.306479454040527, + "learning_rate": 3.6963475212139516e-06, + "loss": 0.4244, + "mean_token_accuracy": 0.8613410949707031, + "num_tokens": 181801286.0, + "step": 151110 + }, + { + "entropy": 1.917163448035717, + "epoch": 0.468458777750998, + "grad_norm": 8.699514389038086, + "learning_rate": 3.696225219958886e-06, + "loss": 0.5055, + "mean_token_accuracy": 0.8294718861579895, + "num_tokens": 181813453.0, + "step": 151120 + }, + { + "entropy": 1.886626946926117, + "epoch": 0.46848977687604765, + "grad_norm": 9.164982795715332, + "learning_rate": 3.6961029308427824e-06, + "loss": 0.4606, + "mean_token_accuracy": 0.8475595995783806, + "num_tokens": 181825948.0, + "step": 151130 + }, + { + "entropy": 1.8653237760066985, + "epoch": 0.46852077600109737, + "grad_norm": 8.744365692138672, + "learning_rate": 3.695980653863633e-06, + "loss": 0.4231, + "mean_token_accuracy": 0.8610041797161102, + "num_tokens": 181838058.0, + "step": 151140 + }, + { + "entropy": 1.8321866735816001, + "epoch": 0.46855177512614704, + "grad_norm": 7.345215320587158, + "learning_rate": 3.69585838901943e-06, + "loss": 0.3699, + "mean_token_accuracy": 0.8644196107983589, + "num_tokens": 181850533.0, + "step": 151150 + }, + { + "entropy": 1.8806213557720184, + "epoch": 0.46858277425119677, + "grad_norm": 8.255237579345703, + "learning_rate": 3.6957361363081657e-06, + "loss": 0.4237, + "mean_token_accuracy": 0.8567614004015922, + "num_tokens": 181863191.0, + "step": 151160 + }, + { + "entropy": 1.9859347879886626, + "epoch": 0.46861377337624643, + "grad_norm": 8.459641456604004, + "learning_rate": 3.6956138957278346e-06, + "loss": 0.4667, + "mean_token_accuracy": 0.8509835615754128, + "num_tokens": 181873914.0, + "step": 151170 + }, + { + "entropy": 1.9205264270305633, + "epoch": 0.46864477250129616, + "grad_norm": 8.495672225952148, + "learning_rate": 3.69549166727643e-06, + "loss": 0.4519, + "mean_token_accuracy": 0.8561896935105324, + "num_tokens": 181884822.0, + "step": 151180 + }, + { + "entropy": 1.827118131518364, + "epoch": 0.4686757716263458, + "grad_norm": 5.892608642578125, + "learning_rate": 3.695369450951948e-06, + "loss": 0.3783, + "mean_token_accuracy": 0.8628544971346855, + "num_tokens": 181897677.0, + "step": 151190 + }, + { + "entropy": 1.8908180356025697, + "epoch": 0.46870677075139555, + "grad_norm": 3.845689535140991, + "learning_rate": 3.6952472467523807e-06, + "loss": 0.4393, + "mean_token_accuracy": 0.8599082320928574, + "num_tokens": 181909762.0, + "step": 151200 + }, + { + "entropy": 1.935589936375618, + "epoch": 0.4687377698764452, + "grad_norm": 7.964409351348877, + "learning_rate": 3.6951250546757256e-06, + "loss": 0.4782, + "mean_token_accuracy": 0.8504336655139924, + "num_tokens": 181920725.0, + "step": 151210 + }, + { + "entropy": 1.915782979130745, + "epoch": 0.46876876900149494, + "grad_norm": 4.305631637573242, + "learning_rate": 3.6950028747199766e-06, + "loss": 0.4388, + "mean_token_accuracy": 0.8563524767756462, + "num_tokens": 181932535.0, + "step": 151220 + }, + { + "entropy": 1.8579757571220399, + "epoch": 0.4687997681265446, + "grad_norm": 8.895844459533691, + "learning_rate": 3.6948807068831323e-06, + "loss": 0.4162, + "mean_token_accuracy": 0.8616629391908646, + "num_tokens": 181944531.0, + "step": 151230 + }, + { + "entropy": 1.9457248359918595, + "epoch": 0.46883076725159434, + "grad_norm": 8.319205284118652, + "learning_rate": 3.694758551163187e-06, + "loss": 0.4806, + "mean_token_accuracy": 0.8451997622847557, + "num_tokens": 181955726.0, + "step": 151240 + }, + { + "entropy": 1.8975832119584084, + "epoch": 0.468861766376644, + "grad_norm": 8.178126335144043, + "learning_rate": 3.6946364075581394e-06, + "loss": 0.4566, + "mean_token_accuracy": 0.8526521921157837, + "num_tokens": 181968223.0, + "step": 151250 + }, + { + "entropy": 1.9673262536525726, + "epoch": 0.46889276550169373, + "grad_norm": 9.780797958374023, + "learning_rate": 3.6945142760659856e-06, + "loss": 0.4799, + "mean_token_accuracy": 0.8512610167264938, + "num_tokens": 181978341.0, + "step": 151260 + }, + { + "entropy": 1.928539128601551, + "epoch": 0.4689237646267434, + "grad_norm": 7.564601421356201, + "learning_rate": 3.694392156684726e-06, + "loss": 0.4058, + "mean_token_accuracy": 0.8705823361873627, + "num_tokens": 181989455.0, + "step": 151270 + }, + { + "entropy": 1.9041236862540245, + "epoch": 0.4689547637517931, + "grad_norm": 3.4355711936950684, + "learning_rate": 3.6942700494123577e-06, + "loss": 0.4363, + "mean_token_accuracy": 0.8603847548365593, + "num_tokens": 182001135.0, + "step": 151280 + }, + { + "entropy": 1.8445854999125004, + "epoch": 0.4689857628768428, + "grad_norm": 8.764761924743652, + "learning_rate": 3.6941479542468796e-06, + "loss": 0.4418, + "mean_token_accuracy": 0.8481447994709015, + "num_tokens": 182013834.0, + "step": 151290 + }, + { + "entropy": 1.861207364499569, + "epoch": 0.4690167620018925, + "grad_norm": 10.083645820617676, + "learning_rate": 3.694025871186291e-06, + "loss": 0.4332, + "mean_token_accuracy": 0.8576826229691505, + "num_tokens": 182026041.0, + "step": 151300 + }, + { + "entropy": 1.9643630295991898, + "epoch": 0.4690477611269422, + "grad_norm": 7.361579895019531, + "learning_rate": 3.693903800228593e-06, + "loss": 0.4872, + "mean_token_accuracy": 0.8524459466338158, + "num_tokens": 182036872.0, + "step": 151310 + }, + { + "entropy": 1.9375217407941818, + "epoch": 0.4690787602519919, + "grad_norm": 6.923783779144287, + "learning_rate": 3.6937817413717846e-06, + "loss": 0.4696, + "mean_token_accuracy": 0.8514922142028809, + "num_tokens": 182048041.0, + "step": 151320 + }, + { + "entropy": 1.982517033815384, + "epoch": 0.4691097593770416, + "grad_norm": 9.478878021240234, + "learning_rate": 3.693659694613868e-06, + "loss": 0.5387, + "mean_token_accuracy": 0.8426320448517799, + "num_tokens": 182058840.0, + "step": 151330 + }, + { + "entropy": 1.8076622486114502, + "epoch": 0.4691407585020913, + "grad_norm": 8.073373794555664, + "learning_rate": 3.6935376599528437e-06, + "loss": 0.3949, + "mean_token_accuracy": 0.8623651906847953, + "num_tokens": 182072827.0, + "step": 151340 + }, + { + "entropy": 1.9158219203352929, + "epoch": 0.469171757627141, + "grad_norm": 7.664277076721191, + "learning_rate": 3.693415637386714e-06, + "loss": 0.486, + "mean_token_accuracy": 0.8522696018218994, + "num_tokens": 182084761.0, + "step": 151350 + }, + { + "entropy": 1.892909213900566, + "epoch": 0.46920275675219064, + "grad_norm": 3.7061331272125244, + "learning_rate": 3.69329362691348e-06, + "loss": 0.4525, + "mean_token_accuracy": 0.8496552616357803, + "num_tokens": 182096624.0, + "step": 151360 + }, + { + "entropy": 1.8524788722395897, + "epoch": 0.46923375587724037, + "grad_norm": 6.679312229156494, + "learning_rate": 3.693171628531146e-06, + "loss": 0.4046, + "mean_token_accuracy": 0.860837659239769, + "num_tokens": 182109437.0, + "step": 151370 + }, + { + "entropy": 1.9032334506511688, + "epoch": 0.46926475500229003, + "grad_norm": 8.694540023803711, + "learning_rate": 3.693049642237714e-06, + "loss": 0.4808, + "mean_token_accuracy": 0.8432764261960983, + "num_tokens": 182121203.0, + "step": 151380 + }, + { + "entropy": 1.952576905488968, + "epoch": 0.46929575412733976, + "grad_norm": 7.671172142028809, + "learning_rate": 3.692927668031188e-06, + "loss": 0.4649, + "mean_token_accuracy": 0.8436528146266937, + "num_tokens": 182133063.0, + "step": 151390 + }, + { + "entropy": 1.9288091585040092, + "epoch": 0.46932675325238943, + "grad_norm": 8.635396957397461, + "learning_rate": 3.6928057059095722e-06, + "loss": 0.4573, + "mean_token_accuracy": 0.8535377115011216, + "num_tokens": 182144648.0, + "step": 151400 + }, + { + "entropy": 1.8419179022312164, + "epoch": 0.46935775237743915, + "grad_norm": 7.754616737365723, + "learning_rate": 3.6926837558708713e-06, + "loss": 0.457, + "mean_token_accuracy": 0.8576859340071679, + "num_tokens": 182157015.0, + "step": 151410 + }, + { + "entropy": 1.7949551880359649, + "epoch": 0.4693887515024888, + "grad_norm": 5.1158061027526855, + "learning_rate": 3.69256181791309e-06, + "loss": 0.4135, + "mean_token_accuracy": 0.8699161261320114, + "num_tokens": 182169639.0, + "step": 151420 + }, + { + "entropy": 1.8809380188584328, + "epoch": 0.46941975062753855, + "grad_norm": 7.669633865356445, + "learning_rate": 3.692439892034234e-06, + "loss": 0.4297, + "mean_token_accuracy": 0.8522050336003304, + "num_tokens": 182182293.0, + "step": 151430 + }, + { + "entropy": 1.954986748099327, + "epoch": 0.4694507497525882, + "grad_norm": 8.638233184814453, + "learning_rate": 3.6923179782323094e-06, + "loss": 0.4811, + "mean_token_accuracy": 0.8547404408454895, + "num_tokens": 182194218.0, + "step": 151440 + }, + { + "entropy": 1.9031829819083215, + "epoch": 0.46948174887763794, + "grad_norm": 3.230351448059082, + "learning_rate": 3.6921960765053222e-06, + "loss": 0.4171, + "mean_token_accuracy": 0.8607431352138519, + "num_tokens": 182206098.0, + "step": 151450 + }, + { + "entropy": 1.860182997584343, + "epoch": 0.4695127480026876, + "grad_norm": 8.417341232299805, + "learning_rate": 3.6920741868512786e-06, + "loss": 0.4697, + "mean_token_accuracy": 0.8528778105974197, + "num_tokens": 182218062.0, + "step": 151460 + }, + { + "entropy": 1.8531003028154374, + "epoch": 0.46954374712773733, + "grad_norm": 4.122646331787109, + "learning_rate": 3.6919523092681877e-06, + "loss": 0.4197, + "mean_token_accuracy": 0.8522070854902267, + "num_tokens": 182231275.0, + "step": 151470 + }, + { + "entropy": 1.826745069026947, + "epoch": 0.469574746252787, + "grad_norm": 3.7218542098999023, + "learning_rate": 3.6918304437540562e-06, + "loss": 0.433, + "mean_token_accuracy": 0.850635839998722, + "num_tokens": 182244141.0, + "step": 151480 + }, + { + "entropy": 1.8726869612932204, + "epoch": 0.4696057453778367, + "grad_norm": 4.05550479888916, + "learning_rate": 3.6917085903068917e-06, + "loss": 0.4061, + "mean_token_accuracy": 0.8551696226000786, + "num_tokens": 182256928.0, + "step": 151490 + }, + { + "entropy": 1.8021176487207413, + "epoch": 0.4696367445028864, + "grad_norm": 8.80622386932373, + "learning_rate": 3.6915867489247047e-06, + "loss": 0.3553, + "mean_token_accuracy": 0.868051840364933, + "num_tokens": 182270315.0, + "step": 151500 + }, + { + "entropy": 1.9134663611650466, + "epoch": 0.4696677436279361, + "grad_norm": 8.243125915527344, + "learning_rate": 3.691464919605503e-06, + "loss": 0.453, + "mean_token_accuracy": 0.856773529946804, + "num_tokens": 182281992.0, + "step": 151510 + }, + { + "entropy": 1.8596371784806252, + "epoch": 0.4696987427529858, + "grad_norm": 6.062982559204102, + "learning_rate": 3.6913431023472958e-06, + "loss": 0.4168, + "mean_token_accuracy": 0.861498761177063, + "num_tokens": 182294109.0, + "step": 151520 + }, + { + "entropy": 1.8382547229528428, + "epoch": 0.4697297418780355, + "grad_norm": 8.140950202941895, + "learning_rate": 3.6912212971480952e-06, + "loss": 0.4041, + "mean_token_accuracy": 0.8637099817395211, + "num_tokens": 182307751.0, + "step": 151530 + }, + { + "entropy": 1.9259546086192132, + "epoch": 0.4697607410030852, + "grad_norm": 7.980990886688232, + "learning_rate": 3.69109950400591e-06, + "loss": 0.4731, + "mean_token_accuracy": 0.8463671579957008, + "num_tokens": 182319503.0, + "step": 151540 + }, + { + "entropy": 1.932530763745308, + "epoch": 0.4697917401281349, + "grad_norm": 7.715649604797363, + "learning_rate": 3.690977722918751e-06, + "loss": 0.4575, + "mean_token_accuracy": 0.848869289457798, + "num_tokens": 182331575.0, + "step": 151550 + }, + { + "entropy": 1.8189910173416137, + "epoch": 0.4698227392531846, + "grad_norm": 11.992012023925781, + "learning_rate": 3.690855953884631e-06, + "loss": 0.441, + "mean_token_accuracy": 0.8527128636837006, + "num_tokens": 182344435.0, + "step": 151560 + }, + { + "entropy": 1.9173847809433937, + "epoch": 0.4698537383782343, + "grad_norm": 6.471937656402588, + "learning_rate": 3.6907341969015616e-06, + "loss": 0.4537, + "mean_token_accuracy": 0.8575096279382706, + "num_tokens": 182355816.0, + "step": 151570 + }, + { + "entropy": 1.8749173179268837, + "epoch": 0.46988473750328397, + "grad_norm": 7.470911502838135, + "learning_rate": 3.6906124519675545e-06, + "loss": 0.4368, + "mean_token_accuracy": 0.8550667837262154, + "num_tokens": 182367893.0, + "step": 151580 + }, + { + "entropy": 1.88685844540596, + "epoch": 0.4699157366283337, + "grad_norm": 6.896969318389893, + "learning_rate": 3.6904907190806227e-06, + "loss": 0.4846, + "mean_token_accuracy": 0.8534591227769852, + "num_tokens": 182379280.0, + "step": 151590 + }, + { + "entropy": 2.0131676971912382, + "epoch": 0.46994673575338336, + "grad_norm": 9.929747581481934, + "learning_rate": 3.6903689982387797e-06, + "loss": 0.557, + "mean_token_accuracy": 0.834673935174942, + "num_tokens": 182390158.0, + "step": 151600 + }, + { + "entropy": 1.903835128247738, + "epoch": 0.46997773487843303, + "grad_norm": 8.8099365234375, + "learning_rate": 3.6902472894400397e-06, + "loss": 0.4271, + "mean_token_accuracy": 0.8607927531003952, + "num_tokens": 182401869.0, + "step": 151610 + }, + { + "entropy": 1.720665130019188, + "epoch": 0.47000873400348275, + "grad_norm": 7.94708776473999, + "learning_rate": 3.6901255926824165e-06, + "loss": 0.3738, + "mean_token_accuracy": 0.8740614369511605, + "num_tokens": 182416920.0, + "step": 151620 + }, + { + "entropy": 1.8281274721026421, + "epoch": 0.4700397331285324, + "grad_norm": 9.17940616607666, + "learning_rate": 3.6900039079639236e-06, + "loss": 0.4254, + "mean_token_accuracy": 0.858392845094204, + "num_tokens": 182429039.0, + "step": 151630 + }, + { + "entropy": 1.8888639822602271, + "epoch": 0.47007073225358215, + "grad_norm": 8.588458061218262, + "learning_rate": 3.689882235282579e-06, + "loss": 0.437, + "mean_token_accuracy": 0.8532740786671639, + "num_tokens": 182441392.0, + "step": 151640 + }, + { + "entropy": 1.9319140702486037, + "epoch": 0.4701017313786318, + "grad_norm": 8.485003471374512, + "learning_rate": 3.689760574636396e-06, + "loss": 0.4998, + "mean_token_accuracy": 0.8546903505921364, + "num_tokens": 182452652.0, + "step": 151650 + }, + { + "entropy": 1.90705948472023, + "epoch": 0.47013273050368154, + "grad_norm": 4.283156394958496, + "learning_rate": 3.6896389260233906e-06, + "loss": 0.4591, + "mean_token_accuracy": 0.8467658385634422, + "num_tokens": 182464784.0, + "step": 151660 + }, + { + "entropy": 1.8410677209496498, + "epoch": 0.4701637296287312, + "grad_norm": 7.59536075592041, + "learning_rate": 3.6895172894415802e-06, + "loss": 0.4054, + "mean_token_accuracy": 0.8557831704616546, + "num_tokens": 182476898.0, + "step": 151670 + }, + { + "entropy": 1.777267834544182, + "epoch": 0.47019472875378093, + "grad_norm": 8.52239990234375, + "learning_rate": 3.6893956648889815e-06, + "loss": 0.3651, + "mean_token_accuracy": 0.8673418864607811, + "num_tokens": 182490990.0, + "step": 151680 + }, + { + "entropy": 1.9240646213293076, + "epoch": 0.4702257278788306, + "grad_norm": 8.47541332244873, + "learning_rate": 3.689274052363612e-06, + "loss": 0.4779, + "mean_token_accuracy": 0.8535736605525017, + "num_tokens": 182502543.0, + "step": 151690 + }, + { + "entropy": 1.9823878049850463, + "epoch": 0.4702567270038803, + "grad_norm": 7.675898551940918, + "learning_rate": 3.6891524518634897e-06, + "loss": 0.4817, + "mean_token_accuracy": 0.8481106892228126, + "num_tokens": 182513069.0, + "step": 151700 + }, + { + "entropy": 1.8830109879374504, + "epoch": 0.47028772612893, + "grad_norm": 3.7766404151916504, + "learning_rate": 3.6890308633866324e-06, + "loss": 0.3968, + "mean_token_accuracy": 0.8605834320187569, + "num_tokens": 182524794.0, + "step": 151710 + }, + { + "entropy": 1.9220516681671143, + "epoch": 0.4703187252539797, + "grad_norm": 7.916464805603027, + "learning_rate": 3.6889092869310594e-06, + "loss": 0.4525, + "mean_token_accuracy": 0.859467314183712, + "num_tokens": 182535849.0, + "step": 151720 + }, + { + "entropy": 1.8670978620648384, + "epoch": 0.4703497243790294, + "grad_norm": 7.645886421203613, + "learning_rate": 3.68878772249479e-06, + "loss": 0.4164, + "mean_token_accuracy": 0.8644891589879989, + "num_tokens": 182548458.0, + "step": 151730 + }, + { + "entropy": 1.9042206808924675, + "epoch": 0.4703807235040791, + "grad_norm": 8.000757217407227, + "learning_rate": 3.6886661700758436e-06, + "loss": 0.4405, + "mean_token_accuracy": 0.8545676082372665, + "num_tokens": 182560533.0, + "step": 151740 + }, + { + "entropy": 1.8703896641731261, + "epoch": 0.4704117226291288, + "grad_norm": 7.271552562713623, + "learning_rate": 3.6885446296722404e-06, + "loss": 0.4418, + "mean_token_accuracy": 0.8588535279035568, + "num_tokens": 182572183.0, + "step": 151750 + }, + { + "entropy": 1.8296963378787041, + "epoch": 0.4704427217541785, + "grad_norm": 7.709904670715332, + "learning_rate": 3.688423101282001e-06, + "loss": 0.3636, + "mean_token_accuracy": 0.8668505728244782, + "num_tokens": 182584228.0, + "step": 151760 + }, + { + "entropy": 1.8257486388087272, + "epoch": 0.4704737208792282, + "grad_norm": 3.957359552383423, + "learning_rate": 3.688301584903147e-06, + "loss": 0.3983, + "mean_token_accuracy": 0.8542505398392677, + "num_tokens": 182597285.0, + "step": 151770 + }, + { + "entropy": 1.9298125982284546, + "epoch": 0.4705047200042779, + "grad_norm": 9.158317565917969, + "learning_rate": 3.688180080533698e-06, + "loss": 0.4692, + "mean_token_accuracy": 0.8475269585847854, + "num_tokens": 182609461.0, + "step": 151780 + }, + { + "entropy": 1.8265347346663474, + "epoch": 0.47053571912932757, + "grad_norm": 3.449368953704834, + "learning_rate": 3.688058588171679e-06, + "loss": 0.3726, + "mean_token_accuracy": 0.8622838228940963, + "num_tokens": 182623236.0, + "step": 151790 + }, + { + "entropy": 1.9088860154151917, + "epoch": 0.4705667182543773, + "grad_norm": 4.117319107055664, + "learning_rate": 3.68793710781511e-06, + "loss": 0.4677, + "mean_token_accuracy": 0.8477122217416764, + "num_tokens": 182636203.0, + "step": 151800 + }, + { + "entropy": 1.9173002913594246, + "epoch": 0.47059771737942696, + "grad_norm": 7.626106262207031, + "learning_rate": 3.6878156394620156e-06, + "loss": 0.4764, + "mean_token_accuracy": 0.8568134188652039, + "num_tokens": 182647282.0, + "step": 151810 + }, + { + "entropy": 1.9399552628397942, + "epoch": 0.4706287165044767, + "grad_norm": 9.241488456726074, + "learning_rate": 3.687694183110418e-06, + "loss": 0.4581, + "mean_token_accuracy": 0.8620687261223793, + "num_tokens": 182658289.0, + "step": 151820 + }, + { + "entropy": 1.911539526283741, + "epoch": 0.47065971562952635, + "grad_norm": 8.373076438903809, + "learning_rate": 3.687572738758341e-06, + "loss": 0.435, + "mean_token_accuracy": 0.85491793602705, + "num_tokens": 182670044.0, + "step": 151830 + }, + { + "entropy": 1.9582807749509812, + "epoch": 0.4706907147545761, + "grad_norm": 7.713615894317627, + "learning_rate": 3.68745130640381e-06, + "loss": 0.4862, + "mean_token_accuracy": 0.8571858555078506, + "num_tokens": 182681164.0, + "step": 151840 + }, + { + "entropy": 1.9365883350372315, + "epoch": 0.47072171387962575, + "grad_norm": 8.034655570983887, + "learning_rate": 3.687329886044849e-06, + "loss": 0.4573, + "mean_token_accuracy": 0.8554182961583138, + "num_tokens": 182691971.0, + "step": 151850 + }, + { + "entropy": 1.9806547105312347, + "epoch": 0.4707527130046754, + "grad_norm": 9.290281295776367, + "learning_rate": 3.6872084776794824e-06, + "loss": 0.4649, + "mean_token_accuracy": 0.8474147111177445, + "num_tokens": 182702979.0, + "step": 151860 + }, + { + "entropy": 1.9025154545903207, + "epoch": 0.47078371212972514, + "grad_norm": 7.480702877044678, + "learning_rate": 3.6870870813057372e-06, + "loss": 0.4161, + "mean_token_accuracy": 0.8635838240385055, + "num_tokens": 182714689.0, + "step": 151870 + }, + { + "entropy": 1.9400853931903839, + "epoch": 0.4708147112547748, + "grad_norm": 8.509082794189453, + "learning_rate": 3.6869656969216393e-06, + "loss": 0.4811, + "mean_token_accuracy": 0.8622403219342232, + "num_tokens": 182725603.0, + "step": 151880 + }, + { + "entropy": 1.9585573732852937, + "epoch": 0.47084571037982453, + "grad_norm": 9.956759452819824, + "learning_rate": 3.686844324525214e-06, + "loss": 0.4887, + "mean_token_accuracy": 0.8526047378778457, + "num_tokens": 182736039.0, + "step": 151890 + }, + { + "entropy": 1.9035368889570237, + "epoch": 0.4708767095048742, + "grad_norm": 7.4699387550354, + "learning_rate": 3.6867229641144897e-06, + "loss": 0.51, + "mean_token_accuracy": 0.8410866633057594, + "num_tokens": 182747402.0, + "step": 151900 + }, + { + "entropy": 1.909564508497715, + "epoch": 0.4709077086299239, + "grad_norm": 8.168258666992188, + "learning_rate": 3.686601615687493e-06, + "loss": 0.4735, + "mean_token_accuracy": 0.8454542621970177, + "num_tokens": 182759365.0, + "step": 151910 + }, + { + "entropy": 1.9451777666807175, + "epoch": 0.4709387077549736, + "grad_norm": 8.727764129638672, + "learning_rate": 3.686480279242252e-06, + "loss": 0.4561, + "mean_token_accuracy": 0.8463275358080864, + "num_tokens": 182771263.0, + "step": 151920 + }, + { + "entropy": 1.875081916153431, + "epoch": 0.4709697068800233, + "grad_norm": 10.775795936584473, + "learning_rate": 3.6863589547767952e-06, + "loss": 0.4724, + "mean_token_accuracy": 0.8565489128232002, + "num_tokens": 182782809.0, + "step": 151930 + }, + { + "entropy": 1.8936992183327674, + "epoch": 0.471000706005073, + "grad_norm": 3.990841865539551, + "learning_rate": 3.6862376422891516e-06, + "loss": 0.4563, + "mean_token_accuracy": 0.8535718321800232, + "num_tokens": 182794788.0, + "step": 151940 + }, + { + "entropy": 1.8360718876123427, + "epoch": 0.4710317051301227, + "grad_norm": 9.025553703308105, + "learning_rate": 3.6861163417773506e-06, + "loss": 0.4586, + "mean_token_accuracy": 0.855416850745678, + "num_tokens": 182807619.0, + "step": 151950 + }, + { + "entropy": 1.827831156551838, + "epoch": 0.4710627042551724, + "grad_norm": 7.876723766326904, + "learning_rate": 3.6859950532394207e-06, + "loss": 0.4411, + "mean_token_accuracy": 0.8626267358660697, + "num_tokens": 182820542.0, + "step": 151960 + }, + { + "entropy": 1.9320463940501214, + "epoch": 0.4710937033802221, + "grad_norm": 4.7400383949279785, + "learning_rate": 3.6858737766733936e-06, + "loss": 0.4801, + "mean_token_accuracy": 0.8590385049581528, + "num_tokens": 182832246.0, + "step": 151970 + }, + { + "entropy": 1.9527651473879815, + "epoch": 0.4711247025052718, + "grad_norm": 7.10463285446167, + "learning_rate": 3.6857525120772986e-06, + "loss": 0.4874, + "mean_token_accuracy": 0.8457214340567589, + "num_tokens": 182843575.0, + "step": 151980 + }, + { + "entropy": 1.912015789747238, + "epoch": 0.4711557016303215, + "grad_norm": 10.008974075317383, + "learning_rate": 3.6856312594491684e-06, + "loss": 0.4629, + "mean_token_accuracy": 0.8591500997543335, + "num_tokens": 182855099.0, + "step": 151990 + }, + { + "entropy": 1.8941986069083214, + "epoch": 0.47118670075537117, + "grad_norm": 7.317121505737305, + "learning_rate": 3.6855100187870336e-06, + "loss": 0.4426, + "mean_token_accuracy": 0.8630182087421417, + "num_tokens": 182867082.0, + "step": 152000 + }, + { + "entropy": 1.9675297275185586, + "epoch": 0.4712176998804209, + "grad_norm": 8.49598503112793, + "learning_rate": 3.685388790088926e-06, + "loss": 0.4801, + "mean_token_accuracy": 0.8479068145155907, + "num_tokens": 182878343.0, + "step": 152010 + }, + { + "entropy": 1.8332099094986916, + "epoch": 0.47124869900547056, + "grad_norm": 9.543118476867676, + "learning_rate": 3.6852675733528774e-06, + "loss": 0.3753, + "mean_token_accuracy": 0.867411358654499, + "num_tokens": 182891559.0, + "step": 152020 + }, + { + "entropy": 1.8728103652596473, + "epoch": 0.4712796981305203, + "grad_norm": 7.202358245849609, + "learning_rate": 3.6851463685769223e-06, + "loss": 0.4315, + "mean_token_accuracy": 0.8574921682476997, + "num_tokens": 182903624.0, + "step": 152030 + }, + { + "entropy": 1.8856816232204436, + "epoch": 0.47131069725556995, + "grad_norm": 7.488083362579346, + "learning_rate": 3.6850251757590934e-06, + "loss": 0.4045, + "mean_token_accuracy": 0.8632172226905823, + "num_tokens": 182915491.0, + "step": 152040 + }, + { + "entropy": 1.961459246277809, + "epoch": 0.4713416963806197, + "grad_norm": 7.7718915939331055, + "learning_rate": 3.684903994897424e-06, + "loss": 0.4761, + "mean_token_accuracy": 0.8584063723683357, + "num_tokens": 182926333.0, + "step": 152050 + }, + { + "entropy": 1.9349211975932121, + "epoch": 0.47137269550566935, + "grad_norm": 7.356520652770996, + "learning_rate": 3.684782825989949e-06, + "loss": 0.4553, + "mean_token_accuracy": 0.85878836363554, + "num_tokens": 182937813.0, + "step": 152060 + }, + { + "entropy": 1.83448978215456, + "epoch": 0.4714036946307191, + "grad_norm": 7.66597843170166, + "learning_rate": 3.6846616690347028e-06, + "loss": 0.3923, + "mean_token_accuracy": 0.8577890947461129, + "num_tokens": 182951161.0, + "step": 152070 + }, + { + "entropy": 1.9403810694813728, + "epoch": 0.47143469375576874, + "grad_norm": 8.418362617492676, + "learning_rate": 3.684540524029721e-06, + "loss": 0.4767, + "mean_token_accuracy": 0.8424350410699845, + "num_tokens": 182962756.0, + "step": 152080 + }, + { + "entropy": 1.9282257035374641, + "epoch": 0.47146569288081847, + "grad_norm": 7.732948303222656, + "learning_rate": 3.6844193909730393e-06, + "loss": 0.4742, + "mean_token_accuracy": 0.8520740434527397, + "num_tokens": 182974074.0, + "step": 152090 + }, + { + "entropy": 1.8858251735568046, + "epoch": 0.47149669200586813, + "grad_norm": 7.745698928833008, + "learning_rate": 3.684298269862692e-06, + "loss": 0.45, + "mean_token_accuracy": 0.8505064308643341, + "num_tokens": 182986163.0, + "step": 152100 + }, + { + "entropy": 1.8810465842485429, + "epoch": 0.4715276911309178, + "grad_norm": 4.165502071380615, + "learning_rate": 3.6841771606967176e-06, + "loss": 0.4411, + "mean_token_accuracy": 0.8553640246391296, + "num_tokens": 182998633.0, + "step": 152110 + }, + { + "entropy": 1.8921176463365554, + "epoch": 0.4715586902559675, + "grad_norm": 7.671092987060547, + "learning_rate": 3.684056063473152e-06, + "loss": 0.394, + "mean_token_accuracy": 0.8703850194811821, + "num_tokens": 183010501.0, + "step": 152120 + }, + { + "entropy": 1.8996976360678672, + "epoch": 0.4715896893810172, + "grad_norm": 3.6530537605285645, + "learning_rate": 3.6839349781900336e-06, + "loss": 0.4634, + "mean_token_accuracy": 0.8507614269852638, + "num_tokens": 183022012.0, + "step": 152130 + }, + { + "entropy": 1.8891934141516686, + "epoch": 0.4716206885060669, + "grad_norm": 3.9836368560791016, + "learning_rate": 3.6838139048453997e-06, + "loss": 0.4175, + "mean_token_accuracy": 0.8550976201891899, + "num_tokens": 183034335.0, + "step": 152140 + }, + { + "entropy": 1.8683853819966316, + "epoch": 0.4716516876311166, + "grad_norm": 7.420729160308838, + "learning_rate": 3.6836928434372883e-06, + "loss": 0.4207, + "mean_token_accuracy": 0.8592018321156502, + "num_tokens": 183046580.0, + "step": 152150 + }, + { + "entropy": 1.9617306634783744, + "epoch": 0.4716826867561663, + "grad_norm": 8.44356632232666, + "learning_rate": 3.6835717939637382e-06, + "loss": 0.4988, + "mean_token_accuracy": 0.8451585352420807, + "num_tokens": 183057635.0, + "step": 152160 + }, + { + "entropy": 1.895785966515541, + "epoch": 0.471713685881216, + "grad_norm": 9.277005195617676, + "learning_rate": 3.6834507564227894e-06, + "loss": 0.4104, + "mean_token_accuracy": 0.8586176127195358, + "num_tokens": 183069793.0, + "step": 152170 + }, + { + "entropy": 1.9464083522558213, + "epoch": 0.4717446850062657, + "grad_norm": 9.0181884765625, + "learning_rate": 3.6833297308124816e-06, + "loss": 0.4491, + "mean_token_accuracy": 0.8472587198019028, + "num_tokens": 183081520.0, + "step": 152180 + }, + { + "entropy": 1.8146517232060433, + "epoch": 0.4717756841313154, + "grad_norm": 7.19028902053833, + "learning_rate": 3.683208717130854e-06, + "loss": 0.4022, + "mean_token_accuracy": 0.8625146299600601, + "num_tokens": 183094449.0, + "step": 152190 + }, + { + "entropy": 1.8609568387269975, + "epoch": 0.4718066832563651, + "grad_norm": 9.199468612670898, + "learning_rate": 3.6830877153759475e-06, + "loss": 0.4774, + "mean_token_accuracy": 0.8427708148956299, + "num_tokens": 183107155.0, + "step": 152200 + }, + { + "entropy": 1.9776297345757485, + "epoch": 0.47183768238141477, + "grad_norm": 9.14094352722168, + "learning_rate": 3.682966725545803e-06, + "loss": 0.52, + "mean_token_accuracy": 0.8375296384096146, + "num_tokens": 183118329.0, + "step": 152210 + }, + { + "entropy": 1.8778993353247642, + "epoch": 0.4718686815064645, + "grad_norm": 7.582857131958008, + "learning_rate": 3.6828457476384627e-06, + "loss": 0.4531, + "mean_token_accuracy": 0.8534572198987007, + "num_tokens": 183130122.0, + "step": 152220 + }, + { + "entropy": 1.9139007419347762, + "epoch": 0.47189968063151416, + "grad_norm": 8.777506828308105, + "learning_rate": 3.6827247816519684e-06, + "loss": 0.4427, + "mean_token_accuracy": 0.8544740185141564, + "num_tokens": 183141880.0, + "step": 152230 + }, + { + "entropy": 1.8196016132831574, + "epoch": 0.4719306797565639, + "grad_norm": 6.885635852813721, + "learning_rate": 3.6826038275843614e-06, + "loss": 0.3667, + "mean_token_accuracy": 0.8703275233507156, + "num_tokens": 183154880.0, + "step": 152240 + }, + { + "entropy": 1.951605823636055, + "epoch": 0.47196167888161356, + "grad_norm": 5.0247979164123535, + "learning_rate": 3.682482885433685e-06, + "loss": 0.4446, + "mean_token_accuracy": 0.8587975725531578, + "num_tokens": 183166905.0, + "step": 152250 + }, + { + "entropy": 1.8515436753630639, + "epoch": 0.4719926780066633, + "grad_norm": 7.222294807434082, + "learning_rate": 3.682361955197983e-06, + "loss": 0.3812, + "mean_token_accuracy": 0.8555783748626709, + "num_tokens": 183179878.0, + "step": 152260 + }, + { + "entropy": 1.9167709454894066, + "epoch": 0.47202367713171295, + "grad_norm": 3.6357107162475586, + "learning_rate": 3.682241036875299e-06, + "loss": 0.4907, + "mean_token_accuracy": 0.8456657961010933, + "num_tokens": 183191647.0, + "step": 152270 + }, + { + "entropy": 1.9338033080101014, + "epoch": 0.4720546762567627, + "grad_norm": 8.893651962280273, + "learning_rate": 3.682120130463677e-06, + "loss": 0.4721, + "mean_token_accuracy": 0.8554412201046944, + "num_tokens": 183203524.0, + "step": 152280 + }, + { + "entropy": 1.8064615935087205, + "epoch": 0.47208567538181234, + "grad_norm": 8.168737411499023, + "learning_rate": 3.681999235961162e-06, + "loss": 0.4264, + "mean_token_accuracy": 0.8518370047211647, + "num_tokens": 183216820.0, + "step": 152290 + }, + { + "entropy": 1.8602395519614219, + "epoch": 0.47211667450686207, + "grad_norm": 9.071744918823242, + "learning_rate": 3.681878353365799e-06, + "loss": 0.4119, + "mean_token_accuracy": 0.8614316672086716, + "num_tokens": 183228935.0, + "step": 152300 + }, + { + "entropy": 1.8436309307813645, + "epoch": 0.47214767363191174, + "grad_norm": 7.226712226867676, + "learning_rate": 3.6817574826756326e-06, + "loss": 0.4279, + "mean_token_accuracy": 0.8612644121050834, + "num_tokens": 183241674.0, + "step": 152310 + }, + { + "entropy": 1.8456968396902085, + "epoch": 0.47217867275696146, + "grad_norm": 7.766792297363281, + "learning_rate": 3.6816366238887095e-06, + "loss": 0.48, + "mean_token_accuracy": 0.8446129441261292, + "num_tokens": 183254431.0, + "step": 152320 + }, + { + "entropy": 1.8767608508467675, + "epoch": 0.47220967188201113, + "grad_norm": 8.538994789123535, + "learning_rate": 3.681515777003077e-06, + "loss": 0.4523, + "mean_token_accuracy": 0.8586622327566147, + "num_tokens": 183265932.0, + "step": 152330 + }, + { + "entropy": 1.9017756059765816, + "epoch": 0.47224067100706085, + "grad_norm": 9.971243858337402, + "learning_rate": 3.68139494201678e-06, + "loss": 0.4552, + "mean_token_accuracy": 0.8545010507106781, + "num_tokens": 183277542.0, + "step": 152340 + }, + { + "entropy": 1.9646113216876984, + "epoch": 0.4722716701321105, + "grad_norm": 12.376042366027832, + "learning_rate": 3.681274118927867e-06, + "loss": 0.506, + "mean_token_accuracy": 0.8538304835557937, + "num_tokens": 183288333.0, + "step": 152350 + }, + { + "entropy": 1.8397805973887444, + "epoch": 0.4723026692571602, + "grad_norm": 3.604233741760254, + "learning_rate": 3.6811533077343866e-06, + "loss": 0.4059, + "mean_token_accuracy": 0.8551945656538009, + "num_tokens": 183301004.0, + "step": 152360 + }, + { + "entropy": 1.8144716992974281, + "epoch": 0.4723336683822099, + "grad_norm": 8.521669387817383, + "learning_rate": 3.681032508434385e-06, + "loss": 0.3965, + "mean_token_accuracy": 0.8576774016022682, + "num_tokens": 183313976.0, + "step": 152370 + }, + { + "entropy": 1.9162440985441207, + "epoch": 0.4723646675072596, + "grad_norm": 4.395325660705566, + "learning_rate": 3.6809117210259127e-06, + "loss": 0.4428, + "mean_token_accuracy": 0.8563192039728165, + "num_tokens": 183325996.0, + "step": 152380 + }, + { + "entropy": 1.800866074860096, + "epoch": 0.4723956666323093, + "grad_norm": 4.031833648681641, + "learning_rate": 3.680790945507018e-06, + "loss": 0.3897, + "mean_token_accuracy": 0.8660914584994316, + "num_tokens": 183338994.0, + "step": 152390 + }, + { + "entropy": 1.9331076472997666, + "epoch": 0.472426665757359, + "grad_norm": 7.948485851287842, + "learning_rate": 3.6806701818757502e-06, + "loss": 0.4812, + "mean_token_accuracy": 0.8479227542877197, + "num_tokens": 183350361.0, + "step": 152400 + }, + { + "entropy": 1.894187593460083, + "epoch": 0.4724576648824087, + "grad_norm": 3.7551207542419434, + "learning_rate": 3.680549430130159e-06, + "loss": 0.4179, + "mean_token_accuracy": 0.8658199816942215, + "num_tokens": 183361759.0, + "step": 152410 + }, + { + "entropy": 1.936548638343811, + "epoch": 0.47248866400745837, + "grad_norm": 7.659783363342285, + "learning_rate": 3.680428690268297e-06, + "loss": 0.4861, + "mean_token_accuracy": 0.8464308723807334, + "num_tokens": 183372606.0, + "step": 152420 + }, + { + "entropy": 1.892865703999996, + "epoch": 0.4725196631325081, + "grad_norm": 5.735332012176514, + "learning_rate": 3.6803079622882127e-06, + "loss": 0.4803, + "mean_token_accuracy": 0.8505585104227066, + "num_tokens": 183384415.0, + "step": 152430 + }, + { + "entropy": 1.8908448189496994, + "epoch": 0.47255066225755776, + "grad_norm": 7.597827434539795, + "learning_rate": 3.6801872461879588e-06, + "loss": 0.4453, + "mean_token_accuracy": 0.8497716590762139, + "num_tokens": 183396513.0, + "step": 152440 + }, + { + "entropy": 1.9123169988393784, + "epoch": 0.4725816613826075, + "grad_norm": 8.45621395111084, + "learning_rate": 3.6800665419655856e-06, + "loss": 0.4784, + "mean_token_accuracy": 0.8462508618831635, + "num_tokens": 183407985.0, + "step": 152450 + }, + { + "entropy": 1.8796579480171203, + "epoch": 0.47261266050765716, + "grad_norm": 7.05368185043335, + "learning_rate": 3.6799458496191475e-06, + "loss": 0.471, + "mean_token_accuracy": 0.8509629443287849, + "num_tokens": 183419056.0, + "step": 152460 + }, + { + "entropy": 1.8205665156245232, + "epoch": 0.4726436596327069, + "grad_norm": 7.551618576049805, + "learning_rate": 3.6798251691466964e-06, + "loss": 0.4077, + "mean_token_accuracy": 0.858218289911747, + "num_tokens": 183431365.0, + "step": 152470 + }, + { + "entropy": 1.857941946387291, + "epoch": 0.47267465875775655, + "grad_norm": 9.102563858032227, + "learning_rate": 3.6797045005462845e-06, + "loss": 0.4162, + "mean_token_accuracy": 0.8679466590285301, + "num_tokens": 183442920.0, + "step": 152480 + }, + { + "entropy": 1.87067861109972, + "epoch": 0.4727056578828063, + "grad_norm": 4.3301897048950195, + "learning_rate": 3.6795838438159666e-06, + "loss": 0.449, + "mean_token_accuracy": 0.8534408792853355, + "num_tokens": 183454386.0, + "step": 152490 + }, + { + "entropy": 1.8775417447090148, + "epoch": 0.47273665700785594, + "grad_norm": 4.135782241821289, + "learning_rate": 3.6794631989537953e-06, + "loss": 0.4695, + "mean_token_accuracy": 0.8456904798746109, + "num_tokens": 183466342.0, + "step": 152500 + }, + { + "entropy": 1.9376057237386703, + "epoch": 0.47276765613290567, + "grad_norm": 8.15317153930664, + "learning_rate": 3.6793425659578263e-06, + "loss": 0.4641, + "mean_token_accuracy": 0.8525159105658531, + "num_tokens": 183477396.0, + "step": 152510 + }, + { + "entropy": 1.8745385244488717, + "epoch": 0.47279865525795534, + "grad_norm": 2.6730575561523438, + "learning_rate": 3.6792219448261145e-06, + "loss": 0.4486, + "mean_token_accuracy": 0.8526719242334366, + "num_tokens": 183489938.0, + "step": 152520 + }, + { + "entropy": 1.9059744358062745, + "epoch": 0.47282965438300506, + "grad_norm": 6.631153106689453, + "learning_rate": 3.6791013355567153e-06, + "loss": 0.4762, + "mean_token_accuracy": 0.8524453654885292, + "num_tokens": 183501781.0, + "step": 152530 + }, + { + "entropy": 1.9294340521097184, + "epoch": 0.47286065350805473, + "grad_norm": 9.2339506149292, + "learning_rate": 3.6789807381476832e-06, + "loss": 0.4727, + "mean_token_accuracy": 0.8489730969071388, + "num_tokens": 183513298.0, + "step": 152540 + }, + { + "entropy": 1.915228234231472, + "epoch": 0.47289165263310445, + "grad_norm": 7.601345062255859, + "learning_rate": 3.6788601525970764e-06, + "loss": 0.4844, + "mean_token_accuracy": 0.8517981469631195, + "num_tokens": 183524819.0, + "step": 152550 + }, + { + "entropy": 1.8034334376454353, + "epoch": 0.4729226517581541, + "grad_norm": 4.246415138244629, + "learning_rate": 3.67873957890295e-06, + "loss": 0.3833, + "mean_token_accuracy": 0.8603020891547203, + "num_tokens": 183537540.0, + "step": 152560 + }, + { + "entropy": 1.8432205379009248, + "epoch": 0.47295365088320385, + "grad_norm": 8.568245887756348, + "learning_rate": 3.6786190170633637e-06, + "loss": 0.421, + "mean_token_accuracy": 0.8558652251958847, + "num_tokens": 183550060.0, + "step": 152570 + }, + { + "entropy": 1.9111878633499146, + "epoch": 0.4729846500082535, + "grad_norm": 6.995577812194824, + "learning_rate": 3.678498467076371e-06, + "loss": 0.4648, + "mean_token_accuracy": 0.8523382723331452, + "num_tokens": 183561570.0, + "step": 152580 + }, + { + "entropy": 1.792884823679924, + "epoch": 0.47301564913330324, + "grad_norm": 5.278679370880127, + "learning_rate": 3.6783779289400336e-06, + "loss": 0.3254, + "mean_token_accuracy": 0.8689914017915725, + "num_tokens": 183575606.0, + "step": 152590 + }, + { + "entropy": 1.8932118907570838, + "epoch": 0.4730466482583529, + "grad_norm": 8.754302024841309, + "learning_rate": 3.6782574026524075e-06, + "loss": 0.4087, + "mean_token_accuracy": 0.8607703685760498, + "num_tokens": 183588123.0, + "step": 152600 + }, + { + "entropy": 1.863177940249443, + "epoch": 0.4730776473834026, + "grad_norm": 4.379142761230469, + "learning_rate": 3.6781368882115536e-06, + "loss": 0.4756, + "mean_token_accuracy": 0.8468897491693497, + "num_tokens": 183600671.0, + "step": 152610 + }, + { + "entropy": 1.9543335124850274, + "epoch": 0.4731086465084523, + "grad_norm": 8.659340858459473, + "learning_rate": 3.6780163856155306e-06, + "loss": 0.5324, + "mean_token_accuracy": 0.8367661848664284, + "num_tokens": 183612571.0, + "step": 152620 + }, + { + "entropy": 1.8547952726483345, + "epoch": 0.47313964563350197, + "grad_norm": 11.049469947814941, + "learning_rate": 3.677895894862398e-06, + "loss": 0.4265, + "mean_token_accuracy": 0.853056563436985, + "num_tokens": 183625066.0, + "step": 152630 + }, + { + "entropy": 1.8781471252441406, + "epoch": 0.4731706447585517, + "grad_norm": 3.9625051021575928, + "learning_rate": 3.6777754159502156e-06, + "loss": 0.4494, + "mean_token_accuracy": 0.8535264268517494, + "num_tokens": 183637140.0, + "step": 152640 + }, + { + "entropy": 1.9165562033653258, + "epoch": 0.47320164388360136, + "grad_norm": 8.369365692138672, + "learning_rate": 3.677654948877046e-06, + "loss": 0.4494, + "mean_token_accuracy": 0.854417635500431, + "num_tokens": 183649188.0, + "step": 152650 + }, + { + "entropy": 1.9343287527561188, + "epoch": 0.4732326430086511, + "grad_norm": 8.027566909790039, + "learning_rate": 3.6775344936409477e-06, + "loss": 0.4711, + "mean_token_accuracy": 0.8549977317452431, + "num_tokens": 183660619.0, + "step": 152660 + }, + { + "entropy": 1.7774075135588645, + "epoch": 0.47326364213370076, + "grad_norm": 8.56772518157959, + "learning_rate": 3.6774140502399853e-06, + "loss": 0.3696, + "mean_token_accuracy": 0.8563277691602706, + "num_tokens": 183674724.0, + "step": 152670 + }, + { + "entropy": 1.7775170743465423, + "epoch": 0.4732946412587505, + "grad_norm": 3.6011083126068115, + "learning_rate": 3.6772936186722188e-06, + "loss": 0.3685, + "mean_token_accuracy": 0.8681730851531029, + "num_tokens": 183688045.0, + "step": 152680 + }, + { + "entropy": 1.891077609360218, + "epoch": 0.47332564038380015, + "grad_norm": 9.511871337890625, + "learning_rate": 3.6771731989357106e-06, + "loss": 0.4778, + "mean_token_accuracy": 0.8397925227880478, + "num_tokens": 183700096.0, + "step": 152690 + }, + { + "entropy": 1.8384882450103759, + "epoch": 0.4733566395088499, + "grad_norm": 6.966220855712891, + "learning_rate": 3.6770527910285245e-06, + "loss": 0.4154, + "mean_token_accuracy": 0.8562698677182198, + "num_tokens": 183712595.0, + "step": 152700 + }, + { + "entropy": 1.85170476436615, + "epoch": 0.47338763863389954, + "grad_norm": 8.161643981933594, + "learning_rate": 3.6769323949487246e-06, + "loss": 0.4254, + "mean_token_accuracy": 0.8651944741606712, + "num_tokens": 183724796.0, + "step": 152710 + }, + { + "entropy": 1.8865203723311423, + "epoch": 0.47341863775894927, + "grad_norm": 10.452710151672363, + "learning_rate": 3.676812010694373e-06, + "loss": 0.4516, + "mean_token_accuracy": 0.8501796692609787, + "num_tokens": 183737051.0, + "step": 152720 + }, + { + "entropy": 1.841557838022709, + "epoch": 0.47344963688399894, + "grad_norm": 3.842358350753784, + "learning_rate": 3.6766916382635347e-06, + "loss": 0.4337, + "mean_token_accuracy": 0.8630548194050789, + "num_tokens": 183749899.0, + "step": 152730 + }, + { + "entropy": 1.905419360846281, + "epoch": 0.47348063600904866, + "grad_norm": 7.395157337188721, + "learning_rate": 3.6765712776542745e-06, + "loss": 0.4447, + "mean_token_accuracy": 0.856062363088131, + "num_tokens": 183762289.0, + "step": 152740 + }, + { + "entropy": 1.6978764295578004, + "epoch": 0.47351163513409833, + "grad_norm": 8.431910514831543, + "learning_rate": 3.6764509288646577e-06, + "loss": 0.3217, + "mean_token_accuracy": 0.8770282730460167, + "num_tokens": 183776782.0, + "step": 152750 + }, + { + "entropy": 1.8656988859176635, + "epoch": 0.47354263425914805, + "grad_norm": 9.192631721496582, + "learning_rate": 3.6763305918927494e-06, + "loss": 0.4009, + "mean_token_accuracy": 0.871405579149723, + "num_tokens": 183788701.0, + "step": 152760 + }, + { + "entropy": 1.8958804607391357, + "epoch": 0.4735736333841977, + "grad_norm": 7.898538112640381, + "learning_rate": 3.676210266736617e-06, + "loss": 0.4845, + "mean_token_accuracy": 0.8465561047196388, + "num_tokens": 183799771.0, + "step": 152770 + }, + { + "entropy": 1.8809950187802316, + "epoch": 0.47360463250924745, + "grad_norm": 3.637345790863037, + "learning_rate": 3.6760899533943257e-06, + "loss": 0.4451, + "mean_token_accuracy": 0.8576677471399308, + "num_tokens": 183811578.0, + "step": 152780 + }, + { + "entropy": 1.804450687766075, + "epoch": 0.4736356316342971, + "grad_norm": 3.748936653137207, + "learning_rate": 3.675969651863942e-06, + "loss": 0.3788, + "mean_token_accuracy": 0.8617606118321419, + "num_tokens": 183824765.0, + "step": 152790 + }, + { + "entropy": 1.8327493906021117, + "epoch": 0.47366663075934684, + "grad_norm": 8.088676452636719, + "learning_rate": 3.675849362143535e-06, + "loss": 0.397, + "mean_token_accuracy": 0.8559876635670662, + "num_tokens": 183837545.0, + "step": 152800 + }, + { + "entropy": 1.8694708958268165, + "epoch": 0.4736976298843965, + "grad_norm": 8.502338409423828, + "learning_rate": 3.6757290842311712e-06, + "loss": 0.4315, + "mean_token_accuracy": 0.8672619074583053, + "num_tokens": 183849680.0, + "step": 152810 + }, + { + "entropy": 1.9533298015594482, + "epoch": 0.47372862900944623, + "grad_norm": 8.786824226379395, + "learning_rate": 3.6756088181249183e-06, + "loss": 0.4956, + "mean_token_accuracy": 0.844579030573368, + "num_tokens": 183860338.0, + "step": 152820 + }, + { + "entropy": 1.7913990572094918, + "epoch": 0.4737596281344959, + "grad_norm": 7.474668979644775, + "learning_rate": 3.675488563822847e-06, + "loss": 0.3915, + "mean_token_accuracy": 0.8707406103610993, + "num_tokens": 183872820.0, + "step": 152830 + }, + { + "entropy": 1.8852297574281693, + "epoch": 0.4737906272595456, + "grad_norm": 8.760466575622559, + "learning_rate": 3.675368321323025e-06, + "loss": 0.4826, + "mean_token_accuracy": 0.8509562835097313, + "num_tokens": 183884880.0, + "step": 152840 + }, + { + "entropy": 1.9746339291334152, + "epoch": 0.4738216263845953, + "grad_norm": 6.545812129974365, + "learning_rate": 3.6752480906235227e-06, + "loss": 0.5101, + "mean_token_accuracy": 0.8477579385042191, + "num_tokens": 183896158.0, + "step": 152850 + }, + { + "entropy": 1.8532864689826964, + "epoch": 0.47385262550964496, + "grad_norm": 7.785163879394531, + "learning_rate": 3.675127871722409e-06, + "loss": 0.3823, + "mean_token_accuracy": 0.8605849370360374, + "num_tokens": 183908502.0, + "step": 152860 + }, + { + "entropy": 1.927658785879612, + "epoch": 0.4738836246346947, + "grad_norm": 8.491934776306152, + "learning_rate": 3.6750076646177558e-06, + "loss": 0.4626, + "mean_token_accuracy": 0.8559217736124992, + "num_tokens": 183920673.0, + "step": 152870 + }, + { + "entropy": 1.9375594913959504, + "epoch": 0.47391462375974436, + "grad_norm": 7.955965995788574, + "learning_rate": 3.6748874693076326e-06, + "loss": 0.4898, + "mean_token_accuracy": 0.8447357758879661, + "num_tokens": 183932317.0, + "step": 152880 + }, + { + "entropy": 1.831139837950468, + "epoch": 0.4739456228847941, + "grad_norm": 4.309988021850586, + "learning_rate": 3.6747672857901117e-06, + "loss": 0.404, + "mean_token_accuracy": 0.8567724660038948, + "num_tokens": 183945777.0, + "step": 152890 + }, + { + "entropy": 1.9209632962942123, + "epoch": 0.47397662200984375, + "grad_norm": 7.784418106079102, + "learning_rate": 3.674647114063265e-06, + "loss": 0.4464, + "mean_token_accuracy": 0.8604557514190674, + "num_tokens": 183956863.0, + "step": 152900 + }, + { + "entropy": 1.9507730916142463, + "epoch": 0.4740076211348935, + "grad_norm": 7.421597480773926, + "learning_rate": 3.674526954125164e-06, + "loss": 0.4967, + "mean_token_accuracy": 0.8345337122678756, + "num_tokens": 183968147.0, + "step": 152910 + }, + { + "entropy": 1.9124934807419778, + "epoch": 0.47403862025994314, + "grad_norm": 9.237162590026855, + "learning_rate": 3.674406805973881e-06, + "loss": 0.4493, + "mean_token_accuracy": 0.8510349482297898, + "num_tokens": 183978987.0, + "step": 152920 + }, + { + "entropy": 1.9097222343087197, + "epoch": 0.47406961938499287, + "grad_norm": 9.957469940185547, + "learning_rate": 3.6742866696074915e-06, + "loss": 0.4697, + "mean_token_accuracy": 0.8503610968589783, + "num_tokens": 183990806.0, + "step": 152930 + }, + { + "entropy": 1.9008977569639682, + "epoch": 0.47410061851004254, + "grad_norm": 8.201800346374512, + "learning_rate": 3.6741665450240667e-06, + "loss": 0.4224, + "mean_token_accuracy": 0.8540589138865471, + "num_tokens": 184003428.0, + "step": 152940 + }, + { + "entropy": 1.8850766360759734, + "epoch": 0.47413161763509226, + "grad_norm": 7.431873798370361, + "learning_rate": 3.6740464322216814e-06, + "loss": 0.4595, + "mean_token_accuracy": 0.8467616021633149, + "num_tokens": 184015405.0, + "step": 152950 + }, + { + "entropy": 1.9019383952021598, + "epoch": 0.47416261676014193, + "grad_norm": 7.215775012969971, + "learning_rate": 3.67392633119841e-06, + "loss": 0.4318, + "mean_token_accuracy": 0.8517497405409813, + "num_tokens": 184027587.0, + "step": 152960 + }, + { + "entropy": 1.905358751118183, + "epoch": 0.47419361588519165, + "grad_norm": 2.930060863494873, + "learning_rate": 3.6738062419523276e-06, + "loss": 0.4719, + "mean_token_accuracy": 0.8468658626079559, + "num_tokens": 184039622.0, + "step": 152970 + }, + { + "entropy": 1.8342019632458686, + "epoch": 0.4742246150102413, + "grad_norm": 3.8289215564727783, + "learning_rate": 3.6736861644815084e-06, + "loss": 0.4457, + "mean_token_accuracy": 0.8554961785674096, + "num_tokens": 184051541.0, + "step": 152980 + }, + { + "entropy": 1.86045740544796, + "epoch": 0.47425561413529105, + "grad_norm": 7.936079025268555, + "learning_rate": 3.6735660987840305e-06, + "loss": 0.4993, + "mean_token_accuracy": 0.8530977353453636, + "num_tokens": 184063826.0, + "step": 152990 + }, + { + "entropy": 1.8976167649030686, + "epoch": 0.4742866132603407, + "grad_norm": 3.9572274684906006, + "learning_rate": 3.6734460448579673e-06, + "loss": 0.4313, + "mean_token_accuracy": 0.858278226852417, + "num_tokens": 184075371.0, + "step": 153000 + }, + { + "entropy": 1.8274334058165551, + "epoch": 0.47431761238539044, + "grad_norm": 6.631383419036865, + "learning_rate": 3.6733260027013985e-06, + "loss": 0.4297, + "mean_token_accuracy": 0.8602709263563156, + "num_tokens": 184088890.0, + "step": 153010 + }, + { + "entropy": 1.934623521566391, + "epoch": 0.4743486115104401, + "grad_norm": 8.96616268157959, + "learning_rate": 3.673205972312398e-06, + "loss": 0.4912, + "mean_token_accuracy": 0.8473653078079224, + "num_tokens": 184100349.0, + "step": 153020 + }, + { + "entropy": 1.874304661154747, + "epoch": 0.47437961063548983, + "grad_norm": 6.859756946563721, + "learning_rate": 3.6730859536890454e-06, + "loss": 0.4008, + "mean_token_accuracy": 0.8729176357388496, + "num_tokens": 184112664.0, + "step": 153030 + }, + { + "entropy": 1.9090553134679795, + "epoch": 0.4744106097605395, + "grad_norm": 6.6315016746521, + "learning_rate": 3.6729659468294182e-06, + "loss": 0.4221, + "mean_token_accuracy": 0.8530550181865693, + "num_tokens": 184125100.0, + "step": 153040 + }, + { + "entropy": 1.8723108693957329, + "epoch": 0.4744416088855892, + "grad_norm": 4.024182319641113, + "learning_rate": 3.6728459517315944e-06, + "loss": 0.4138, + "mean_token_accuracy": 0.8646663218736649, + "num_tokens": 184136438.0, + "step": 153050 + }, + { + "entropy": 1.8754427291452884, + "epoch": 0.4744726080106389, + "grad_norm": 2.725093364715576, + "learning_rate": 3.672725968393654e-06, + "loss": 0.4289, + "mean_token_accuracy": 0.8570066452026367, + "num_tokens": 184148910.0, + "step": 153060 + }, + { + "entropy": 1.897630612552166, + "epoch": 0.4745036071356886, + "grad_norm": 8.470466613769531, + "learning_rate": 3.6726059968136746e-06, + "loss": 0.4828, + "mean_token_accuracy": 0.8399512648582459, + "num_tokens": 184160738.0, + "step": 153070 + }, + { + "entropy": 1.8298255681991578, + "epoch": 0.4745346062607383, + "grad_norm": 8.843210220336914, + "learning_rate": 3.672486036989737e-06, + "loss": 0.4457, + "mean_token_accuracy": 0.8569268405437469, + "num_tokens": 184173795.0, + "step": 153080 + }, + { + "entropy": 1.855211439728737, + "epoch": 0.47456560538578796, + "grad_norm": 7.2180280685424805, + "learning_rate": 3.6723660889199214e-06, + "loss": 0.4219, + "mean_token_accuracy": 0.8601253613829613, + "num_tokens": 184185706.0, + "step": 153090 + }, + { + "entropy": 1.957202786207199, + "epoch": 0.4745966045108377, + "grad_norm": 8.35397720336914, + "learning_rate": 3.672246152602308e-06, + "loss": 0.4675, + "mean_token_accuracy": 0.8588167384266854, + "num_tokens": 184196601.0, + "step": 153100 + }, + { + "entropy": 1.8255947291851045, + "epoch": 0.47462760363588735, + "grad_norm": 9.436110496520996, + "learning_rate": 3.6721262280349785e-06, + "loss": 0.4364, + "mean_token_accuracy": 0.85789824873209, + "num_tokens": 184209736.0, + "step": 153110 + }, + { + "entropy": 1.846110972762108, + "epoch": 0.4746586027609371, + "grad_norm": 3.714542865753174, + "learning_rate": 3.6720063152160128e-06, + "loss": 0.4014, + "mean_token_accuracy": 0.8676638081669807, + "num_tokens": 184222073.0, + "step": 153120 + }, + { + "entropy": 1.9054560333490371, + "epoch": 0.47468960188598674, + "grad_norm": 9.361035346984863, + "learning_rate": 3.6718864141434946e-06, + "loss": 0.4596, + "mean_token_accuracy": 0.8527064174413681, + "num_tokens": 184233663.0, + "step": 153130 + }, + { + "entropy": 1.9491969496011734, + "epoch": 0.47472060101103647, + "grad_norm": 8.116238594055176, + "learning_rate": 3.6717665248155054e-06, + "loss": 0.4821, + "mean_token_accuracy": 0.84459348320961, + "num_tokens": 184244980.0, + "step": 153140 + }, + { + "entropy": 1.9269451722502708, + "epoch": 0.47475160013608614, + "grad_norm": 7.581139087677002, + "learning_rate": 3.6716466472301283e-06, + "loss": 0.4453, + "mean_token_accuracy": 0.8586566999554635, + "num_tokens": 184256411.0, + "step": 153150 + }, + { + "entropy": 1.7936486840248107, + "epoch": 0.47478259926113586, + "grad_norm": 3.638094663619995, + "learning_rate": 3.671526781385446e-06, + "loss": 0.3555, + "mean_token_accuracy": 0.866078944504261, + "num_tokens": 184269455.0, + "step": 153160 + }, + { + "entropy": 1.9680861115455628, + "epoch": 0.47481359838618553, + "grad_norm": 8.033863067626953, + "learning_rate": 3.6714069272795433e-06, + "loss": 0.5163, + "mean_token_accuracy": 0.8408958032727242, + "num_tokens": 184280201.0, + "step": 153170 + }, + { + "entropy": 1.8380594596266746, + "epoch": 0.47484459751123526, + "grad_norm": 6.594268321990967, + "learning_rate": 3.6712870849105025e-06, + "loss": 0.4233, + "mean_token_accuracy": 0.8671738147735596, + "num_tokens": 184292398.0, + "step": 153180 + }, + { + "entropy": 1.9187800362706184, + "epoch": 0.4748755966362849, + "grad_norm": 9.44068431854248, + "learning_rate": 3.67116725427641e-06, + "loss": 0.4775, + "mean_token_accuracy": 0.8464760422706604, + "num_tokens": 184304484.0, + "step": 153190 + }, + { + "entropy": 1.8541857436299325, + "epoch": 0.47490659576133465, + "grad_norm": 8.135321617126465, + "learning_rate": 3.67104743537535e-06, + "loss": 0.4343, + "mean_token_accuracy": 0.853699392080307, + "num_tokens": 184316960.0, + "step": 153200 + }, + { + "entropy": 1.8801586970686912, + "epoch": 0.4749375948863843, + "grad_norm": 9.78518009185791, + "learning_rate": 3.6709276282054077e-06, + "loss": 0.5018, + "mean_token_accuracy": 0.8420796692371368, + "num_tokens": 184328693.0, + "step": 153210 + }, + { + "entropy": 1.8984438449144363, + "epoch": 0.47496859401143404, + "grad_norm": 8.019004821777344, + "learning_rate": 3.6708078327646697e-06, + "loss": 0.432, + "mean_token_accuracy": 0.8611553832888603, + "num_tokens": 184340353.0, + "step": 153220 + }, + { + "entropy": 1.9146203354001046, + "epoch": 0.4749995931364837, + "grad_norm": 9.26950454711914, + "learning_rate": 3.6706880490512205e-06, + "loss": 0.469, + "mean_token_accuracy": 0.8470090657472611, + "num_tokens": 184351989.0, + "step": 153230 + }, + { + "entropy": 1.8650276467204094, + "epoch": 0.47503059226153344, + "grad_norm": 3.9464805126190186, + "learning_rate": 3.670568277063149e-06, + "loss": 0.4212, + "mean_token_accuracy": 0.8640954300761223, + "num_tokens": 184364008.0, + "step": 153240 + }, + { + "entropy": 1.8467861473560334, + "epoch": 0.4750615913865831, + "grad_norm": 8.731751441955566, + "learning_rate": 3.6704485167985414e-06, + "loss": 0.4382, + "mean_token_accuracy": 0.855233508348465, + "num_tokens": 184375127.0, + "step": 153250 + }, + { + "entropy": 1.7884536266326905, + "epoch": 0.47509259051163283, + "grad_norm": 7.820191383361816, + "learning_rate": 3.6703287682554855e-06, + "loss": 0.3785, + "mean_token_accuracy": 0.8606995210051537, + "num_tokens": 184388469.0, + "step": 153260 + }, + { + "entropy": 1.894975845515728, + "epoch": 0.4751235896366825, + "grad_norm": 8.473472595214844, + "learning_rate": 3.670209031432069e-06, + "loss": 0.4907, + "mean_token_accuracy": 0.8551534727215767, + "num_tokens": 184399686.0, + "step": 153270 + }, + { + "entropy": 1.8992106392979622, + "epoch": 0.4751545887617322, + "grad_norm": 7.339175224304199, + "learning_rate": 3.6700893063263798e-06, + "loss": 0.4458, + "mean_token_accuracy": 0.8561721444129944, + "num_tokens": 184410820.0, + "step": 153280 + }, + { + "entropy": 1.898959006369114, + "epoch": 0.4751855878867819, + "grad_norm": 3.8873322010040283, + "learning_rate": 3.669969592936509e-06, + "loss": 0.4884, + "mean_token_accuracy": 0.8328580185770988, + "num_tokens": 184422656.0, + "step": 153290 + }, + { + "entropy": 1.8946363300085067, + "epoch": 0.4752165870118316, + "grad_norm": 7.692179203033447, + "learning_rate": 3.669849891260544e-06, + "loss": 0.4482, + "mean_token_accuracy": 0.8575319543480873, + "num_tokens": 184433675.0, + "step": 153300 + }, + { + "entropy": 1.80959662348032, + "epoch": 0.4752475861368813, + "grad_norm": 8.2183198928833, + "learning_rate": 3.669730201296575e-06, + "loss": 0.3958, + "mean_token_accuracy": 0.8613125756382942, + "num_tokens": 184446305.0, + "step": 153310 + }, + { + "entropy": 1.8346762344241143, + "epoch": 0.475278585261931, + "grad_norm": 7.708462715148926, + "learning_rate": 3.6696105230426927e-06, + "loss": 0.4407, + "mean_token_accuracy": 0.8413996055722237, + "num_tokens": 184458769.0, + "step": 153320 + }, + { + "entropy": 1.8374806106090547, + "epoch": 0.4753095843869807, + "grad_norm": 8.594371795654297, + "learning_rate": 3.6694908564969873e-06, + "loss": 0.4607, + "mean_token_accuracy": 0.8590363129973412, + "num_tokens": 184470713.0, + "step": 153330 + }, + { + "entropy": 1.8095426648855208, + "epoch": 0.47534058351203035, + "grad_norm": 3.7232837677001953, + "learning_rate": 3.6693712016575504e-06, + "loss": 0.3899, + "mean_token_accuracy": 0.8638847827911377, + "num_tokens": 184482692.0, + "step": 153340 + }, + { + "entropy": 1.8390261575579643, + "epoch": 0.47537158263708007, + "grad_norm": 6.988356113433838, + "learning_rate": 3.6692515585224724e-06, + "loss": 0.4216, + "mean_token_accuracy": 0.8590392619371414, + "num_tokens": 184494926.0, + "step": 153350 + }, + { + "entropy": 1.8419798463582993, + "epoch": 0.47540258176212974, + "grad_norm": 9.946001052856445, + "learning_rate": 3.669131927089847e-06, + "loss": 0.4158, + "mean_token_accuracy": 0.8536298364400864, + "num_tokens": 184506522.0, + "step": 153360 + }, + { + "entropy": 1.857440821826458, + "epoch": 0.47543358088717946, + "grad_norm": 9.011540412902832, + "learning_rate": 3.6690123073577653e-06, + "loss": 0.4895, + "mean_token_accuracy": 0.8468375623226165, + "num_tokens": 184519247.0, + "step": 153370 + }, + { + "entropy": 1.8762486830353737, + "epoch": 0.47546458001222913, + "grad_norm": 7.706223964691162, + "learning_rate": 3.66889269932432e-06, + "loss": 0.467, + "mean_token_accuracy": 0.8498484030365944, + "num_tokens": 184530480.0, + "step": 153380 + }, + { + "entropy": 1.8929049670696259, + "epoch": 0.47549557913727886, + "grad_norm": 9.091431617736816, + "learning_rate": 3.6687731029876057e-06, + "loss": 0.4464, + "mean_token_accuracy": 0.8557457372546196, + "num_tokens": 184541409.0, + "step": 153390 + }, + { + "entropy": 1.8962855875492095, + "epoch": 0.4755265782623285, + "grad_norm": 10.7317476272583, + "learning_rate": 3.6686535183457147e-06, + "loss": 0.5057, + "mean_token_accuracy": 0.8461247265338898, + "num_tokens": 184552652.0, + "step": 153400 + }, + { + "entropy": 1.8995665937662125, + "epoch": 0.47555757738737825, + "grad_norm": 8.745923042297363, + "learning_rate": 3.6685339453967417e-06, + "loss": 0.4825, + "mean_token_accuracy": 0.8513082057237625, + "num_tokens": 184563272.0, + "step": 153410 + }, + { + "entropy": 1.728972639143467, + "epoch": 0.4755885765124279, + "grad_norm": 3.664832353591919, + "learning_rate": 3.6684143841387817e-06, + "loss": 0.3383, + "mean_token_accuracy": 0.8756014302372932, + "num_tokens": 184577391.0, + "step": 153420 + }, + { + "entropy": 1.8072891741991044, + "epoch": 0.47561957563747764, + "grad_norm": 9.03156852722168, + "learning_rate": 3.6682948345699293e-06, + "loss": 0.3899, + "mean_token_accuracy": 0.8632027536630631, + "num_tokens": 184590084.0, + "step": 153430 + }, + { + "entropy": 1.8861136004328727, + "epoch": 0.4756505747625273, + "grad_norm": 7.806605815887451, + "learning_rate": 3.6681752966882795e-06, + "loss": 0.4435, + "mean_token_accuracy": 0.8579560875892639, + "num_tokens": 184601954.0, + "step": 153440 + }, + { + "entropy": 1.8768578171730042, + "epoch": 0.47568157388757704, + "grad_norm": 6.651251316070557, + "learning_rate": 3.668055770491929e-06, + "loss": 0.4154, + "mean_token_accuracy": 0.8663598403334618, + "num_tokens": 184613113.0, + "step": 153450 + }, + { + "entropy": 1.7853379517793655, + "epoch": 0.4757125730126267, + "grad_norm": 5.01438570022583, + "learning_rate": 3.667936255978974e-06, + "loss": 0.3818, + "mean_token_accuracy": 0.8630882278084755, + "num_tokens": 184626166.0, + "step": 153460 + }, + { + "entropy": 1.8284614101052283, + "epoch": 0.47574357213767643, + "grad_norm": 9.973559379577637, + "learning_rate": 3.6678167531475107e-06, + "loss": 0.4257, + "mean_token_accuracy": 0.8562781944870949, + "num_tokens": 184637586.0, + "step": 153470 + }, + { + "entropy": 1.7700392067432404, + "epoch": 0.4757745712627261, + "grad_norm": 4.605105876922607, + "learning_rate": 3.667697261995636e-06, + "loss": 0.3653, + "mean_token_accuracy": 0.8692534595727921, + "num_tokens": 184650692.0, + "step": 153480 + }, + { + "entropy": 1.9041405349969864, + "epoch": 0.4758055703877758, + "grad_norm": 8.431550025939941, + "learning_rate": 3.6675777825214486e-06, + "loss": 0.4631, + "mean_token_accuracy": 0.8535671159625053, + "num_tokens": 184662239.0, + "step": 153490 + }, + { + "entropy": 1.779083289206028, + "epoch": 0.4758365695128255, + "grad_norm": 4.4849748611450195, + "learning_rate": 3.667458314723047e-06, + "loss": 0.4269, + "mean_token_accuracy": 0.853766855597496, + "num_tokens": 184676796.0, + "step": 153500 + }, + { + "entropy": 1.885641473531723, + "epoch": 0.4758675686378752, + "grad_norm": 3.370105266571045, + "learning_rate": 3.667338858598527e-06, + "loss": 0.4198, + "mean_token_accuracy": 0.8566616162657738, + "num_tokens": 184688573.0, + "step": 153510 + }, + { + "entropy": 1.9167728781700135, + "epoch": 0.4758985677629249, + "grad_norm": 14.460858345031738, + "learning_rate": 3.667219414145991e-06, + "loss": 0.4755, + "mean_token_accuracy": 0.8444859847426415, + "num_tokens": 184699665.0, + "step": 153520 + }, + { + "entropy": 1.807128955423832, + "epoch": 0.4759295668879746, + "grad_norm": 3.8784046173095703, + "learning_rate": 3.6670999813635354e-06, + "loss": 0.3995, + "mean_token_accuracy": 0.8568266987800598, + "num_tokens": 184712308.0, + "step": 153530 + }, + { + "entropy": 1.8136287018656732, + "epoch": 0.4759605660130243, + "grad_norm": 9.484222412109375, + "learning_rate": 3.666980560249262e-06, + "loss": 0.3689, + "mean_token_accuracy": 0.8740163326263428, + "num_tokens": 184724676.0, + "step": 153540 + }, + { + "entropy": 1.8556558206677436, + "epoch": 0.475991565138074, + "grad_norm": 7.489902496337891, + "learning_rate": 3.66686115080127e-06, + "loss": 0.5065, + "mean_token_accuracy": 0.8446243152022361, + "num_tokens": 184737203.0, + "step": 153550 + }, + { + "entropy": 1.9191300675272942, + "epoch": 0.47602256426312367, + "grad_norm": 8.194887161254883, + "learning_rate": 3.6667417530176603e-06, + "loss": 0.475, + "mean_token_accuracy": 0.8443259477615357, + "num_tokens": 184748452.0, + "step": 153560 + }, + { + "entropy": 1.795873185992241, + "epoch": 0.4760535633881734, + "grad_norm": 9.132946968078613, + "learning_rate": 3.6666223668965338e-06, + "loss": 0.3911, + "mean_token_accuracy": 0.8649899259209632, + "num_tokens": 184760930.0, + "step": 153570 + }, + { + "entropy": 1.9766324907541275, + "epoch": 0.47608456251322306, + "grad_norm": 8.085390090942383, + "learning_rate": 3.6665029924359922e-06, + "loss": 0.5102, + "mean_token_accuracy": 0.8393667578697205, + "num_tokens": 184771872.0, + "step": 153580 + }, + { + "entropy": 1.7679221838712693, + "epoch": 0.47611556163827273, + "grad_norm": 4.475643634796143, + "learning_rate": 3.6663836296341384e-06, + "loss": 0.3281, + "mean_token_accuracy": 0.869743998348713, + "num_tokens": 184785921.0, + "step": 153590 + }, + { + "entropy": 1.840344101190567, + "epoch": 0.47614656076332246, + "grad_norm": 4.371096134185791, + "learning_rate": 3.6662642784890723e-06, + "loss": 0.415, + "mean_token_accuracy": 0.8603167220950126, + "num_tokens": 184797921.0, + "step": 153600 + }, + { + "entropy": 1.7917725771665574, + "epoch": 0.4761775598883721, + "grad_norm": 4.4459686279296875, + "learning_rate": 3.6661449389988997e-06, + "loss": 0.4022, + "mean_token_accuracy": 0.8593000710010529, + "num_tokens": 184810881.0, + "step": 153610 + }, + { + "entropy": 1.8440128073096276, + "epoch": 0.47620855901342185, + "grad_norm": 9.678831100463867, + "learning_rate": 3.6660256111617214e-06, + "loss": 0.4642, + "mean_token_accuracy": 0.8468298763036728, + "num_tokens": 184823216.0, + "step": 153620 + }, + { + "entropy": 1.9154121845960617, + "epoch": 0.4762395581384715, + "grad_norm": 8.692216873168945, + "learning_rate": 3.6659062949756423e-06, + "loss": 0.4485, + "mean_token_accuracy": 0.8561125859618187, + "num_tokens": 184834303.0, + "step": 153630 + }, + { + "entropy": 1.7718016371130942, + "epoch": 0.47627055726352124, + "grad_norm": 7.937458038330078, + "learning_rate": 3.6657869904387666e-06, + "loss": 0.3948, + "mean_token_accuracy": 0.8643491208553314, + "num_tokens": 184846836.0, + "step": 153640 + }, + { + "entropy": 1.7790207624435426, + "epoch": 0.4763015563885709, + "grad_norm": 7.390011310577393, + "learning_rate": 3.6656676975491983e-06, + "loss": 0.433, + "mean_token_accuracy": 0.8582101926207543, + "num_tokens": 184861251.0, + "step": 153650 + }, + { + "entropy": 1.8293533489108085, + "epoch": 0.47633255551362064, + "grad_norm": 8.291117668151855, + "learning_rate": 3.6655484163050426e-06, + "loss": 0.4276, + "mean_token_accuracy": 0.8656255662441253, + "num_tokens": 184873707.0, + "step": 153660 + }, + { + "entropy": 1.803611083328724, + "epoch": 0.4763635546386703, + "grad_norm": 3.697669267654419, + "learning_rate": 3.6654291467044046e-06, + "loss": 0.3816, + "mean_token_accuracy": 0.8656400561332702, + "num_tokens": 184886963.0, + "step": 153670 + }, + { + "entropy": 1.8712692141532898, + "epoch": 0.47639455376372003, + "grad_norm": 8.776789665222168, + "learning_rate": 3.6653098887453913e-06, + "loss": 0.4827, + "mean_token_accuracy": 0.8448523849248886, + "num_tokens": 184898369.0, + "step": 153680 + }, + { + "entropy": 1.781012015044689, + "epoch": 0.4764255528887697, + "grad_norm": 7.237161636352539, + "learning_rate": 3.665190642426108e-06, + "loss": 0.4004, + "mean_token_accuracy": 0.8646363571286202, + "num_tokens": 184910887.0, + "step": 153690 + }, + { + "entropy": 1.7416659876704217, + "epoch": 0.4764565520138194, + "grad_norm": 8.15397834777832, + "learning_rate": 3.6650714077446614e-06, + "loss": 0.3926, + "mean_token_accuracy": 0.8684957414865494, + "num_tokens": 184924099.0, + "step": 153700 + }, + { + "entropy": 1.8524023115634918, + "epoch": 0.4764875511388691, + "grad_norm": 8.428122520446777, + "learning_rate": 3.664952184699158e-06, + "loss": 0.4665, + "mean_token_accuracy": 0.84611546844244, + "num_tokens": 184936005.0, + "step": 153710 + }, + { + "entropy": 1.8617612093687057, + "epoch": 0.4765185502639188, + "grad_norm": 7.877058982849121, + "learning_rate": 3.664832973287707e-06, + "loss": 0.4236, + "mean_token_accuracy": 0.8690141126513481, + "num_tokens": 184948169.0, + "step": 153720 + }, + { + "entropy": 1.74190554022789, + "epoch": 0.4765495493889685, + "grad_norm": 3.756366491317749, + "learning_rate": 3.6647137735084156e-06, + "loss": 0.3976, + "mean_token_accuracy": 0.8677965849637985, + "num_tokens": 184961545.0, + "step": 153730 + }, + { + "entropy": 1.841639220714569, + "epoch": 0.4765805485140182, + "grad_norm": 6.832626819610596, + "learning_rate": 3.6645945853593916e-06, + "loss": 0.4091, + "mean_token_accuracy": 0.8673912405967712, + "num_tokens": 184973489.0, + "step": 153740 + }, + { + "entropy": 1.9268560737371445, + "epoch": 0.4766115476390679, + "grad_norm": 7.91566801071167, + "learning_rate": 3.6644754088387447e-06, + "loss": 0.5311, + "mean_token_accuracy": 0.8405150607228279, + "num_tokens": 184984295.0, + "step": 153750 + }, + { + "entropy": 1.8099448829889297, + "epoch": 0.4766425467641176, + "grad_norm": 7.360180854797363, + "learning_rate": 3.664356243944584e-06, + "loss": 0.4085, + "mean_token_accuracy": 0.8619012326002121, + "num_tokens": 184997460.0, + "step": 153760 + }, + { + "entropy": 1.9135849446058273, + "epoch": 0.47667354588916727, + "grad_norm": 7.397768020629883, + "learning_rate": 3.6642370906750193e-06, + "loss": 0.4747, + "mean_token_accuracy": 0.8504505261778832, + "num_tokens": 185007719.0, + "step": 153770 + }, + { + "entropy": 1.9261243373155594, + "epoch": 0.476704545014217, + "grad_norm": 6.789693832397461, + "learning_rate": 3.6641179490281596e-06, + "loss": 0.4927, + "mean_token_accuracy": 0.8460416331887245, + "num_tokens": 185018707.0, + "step": 153780 + }, + { + "entropy": 1.8728522315621376, + "epoch": 0.47673554413926666, + "grad_norm": 3.7547261714935303, + "learning_rate": 3.6639988190021176e-06, + "loss": 0.4731, + "mean_token_accuracy": 0.8428987547755241, + "num_tokens": 185030141.0, + "step": 153790 + }, + { + "entropy": 1.8588514044880866, + "epoch": 0.4767665432643164, + "grad_norm": 3.039729356765747, + "learning_rate": 3.6638797005950024e-06, + "loss": 0.406, + "mean_token_accuracy": 0.858186562359333, + "num_tokens": 185042499.0, + "step": 153800 + }, + { + "entropy": 1.8474113151431084, + "epoch": 0.47679754238936606, + "grad_norm": 8.664191246032715, + "learning_rate": 3.6637605938049266e-06, + "loss": 0.4419, + "mean_token_accuracy": 0.8646103963255882, + "num_tokens": 185054646.0, + "step": 153810 + }, + { + "entropy": 1.7442696556448936, + "epoch": 0.4768285415144158, + "grad_norm": 8.061847686767578, + "learning_rate": 3.663641498630001e-06, + "loss": 0.3783, + "mean_token_accuracy": 0.8688895791769028, + "num_tokens": 185067799.0, + "step": 153820 + }, + { + "entropy": 1.9126453652977944, + "epoch": 0.47685954063946545, + "grad_norm": 7.419454097747803, + "learning_rate": 3.663522415068339e-06, + "loss": 0.5575, + "mean_token_accuracy": 0.8450031682848931, + "num_tokens": 185079637.0, + "step": 153830 + }, + { + "entropy": 1.841584986448288, + "epoch": 0.4768905397645151, + "grad_norm": 7.337125778198242, + "learning_rate": 3.6634033431180534e-06, + "loss": 0.4272, + "mean_token_accuracy": 0.8534199371933937, + "num_tokens": 185091402.0, + "step": 153840 + }, + { + "entropy": 1.8615821033716202, + "epoch": 0.47692153888956484, + "grad_norm": 8.566046714782715, + "learning_rate": 3.663284282777256e-06, + "loss": 0.4612, + "mean_token_accuracy": 0.8558877721428871, + "num_tokens": 185102679.0, + "step": 153850 + }, + { + "entropy": 1.9264022588729859, + "epoch": 0.4769525380146145, + "grad_norm": 9.237041473388672, + "learning_rate": 3.6631652340440625e-06, + "loss": 0.4899, + "mean_token_accuracy": 0.8412112295627594, + "num_tokens": 185113641.0, + "step": 153860 + }, + { + "entropy": 1.8670520842075349, + "epoch": 0.47698353713966424, + "grad_norm": 8.40865707397461, + "learning_rate": 3.6630461969165847e-06, + "loss": 0.4872, + "mean_token_accuracy": 0.8481194913387299, + "num_tokens": 185125200.0, + "step": 153870 + }, + { + "entropy": 1.9037254452705383, + "epoch": 0.4770145362647139, + "grad_norm": 8.676751136779785, + "learning_rate": 3.662927171392938e-06, + "loss": 0.5002, + "mean_token_accuracy": 0.8378460854291916, + "num_tokens": 185136508.0, + "step": 153880 + }, + { + "entropy": 1.8699619323015213, + "epoch": 0.47704553538976363, + "grad_norm": 4.6836323738098145, + "learning_rate": 3.6628081574712363e-06, + "loss": 0.3981, + "mean_token_accuracy": 0.8679074257612228, + "num_tokens": 185148833.0, + "step": 153890 + }, + { + "entropy": 1.8572426453232764, + "epoch": 0.4770765345148133, + "grad_norm": 9.239683151245117, + "learning_rate": 3.662689155149597e-06, + "loss": 0.444, + "mean_token_accuracy": 0.8490281060338021, + "num_tokens": 185161322.0, + "step": 153900 + }, + { + "entropy": 1.8752911046147347, + "epoch": 0.477107533639863, + "grad_norm": 8.707393646240234, + "learning_rate": 3.662570164426135e-06, + "loss": 0.4612, + "mean_token_accuracy": 0.8525170043110848, + "num_tokens": 185173769.0, + "step": 153910 + }, + { + "entropy": 1.8172207593917846, + "epoch": 0.4771385327649127, + "grad_norm": 8.489155769348145, + "learning_rate": 3.6624511852989657e-06, + "loss": 0.4304, + "mean_token_accuracy": 0.858148242533207, + "num_tokens": 185187124.0, + "step": 153920 + }, + { + "entropy": 1.8730401009321214, + "epoch": 0.4771695318899624, + "grad_norm": 9.239094734191895, + "learning_rate": 3.6623322177662056e-06, + "loss": 0.4503, + "mean_token_accuracy": 0.8503327935934066, + "num_tokens": 185199593.0, + "step": 153930 + }, + { + "entropy": 1.8695446357131005, + "epoch": 0.4772005310150121, + "grad_norm": 8.47158432006836, + "learning_rate": 3.662213261825973e-06, + "loss": 0.4366, + "mean_token_accuracy": 0.8578305006027221, + "num_tokens": 185212365.0, + "step": 153940 + }, + { + "entropy": 1.8985270693898202, + "epoch": 0.4772315301400618, + "grad_norm": 7.378330230712891, + "learning_rate": 3.6620943174763845e-06, + "loss": 0.483, + "mean_token_accuracy": 0.8512399345636368, + "num_tokens": 185223754.0, + "step": 153950 + }, + { + "entropy": 1.8317578330636024, + "epoch": 0.4772625292651115, + "grad_norm": 8.441747665405273, + "learning_rate": 3.661975384715558e-06, + "loss": 0.4225, + "mean_token_accuracy": 0.8525681480765342, + "num_tokens": 185237068.0, + "step": 153960 + }, + { + "entropy": 1.92967329621315, + "epoch": 0.4772935283901612, + "grad_norm": 7.202940940856934, + "learning_rate": 3.6618564635416117e-06, + "loss": 0.4644, + "mean_token_accuracy": 0.8535278737545013, + "num_tokens": 185247787.0, + "step": 153970 + }, + { + "entropy": 1.8548561289906502, + "epoch": 0.4773245275152109, + "grad_norm": 8.233925819396973, + "learning_rate": 3.661737553952664e-06, + "loss": 0.4522, + "mean_token_accuracy": 0.8560821250081062, + "num_tokens": 185260706.0, + "step": 153980 + }, + { + "entropy": 1.928127257525921, + "epoch": 0.4773555266402606, + "grad_norm": 8.103163719177246, + "learning_rate": 3.661618655946835e-06, + "loss": 0.4944, + "mean_token_accuracy": 0.8430070072412491, + "num_tokens": 185272523.0, + "step": 153990 + }, + { + "entropy": 1.8638317108154296, + "epoch": 0.47738652576531027, + "grad_norm": 7.030045032501221, + "learning_rate": 3.6614997695222444e-06, + "loss": 0.3983, + "mean_token_accuracy": 0.8599402293562889, + "num_tokens": 185284942.0, + "step": 154000 + }, + { + "entropy": 1.8330623269081117, + "epoch": 0.47741752489036, + "grad_norm": 7.989560127258301, + "learning_rate": 3.6613808946770103e-06, + "loss": 0.3899, + "mean_token_accuracy": 0.8641909688711167, + "num_tokens": 185297882.0, + "step": 154010 + }, + { + "entropy": 1.8085048258304597, + "epoch": 0.47744852401540966, + "grad_norm": 7.296682834625244, + "learning_rate": 3.6612620314092554e-06, + "loss": 0.4384, + "mean_token_accuracy": 0.8560028299689293, + "num_tokens": 185310312.0, + "step": 154020 + }, + { + "entropy": 1.9709538519382477, + "epoch": 0.4774795231404594, + "grad_norm": 7.870602130889893, + "learning_rate": 3.6611431797170994e-06, + "loss": 0.4859, + "mean_token_accuracy": 0.8472713023424149, + "num_tokens": 185321671.0, + "step": 154030 + }, + { + "entropy": 1.9173092991113663, + "epoch": 0.47751052226550905, + "grad_norm": 9.225147247314453, + "learning_rate": 3.661024339598664e-06, + "loss": 0.4928, + "mean_token_accuracy": 0.8469387844204903, + "num_tokens": 185333128.0, + "step": 154040 + }, + { + "entropy": 1.9338498145341874, + "epoch": 0.4775415213905588, + "grad_norm": 8.254136085510254, + "learning_rate": 3.6609055110520702e-06, + "loss": 0.4891, + "mean_token_accuracy": 0.8532148212194443, + "num_tokens": 185344879.0, + "step": 154050 + }, + { + "entropy": 1.7882016241550445, + "epoch": 0.47757252051560845, + "grad_norm": 2.6473770141601562, + "learning_rate": 3.660786694075441e-06, + "loss": 0.3935, + "mean_token_accuracy": 0.8609866589307785, + "num_tokens": 185357701.0, + "step": 154060 + }, + { + "entropy": 1.87758369743824, + "epoch": 0.47760351964065817, + "grad_norm": 8.510712623596191, + "learning_rate": 3.660667888666899e-06, + "loss": 0.4199, + "mean_token_accuracy": 0.8468722432851792, + "num_tokens": 185369849.0, + "step": 154070 + }, + { + "entropy": 1.744155490398407, + "epoch": 0.47763451876570784, + "grad_norm": 9.347501754760742, + "learning_rate": 3.6605490948245658e-06, + "loss": 0.3608, + "mean_token_accuracy": 0.8634230136871338, + "num_tokens": 185383168.0, + "step": 154080 + }, + { + "entropy": 1.838800160586834, + "epoch": 0.4776655178907575, + "grad_norm": 4.383167743682861, + "learning_rate": 3.6604303125465666e-06, + "loss": 0.4114, + "mean_token_accuracy": 0.8602541759610176, + "num_tokens": 185395934.0, + "step": 154090 + }, + { + "entropy": 1.8426039129495622, + "epoch": 0.47769651701580723, + "grad_norm": 9.018194198608398, + "learning_rate": 3.6603115418310246e-06, + "loss": 0.4587, + "mean_token_accuracy": 0.855328157544136, + "num_tokens": 185408078.0, + "step": 154100 + }, + { + "entropy": 1.811622078716755, + "epoch": 0.4777275161408569, + "grad_norm": 4.057611465454102, + "learning_rate": 3.6601927826760636e-06, + "loss": 0.4392, + "mean_token_accuracy": 0.8523658573627472, + "num_tokens": 185421402.0, + "step": 154110 + }, + { + "entropy": 1.7786233231425286, + "epoch": 0.4777585152659066, + "grad_norm": 3.694188356399536, + "learning_rate": 3.660074035079809e-06, + "loss": 0.364, + "mean_token_accuracy": 0.8725257933139801, + "num_tokens": 185434169.0, + "step": 154120 + }, + { + "entropy": 1.873915046453476, + "epoch": 0.4777895143909563, + "grad_norm": 9.391448020935059, + "learning_rate": 3.659955299040385e-06, + "loss": 0.4731, + "mean_token_accuracy": 0.8509207114577293, + "num_tokens": 185445738.0, + "step": 154130 + }, + { + "entropy": 1.930639487504959, + "epoch": 0.477820513516006, + "grad_norm": 10.155179977416992, + "learning_rate": 3.6598365745559187e-06, + "loss": 0.5343, + "mean_token_accuracy": 0.8412887364625931, + "num_tokens": 185456538.0, + "step": 154140 + }, + { + "entropy": 1.9113746494054795, + "epoch": 0.4778515126410557, + "grad_norm": 8.49729061126709, + "learning_rate": 3.6597178616245345e-06, + "loss": 0.4264, + "mean_token_accuracy": 0.8552822053432465, + "num_tokens": 185468039.0, + "step": 154150 + }, + { + "entropy": 1.9362047135829925, + "epoch": 0.4778825117661054, + "grad_norm": 7.92914342880249, + "learning_rate": 3.6595991602443593e-06, + "loss": 0.477, + "mean_token_accuracy": 0.847518865764141, + "num_tokens": 185478541.0, + "step": 154160 + }, + { + "entropy": 1.8837508246302606, + "epoch": 0.4779135108911551, + "grad_norm": 8.445967674255371, + "learning_rate": 3.6594804704135206e-06, + "loss": 0.4509, + "mean_token_accuracy": 0.8552197054028511, + "num_tokens": 185489982.0, + "step": 154170 + }, + { + "entropy": 1.8416315570473671, + "epoch": 0.4779445100162048, + "grad_norm": 8.454257011413574, + "learning_rate": 3.659361792130145e-06, + "loss": 0.4256, + "mean_token_accuracy": 0.859393647313118, + "num_tokens": 185502258.0, + "step": 154180 + }, + { + "entropy": 1.9041074529290198, + "epoch": 0.4779755091412545, + "grad_norm": 3.8692028522491455, + "learning_rate": 3.6592431253923597e-06, + "loss": 0.4289, + "mean_token_accuracy": 0.8625993952155113, + "num_tokens": 185513826.0, + "step": 154190 + }, + { + "entropy": 1.8795275837182999, + "epoch": 0.4780065082663042, + "grad_norm": 8.003114700317383, + "learning_rate": 3.659124470198294e-06, + "loss": 0.4494, + "mean_token_accuracy": 0.8551646754145622, + "num_tokens": 185525636.0, + "step": 154200 + }, + { + "entropy": 1.9021770521998405, + "epoch": 0.47803750739135387, + "grad_norm": 9.979130744934082, + "learning_rate": 3.659005826546076e-06, + "loss": 0.4726, + "mean_token_accuracy": 0.8435526043176651, + "num_tokens": 185537405.0, + "step": 154210 + }, + { + "entropy": 1.9014919973909854, + "epoch": 0.4780685065164036, + "grad_norm": 8.349839210510254, + "learning_rate": 3.6588871944338343e-06, + "loss": 0.4337, + "mean_token_accuracy": 0.8566196665167809, + "num_tokens": 185549006.0, + "step": 154220 + }, + { + "entropy": 1.928890497982502, + "epoch": 0.47809950564145326, + "grad_norm": 8.71741008758545, + "learning_rate": 3.6587685738596985e-06, + "loss": 0.4939, + "mean_token_accuracy": 0.8444687411189079, + "num_tokens": 185560063.0, + "step": 154230 + }, + { + "entropy": 1.8423243664205073, + "epoch": 0.478130504766503, + "grad_norm": 8.268646240234375, + "learning_rate": 3.6586499648217977e-06, + "loss": 0.4339, + "mean_token_accuracy": 0.8484700009226799, + "num_tokens": 185573162.0, + "step": 154240 + }, + { + "entropy": 1.9101153433322906, + "epoch": 0.47816150389155265, + "grad_norm": 10.118322372436523, + "learning_rate": 3.658531367318264e-06, + "loss": 0.4681, + "mean_token_accuracy": 0.8470887467265129, + "num_tokens": 185584820.0, + "step": 154250 + }, + { + "entropy": 1.9050511002540589, + "epoch": 0.4781925030166024, + "grad_norm": 7.960333824157715, + "learning_rate": 3.658412781347225e-06, + "loss": 0.5336, + "mean_token_accuracy": 0.8414354234933853, + "num_tokens": 185596298.0, + "step": 154260 + }, + { + "entropy": 1.8809094280004501, + "epoch": 0.47822350214165205, + "grad_norm": 4.4286885261535645, + "learning_rate": 3.6582942069068154e-06, + "loss": 0.4722, + "mean_token_accuracy": 0.8507549971342087, + "num_tokens": 185607751.0, + "step": 154270 + }, + { + "entropy": 1.7869101524353028, + "epoch": 0.47825450126670177, + "grad_norm": 7.874044418334961, + "learning_rate": 3.6581756439951644e-06, + "loss": 0.3641, + "mean_token_accuracy": 0.862111645936966, + "num_tokens": 185620912.0, + "step": 154280 + }, + { + "entropy": 1.9244977489113808, + "epoch": 0.47828550039175144, + "grad_norm": 9.23994255065918, + "learning_rate": 3.658057092610404e-06, + "loss": 0.5345, + "mean_token_accuracy": 0.8415355905890465, + "num_tokens": 185632155.0, + "step": 154290 + }, + { + "entropy": 1.9335034400224687, + "epoch": 0.47831649951680116, + "grad_norm": 7.755003452301025, + "learning_rate": 3.657938552750668e-06, + "loss": 0.5167, + "mean_token_accuracy": 0.8392974704504013, + "num_tokens": 185642986.0, + "step": 154300 + }, + { + "entropy": 1.9139926508069038, + "epoch": 0.47834749864185083, + "grad_norm": 8.397530555725098, + "learning_rate": 3.657820024414088e-06, + "loss": 0.5023, + "mean_token_accuracy": 0.8501160070300102, + "num_tokens": 185654713.0, + "step": 154310 + }, + { + "entropy": 1.8200761735439301, + "epoch": 0.47837849776690056, + "grad_norm": 8.106402397155762, + "learning_rate": 3.6577015075987958e-06, + "loss": 0.42, + "mean_token_accuracy": 0.8671575516462326, + "num_tokens": 185667339.0, + "step": 154320 + }, + { + "entropy": 1.9082677319645882, + "epoch": 0.4784094968919502, + "grad_norm": 3.1612582206726074, + "learning_rate": 3.6575830023029285e-06, + "loss": 0.4595, + "mean_token_accuracy": 0.8523703336715698, + "num_tokens": 185678791.0, + "step": 154330 + }, + { + "entropy": 1.8255998715758324, + "epoch": 0.4784404960169999, + "grad_norm": 7.894652366638184, + "learning_rate": 3.6574645085246168e-06, + "loss": 0.3864, + "mean_token_accuracy": 0.8676185742020607, + "num_tokens": 185691476.0, + "step": 154340 + }, + { + "entropy": 1.894423645734787, + "epoch": 0.4784714951420496, + "grad_norm": 9.043898582458496, + "learning_rate": 3.6573460262619975e-06, + "loss": 0.4669, + "mean_token_accuracy": 0.8409060701727867, + "num_tokens": 185703246.0, + "step": 154350 + }, + { + "entropy": 1.9331441923975945, + "epoch": 0.4785024942670993, + "grad_norm": 9.189416885375977, + "learning_rate": 3.657227555513204e-06, + "loss": 0.4848, + "mean_token_accuracy": 0.8522034257650375, + "num_tokens": 185714266.0, + "step": 154360 + }, + { + "entropy": 1.8812373384833336, + "epoch": 0.478533493392149, + "grad_norm": 7.222550392150879, + "learning_rate": 3.657109096276372e-06, + "loss": 0.4523, + "mean_token_accuracy": 0.8448280677199363, + "num_tokens": 185726369.0, + "step": 154370 + }, + { + "entropy": 1.9059947580099106, + "epoch": 0.4785644925171987, + "grad_norm": 6.528245449066162, + "learning_rate": 3.6569906485496383e-06, + "loss": 0.4367, + "mean_token_accuracy": 0.8697101891040802, + "num_tokens": 185737440.0, + "step": 154380 + }, + { + "entropy": 1.9224574849009515, + "epoch": 0.4785954916422484, + "grad_norm": 10.537102699279785, + "learning_rate": 3.656872212331138e-06, + "loss": 0.4862, + "mean_token_accuracy": 0.838177040219307, + "num_tokens": 185748538.0, + "step": 154390 + }, + { + "entropy": 1.894202496111393, + "epoch": 0.4786264907672981, + "grad_norm": 7.70367431640625, + "learning_rate": 3.6567537876190075e-06, + "loss": 0.4494, + "mean_token_accuracy": 0.8510572746396065, + "num_tokens": 185759523.0, + "step": 154400 + }, + { + "entropy": 1.8672723740339279, + "epoch": 0.4786574898923478, + "grad_norm": 6.750048637390137, + "learning_rate": 3.6566353744113836e-06, + "loss": 0.4616, + "mean_token_accuracy": 0.857047687470913, + "num_tokens": 185772469.0, + "step": 154410 + }, + { + "entropy": 1.915586268901825, + "epoch": 0.47868848901739747, + "grad_norm": 8.35063648223877, + "learning_rate": 3.6565169727064054e-06, + "loss": 0.4429, + "mean_token_accuracy": 0.8574128776788712, + "num_tokens": 185784266.0, + "step": 154420 + }, + { + "entropy": 1.9031167924404144, + "epoch": 0.4787194881424472, + "grad_norm": 8.222635269165039, + "learning_rate": 3.656398582502209e-06, + "loss": 0.4699, + "mean_token_accuracy": 0.8537877663969994, + "num_tokens": 185795413.0, + "step": 154430 + }, + { + "entropy": 1.840549720823765, + "epoch": 0.47875048726749686, + "grad_norm": 7.137433052062988, + "learning_rate": 3.656280203796933e-06, + "loss": 0.4291, + "mean_token_accuracy": 0.8579451397061348, + "num_tokens": 185807798.0, + "step": 154440 + }, + { + "entropy": 1.861526158452034, + "epoch": 0.4787814863925466, + "grad_norm": 4.58693265914917, + "learning_rate": 3.656161836588717e-06, + "loss": 0.4406, + "mean_token_accuracy": 0.8540133357048034, + "num_tokens": 185820505.0, + "step": 154450 + }, + { + "entropy": 1.900732010602951, + "epoch": 0.47881248551759625, + "grad_norm": 3.4613146781921387, + "learning_rate": 3.6560434808756995e-06, + "loss": 0.4382, + "mean_token_accuracy": 0.8573839977383614, + "num_tokens": 185832232.0, + "step": 154460 + }, + { + "entropy": 1.8618409425020217, + "epoch": 0.478843484642646, + "grad_norm": 6.595524787902832, + "learning_rate": 3.6559251366560195e-06, + "loss": 0.4251, + "mean_token_accuracy": 0.8640586957335472, + "num_tokens": 185843445.0, + "step": 154470 + }, + { + "entropy": 1.9100439898669719, + "epoch": 0.47887448376769565, + "grad_norm": 9.08141040802002, + "learning_rate": 3.655806803927818e-06, + "loss": 0.4413, + "mean_token_accuracy": 0.8536060154438019, + "num_tokens": 185855042.0, + "step": 154480 + }, + { + "entropy": 1.8713151529431342, + "epoch": 0.47890548289274537, + "grad_norm": 3.9721601009368896, + "learning_rate": 3.655688482689234e-06, + "loss": 0.4301, + "mean_token_accuracy": 0.8604184344410897, + "num_tokens": 185866727.0, + "step": 154490 + }, + { + "entropy": 1.853001520037651, + "epoch": 0.47893648201779504, + "grad_norm": 4.289365768432617, + "learning_rate": 3.6555701729384096e-06, + "loss": 0.4227, + "mean_token_accuracy": 0.8596016809344291, + "num_tokens": 185878802.0, + "step": 154500 + }, + { + "entropy": 1.9377120748162269, + "epoch": 0.47896748114284476, + "grad_norm": 8.828749656677246, + "learning_rate": 3.6554518746734857e-06, + "loss": 0.4896, + "mean_token_accuracy": 0.8434145227074623, + "num_tokens": 185889992.0, + "step": 154510 + }, + { + "entropy": 1.8815474480390548, + "epoch": 0.47899848026789443, + "grad_norm": 7.912415027618408, + "learning_rate": 3.6553335878926037e-06, + "loss": 0.4563, + "mean_token_accuracy": 0.8570791929960251, + "num_tokens": 185901345.0, + "step": 154520 + }, + { + "entropy": 1.8100280150771142, + "epoch": 0.47902947939294416, + "grad_norm": 3.6402947902679443, + "learning_rate": 3.655215312593906e-06, + "loss": 0.4176, + "mean_token_accuracy": 0.8516449317336082, + "num_tokens": 185914045.0, + "step": 154530 + }, + { + "entropy": 1.905594563484192, + "epoch": 0.4790604785179938, + "grad_norm": 8.659533500671387, + "learning_rate": 3.6550970487755343e-06, + "loss": 0.4717, + "mean_token_accuracy": 0.8463260412216187, + "num_tokens": 185925597.0, + "step": 154540 + }, + { + "entropy": 1.9034359157085419, + "epoch": 0.47909147764304355, + "grad_norm": 7.604994297027588, + "learning_rate": 3.6549787964356328e-06, + "loss": 0.4815, + "mean_token_accuracy": 0.8514864310622215, + "num_tokens": 185936927.0, + "step": 154550 + }, + { + "entropy": 1.8217870756983756, + "epoch": 0.4791224767680932, + "grad_norm": 8.547018051147461, + "learning_rate": 3.6548605555723437e-06, + "loss": 0.4123, + "mean_token_accuracy": 0.8532523363828659, + "num_tokens": 185949713.0, + "step": 154560 + }, + { + "entropy": 1.86880983710289, + "epoch": 0.4791534758931429, + "grad_norm": 8.985418319702148, + "learning_rate": 3.6547423261838103e-06, + "loss": 0.4369, + "mean_token_accuracy": 0.8528073132038116, + "num_tokens": 185961110.0, + "step": 154570 + }, + { + "entropy": 1.873310026526451, + "epoch": 0.4791844750181926, + "grad_norm": 4.159888744354248, + "learning_rate": 3.654624108268179e-06, + "loss": 0.4601, + "mean_token_accuracy": 0.856771194934845, + "num_tokens": 185972911.0, + "step": 154580 + }, + { + "entropy": 1.809093876183033, + "epoch": 0.4792154741432423, + "grad_norm": 4.302996635437012, + "learning_rate": 3.6545059018235918e-06, + "loss": 0.4054, + "mean_token_accuracy": 0.8638216108083725, + "num_tokens": 185985591.0, + "step": 154590 + }, + { + "entropy": 1.8471155345439911, + "epoch": 0.479246473268292, + "grad_norm": 7.965666770935059, + "learning_rate": 3.654387706848195e-06, + "loss": 0.4537, + "mean_token_accuracy": 0.8619759008288383, + "num_tokens": 185998248.0, + "step": 154600 + }, + { + "entropy": 1.7570737972855568, + "epoch": 0.4792774723933417, + "grad_norm": 5.8140764236450195, + "learning_rate": 3.654269523340134e-06, + "loss": 0.3326, + "mean_token_accuracy": 0.8703777492046356, + "num_tokens": 186011817.0, + "step": 154610 + }, + { + "entropy": 1.7383157536387444, + "epoch": 0.4793084715183914, + "grad_norm": 3.960681200027466, + "learning_rate": 3.654151351297555e-06, + "loss": 0.3605, + "mean_token_accuracy": 0.8690917834639549, + "num_tokens": 186025532.0, + "step": 154620 + }, + { + "entropy": 1.9056962199509144, + "epoch": 0.47933947064344107, + "grad_norm": 6.778663158416748, + "learning_rate": 3.6540331907186033e-06, + "loss": 0.4353, + "mean_token_accuracy": 0.8545536309480667, + "num_tokens": 186037066.0, + "step": 154630 + }, + { + "entropy": 1.9246233195066451, + "epoch": 0.4793704697684908, + "grad_norm": 8.319918632507324, + "learning_rate": 3.653915041601426e-06, + "loss": 0.4952, + "mean_token_accuracy": 0.8444161772727966, + "num_tokens": 186047654.0, + "step": 154640 + }, + { + "entropy": 1.9099677562713624, + "epoch": 0.47940146889354046, + "grad_norm": 3.9827780723571777, + "learning_rate": 3.6537969039441702e-06, + "loss": 0.4456, + "mean_token_accuracy": 0.8582230001688004, + "num_tokens": 186059198.0, + "step": 154650 + }, + { + "entropy": 1.8121528401970863, + "epoch": 0.4794324680185902, + "grad_norm": 7.723804950714111, + "learning_rate": 3.653678777744984e-06, + "loss": 0.3769, + "mean_token_accuracy": 0.8730279862880707, + "num_tokens": 186072057.0, + "step": 154660 + }, + { + "entropy": 1.9339038461446763, + "epoch": 0.47946346714363985, + "grad_norm": 7.433446884155273, + "learning_rate": 3.6535606630020144e-06, + "loss": 0.519, + "mean_token_accuracy": 0.8461462393403053, + "num_tokens": 186083085.0, + "step": 154670 + }, + { + "entropy": 1.8623194843530655, + "epoch": 0.4794944662686896, + "grad_norm": 7.5080461502075195, + "learning_rate": 3.65344255971341e-06, + "loss": 0.4641, + "mean_token_accuracy": 0.8542502373456955, + "num_tokens": 186095411.0, + "step": 154680 + }, + { + "entropy": 1.8670690104365348, + "epoch": 0.47952546539373925, + "grad_norm": 5.185728073120117, + "learning_rate": 3.65332446787732e-06, + "loss": 0.4811, + "mean_token_accuracy": 0.8545142263174057, + "num_tokens": 186107457.0, + "step": 154690 + }, + { + "entropy": 1.907190305739641, + "epoch": 0.47955646451878897, + "grad_norm": 7.39229679107666, + "learning_rate": 3.6532063874918936e-06, + "loss": 0.4768, + "mean_token_accuracy": 0.8454447597265243, + "num_tokens": 186119514.0, + "step": 154700 + }, + { + "entropy": 1.809042975306511, + "epoch": 0.47958746364383864, + "grad_norm": 6.491355895996094, + "learning_rate": 3.65308831855528e-06, + "loss": 0.366, + "mean_token_accuracy": 0.8723908111453056, + "num_tokens": 186132317.0, + "step": 154710 + }, + { + "entropy": 1.9088991969823836, + "epoch": 0.47961846276888837, + "grad_norm": 8.895363807678223, + "learning_rate": 3.6529702610656294e-06, + "loss": 0.4835, + "mean_token_accuracy": 0.8540124624967576, + "num_tokens": 186143250.0, + "step": 154720 + }, + { + "entropy": 1.8378096505999566, + "epoch": 0.47964946189393803, + "grad_norm": 5.12782096862793, + "learning_rate": 3.652852215021092e-06, + "loss": 0.4399, + "mean_token_accuracy": 0.8618904039263725, + "num_tokens": 186154946.0, + "step": 154730 + }, + { + "entropy": 1.8013930171728134, + "epoch": 0.47968046101898776, + "grad_norm": 8.057185173034668, + "learning_rate": 3.6527341804198193e-06, + "loss": 0.4147, + "mean_token_accuracy": 0.8602392926812172, + "num_tokens": 186167660.0, + "step": 154740 + }, + { + "entropy": 1.8815608084201814, + "epoch": 0.4797114601440374, + "grad_norm": 8.834275245666504, + "learning_rate": 3.6526161572599616e-06, + "loss": 0.4896, + "mean_token_accuracy": 0.8500603690743447, + "num_tokens": 186178956.0, + "step": 154750 + }, + { + "entropy": 1.9094288021326065, + "epoch": 0.47974245926908715, + "grad_norm": 8.339888572692871, + "learning_rate": 3.6524981455396715e-06, + "loss": 0.5112, + "mean_token_accuracy": 0.8476811647415161, + "num_tokens": 186190800.0, + "step": 154760 + }, + { + "entropy": 1.8156311631202697, + "epoch": 0.4797734583941368, + "grad_norm": 3.8487701416015625, + "learning_rate": 3.6523801452571006e-06, + "loss": 0.3939, + "mean_token_accuracy": 0.8534475967288018, + "num_tokens": 186203466.0, + "step": 154770 + }, + { + "entropy": 1.8079561397433281, + "epoch": 0.47980445751918654, + "grad_norm": 3.5231499671936035, + "learning_rate": 3.652262156410402e-06, + "loss": 0.4518, + "mean_token_accuracy": 0.8429020419716835, + "num_tokens": 186215727.0, + "step": 154780 + }, + { + "entropy": 1.7731987565755845, + "epoch": 0.4798354566442362, + "grad_norm": 8.213481903076172, + "learning_rate": 3.6521441789977287e-06, + "loss": 0.3805, + "mean_token_accuracy": 0.868919475376606, + "num_tokens": 186228292.0, + "step": 154790 + }, + { + "entropy": 1.8571752443909646, + "epoch": 0.47986645576928594, + "grad_norm": 4.771162033081055, + "learning_rate": 3.6520262130172334e-06, + "loss": 0.4432, + "mean_token_accuracy": 0.8500853896141052, + "num_tokens": 186239711.0, + "step": 154800 + }, + { + "entropy": 1.8680015295743941, + "epoch": 0.4798974548943356, + "grad_norm": 9.111491203308105, + "learning_rate": 3.6519082584670694e-06, + "loss": 0.4754, + "mean_token_accuracy": 0.8505641296505928, + "num_tokens": 186251539.0, + "step": 154810 + }, + { + "entropy": 1.7599168375134469, + "epoch": 0.4799284540193853, + "grad_norm": 7.46876335144043, + "learning_rate": 3.6517903153453934e-06, + "loss": 0.412, + "mean_token_accuracy": 0.8639022320508957, + "num_tokens": 186265063.0, + "step": 154820 + }, + { + "entropy": 1.8380054078996182, + "epoch": 0.479959453144435, + "grad_norm": 2.27538800239563, + "learning_rate": 3.651672383650357e-06, + "loss": 0.4376, + "mean_token_accuracy": 0.8478976964950562, + "num_tokens": 186277300.0, + "step": 154830 + }, + { + "entropy": 1.91184870749712, + "epoch": 0.47999045226948467, + "grad_norm": 4.427165985107422, + "learning_rate": 3.651554463380117e-06, + "loss": 0.4953, + "mean_token_accuracy": 0.8442239791154862, + "num_tokens": 186288782.0, + "step": 154840 + }, + { + "entropy": 1.7352715209126472, + "epoch": 0.4800214513945344, + "grad_norm": 4.620881080627441, + "learning_rate": 3.6514365545328283e-06, + "loss": 0.3555, + "mean_token_accuracy": 0.8682469353079796, + "num_tokens": 186302145.0, + "step": 154850 + }, + { + "entropy": 1.8953940749168396, + "epoch": 0.48005245051958406, + "grad_norm": 8.379203796386719, + "learning_rate": 3.6513186571066473e-06, + "loss": 0.4744, + "mean_token_accuracy": 0.851770706474781, + "num_tokens": 186313454.0, + "step": 154860 + }, + { + "entropy": 1.8896978572010994, + "epoch": 0.4800834496446338, + "grad_norm": 9.247343063354492, + "learning_rate": 3.6512007710997295e-06, + "loss": 0.4817, + "mean_token_accuracy": 0.8484896406531334, + "num_tokens": 186325023.0, + "step": 154870 + }, + { + "entropy": 1.900222858786583, + "epoch": 0.48011444876968346, + "grad_norm": 9.006083488464355, + "learning_rate": 3.6510828965102326e-06, + "loss": 0.4594, + "mean_token_accuracy": 0.8547819867730141, + "num_tokens": 186336470.0, + "step": 154880 + }, + { + "entropy": 1.805259671807289, + "epoch": 0.4801454478947332, + "grad_norm": 7.851848125457764, + "learning_rate": 3.6509650333363135e-06, + "loss": 0.4236, + "mean_token_accuracy": 0.8576851665973664, + "num_tokens": 186348769.0, + "step": 154890 + }, + { + "entropy": 1.9134155437350273, + "epoch": 0.48017644701978285, + "grad_norm": 8.607007026672363, + "learning_rate": 3.6508471815761283e-06, + "loss": 0.4927, + "mean_token_accuracy": 0.845287999510765, + "num_tokens": 186359818.0, + "step": 154900 + }, + { + "entropy": 1.8226548954844475, + "epoch": 0.4802074461448326, + "grad_norm": 7.391361236572266, + "learning_rate": 3.6507293412278367e-06, + "loss": 0.4301, + "mean_token_accuracy": 0.8539162278175354, + "num_tokens": 186372552.0, + "step": 154910 + }, + { + "entropy": 1.8565132528543473, + "epoch": 0.48023844526988224, + "grad_norm": 8.47097110748291, + "learning_rate": 3.650611512289597e-06, + "loss": 0.5052, + "mean_token_accuracy": 0.8450149267911911, + "num_tokens": 186384399.0, + "step": 154920 + }, + { + "entropy": 1.8987506821751594, + "epoch": 0.48026944439493197, + "grad_norm": 9.051301002502441, + "learning_rate": 3.6504936947595664e-06, + "loss": 0.4538, + "mean_token_accuracy": 0.8581790804862977, + "num_tokens": 186395705.0, + "step": 154930 + }, + { + "entropy": 1.7860086098313332, + "epoch": 0.48030044351998163, + "grad_norm": 7.646950721740723, + "learning_rate": 3.6503758886359053e-06, + "loss": 0.3821, + "mean_token_accuracy": 0.8590388312935829, + "num_tokens": 186408158.0, + "step": 154940 + }, + { + "entropy": 1.835461364686489, + "epoch": 0.48033144264503136, + "grad_norm": 6.986077308654785, + "learning_rate": 3.650258093916774e-06, + "loss": 0.441, + "mean_token_accuracy": 0.8536029830574989, + "num_tokens": 186420083.0, + "step": 154950 + }, + { + "entropy": 1.7754562705755235, + "epoch": 0.48036244177008103, + "grad_norm": 7.720706462860107, + "learning_rate": 3.6501403106003315e-06, + "loss": 0.3864, + "mean_token_accuracy": 0.8717953890562058, + "num_tokens": 186432804.0, + "step": 154960 + }, + { + "entropy": 1.8868480697274208, + "epoch": 0.48039344089513075, + "grad_norm": 8.321026802062988, + "learning_rate": 3.650022538684739e-06, + "loss": 0.4553, + "mean_token_accuracy": 0.8510920956730843, + "num_tokens": 186444348.0, + "step": 154970 + }, + { + "entropy": 1.8335092812776566, + "epoch": 0.4804244400201804, + "grad_norm": 7.6114325523376465, + "learning_rate": 3.6499047781681557e-06, + "loss": 0.3916, + "mean_token_accuracy": 0.8667181313037873, + "num_tokens": 186456211.0, + "step": 154980 + }, + { + "entropy": 1.83046106249094, + "epoch": 0.48045543914523015, + "grad_norm": 7.575179576873779, + "learning_rate": 3.649787029048745e-06, + "loss": 0.4178, + "mean_token_accuracy": 0.8602567866444588, + "num_tokens": 186468895.0, + "step": 154990 + }, + { + "entropy": 1.7130945861339568, + "epoch": 0.4804864382702798, + "grad_norm": 2.742889404296875, + "learning_rate": 3.6496692913246674e-06, + "loss": 0.3431, + "mean_token_accuracy": 0.876962573826313, + "num_tokens": 186482798.0, + "step": 155000 + }, + { + "entropy": 1.9094618245959283, + "epoch": 0.48051743739532954, + "grad_norm": 8.246910095214844, + "learning_rate": 3.649551564994085e-06, + "loss": 0.4625, + "mean_token_accuracy": 0.8591118276119232, + "num_tokens": 186494431.0, + "step": 155010 + }, + { + "entropy": 1.8281339153647422, + "epoch": 0.4805484365203792, + "grad_norm": 9.279541969299316, + "learning_rate": 3.6494338500551612e-06, + "loss": 0.4061, + "mean_token_accuracy": 0.8599943906068802, + "num_tokens": 186506615.0, + "step": 155020 + }, + { + "entropy": 1.7933892510831355, + "epoch": 0.48057943564542893, + "grad_norm": 7.063535690307617, + "learning_rate": 3.6493161465060584e-06, + "loss": 0.3894, + "mean_token_accuracy": 0.8706391632556916, + "num_tokens": 186520167.0, + "step": 155030 + }, + { + "entropy": 1.8618095114827156, + "epoch": 0.4806104347704786, + "grad_norm": 8.329252243041992, + "learning_rate": 3.649198454344939e-06, + "loss": 0.4554, + "mean_token_accuracy": 0.851315900683403, + "num_tokens": 186531387.0, + "step": 155040 + }, + { + "entropy": 1.9018240422010422, + "epoch": 0.4806414338955283, + "grad_norm": 7.958313941955566, + "learning_rate": 3.649080773569969e-06, + "loss": 0.482, + "mean_token_accuracy": 0.8383594647049903, + "num_tokens": 186543471.0, + "step": 155050 + }, + { + "entropy": 1.8148971542716026, + "epoch": 0.480672433020578, + "grad_norm": 8.183568954467773, + "learning_rate": 3.648963104179311e-06, + "loss": 0.4269, + "mean_token_accuracy": 0.8573694407939911, + "num_tokens": 186555739.0, + "step": 155060 + }, + { + "entropy": 1.898486042022705, + "epoch": 0.48070343214562766, + "grad_norm": 6.402829170227051, + "learning_rate": 3.648845446171129e-06, + "loss": 0.4411, + "mean_token_accuracy": 0.8548410072922706, + "num_tokens": 186566808.0, + "step": 155070 + }, + { + "entropy": 1.853443591296673, + "epoch": 0.4807344312706774, + "grad_norm": 9.244396209716797, + "learning_rate": 3.6487277995435906e-06, + "loss": 0.4326, + "mean_token_accuracy": 0.8503358826041222, + "num_tokens": 186579310.0, + "step": 155080 + }, + { + "entropy": 1.8664480939507484, + "epoch": 0.48076543039572706, + "grad_norm": 4.412349700927734, + "learning_rate": 3.6486101642948586e-06, + "loss": 0.4325, + "mean_token_accuracy": 0.8586249127984047, + "num_tokens": 186591176.0, + "step": 155090 + }, + { + "entropy": 1.8777339145541192, + "epoch": 0.4807964295207768, + "grad_norm": 6.988900184631348, + "learning_rate": 3.6484925404230997e-06, + "loss": 0.4294, + "mean_token_accuracy": 0.863787741959095, + "num_tokens": 186602357.0, + "step": 155100 + }, + { + "entropy": 1.8698883458971978, + "epoch": 0.48082742864582645, + "grad_norm": 9.025288581848145, + "learning_rate": 3.6483749279264807e-06, + "loss": 0.4372, + "mean_token_accuracy": 0.8576495185494423, + "num_tokens": 186614146.0, + "step": 155110 + }, + { + "entropy": 1.8872638180851937, + "epoch": 0.4808584277708762, + "grad_norm": 8.719307899475098, + "learning_rate": 3.6482573268031675e-06, + "loss": 0.4322, + "mean_token_accuracy": 0.8448175966739655, + "num_tokens": 186626569.0, + "step": 155120 + }, + { + "entropy": 1.8500750228762626, + "epoch": 0.48088942689592584, + "grad_norm": 6.002628326416016, + "learning_rate": 3.6481397370513276e-06, + "loss": 0.5012, + "mean_token_accuracy": 0.8516894370317459, + "num_tokens": 186639402.0, + "step": 155130 + }, + { + "entropy": 1.9110220953822137, + "epoch": 0.48092042602097557, + "grad_norm": 6.769970417022705, + "learning_rate": 3.6480221586691295e-06, + "loss": 0.4582, + "mean_token_accuracy": 0.8488235414028168, + "num_tokens": 186651695.0, + "step": 155140 + }, + { + "entropy": 1.8391446053981781, + "epoch": 0.48095142514602524, + "grad_norm": 7.097575664520264, + "learning_rate": 3.647904591654739e-06, + "loss": 0.4181, + "mean_token_accuracy": 0.8646885514259338, + "num_tokens": 186663994.0, + "step": 155150 + }, + { + "entropy": 1.9141424641013145, + "epoch": 0.48098242427107496, + "grad_norm": 7.152270317077637, + "learning_rate": 3.6477870360063257e-06, + "loss": 0.4834, + "mean_token_accuracy": 0.8485850319266319, + "num_tokens": 186675453.0, + "step": 155160 + }, + { + "entropy": 1.9391047358512878, + "epoch": 0.48101342339612463, + "grad_norm": 8.798613548278809, + "learning_rate": 3.6476694917220577e-06, + "loss": 0.5, + "mean_token_accuracy": 0.8436856657266617, + "num_tokens": 186686372.0, + "step": 155170 + }, + { + "entropy": 1.8622770234942436, + "epoch": 0.48104442252117435, + "grad_norm": 8.492561340332031, + "learning_rate": 3.647551958800106e-06, + "loss": 0.4248, + "mean_token_accuracy": 0.8657533332705498, + "num_tokens": 186697676.0, + "step": 155180 + }, + { + "entropy": 1.9052131861448287, + "epoch": 0.481075421646224, + "grad_norm": 6.784976959228516, + "learning_rate": 3.6474344372386383e-06, + "loss": 0.483, + "mean_token_accuracy": 0.8484751611948014, + "num_tokens": 186708270.0, + "step": 155190 + }, + { + "entropy": 1.8822030156850815, + "epoch": 0.48110642077127375, + "grad_norm": 7.78303337097168, + "learning_rate": 3.647316927035825e-06, + "loss": 0.4875, + "mean_token_accuracy": 0.8471454083919525, + "num_tokens": 186719190.0, + "step": 155200 + }, + { + "entropy": 1.8744702443480492, + "epoch": 0.4811374198963234, + "grad_norm": 9.0418701171875, + "learning_rate": 3.6471994281898366e-06, + "loss": 0.4277, + "mean_token_accuracy": 0.8550082817673683, + "num_tokens": 186730876.0, + "step": 155210 + }, + { + "entropy": 1.905518215894699, + "epoch": 0.48116841902137314, + "grad_norm": 7.297862529754639, + "learning_rate": 3.647081940698843e-06, + "loss": 0.438, + "mean_token_accuracy": 0.8591875582933426, + "num_tokens": 186741370.0, + "step": 155220 + }, + { + "entropy": 1.940250787138939, + "epoch": 0.4811994181464228, + "grad_norm": 8.297772407531738, + "learning_rate": 3.6469644645610177e-06, + "loss": 0.5156, + "mean_token_accuracy": 0.8442102998495102, + "num_tokens": 186752169.0, + "step": 155230 + }, + { + "entropy": 1.8991802141070366, + "epoch": 0.48123041727147253, + "grad_norm": 8.18435001373291, + "learning_rate": 3.64684699977453e-06, + "loss": 0.4477, + "mean_token_accuracy": 0.8534949243068695, + "num_tokens": 186763406.0, + "step": 155240 + }, + { + "entropy": 1.8912201315164565, + "epoch": 0.4812614163965222, + "grad_norm": 8.27580738067627, + "learning_rate": 3.6467295463375536e-06, + "loss": 0.4753, + "mean_token_accuracy": 0.8461031407117844, + "num_tokens": 186774770.0, + "step": 155250 + }, + { + "entropy": 1.9139169782400132, + "epoch": 0.4812924155215719, + "grad_norm": 7.336427688598633, + "learning_rate": 3.6466121042482605e-06, + "loss": 0.5342, + "mean_token_accuracy": 0.8405420258641243, + "num_tokens": 186786162.0, + "step": 155260 + }, + { + "entropy": 1.7847980469465257, + "epoch": 0.4813234146466216, + "grad_norm": 9.743826866149902, + "learning_rate": 3.6464946735048225e-06, + "loss": 0.4024, + "mean_token_accuracy": 0.8628578379750251, + "num_tokens": 186798530.0, + "step": 155270 + }, + { + "entropy": 1.729624892771244, + "epoch": 0.4813544137716713, + "grad_norm": 3.902719497680664, + "learning_rate": 3.6463772541054143e-06, + "loss": 0.3639, + "mean_token_accuracy": 0.8717215061187744, + "num_tokens": 186811845.0, + "step": 155280 + }, + { + "entropy": 1.8367839485406876, + "epoch": 0.481385412896721, + "grad_norm": 7.629048824310303, + "learning_rate": 3.6462598460482084e-06, + "loss": 0.4439, + "mean_token_accuracy": 0.8575905844569206, + "num_tokens": 186823908.0, + "step": 155290 + }, + { + "entropy": 1.8470647171139718, + "epoch": 0.4814164120217707, + "grad_norm": 7.2646403312683105, + "learning_rate": 3.6461424493313795e-06, + "loss": 0.4655, + "mean_token_accuracy": 0.8454138725996018, + "num_tokens": 186835758.0, + "step": 155300 + }, + { + "entropy": 1.843580712378025, + "epoch": 0.4814474111468204, + "grad_norm": 4.490911483764648, + "learning_rate": 3.646025063953103e-06, + "loss": 0.4521, + "mean_token_accuracy": 0.8536460235714912, + "num_tokens": 186848093.0, + "step": 155310 + }, + { + "entropy": 1.8249020904302597, + "epoch": 0.48147841027187005, + "grad_norm": 7.88754415512085, + "learning_rate": 3.645907689911552e-06, + "loss": 0.422, + "mean_token_accuracy": 0.8577623814344406, + "num_tokens": 186860208.0, + "step": 155320 + }, + { + "entropy": 1.8020548969507217, + "epoch": 0.4815094093969198, + "grad_norm": 3.8696632385253906, + "learning_rate": 3.6457903272049033e-06, + "loss": 0.4251, + "mean_token_accuracy": 0.8550468429923057, + "num_tokens": 186872705.0, + "step": 155330 + }, + { + "entropy": 1.8387231886386872, + "epoch": 0.48154040852196944, + "grad_norm": 9.060702323913574, + "learning_rate": 3.645672975831332e-06, + "loss": 0.4401, + "mean_token_accuracy": 0.857013863325119, + "num_tokens": 186884728.0, + "step": 155340 + }, + { + "entropy": 1.7827875435352325, + "epoch": 0.48157140764701917, + "grad_norm": 2.826335906982422, + "learning_rate": 3.645555635789015e-06, + "loss": 0.391, + "mean_token_accuracy": 0.8733478873968125, + "num_tokens": 186897871.0, + "step": 155350 + }, + { + "entropy": 1.844759140908718, + "epoch": 0.48160240677206884, + "grad_norm": 7.48664665222168, + "learning_rate": 3.6454383070761275e-06, + "loss": 0.447, + "mean_token_accuracy": 0.8576426088809967, + "num_tokens": 186909772.0, + "step": 155360 + }, + { + "entropy": 1.8383795037865638, + "epoch": 0.48163340589711856, + "grad_norm": 12.733421325683594, + "learning_rate": 3.6453209896908476e-06, + "loss": 0.4823, + "mean_token_accuracy": 0.8490657106041908, + "num_tokens": 186921500.0, + "step": 155370 + }, + { + "entropy": 1.7978422671556473, + "epoch": 0.48166440502216823, + "grad_norm": 4.8395233154296875, + "learning_rate": 3.645203683631352e-06, + "loss": 0.4023, + "mean_token_accuracy": 0.8567419424653053, + "num_tokens": 186934087.0, + "step": 155380 + }, + { + "entropy": 1.8284786969423295, + "epoch": 0.48169540414721795, + "grad_norm": 3.7645609378814697, + "learning_rate": 3.6450863888958197e-06, + "loss": 0.4494, + "mean_token_accuracy": 0.8614578813314437, + "num_tokens": 186945729.0, + "step": 155390 + }, + { + "entropy": 1.7969519719481468, + "epoch": 0.4817264032722676, + "grad_norm": 3.680454730987549, + "learning_rate": 3.6449691054824275e-06, + "loss": 0.3915, + "mean_token_accuracy": 0.872539047896862, + "num_tokens": 186957718.0, + "step": 155400 + }, + { + "entropy": 1.8333932682871819, + "epoch": 0.48175740239731735, + "grad_norm": 7.701864242553711, + "learning_rate": 3.644851833389354e-06, + "loss": 0.4355, + "mean_token_accuracy": 0.8568485960364342, + "num_tokens": 186969432.0, + "step": 155410 + }, + { + "entropy": 1.7963881850242616, + "epoch": 0.481788401522367, + "grad_norm": 7.959238529205322, + "learning_rate": 3.64473457261478e-06, + "loss": 0.4434, + "mean_token_accuracy": 0.8592301860451699, + "num_tokens": 186981898.0, + "step": 155420 + }, + { + "entropy": 1.8788066983222962, + "epoch": 0.48181940064741674, + "grad_norm": 8.316329002380371, + "learning_rate": 3.6446173231568833e-06, + "loss": 0.4416, + "mean_token_accuracy": 0.8591387689113616, + "num_tokens": 186993009.0, + "step": 155430 + }, + { + "entropy": 1.7975234732031822, + "epoch": 0.4818503997724664, + "grad_norm": 4.090932846069336, + "learning_rate": 3.644500085013844e-06, + "loss": 0.4096, + "mean_token_accuracy": 0.8513005197048187, + "num_tokens": 187005781.0, + "step": 155440 + }, + { + "entropy": 1.8089564740657806, + "epoch": 0.48188139889751613, + "grad_norm": 6.5081000328063965, + "learning_rate": 3.644382858183843e-06, + "loss": 0.4099, + "mean_token_accuracy": 0.8633261471986771, + "num_tokens": 187018552.0, + "step": 155450 + }, + { + "entropy": 1.7889938607811928, + "epoch": 0.4819123980225658, + "grad_norm": 3.51275372505188, + "learning_rate": 3.6442656426650603e-06, + "loss": 0.407, + "mean_token_accuracy": 0.8629370719194412, + "num_tokens": 187032150.0, + "step": 155460 + }, + { + "entropy": 1.8642136842012405, + "epoch": 0.4819433971476155, + "grad_norm": 7.637637615203857, + "learning_rate": 3.6441484384556776e-06, + "loss": 0.4404, + "mean_token_accuracy": 0.8616564303636551, + "num_tokens": 187043436.0, + "step": 155470 + }, + { + "entropy": 1.8972400605678559, + "epoch": 0.4819743962726652, + "grad_norm": 9.601285934448242, + "learning_rate": 3.6440312455538756e-06, + "loss": 0.4747, + "mean_token_accuracy": 0.8566853240132332, + "num_tokens": 187053461.0, + "step": 155480 + }, + { + "entropy": 1.905197212100029, + "epoch": 0.4820053953977149, + "grad_norm": 6.226647853851318, + "learning_rate": 3.643914063957837e-06, + "loss": 0.4993, + "mean_token_accuracy": 0.849284790456295, + "num_tokens": 187064477.0, + "step": 155490 + }, + { + "entropy": 1.848924145102501, + "epoch": 0.4820363945227646, + "grad_norm": 8.730347633361816, + "learning_rate": 3.643796893665743e-06, + "loss": 0.4393, + "mean_token_accuracy": 0.8531228393316269, + "num_tokens": 187076141.0, + "step": 155500 + }, + { + "entropy": 1.9209964841604232, + "epoch": 0.4820673936478143, + "grad_norm": 10.016907691955566, + "learning_rate": 3.643679734675778e-06, + "loss": 0.4781, + "mean_token_accuracy": 0.8399276688694954, + "num_tokens": 187087179.0, + "step": 155510 + }, + { + "entropy": 1.811716277897358, + "epoch": 0.482098392772864, + "grad_norm": 10.810657501220703, + "learning_rate": 3.643562586986124e-06, + "loss": 0.4141, + "mean_token_accuracy": 0.8532059162855148, + "num_tokens": 187099880.0, + "step": 155520 + }, + { + "entropy": 1.8917767360806466, + "epoch": 0.4821293918979137, + "grad_norm": 9.377852439880371, + "learning_rate": 3.6434454505949645e-06, + "loss": 0.464, + "mean_token_accuracy": 0.8542043223977089, + "num_tokens": 187111103.0, + "step": 155530 + }, + { + "entropy": 1.751031818985939, + "epoch": 0.4821603910229634, + "grad_norm": 11.62239933013916, + "learning_rate": 3.6433283255004835e-06, + "loss": 0.3926, + "mean_token_accuracy": 0.8702298268675804, + "num_tokens": 187124247.0, + "step": 155540 + }, + { + "entropy": 1.7074541047215461, + "epoch": 0.4821913901480131, + "grad_norm": 4.805656909942627, + "learning_rate": 3.643211211700866e-06, + "loss": 0.3803, + "mean_token_accuracy": 0.8729520246386528, + "num_tokens": 187138198.0, + "step": 155550 + }, + { + "entropy": 1.82494997382164, + "epoch": 0.48222238927306277, + "grad_norm": 4.265288352966309, + "learning_rate": 3.6430941091942967e-06, + "loss": 0.4014, + "mean_token_accuracy": 0.8578064441680908, + "num_tokens": 187150394.0, + "step": 155560 + }, + { + "entropy": 1.8820818334817886, + "epoch": 0.48225338839811244, + "grad_norm": 8.813608169555664, + "learning_rate": 3.6429770179789605e-06, + "loss": 0.4503, + "mean_token_accuracy": 0.8515245079994201, + "num_tokens": 187161557.0, + "step": 155570 + }, + { + "entropy": 1.867897354066372, + "epoch": 0.48228438752316216, + "grad_norm": 7.266292572021484, + "learning_rate": 3.6428599380530417e-06, + "loss": 0.4301, + "mean_token_accuracy": 0.8602555438876152, + "num_tokens": 187173114.0, + "step": 155580 + }, + { + "entropy": 1.8383618667721748, + "epoch": 0.48231538664821183, + "grad_norm": 3.8756728172302246, + "learning_rate": 3.6427428694147288e-06, + "loss": 0.4232, + "mean_token_accuracy": 0.8568738460540771, + "num_tokens": 187184447.0, + "step": 155590 + }, + { + "entropy": 1.8275836020708085, + "epoch": 0.48234638577326155, + "grad_norm": 5.545556545257568, + "learning_rate": 3.642625812062206e-06, + "loss": 0.444, + "mean_token_accuracy": 0.8490733534097672, + "num_tokens": 187196783.0, + "step": 155600 + }, + { + "entropy": 1.8141174018383026, + "epoch": 0.4823773848983112, + "grad_norm": 3.5136075019836426, + "learning_rate": 3.642508765993661e-06, + "loss": 0.4333, + "mean_token_accuracy": 0.8561397686600685, + "num_tokens": 187209895.0, + "step": 155610 + }, + { + "entropy": 1.8496269896626472, + "epoch": 0.48240838402336095, + "grad_norm": 7.60167932510376, + "learning_rate": 3.6423917312072815e-06, + "loss": 0.4274, + "mean_token_accuracy": 0.862434770166874, + "num_tokens": 187221932.0, + "step": 155620 + }, + { + "entropy": 1.7664272099733354, + "epoch": 0.4824393831484106, + "grad_norm": 7.53615140914917, + "learning_rate": 3.6422747077012544e-06, + "loss": 0.3898, + "mean_token_accuracy": 0.8608724012970924, + "num_tokens": 187234329.0, + "step": 155630 + }, + { + "entropy": 1.8685143813490868, + "epoch": 0.48247038227346034, + "grad_norm": 8.27384090423584, + "learning_rate": 3.6421576954737676e-06, + "loss": 0.4483, + "mean_token_accuracy": 0.8536564335227013, + "num_tokens": 187245703.0, + "step": 155640 + }, + { + "entropy": 1.715440958738327, + "epoch": 0.48250138139851, + "grad_norm": 6.6035637855529785, + "learning_rate": 3.6420406945230103e-06, + "loss": 0.3223, + "mean_token_accuracy": 0.8742220461368561, + "num_tokens": 187258474.0, + "step": 155650 + }, + { + "entropy": 1.8504155680537224, + "epoch": 0.48253238052355973, + "grad_norm": 9.480717658996582, + "learning_rate": 3.6419237048471704e-06, + "loss": 0.4204, + "mean_token_accuracy": 0.8578575730323792, + "num_tokens": 187270524.0, + "step": 155660 + }, + { + "entropy": 1.7659525617957115, + "epoch": 0.4825633796486094, + "grad_norm": 8.356760025024414, + "learning_rate": 3.6418067264444382e-06, + "loss": 0.4063, + "mean_token_accuracy": 0.8561761811375618, + "num_tokens": 187283451.0, + "step": 155670 + }, + { + "entropy": 1.8657870590686798, + "epoch": 0.4825943787736591, + "grad_norm": 9.368040084838867, + "learning_rate": 3.641689759313003e-06, + "loss": 0.4418, + "mean_token_accuracy": 0.8581396222114563, + "num_tokens": 187295040.0, + "step": 155680 + }, + { + "entropy": 1.8435973614454269, + "epoch": 0.4826253778987088, + "grad_norm": 8.829280853271484, + "learning_rate": 3.6415728034510545e-06, + "loss": 0.4342, + "mean_token_accuracy": 0.8657978147268295, + "num_tokens": 187306210.0, + "step": 155690 + }, + { + "entropy": 1.9003707140684127, + "epoch": 0.4826563770237585, + "grad_norm": 10.332893371582031, + "learning_rate": 3.6414558588567827e-06, + "loss": 0.4659, + "mean_token_accuracy": 0.8597208350896836, + "num_tokens": 187316273.0, + "step": 155700 + }, + { + "entropy": 1.8557246506214142, + "epoch": 0.4826873761488082, + "grad_norm": 7.9659247398376465, + "learning_rate": 3.64133892552838e-06, + "loss": 0.4304, + "mean_token_accuracy": 0.8631693422794342, + "num_tokens": 187328277.0, + "step": 155710 + }, + { + "entropy": 1.909783835709095, + "epoch": 0.4827183752738579, + "grad_norm": 4.126440525054932, + "learning_rate": 3.6412220034640367e-06, + "loss": 0.5136, + "mean_token_accuracy": 0.8437858536839485, + "num_tokens": 187339540.0, + "step": 155720 + }, + { + "entropy": 1.8335693284869194, + "epoch": 0.4827493743989076, + "grad_norm": 8.314302444458008, + "learning_rate": 3.6411050926619444e-06, + "loss": 0.434, + "mean_token_accuracy": 0.85353202521801, + "num_tokens": 187351930.0, + "step": 155730 + }, + { + "entropy": 1.7986718460917472, + "epoch": 0.4827803735239573, + "grad_norm": 6.746932506561279, + "learning_rate": 3.6409881931202954e-06, + "loss": 0.4154, + "mean_token_accuracy": 0.8710423827171325, + "num_tokens": 187364243.0, + "step": 155740 + }, + { + "entropy": 1.92829280346632, + "epoch": 0.482811372649007, + "grad_norm": 9.083883285522461, + "learning_rate": 3.640871304837283e-06, + "loss": 0.4782, + "mean_token_accuracy": 0.8497691377997398, + "num_tokens": 187375649.0, + "step": 155750 + }, + { + "entropy": 1.866778840124607, + "epoch": 0.4828423717740567, + "grad_norm": 4.199884414672852, + "learning_rate": 3.640754427811098e-06, + "loss": 0.4778, + "mean_token_accuracy": 0.8422569170594215, + "num_tokens": 187387617.0, + "step": 155760 + }, + { + "entropy": 1.8624927997589111, + "epoch": 0.48287337089910637, + "grad_norm": 8.201692581176758, + "learning_rate": 3.640637562039936e-06, + "loss": 0.4795, + "mean_token_accuracy": 0.8347114294767379, + "num_tokens": 187399916.0, + "step": 155770 + }, + { + "entropy": 1.7571727007627487, + "epoch": 0.4829043700241561, + "grad_norm": 8.598112106323242, + "learning_rate": 3.6405207075219895e-06, + "loss": 0.3851, + "mean_token_accuracy": 0.8643165245652199, + "num_tokens": 187413052.0, + "step": 155780 + }, + { + "entropy": 1.8955672517418862, + "epoch": 0.48293536914920576, + "grad_norm": 13.206428527832031, + "learning_rate": 3.640403864255453e-06, + "loss": 0.4662, + "mean_token_accuracy": 0.8522917911410332, + "num_tokens": 187423801.0, + "step": 155790 + }, + { + "entropy": 1.862819616496563, + "epoch": 0.4829663682742555, + "grad_norm": 3.849029064178467, + "learning_rate": 3.6402870322385207e-06, + "loss": 0.4488, + "mean_token_accuracy": 0.8516810446977615, + "num_tokens": 187435559.0, + "step": 155800 + }, + { + "entropy": 1.9115412473678588, + "epoch": 0.48299736739930516, + "grad_norm": 8.526803016662598, + "learning_rate": 3.6401702114693883e-06, + "loss": 0.4928, + "mean_token_accuracy": 0.8415917620062828, + "num_tokens": 187447277.0, + "step": 155810 + }, + { + "entropy": 1.7353981606662274, + "epoch": 0.4830283665243548, + "grad_norm": 10.263956069946289, + "learning_rate": 3.6400534019462497e-06, + "loss": 0.3582, + "mean_token_accuracy": 0.8638710469007492, + "num_tokens": 187461097.0, + "step": 155820 + }, + { + "entropy": 1.8627897635102273, + "epoch": 0.48305936564940455, + "grad_norm": 8.649343490600586, + "learning_rate": 3.6399366036673023e-06, + "loss": 0.4701, + "mean_token_accuracy": 0.8539001986384391, + "num_tokens": 187473687.0, + "step": 155830 + }, + { + "entropy": 1.839612002670765, + "epoch": 0.4830903647744542, + "grad_norm": 8.778943061828613, + "learning_rate": 3.639819816630742e-06, + "loss": 0.4125, + "mean_token_accuracy": 0.8621722176671028, + "num_tokens": 187486542.0, + "step": 155840 + }, + { + "entropy": 1.8706245437264442, + "epoch": 0.48312136389950394, + "grad_norm": 7.706948757171631, + "learning_rate": 3.6397030408347638e-06, + "loss": 0.4887, + "mean_token_accuracy": 0.8378767624497414, + "num_tokens": 187498647.0, + "step": 155850 + }, + { + "entropy": 1.8668534964323045, + "epoch": 0.4831523630245536, + "grad_norm": 7.943578243255615, + "learning_rate": 3.639586276277567e-06, + "loss": 0.4789, + "mean_token_accuracy": 0.8449800446629524, + "num_tokens": 187510115.0, + "step": 155860 + }, + { + "entropy": 1.9097621962428093, + "epoch": 0.48318336214960333, + "grad_norm": 9.294798851013184, + "learning_rate": 3.639469522957346e-06, + "loss": 0.512, + "mean_token_accuracy": 0.8440002262592315, + "num_tokens": 187521520.0, + "step": 155870 + }, + { + "entropy": 1.7814342930912972, + "epoch": 0.483214361274653, + "grad_norm": 4.55609655380249, + "learning_rate": 3.639352780872302e-06, + "loss": 0.3879, + "mean_token_accuracy": 0.8625999093055725, + "num_tokens": 187534109.0, + "step": 155880 + }, + { + "entropy": 1.8384138897061348, + "epoch": 0.48324536039970273, + "grad_norm": 8.546332359313965, + "learning_rate": 3.6392360500206303e-06, + "loss": 0.4519, + "mean_token_accuracy": 0.8485594004392624, + "num_tokens": 187545835.0, + "step": 155890 + }, + { + "entropy": 1.8684273108839988, + "epoch": 0.4832763595247524, + "grad_norm": 8.705604553222656, + "learning_rate": 3.639119330400532e-06, + "loss": 0.458, + "mean_token_accuracy": 0.8522221609950066, + "num_tokens": 187557414.0, + "step": 155900 + }, + { + "entropy": 1.8951046496629715, + "epoch": 0.4833073586498021, + "grad_norm": 7.8505377769470215, + "learning_rate": 3.6390026220102036e-06, + "loss": 0.4849, + "mean_token_accuracy": 0.8535401314496994, + "num_tokens": 187568187.0, + "step": 155910 + }, + { + "entropy": 1.8770427122712134, + "epoch": 0.4833383577748518, + "grad_norm": 7.7342915534973145, + "learning_rate": 3.6388859248478454e-06, + "loss": 0.438, + "mean_token_accuracy": 0.8514578476548195, + "num_tokens": 187580732.0, + "step": 155920 + }, + { + "entropy": 1.8046485051512717, + "epoch": 0.4833693568999015, + "grad_norm": 7.7772369384765625, + "learning_rate": 3.6387692389116584e-06, + "loss": 0.4334, + "mean_token_accuracy": 0.853971840441227, + "num_tokens": 187592862.0, + "step": 155930 + }, + { + "entropy": 1.8091088235378265, + "epoch": 0.4834003560249512, + "grad_norm": 4.339217662811279, + "learning_rate": 3.638652564199841e-06, + "loss": 0.3929, + "mean_token_accuracy": 0.8683247849345207, + "num_tokens": 187605469.0, + "step": 155940 + }, + { + "entropy": 1.8805622160434723, + "epoch": 0.4834313551500009, + "grad_norm": 8.848437309265137, + "learning_rate": 3.6385359007105956e-06, + "loss": 0.4171, + "mean_token_accuracy": 0.8534131139516831, + "num_tokens": 187617262.0, + "step": 155950 + }, + { + "entropy": 1.8057363733649254, + "epoch": 0.4834623542750506, + "grad_norm": 3.9022932052612305, + "learning_rate": 3.638419248442122e-06, + "loss": 0.4247, + "mean_token_accuracy": 0.8567571699619293, + "num_tokens": 187629721.0, + "step": 155960 + }, + { + "entropy": 1.764707237482071, + "epoch": 0.4834933534001003, + "grad_norm": 2.7294106483459473, + "learning_rate": 3.638302607392622e-06, + "loss": 0.4038, + "mean_token_accuracy": 0.8628931626677513, + "num_tokens": 187643292.0, + "step": 155970 + }, + { + "entropy": 1.804534560441971, + "epoch": 0.48352435252514997, + "grad_norm": 7.46668815612793, + "learning_rate": 3.6381859775602966e-06, + "loss": 0.4035, + "mean_token_accuracy": 0.8666567876935005, + "num_tokens": 187656242.0, + "step": 155980 + }, + { + "entropy": 1.9103981226682663, + "epoch": 0.4835553516501997, + "grad_norm": 7.5417070388793945, + "learning_rate": 3.638069358943349e-06, + "loss": 0.4645, + "mean_token_accuracy": 0.8543941050767898, + "num_tokens": 187667038.0, + "step": 155990 + }, + { + "entropy": 1.8362280517816543, + "epoch": 0.48358635077524936, + "grad_norm": 7.783841133117676, + "learning_rate": 3.637952751539982e-06, + "loss": 0.4361, + "mean_token_accuracy": 0.8486904546618461, + "num_tokens": 187679620.0, + "step": 156000 + }, + { + "entropy": 1.8745223417878152, + "epoch": 0.4836173499002991, + "grad_norm": 10.0465087890625, + "learning_rate": 3.6378361553483975e-06, + "loss": 0.4271, + "mean_token_accuracy": 0.8512848377227783, + "num_tokens": 187691256.0, + "step": 156010 + }, + { + "entropy": 1.9242184340953827, + "epoch": 0.48364834902534876, + "grad_norm": 8.397578239440918, + "learning_rate": 3.6377195703668004e-06, + "loss": 0.4736, + "mean_token_accuracy": 0.8493492394685745, + "num_tokens": 187701903.0, + "step": 156020 + }, + { + "entropy": 1.8979373127222061, + "epoch": 0.4836793481503985, + "grad_norm": 7.949219226837158, + "learning_rate": 3.6376029965933936e-06, + "loss": 0.5207, + "mean_token_accuracy": 0.8401141837239265, + "num_tokens": 187713014.0, + "step": 156030 + }, + { + "entropy": 1.8131269350647927, + "epoch": 0.48371034727544815, + "grad_norm": 3.8186416625976562, + "learning_rate": 3.637486434026381e-06, + "loss": 0.4211, + "mean_token_accuracy": 0.848060154914856, + "num_tokens": 187726114.0, + "step": 156040 + }, + { + "entropy": 1.7898790180683135, + "epoch": 0.4837413464004979, + "grad_norm": 4.015059947967529, + "learning_rate": 3.6373698826639674e-06, + "loss": 0.4027, + "mean_token_accuracy": 0.8551013439893722, + "num_tokens": 187739024.0, + "step": 156050 + }, + { + "entropy": 1.7694806531071663, + "epoch": 0.48377234552554754, + "grad_norm": 8.411758422851562, + "learning_rate": 3.6372533425043592e-06, + "loss": 0.3709, + "mean_token_accuracy": 0.8681419178843498, + "num_tokens": 187750917.0, + "step": 156060 + }, + { + "entropy": 1.8274760872125626, + "epoch": 0.4838033446505972, + "grad_norm": 9.502041816711426, + "learning_rate": 3.6371368135457597e-06, + "loss": 0.4486, + "mean_token_accuracy": 0.8524838626384735, + "num_tokens": 187763130.0, + "step": 156070 + }, + { + "entropy": 1.8318697839975357, + "epoch": 0.48383434377564694, + "grad_norm": 9.341939926147461, + "learning_rate": 3.637020295786377e-06, + "loss": 0.4545, + "mean_token_accuracy": 0.8428703010082245, + "num_tokens": 187775702.0, + "step": 156080 + }, + { + "entropy": 1.9090040355920792, + "epoch": 0.4838653429006966, + "grad_norm": 9.399673461914062, + "learning_rate": 3.6369037892244154e-06, + "loss": 0.4788, + "mean_token_accuracy": 0.8548586055636406, + "num_tokens": 187786390.0, + "step": 156090 + }, + { + "entropy": 1.8724202007055282, + "epoch": 0.48389634202574633, + "grad_norm": 9.181824684143066, + "learning_rate": 3.6367872938580817e-06, + "loss": 0.4125, + "mean_token_accuracy": 0.8572488963603974, + "num_tokens": 187798294.0, + "step": 156100 + }, + { + "entropy": 1.7142846815288066, + "epoch": 0.483927341150796, + "grad_norm": 7.860523223876953, + "learning_rate": 3.6366708096855852e-06, + "loss": 0.3695, + "mean_token_accuracy": 0.8716006681323052, + "num_tokens": 187812016.0, + "step": 156110 + }, + { + "entropy": 1.8110566526651382, + "epoch": 0.4839583402758457, + "grad_norm": 7.496535778045654, + "learning_rate": 3.6365543367051304e-06, + "loss": 0.4139, + "mean_token_accuracy": 0.8591806992888451, + "num_tokens": 187824148.0, + "step": 156120 + }, + { + "entropy": 1.7441804528236389, + "epoch": 0.4839893394008954, + "grad_norm": 8.527477264404297, + "learning_rate": 3.636437874914927e-06, + "loss": 0.3678, + "mean_token_accuracy": 0.8575242191553116, + "num_tokens": 187837951.0, + "step": 156130 + }, + { + "entropy": 1.7979930967092514, + "epoch": 0.4840203385259451, + "grad_norm": 10.310881614685059, + "learning_rate": 3.636321424313183e-06, + "loss": 0.4433, + "mean_token_accuracy": 0.8589319199323654, + "num_tokens": 187850444.0, + "step": 156140 + }, + { + "entropy": 1.8642814740538598, + "epoch": 0.4840513376509948, + "grad_norm": 7.190332889556885, + "learning_rate": 3.6362049848981064e-06, + "loss": 0.4609, + "mean_token_accuracy": 0.8498084455728531, + "num_tokens": 187862015.0, + "step": 156150 + }, + { + "entropy": 1.8755100816488266, + "epoch": 0.4840823367760445, + "grad_norm": 9.06082820892334, + "learning_rate": 3.6360885566679073e-06, + "loss": 0.482, + "mean_token_accuracy": 0.8422882303595542, + "num_tokens": 187873861.0, + "step": 156160 + }, + { + "entropy": 1.8259334176778794, + "epoch": 0.4841133359010942, + "grad_norm": 7.07421875, + "learning_rate": 3.635972139620794e-06, + "loss": 0.428, + "mean_token_accuracy": 0.8559049963951111, + "num_tokens": 187885749.0, + "step": 156170 + }, + { + "entropy": 1.933459311723709, + "epoch": 0.4841443350261439, + "grad_norm": 9.747174263000488, + "learning_rate": 3.635855733754977e-06, + "loss": 0.4557, + "mean_token_accuracy": 0.8511974141001701, + "num_tokens": 187896265.0, + "step": 156180 + }, + { + "entropy": 1.8441163718700408, + "epoch": 0.48417533415119357, + "grad_norm": 8.249030113220215, + "learning_rate": 3.6357393390686667e-06, + "loss": 0.4045, + "mean_token_accuracy": 0.8691418990492821, + "num_tokens": 187907705.0, + "step": 156190 + }, + { + "entropy": 1.819663205742836, + "epoch": 0.4842063332762433, + "grad_norm": 9.237526893615723, + "learning_rate": 3.635622955560073e-06, + "loss": 0.4107, + "mean_token_accuracy": 0.856706628203392, + "num_tokens": 187920290.0, + "step": 156200 + }, + { + "entropy": 1.8128308981657029, + "epoch": 0.48423733240129296, + "grad_norm": 7.997885704040527, + "learning_rate": 3.6355065832274085e-06, + "loss": 0.4652, + "mean_token_accuracy": 0.8497570842504502, + "num_tokens": 187933859.0, + "step": 156210 + }, + { + "entropy": 1.8780751511454583, + "epoch": 0.4842683315263427, + "grad_norm": 3.8760132789611816, + "learning_rate": 3.6353902220688827e-06, + "loss": 0.447, + "mean_token_accuracy": 0.849544820189476, + "num_tokens": 187945493.0, + "step": 156220 + }, + { + "entropy": 1.897641871869564, + "epoch": 0.48429933065139236, + "grad_norm": 7.152022361755371, + "learning_rate": 3.635273872082708e-06, + "loss": 0.4177, + "mean_token_accuracy": 0.8556644603610039, + "num_tokens": 187957061.0, + "step": 156230 + }, + { + "entropy": 1.8795344799757003, + "epoch": 0.4843303297764421, + "grad_norm": 3.9362809658050537, + "learning_rate": 3.6351575332670987e-06, + "loss": 0.4428, + "mean_token_accuracy": 0.8543588429689407, + "num_tokens": 187968642.0, + "step": 156240 + }, + { + "entropy": 1.8843262925744058, + "epoch": 0.48436132890149175, + "grad_norm": 7.554871082305908, + "learning_rate": 3.635041205620265e-06, + "loss": 0.4385, + "mean_token_accuracy": 0.8527411460876465, + "num_tokens": 187980875.0, + "step": 156250 + }, + { + "entropy": 1.8457639068365097, + "epoch": 0.4843923280265415, + "grad_norm": 8.525203704833984, + "learning_rate": 3.6349248891404204e-06, + "loss": 0.4698, + "mean_token_accuracy": 0.8586451008915901, + "num_tokens": 187992538.0, + "step": 156260 + }, + { + "entropy": 1.882865457236767, + "epoch": 0.48442332715159114, + "grad_norm": 9.19686222076416, + "learning_rate": 3.6348085838257794e-06, + "loss": 0.4687, + "mean_token_accuracy": 0.8533548563718796, + "num_tokens": 188003720.0, + "step": 156270 + }, + { + "entropy": 1.8969163656234742, + "epoch": 0.48445432627664087, + "grad_norm": 7.925225734710693, + "learning_rate": 3.634692289674556e-06, + "loss": 0.5061, + "mean_token_accuracy": 0.8379020988941193, + "num_tokens": 188015486.0, + "step": 156280 + }, + { + "entropy": 1.8043647065758706, + "epoch": 0.48448532540169054, + "grad_norm": 10.64844799041748, + "learning_rate": 3.6345760066849624e-06, + "loss": 0.4691, + "mean_token_accuracy": 0.8532945722341537, + "num_tokens": 188027781.0, + "step": 156290 + }, + { + "entropy": 1.7452705442905425, + "epoch": 0.4845163245267402, + "grad_norm": 4.376309871673584, + "learning_rate": 3.6344597348552156e-06, + "loss": 0.3688, + "mean_token_accuracy": 0.8649339213967323, + "num_tokens": 188041959.0, + "step": 156300 + }, + { + "entropy": 1.867486347258091, + "epoch": 0.48454732365178993, + "grad_norm": 7.461634159088135, + "learning_rate": 3.6343434741835288e-06, + "loss": 0.4327, + "mean_token_accuracy": 0.8641291365027428, + "num_tokens": 188054346.0, + "step": 156310 + }, + { + "entropy": 1.8779701933264732, + "epoch": 0.4845783227768396, + "grad_norm": 12.157458305358887, + "learning_rate": 3.634227224668119e-06, + "loss": 0.4465, + "mean_token_accuracy": 0.8633387625217438, + "num_tokens": 188065860.0, + "step": 156320 + }, + { + "entropy": 1.8805889412760735, + "epoch": 0.4846093219018893, + "grad_norm": 8.761711120605469, + "learning_rate": 3.634110986307202e-06, + "loss": 0.5089, + "mean_token_accuracy": 0.8404549643397331, + "num_tokens": 188077040.0, + "step": 156330 + }, + { + "entropy": 1.917221650481224, + "epoch": 0.484640321026939, + "grad_norm": 7.180342674255371, + "learning_rate": 3.6339947590989927e-06, + "loss": 0.4653, + "mean_token_accuracy": 0.8567236632108688, + "num_tokens": 188088847.0, + "step": 156340 + }, + { + "entropy": 1.758651900291443, + "epoch": 0.4846713201519887, + "grad_norm": 7.375646591186523, + "learning_rate": 3.633878543041709e-06, + "loss": 0.3852, + "mean_token_accuracy": 0.8604704290628433, + "num_tokens": 188102559.0, + "step": 156350 + }, + { + "entropy": 1.8245538994669914, + "epoch": 0.4847023192770384, + "grad_norm": 9.896560668945312, + "learning_rate": 3.6337623381335667e-06, + "loss": 0.4922, + "mean_token_accuracy": 0.8480986952781677, + "num_tokens": 188116273.0, + "step": 156360 + }, + { + "entropy": 1.7007096633315086, + "epoch": 0.4847333184020881, + "grad_norm": 6.117918014526367, + "learning_rate": 3.633646144372785e-06, + "loss": 0.404, + "mean_token_accuracy": 0.8717444121837616, + "num_tokens": 188130208.0, + "step": 156370 + }, + { + "entropy": 1.8957871600985527, + "epoch": 0.4847643175271378, + "grad_norm": 8.124541282653809, + "learning_rate": 3.6335299617575805e-06, + "loss": 0.4721, + "mean_token_accuracy": 0.853635823726654, + "num_tokens": 188141683.0, + "step": 156380 + }, + { + "entropy": 1.871186774969101, + "epoch": 0.4847953166521875, + "grad_norm": 8.871430397033691, + "learning_rate": 3.6334137902861723e-06, + "loss": 0.4375, + "mean_token_accuracy": 0.856009703874588, + "num_tokens": 188153756.0, + "step": 156390 + }, + { + "entropy": 1.8725305289030074, + "epoch": 0.48482631577723717, + "grad_norm": 8.753509521484375, + "learning_rate": 3.6332976299567785e-06, + "loss": 0.51, + "mean_token_accuracy": 0.8379634752869606, + "num_tokens": 188166036.0, + "step": 156400 + }, + { + "entropy": 1.8789052292704582, + "epoch": 0.4848573149022869, + "grad_norm": 8.494888305664062, + "learning_rate": 3.6331814807676176e-06, + "loss": 0.4757, + "mean_token_accuracy": 0.8496663123369217, + "num_tokens": 188177126.0, + "step": 156410 + }, + { + "entropy": 1.8479280322790146, + "epoch": 0.48488831402733656, + "grad_norm": 7.020564079284668, + "learning_rate": 3.6330653427169103e-06, + "loss": 0.4752, + "mean_token_accuracy": 0.8580684915184975, + "num_tokens": 188189522.0, + "step": 156420 + }, + { + "entropy": 1.849296373128891, + "epoch": 0.4849193131523863, + "grad_norm": 10.006750106811523, + "learning_rate": 3.6329492158028757e-06, + "loss": 0.4727, + "mean_token_accuracy": 0.8454330176115036, + "num_tokens": 188200987.0, + "step": 156430 + }, + { + "entropy": 1.8682186782360077, + "epoch": 0.48495031227743596, + "grad_norm": 8.694777488708496, + "learning_rate": 3.6328331000237343e-06, + "loss": 0.4601, + "mean_token_accuracy": 0.8542024463415145, + "num_tokens": 188213291.0, + "step": 156440 + }, + { + "entropy": 1.8008888192474841, + "epoch": 0.4849813114024857, + "grad_norm": 8.048171043395996, + "learning_rate": 3.6327169953777063e-06, + "loss": 0.4018, + "mean_token_accuracy": 0.8668388247489929, + "num_tokens": 188225876.0, + "step": 156450 + }, + { + "entropy": 1.799254597723484, + "epoch": 0.48501231052753535, + "grad_norm": 8.080785751342773, + "learning_rate": 3.632600901863014e-06, + "loss": 0.4142, + "mean_token_accuracy": 0.8576643332839012, + "num_tokens": 188238714.0, + "step": 156460 + }, + { + "entropy": 1.639521201699972, + "epoch": 0.4850433096525851, + "grad_norm": 3.9683926105499268, + "learning_rate": 3.6324848194778774e-06, + "loss": 0.3425, + "mean_token_accuracy": 0.869882382452488, + "num_tokens": 188253725.0, + "step": 156470 + }, + { + "entropy": 1.776284283399582, + "epoch": 0.48507430877763474, + "grad_norm": 4.678526401519775, + "learning_rate": 3.632368748220519e-06, + "loss": 0.4405, + "mean_token_accuracy": 0.8620742172002792, + "num_tokens": 188266283.0, + "step": 156480 + }, + { + "entropy": 1.8657296374440193, + "epoch": 0.48510530790268447, + "grad_norm": 7.574404239654541, + "learning_rate": 3.632252688089161e-06, + "loss": 0.4225, + "mean_token_accuracy": 0.863825935125351, + "num_tokens": 188277756.0, + "step": 156490 + }, + { + "entropy": 1.8865803450345993, + "epoch": 0.48513630702773414, + "grad_norm": 8.333385467529297, + "learning_rate": 3.6321366390820266e-06, + "loss": 0.4272, + "mean_token_accuracy": 0.8613347887992859, + "num_tokens": 188288700.0, + "step": 156500 + }, + { + "entropy": 1.9244218185544013, + "epoch": 0.48516730615278386, + "grad_norm": 7.839199066162109, + "learning_rate": 3.6320206011973373e-06, + "loss": 0.5009, + "mean_token_accuracy": 0.8391366973519325, + "num_tokens": 188300479.0, + "step": 156510 + }, + { + "entropy": 1.8372875064611436, + "epoch": 0.48519830527783353, + "grad_norm": 7.72141170501709, + "learning_rate": 3.6319045744333185e-06, + "loss": 0.423, + "mean_token_accuracy": 0.860981197655201, + "num_tokens": 188312684.0, + "step": 156520 + }, + { + "entropy": 1.7935153931379317, + "epoch": 0.48522930440288325, + "grad_norm": 4.041905879974365, + "learning_rate": 3.631788558788193e-06, + "loss": 0.415, + "mean_token_accuracy": 0.8546757638454437, + "num_tokens": 188325289.0, + "step": 156530 + }, + { + "entropy": 1.744502691924572, + "epoch": 0.4852603035279329, + "grad_norm": 7.012277603149414, + "learning_rate": 3.631672554260184e-06, + "loss": 0.3668, + "mean_token_accuracy": 0.8671909630298614, + "num_tokens": 188338884.0, + "step": 156540 + }, + { + "entropy": 1.786800318956375, + "epoch": 0.4852913026529826, + "grad_norm": 8.924103736877441, + "learning_rate": 3.6315565608475184e-06, + "loss": 0.3694, + "mean_token_accuracy": 0.8652423739433288, + "num_tokens": 188351601.0, + "step": 156550 + }, + { + "entropy": 1.798016108572483, + "epoch": 0.4853223017780323, + "grad_norm": 9.73193645477295, + "learning_rate": 3.631440578548419e-06, + "loss": 0.4074, + "mean_token_accuracy": 0.861903102695942, + "num_tokens": 188364272.0, + "step": 156560 + }, + { + "entropy": 1.7860716104507446, + "epoch": 0.485353300903082, + "grad_norm": 5.464559078216553, + "learning_rate": 3.631324607361113e-06, + "loss": 0.3677, + "mean_token_accuracy": 0.8641039207577705, + "num_tokens": 188377196.0, + "step": 156570 + }, + { + "entropy": 1.9405299216508864, + "epoch": 0.4853843000281317, + "grad_norm": 9.91318130493164, + "learning_rate": 3.6312086472838253e-06, + "loss": 0.5184, + "mean_token_accuracy": 0.8417086794972419, + "num_tokens": 188388093.0, + "step": 156580 + }, + { + "entropy": 1.8181221313774585, + "epoch": 0.4854152991531814, + "grad_norm": 11.408985137939453, + "learning_rate": 3.6310926983147823e-06, + "loss": 0.3895, + "mean_token_accuracy": 0.8606801643967629, + "num_tokens": 188400330.0, + "step": 156590 + }, + { + "entropy": 1.8644197478890419, + "epoch": 0.4854462982782311, + "grad_norm": 9.163566589355469, + "learning_rate": 3.630976760452211e-06, + "loss": 0.4779, + "mean_token_accuracy": 0.8528912767767907, + "num_tokens": 188411951.0, + "step": 156600 + }, + { + "entropy": 1.8152849718928337, + "epoch": 0.48547729740328077, + "grad_norm": 8.839181900024414, + "learning_rate": 3.6308608336943374e-06, + "loss": 0.4156, + "mean_token_accuracy": 0.8583212316036224, + "num_tokens": 188423655.0, + "step": 156610 + }, + { + "entropy": 1.8252029418945312, + "epoch": 0.4855082965283305, + "grad_norm": 8.845681190490723, + "learning_rate": 3.6307449180393895e-06, + "loss": 0.4381, + "mean_token_accuracy": 0.8624794527888298, + "num_tokens": 188435618.0, + "step": 156620 + }, + { + "entropy": 1.7919634029269218, + "epoch": 0.48553929565338017, + "grad_norm": 7.7746429443359375, + "learning_rate": 3.630629013485596e-06, + "loss": 0.3895, + "mean_token_accuracy": 0.8691168889403343, + "num_tokens": 188447995.0, + "step": 156630 + }, + { + "entropy": 1.7863436222076416, + "epoch": 0.4855702947784299, + "grad_norm": 8.812021255493164, + "learning_rate": 3.6305131200311835e-06, + "loss": 0.4042, + "mean_token_accuracy": 0.8654252752661705, + "num_tokens": 188460962.0, + "step": 156640 + }, + { + "entropy": 1.8167147532105445, + "epoch": 0.48560129390347956, + "grad_norm": 3.7734971046447754, + "learning_rate": 3.6303972376743814e-06, + "loss": 0.409, + "mean_token_accuracy": 0.8662436470389366, + "num_tokens": 188472440.0, + "step": 156650 + }, + { + "entropy": 1.864917366206646, + "epoch": 0.4856322930285293, + "grad_norm": 8.78889274597168, + "learning_rate": 3.630281366413419e-06, + "loss": 0.5209, + "mean_token_accuracy": 0.8403189569711685, + "num_tokens": 188484784.0, + "step": 156660 + }, + { + "entropy": 1.859625655412674, + "epoch": 0.48566329215357895, + "grad_norm": 7.059370040893555, + "learning_rate": 3.630165506246525e-06, + "loss": 0.4652, + "mean_token_accuracy": 0.8460963040590286, + "num_tokens": 188497245.0, + "step": 156670 + }, + { + "entropy": 1.7957852184772491, + "epoch": 0.4856942912786287, + "grad_norm": 8.198931694030762, + "learning_rate": 3.6300496571719295e-06, + "loss": 0.4054, + "mean_token_accuracy": 0.8644311785697937, + "num_tokens": 188509887.0, + "step": 156680 + }, + { + "entropy": 1.9186707973480224, + "epoch": 0.48572529040367834, + "grad_norm": 9.6982421875, + "learning_rate": 3.6299338191878623e-06, + "loss": 0.4884, + "mean_token_accuracy": 0.8487430840730668, + "num_tokens": 188520517.0, + "step": 156690 + }, + { + "entropy": 1.9093660950660705, + "epoch": 0.48575628952872807, + "grad_norm": 7.849754333496094, + "learning_rate": 3.6298179922925554e-06, + "loss": 0.4953, + "mean_token_accuracy": 0.8508066058158874, + "num_tokens": 188531905.0, + "step": 156700 + }, + { + "entropy": 1.8367055609822274, + "epoch": 0.48578728865377774, + "grad_norm": 7.576959609985352, + "learning_rate": 3.6297021764842377e-06, + "loss": 0.4464, + "mean_token_accuracy": 0.8587787270545959, + "num_tokens": 188543901.0, + "step": 156710 + }, + { + "entropy": 1.805798091739416, + "epoch": 0.48581828777882746, + "grad_norm": 8.443595886230469, + "learning_rate": 3.6295863717611423e-06, + "loss": 0.4334, + "mean_token_accuracy": 0.8576732441782952, + "num_tokens": 188556121.0, + "step": 156720 + }, + { + "entropy": 1.8692028522491455, + "epoch": 0.48584928690387713, + "grad_norm": 3.9699697494506836, + "learning_rate": 3.6294705781214996e-06, + "loss": 0.4154, + "mean_token_accuracy": 0.8564115181565285, + "num_tokens": 188567950.0, + "step": 156730 + }, + { + "entropy": 1.7551136761903763, + "epoch": 0.48588028602892686, + "grad_norm": 2.822622299194336, + "learning_rate": 3.629354795563543e-06, + "loss": 0.4069, + "mean_token_accuracy": 0.8498562499880791, + "num_tokens": 188581754.0, + "step": 156740 + }, + { + "entropy": 1.8290818929672241, + "epoch": 0.4859112851539765, + "grad_norm": 6.920858860015869, + "learning_rate": 3.6292390240855038e-06, + "loss": 0.4039, + "mean_token_accuracy": 0.8647098124027253, + "num_tokens": 188593324.0, + "step": 156750 + }, + { + "entropy": 1.851450626552105, + "epoch": 0.48594228427902625, + "grad_norm": 7.4507904052734375, + "learning_rate": 3.629123263685616e-06, + "loss": 0.472, + "mean_token_accuracy": 0.8521499082446098, + "num_tokens": 188604866.0, + "step": 156760 + }, + { + "entropy": 1.8714505270123483, + "epoch": 0.4859732834040759, + "grad_norm": 8.083022117614746, + "learning_rate": 3.629007514362112e-06, + "loss": 0.4285, + "mean_token_accuracy": 0.8554137036204338, + "num_tokens": 188617023.0, + "step": 156770 + }, + { + "entropy": 1.8139952212572097, + "epoch": 0.48600428252912564, + "grad_norm": 7.504046440124512, + "learning_rate": 3.628891776113226e-06, + "loss": 0.436, + "mean_token_accuracy": 0.8558562770485878, + "num_tokens": 188629125.0, + "step": 156780 + }, + { + "entropy": 1.7777962386608124, + "epoch": 0.4860352816541753, + "grad_norm": 7.869353294372559, + "learning_rate": 3.6287760489371926e-06, + "loss": 0.404, + "mean_token_accuracy": 0.8635659471154213, + "num_tokens": 188642471.0, + "step": 156790 + }, + { + "entropy": 1.893085741996765, + "epoch": 0.486066280779225, + "grad_norm": 6.712622165679932, + "learning_rate": 3.628660332832246e-06, + "loss": 0.4191, + "mean_token_accuracy": 0.8632133215665817, + "num_tokens": 188654009.0, + "step": 156800 + }, + { + "entropy": 1.846302431821823, + "epoch": 0.4860972799042747, + "grad_norm": 3.914888858795166, + "learning_rate": 3.628544627796621e-06, + "loss": 0.4908, + "mean_token_accuracy": 0.8565996348857879, + "num_tokens": 188666288.0, + "step": 156810 + }, + { + "entropy": 1.9017745479941368, + "epoch": 0.4861282790293244, + "grad_norm": 8.8294677734375, + "learning_rate": 3.628428933828553e-06, + "loss": 0.5231, + "mean_token_accuracy": 0.8494993954896927, + "num_tokens": 188677652.0, + "step": 156820 + }, + { + "entropy": 1.8907178774476052, + "epoch": 0.4861592781543741, + "grad_norm": 7.541580677032471, + "learning_rate": 3.6283132509262763e-06, + "loss": 0.498, + "mean_token_accuracy": 0.8456685811281204, + "num_tokens": 188688888.0, + "step": 156830 + }, + { + "entropy": 1.844075907766819, + "epoch": 0.48619027727942377, + "grad_norm": 4.210703372955322, + "learning_rate": 3.6281975790880292e-06, + "loss": 0.421, + "mean_token_accuracy": 0.857151634991169, + "num_tokens": 188700221.0, + "step": 156840 + }, + { + "entropy": 1.828725065290928, + "epoch": 0.4862212764044735, + "grad_norm": 8.287814140319824, + "learning_rate": 3.6280819183120477e-06, + "loss": 0.4319, + "mean_token_accuracy": 0.8587014749646187, + "num_tokens": 188712585.0, + "step": 156850 + }, + { + "entropy": 1.8757966220378877, + "epoch": 0.48625227552952316, + "grad_norm": 7.026792526245117, + "learning_rate": 3.6279662685965677e-06, + "loss": 0.4514, + "mean_token_accuracy": 0.8533299207687378, + "num_tokens": 188723844.0, + "step": 156860 + }, + { + "entropy": 1.818382728099823, + "epoch": 0.4862832746545729, + "grad_norm": 3.8120429515838623, + "learning_rate": 3.627850629939827e-06, + "loss": 0.4163, + "mean_token_accuracy": 0.8576777949929237, + "num_tokens": 188736591.0, + "step": 156870 + }, + { + "entropy": 1.8777550026774406, + "epoch": 0.48631427377962255, + "grad_norm": 7.84219217300415, + "learning_rate": 3.6277350023400635e-06, + "loss": 0.4651, + "mean_token_accuracy": 0.8507702320814132, + "num_tokens": 188748864.0, + "step": 156880 + }, + { + "entropy": 1.745957624912262, + "epoch": 0.4863452729046723, + "grad_norm": 3.141312599182129, + "learning_rate": 3.627619385795515e-06, + "loss": 0.3598, + "mean_token_accuracy": 0.8645585060119629, + "num_tokens": 188762327.0, + "step": 156890 + }, + { + "entropy": 1.8983976244926453, + "epoch": 0.48637627202972195, + "grad_norm": 9.90230655670166, + "learning_rate": 3.62750378030442e-06, + "loss": 0.4266, + "mean_token_accuracy": 0.8513541236519814, + "num_tokens": 188773931.0, + "step": 156900 + }, + { + "entropy": 1.7605302453041076, + "epoch": 0.48640727115477167, + "grad_norm": 6.981546401977539, + "learning_rate": 3.627388185865018e-06, + "loss": 0.3841, + "mean_token_accuracy": 0.8662245452404023, + "num_tokens": 188788065.0, + "step": 156910 + }, + { + "entropy": 1.7625210091471673, + "epoch": 0.48643827027982134, + "grad_norm": 2.2922286987304688, + "learning_rate": 3.6272726024755462e-06, + "loss": 0.3888, + "mean_token_accuracy": 0.8693696141242981, + "num_tokens": 188802039.0, + "step": 156920 + }, + { + "entropy": 1.8891747072339058, + "epoch": 0.48646926940487106, + "grad_norm": 4.775572776794434, + "learning_rate": 3.627157030134246e-06, + "loss": 0.4526, + "mean_token_accuracy": 0.8509297117590904, + "num_tokens": 188814295.0, + "step": 156930 + }, + { + "entropy": 1.763446855545044, + "epoch": 0.48650026852992073, + "grad_norm": 3.432339668273926, + "learning_rate": 3.627041468839358e-06, + "loss": 0.3982, + "mean_token_accuracy": 0.8640241846442223, + "num_tokens": 188827785.0, + "step": 156940 + }, + { + "entropy": 1.8804999768733979, + "epoch": 0.48653126765497046, + "grad_norm": 8.55935287475586, + "learning_rate": 3.626925918589121e-06, + "loss": 0.4666, + "mean_token_accuracy": 0.8539211377501488, + "num_tokens": 188840111.0, + "step": 156950 + }, + { + "entropy": 1.8551584213972092, + "epoch": 0.4865622667800201, + "grad_norm": 8.160202980041504, + "learning_rate": 3.6268103793817766e-06, + "loss": 0.472, + "mean_token_accuracy": 0.8460486233234406, + "num_tokens": 188852260.0, + "step": 156960 + }, + { + "entropy": 1.8928451895713807, + "epoch": 0.48659326590506985, + "grad_norm": 8.368380546569824, + "learning_rate": 3.6266948512155656e-06, + "loss": 0.4425, + "mean_token_accuracy": 0.8604690045118332, + "num_tokens": 188863606.0, + "step": 156970 + }, + { + "entropy": 1.817128673195839, + "epoch": 0.4866242650301195, + "grad_norm": 8.995201110839844, + "learning_rate": 3.6265793340887304e-06, + "loss": 0.4203, + "mean_token_accuracy": 0.860697340965271, + "num_tokens": 188876385.0, + "step": 156980 + }, + { + "entropy": 1.8426421731710434, + "epoch": 0.48665526415516924, + "grad_norm": 4.008335590362549, + "learning_rate": 3.626463827999513e-06, + "loss": 0.4162, + "mean_token_accuracy": 0.854708181321621, + "num_tokens": 188889014.0, + "step": 156990 + }, + { + "entropy": 1.8129928693175317, + "epoch": 0.4866862632802189, + "grad_norm": 4.042017936706543, + "learning_rate": 3.626348332946154e-06, + "loss": 0.3957, + "mean_token_accuracy": 0.8559390410780907, + "num_tokens": 188901539.0, + "step": 157000 + }, + { + "entropy": 1.8829196870326996, + "epoch": 0.48671726240526864, + "grad_norm": 9.24323844909668, + "learning_rate": 3.6262328489268982e-06, + "loss": 0.4865, + "mean_token_accuracy": 0.8524382159113884, + "num_tokens": 188913282.0, + "step": 157010 + }, + { + "entropy": 1.7663740821182727, + "epoch": 0.4867482615303183, + "grad_norm": 3.5652692317962646, + "learning_rate": 3.6261173759399875e-06, + "loss": 0.3192, + "mean_token_accuracy": 0.8695008844137192, + "num_tokens": 188926799.0, + "step": 157020 + }, + { + "entropy": 1.8988188937306405, + "epoch": 0.48677926065536803, + "grad_norm": 7.9664201736450195, + "learning_rate": 3.6260019139836667e-06, + "loss": 0.4219, + "mean_token_accuracy": 0.8581131368875503, + "num_tokens": 188938377.0, + "step": 157030 + }, + { + "entropy": 1.9046687543392182, + "epoch": 0.4868102597804177, + "grad_norm": 9.607617378234863, + "learning_rate": 3.6258864630561778e-06, + "loss": 0.5123, + "mean_token_accuracy": 0.8482811197638511, + "num_tokens": 188949656.0, + "step": 157040 + }, + { + "entropy": 1.8872241079807281, + "epoch": 0.48684125890546737, + "grad_norm": 8.410953521728516, + "learning_rate": 3.625771023155767e-06, + "loss": 0.4211, + "mean_token_accuracy": 0.8558803468942642, + "num_tokens": 188961681.0, + "step": 157050 + }, + { + "entropy": 1.8674045950174332, + "epoch": 0.4868722580305171, + "grad_norm": 8.313755989074707, + "learning_rate": 3.6256555942806783e-06, + "loss": 0.5321, + "mean_token_accuracy": 0.8421355858445168, + "num_tokens": 188973938.0, + "step": 157060 + }, + { + "entropy": 1.77282817363739, + "epoch": 0.48690325715556676, + "grad_norm": 7.947534561157227, + "learning_rate": 3.625540176429157e-06, + "loss": 0.3933, + "mean_token_accuracy": 0.856605575978756, + "num_tokens": 188987056.0, + "step": 157070 + }, + { + "entropy": 1.7749268785119057, + "epoch": 0.4869342562806165, + "grad_norm": 3.8374805450439453, + "learning_rate": 3.625424769599448e-06, + "loss": 0.4021, + "mean_token_accuracy": 0.8576187625527382, + "num_tokens": 189000210.0, + "step": 157080 + }, + { + "entropy": 1.8151443317532538, + "epoch": 0.48696525540566615, + "grad_norm": 8.406780242919922, + "learning_rate": 3.6253093737897984e-06, + "loss": 0.4074, + "mean_token_accuracy": 0.8566974952816964, + "num_tokens": 189013181.0, + "step": 157090 + }, + { + "entropy": 1.8080456346273421, + "epoch": 0.4869962545307159, + "grad_norm": 10.127240180969238, + "learning_rate": 3.6251939889984526e-06, + "loss": 0.4072, + "mean_token_accuracy": 0.8613529846072197, + "num_tokens": 189025238.0, + "step": 157100 + }, + { + "entropy": 1.9063942030072212, + "epoch": 0.48702725365576555, + "grad_norm": 8.239479064941406, + "learning_rate": 3.6250786152236593e-06, + "loss": 0.4802, + "mean_token_accuracy": 0.8503356859087944, + "num_tokens": 189036393.0, + "step": 157110 + }, + { + "entropy": 1.9125859394669533, + "epoch": 0.48705825278081527, + "grad_norm": 4.242093563079834, + "learning_rate": 3.624963252463665e-06, + "loss": 0.4902, + "mean_token_accuracy": 0.846269004046917, + "num_tokens": 189047901.0, + "step": 157120 + }, + { + "entropy": 1.9060107335448264, + "epoch": 0.48708925190586494, + "grad_norm": 5.090417385101318, + "learning_rate": 3.6248479007167166e-06, + "loss": 0.4821, + "mean_token_accuracy": 0.8466105028986931, + "num_tokens": 189059652.0, + "step": 157130 + }, + { + "entropy": 1.8610600799322128, + "epoch": 0.48712025103091466, + "grad_norm": 6.971331596374512, + "learning_rate": 3.6247325599810618e-06, + "loss": 0.3993, + "mean_token_accuracy": 0.8637379705905914, + "num_tokens": 189072096.0, + "step": 157140 + }, + { + "entropy": 1.8987021505832673, + "epoch": 0.48715125015596433, + "grad_norm": 3.920376777648926, + "learning_rate": 3.6246172302549497e-06, + "loss": 0.4326, + "mean_token_accuracy": 0.8554188758134842, + "num_tokens": 189083311.0, + "step": 157150 + }, + { + "entropy": 1.8860539883375167, + "epoch": 0.48718224928101406, + "grad_norm": 6.914085388183594, + "learning_rate": 3.624501911536628e-06, + "loss": 0.4479, + "mean_token_accuracy": 0.8497070074081421, + "num_tokens": 189095521.0, + "step": 157160 + }, + { + "entropy": 1.879089817404747, + "epoch": 0.4872132484060637, + "grad_norm": 8.246174812316895, + "learning_rate": 3.624386603824347e-06, + "loss": 0.4463, + "mean_token_accuracy": 0.8566564947366715, + "num_tokens": 189107507.0, + "step": 157170 + }, + { + "entropy": 1.8245720505714416, + "epoch": 0.48724424753111345, + "grad_norm": 7.060524940490723, + "learning_rate": 3.6242713071163548e-06, + "loss": 0.3965, + "mean_token_accuracy": 0.8609587267041207, + "num_tokens": 189119955.0, + "step": 157180 + }, + { + "entropy": 1.8623187392950058, + "epoch": 0.4872752466561631, + "grad_norm": 6.407861232757568, + "learning_rate": 3.6241560214109024e-06, + "loss": 0.4286, + "mean_token_accuracy": 0.8532092794775963, + "num_tokens": 189131790.0, + "step": 157190 + }, + { + "entropy": 1.8516870602965354, + "epoch": 0.48730624578121284, + "grad_norm": 6.912207126617432, + "learning_rate": 3.624040746706239e-06, + "loss": 0.3995, + "mean_token_accuracy": 0.8642320290207863, + "num_tokens": 189143723.0, + "step": 157200 + }, + { + "entropy": 1.9268372386693955, + "epoch": 0.4873372449062625, + "grad_norm": 10.234281539916992, + "learning_rate": 3.623925483000615e-06, + "loss": 0.5109, + "mean_token_accuracy": 0.8470379471778869, + "num_tokens": 189154544.0, + "step": 157210 + }, + { + "entropy": 1.8641832515597343, + "epoch": 0.48736824403131224, + "grad_norm": 8.448433876037598, + "learning_rate": 3.6238102302922827e-06, + "loss": 0.4503, + "mean_token_accuracy": 0.857720798254013, + "num_tokens": 189166226.0, + "step": 157220 + }, + { + "entropy": 1.9346581190824508, + "epoch": 0.4873992431563619, + "grad_norm": 8.481154441833496, + "learning_rate": 3.623694988579492e-06, + "loss": 0.4479, + "mean_token_accuracy": 0.852697542309761, + "num_tokens": 189177197.0, + "step": 157230 + }, + { + "entropy": 1.8736652597784995, + "epoch": 0.48743024228141163, + "grad_norm": 8.590533256530762, + "learning_rate": 3.6235797578604958e-06, + "loss": 0.4529, + "mean_token_accuracy": 0.8409170970320702, + "num_tokens": 189189425.0, + "step": 157240 + }, + { + "entropy": 1.8822925239801407, + "epoch": 0.4874612414064613, + "grad_norm": 8.005229949951172, + "learning_rate": 3.6234645381335458e-06, + "loss": 0.4516, + "mean_token_accuracy": 0.8517946019768715, + "num_tokens": 189200841.0, + "step": 157250 + }, + { + "entropy": 1.868522433936596, + "epoch": 0.487492240531511, + "grad_norm": 7.923999786376953, + "learning_rate": 3.6233493293968936e-06, + "loss": 0.4733, + "mean_token_accuracy": 0.851598097383976, + "num_tokens": 189213006.0, + "step": 157260 + }, + { + "entropy": 1.8525910183787346, + "epoch": 0.4875232396565607, + "grad_norm": 7.764453887939453, + "learning_rate": 3.623234131648794e-06, + "loss": 0.4798, + "mean_token_accuracy": 0.8464570224285126, + "num_tokens": 189225217.0, + "step": 157270 + }, + { + "entropy": 1.8424304701387881, + "epoch": 0.4875542387816104, + "grad_norm": 9.055932998657227, + "learning_rate": 3.6231189448874993e-06, + "loss": 0.4236, + "mean_token_accuracy": 0.8600788041949272, + "num_tokens": 189237890.0, + "step": 157280 + }, + { + "entropy": 1.8687356740236283, + "epoch": 0.4875852379066601, + "grad_norm": 7.5734663009643555, + "learning_rate": 3.6230037691112623e-06, + "loss": 0.4453, + "mean_token_accuracy": 0.8601713046431542, + "num_tokens": 189248774.0, + "step": 157290 + }, + { + "entropy": 1.887175776064396, + "epoch": 0.48761623703170975, + "grad_norm": 8.036844253540039, + "learning_rate": 3.6228886043183385e-06, + "loss": 0.4505, + "mean_token_accuracy": 0.8500497087836265, + "num_tokens": 189260259.0, + "step": 157300 + }, + { + "entropy": 1.8767434313893319, + "epoch": 0.4876472361567595, + "grad_norm": 9.874613761901855, + "learning_rate": 3.622773450506982e-06, + "loss": 0.4308, + "mean_token_accuracy": 0.8547291472554207, + "num_tokens": 189271853.0, + "step": 157310 + }, + { + "entropy": 1.8897757351398468, + "epoch": 0.48767823528180915, + "grad_norm": 8.708046913146973, + "learning_rate": 3.622658307675448e-06, + "loss": 0.4245, + "mean_token_accuracy": 0.8521436333656311, + "num_tokens": 189284384.0, + "step": 157320 + }, + { + "entropy": 1.7800422132015228, + "epoch": 0.48770923440685887, + "grad_norm": 3.693558931350708, + "learning_rate": 3.6225431758219905e-06, + "loss": 0.3894, + "mean_token_accuracy": 0.8698929205536843, + "num_tokens": 189297579.0, + "step": 157330 + }, + { + "entropy": 1.9262080565094948, + "epoch": 0.48774023353190854, + "grad_norm": 3.791533946990967, + "learning_rate": 3.6224280549448654e-06, + "loss": 0.4635, + "mean_token_accuracy": 0.8444644317030907, + "num_tokens": 189309341.0, + "step": 157340 + }, + { + "entropy": 1.8786733433604241, + "epoch": 0.48777123265695826, + "grad_norm": 7.771004676818848, + "learning_rate": 3.62231294504233e-06, + "loss": 0.4428, + "mean_token_accuracy": 0.8481441915035248, + "num_tokens": 189321372.0, + "step": 157350 + }, + { + "entropy": 1.9163564920425415, + "epoch": 0.48780223178200793, + "grad_norm": 14.17982006072998, + "learning_rate": 3.62219784611264e-06, + "loss": 0.4775, + "mean_token_accuracy": 0.854206183552742, + "num_tokens": 189332771.0, + "step": 157360 + }, + { + "entropy": 1.841642838716507, + "epoch": 0.48783323090705766, + "grad_norm": 9.067546844482422, + "learning_rate": 3.6220827581540524e-06, + "loss": 0.4249, + "mean_token_accuracy": 0.8519814401865006, + "num_tokens": 189345612.0, + "step": 157370 + }, + { + "entropy": 1.9254460856318474, + "epoch": 0.4878642300321073, + "grad_norm": 10.410828590393066, + "learning_rate": 3.6219676811648235e-06, + "loss": 0.4677, + "mean_token_accuracy": 0.8462668195366859, + "num_tokens": 189356913.0, + "step": 157380 + }, + { + "entropy": 1.8930987566709518, + "epoch": 0.48789522915715705, + "grad_norm": 4.706137657165527, + "learning_rate": 3.6218526151432123e-06, + "loss": 0.442, + "mean_token_accuracy": 0.8493619441986084, + "num_tokens": 189368660.0, + "step": 157390 + }, + { + "entropy": 1.86765104085207, + "epoch": 0.4879262282822067, + "grad_norm": 2.7685275077819824, + "learning_rate": 3.621737560087475e-06, + "loss": 0.4423, + "mean_token_accuracy": 0.8525311022996902, + "num_tokens": 189382038.0, + "step": 157400 + }, + { + "entropy": 1.844710259139538, + "epoch": 0.48795722740725644, + "grad_norm": 2.638852119445801, + "learning_rate": 3.6216225159958713e-06, + "loss": 0.4053, + "mean_token_accuracy": 0.8614204362034797, + "num_tokens": 189394683.0, + "step": 157410 + }, + { + "entropy": 1.8290350809693336, + "epoch": 0.4879882265323061, + "grad_norm": 4.062201976776123, + "learning_rate": 3.6215074828666598e-06, + "loss": 0.4031, + "mean_token_accuracy": 0.8629499360918998, + "num_tokens": 189407590.0, + "step": 157420 + }, + { + "entropy": 1.80876744389534, + "epoch": 0.48801922565735584, + "grad_norm": 7.40885066986084, + "learning_rate": 3.6213924606980995e-06, + "loss": 0.4373, + "mean_token_accuracy": 0.8544455721974373, + "num_tokens": 189420366.0, + "step": 157430 + }, + { + "entropy": 1.8963041082024574, + "epoch": 0.4880502247824055, + "grad_norm": 8.370992660522461, + "learning_rate": 3.6212774494884494e-06, + "loss": 0.4493, + "mean_token_accuracy": 0.8585180237889289, + "num_tokens": 189431992.0, + "step": 157440 + }, + { + "entropy": 1.8512851744890213, + "epoch": 0.48808122390745523, + "grad_norm": 6.397646427154541, + "learning_rate": 3.6211624492359696e-06, + "loss": 0.4204, + "mean_token_accuracy": 0.8509098574519157, + "num_tokens": 189444932.0, + "step": 157450 + }, + { + "entropy": 1.8019349545240402, + "epoch": 0.4881122230325049, + "grad_norm": 3.6263394355773926, + "learning_rate": 3.6210474599389212e-06, + "loss": 0.3726, + "mean_token_accuracy": 0.8706066176295281, + "num_tokens": 189458119.0, + "step": 157460 + }, + { + "entropy": 1.9306983739137649, + "epoch": 0.4881432221575546, + "grad_norm": 7.022060394287109, + "learning_rate": 3.6209324815955636e-06, + "loss": 0.4699, + "mean_token_accuracy": 0.8524512380361557, + "num_tokens": 189469847.0, + "step": 157470 + }, + { + "entropy": 1.8647955879569054, + "epoch": 0.4881742212826043, + "grad_norm": 7.962439060211182, + "learning_rate": 3.620817514204159e-06, + "loss": 0.4444, + "mean_token_accuracy": 0.8580728963017463, + "num_tokens": 189482607.0, + "step": 157480 + }, + { + "entropy": 1.8655321180820466, + "epoch": 0.488205220407654, + "grad_norm": 6.718905448913574, + "learning_rate": 3.6207025577629685e-06, + "loss": 0.409, + "mean_token_accuracy": 0.8578371599316597, + "num_tokens": 189494789.0, + "step": 157490 + }, + { + "entropy": 1.782193235307932, + "epoch": 0.4882362195327037, + "grad_norm": 8.388843536376953, + "learning_rate": 3.620587612270253e-06, + "loss": 0.382, + "mean_token_accuracy": 0.867868272960186, + "num_tokens": 189508905.0, + "step": 157500 + }, + { + "entropy": 1.924613358080387, + "epoch": 0.4882672186577534, + "grad_norm": 7.749386787414551, + "learning_rate": 3.6204726777242765e-06, + "loss": 0.4814, + "mean_token_accuracy": 0.8526110947132111, + "num_tokens": 189520596.0, + "step": 157510 + }, + { + "entropy": 1.8958618491888046, + "epoch": 0.4882982177828031, + "grad_norm": 8.092024803161621, + "learning_rate": 3.6203577541233004e-06, + "loss": 0.4353, + "mean_token_accuracy": 0.8603635936975479, + "num_tokens": 189532260.0, + "step": 157520 + }, + { + "entropy": 1.9047956779599189, + "epoch": 0.4883292169078528, + "grad_norm": 4.471070766448975, + "learning_rate": 3.6202428414655877e-06, + "loss": 0.461, + "mean_token_accuracy": 0.842148295044899, + "num_tokens": 189544420.0, + "step": 157530 + }, + { + "entropy": 1.934245301783085, + "epoch": 0.48836021603290247, + "grad_norm": 10.193523406982422, + "learning_rate": 3.6201279397494023e-06, + "loss": 0.4829, + "mean_token_accuracy": 0.8412486299872398, + "num_tokens": 189555444.0, + "step": 157540 + }, + { + "entropy": 1.8730113923549652, + "epoch": 0.48839121515795214, + "grad_norm": 4.940889358520508, + "learning_rate": 3.6200130489730075e-06, + "loss": 0.4044, + "mean_token_accuracy": 0.859997783601284, + "num_tokens": 189568007.0, + "step": 157550 + }, + { + "entropy": 1.8695228844881058, + "epoch": 0.48842221428300187, + "grad_norm": 7.891258239746094, + "learning_rate": 3.6198981691346667e-06, + "loss": 0.4345, + "mean_token_accuracy": 0.8547190606594086, + "num_tokens": 189580842.0, + "step": 157560 + }, + { + "entropy": 1.9357234045863152, + "epoch": 0.48845321340805153, + "grad_norm": 8.230269432067871, + "learning_rate": 3.619783300232646e-06, + "loss": 0.4894, + "mean_token_accuracy": 0.8404277488589287, + "num_tokens": 189592624.0, + "step": 157570 + }, + { + "entropy": 1.9686369627714158, + "epoch": 0.48848421253310126, + "grad_norm": 8.409232139587402, + "learning_rate": 3.6196684422652106e-06, + "loss": 0.4816, + "mean_token_accuracy": 0.8537262305617332, + "num_tokens": 189603599.0, + "step": 157580 + }, + { + "entropy": 1.8572678551077844, + "epoch": 0.4885152116581509, + "grad_norm": 8.065033912658691, + "learning_rate": 3.6195535952306233e-06, + "loss": 0.4137, + "mean_token_accuracy": 0.8580617591738701, + "num_tokens": 189616217.0, + "step": 157590 + }, + { + "entropy": 1.854029330611229, + "epoch": 0.48854621078320065, + "grad_norm": 8.96259593963623, + "learning_rate": 3.6194387591271525e-06, + "loss": 0.4054, + "mean_token_accuracy": 0.8569090217351913, + "num_tokens": 189628302.0, + "step": 157600 + }, + { + "entropy": 1.836168546974659, + "epoch": 0.4885772099082503, + "grad_norm": 8.88343334197998, + "learning_rate": 3.619323933953063e-06, + "loss": 0.4175, + "mean_token_accuracy": 0.8561067417263984, + "num_tokens": 189640866.0, + "step": 157610 + }, + { + "entropy": 1.8437625631690024, + "epoch": 0.48860820903330004, + "grad_norm": 7.693849086761475, + "learning_rate": 3.6192091197066205e-06, + "loss": 0.4329, + "mean_token_accuracy": 0.8549635618925094, + "num_tokens": 189653230.0, + "step": 157620 + }, + { + "entropy": 1.971674808859825, + "epoch": 0.4886392081583497, + "grad_norm": 8.87653923034668, + "learning_rate": 3.619094316386093e-06, + "loss": 0.4898, + "mean_token_accuracy": 0.8423131659626961, + "num_tokens": 189663808.0, + "step": 157630 + }, + { + "entropy": 1.9198501527309417, + "epoch": 0.48867020728339944, + "grad_norm": 8.140296936035156, + "learning_rate": 3.6189795239897478e-06, + "loss": 0.4452, + "mean_token_accuracy": 0.8481401056051254, + "num_tokens": 189675324.0, + "step": 157640 + }, + { + "entropy": 1.93048115670681, + "epoch": 0.4887012064084491, + "grad_norm": 6.999414443969727, + "learning_rate": 3.618864742515852e-06, + "loss": 0.4691, + "mean_token_accuracy": 0.8548283234238625, + "num_tokens": 189686430.0, + "step": 157650 + }, + { + "entropy": 1.856081487238407, + "epoch": 0.48873220553349883, + "grad_norm": 9.344239234924316, + "learning_rate": 3.6187499719626744e-06, + "loss": 0.3988, + "mean_token_accuracy": 0.8584906741976738, + "num_tokens": 189698360.0, + "step": 157660 + }, + { + "entropy": 1.8952415823936462, + "epoch": 0.4887632046585485, + "grad_norm": 7.345611572265625, + "learning_rate": 3.6186352123284817e-06, + "loss": 0.4343, + "mean_token_accuracy": 0.8519857943058013, + "num_tokens": 189710084.0, + "step": 157670 + }, + { + "entropy": 1.7388610973954202, + "epoch": 0.4887942037835982, + "grad_norm": 3.976619243621826, + "learning_rate": 3.6185204636115446e-06, + "loss": 0.3789, + "mean_token_accuracy": 0.8673770546913147, + "num_tokens": 189724014.0, + "step": 157680 + }, + { + "entropy": 1.8221586227416993, + "epoch": 0.4888252029086479, + "grad_norm": 8.260506629943848, + "learning_rate": 3.6184057258101305e-06, + "loss": 0.3949, + "mean_token_accuracy": 0.8662657171487809, + "num_tokens": 189736367.0, + "step": 157690 + }, + { + "entropy": 1.8475705727934837, + "epoch": 0.4888562020336976, + "grad_norm": 7.417000770568848, + "learning_rate": 3.6182909989225107e-06, + "loss": 0.3849, + "mean_token_accuracy": 0.8679051205515862, + "num_tokens": 189748764.0, + "step": 157700 + }, + { + "entropy": 2.0135974794626237, + "epoch": 0.4888872011587473, + "grad_norm": 8.551652908325195, + "learning_rate": 3.618176282946954e-06, + "loss": 0.5405, + "mean_token_accuracy": 0.8359439983963967, + "num_tokens": 189759508.0, + "step": 157710 + }, + { + "entropy": 1.8791854158043861, + "epoch": 0.488918200283797, + "grad_norm": 8.100190162658691, + "learning_rate": 3.6180615778817304e-06, + "loss": 0.4158, + "mean_token_accuracy": 0.8628863275051117, + "num_tokens": 189771238.0, + "step": 157720 + }, + { + "entropy": 1.8653140723705293, + "epoch": 0.4889491994088467, + "grad_norm": 8.358512878417969, + "learning_rate": 3.6179468837251115e-06, + "loss": 0.4191, + "mean_token_accuracy": 0.8619488105177879, + "num_tokens": 189783364.0, + "step": 157730 + }, + { + "entropy": 1.8888757541775703, + "epoch": 0.4889801985338964, + "grad_norm": 3.8749780654907227, + "learning_rate": 3.617832200475368e-06, + "loss": 0.4373, + "mean_token_accuracy": 0.8521002560853959, + "num_tokens": 189795088.0, + "step": 157740 + }, + { + "entropy": 1.8624941438436509, + "epoch": 0.4890111976589461, + "grad_norm": 8.596406936645508, + "learning_rate": 3.6177175281307715e-06, + "loss": 0.4271, + "mean_token_accuracy": 0.8576769649982452, + "num_tokens": 189807667.0, + "step": 157750 + }, + { + "entropy": 1.950662373006344, + "epoch": 0.4890421967839958, + "grad_norm": 7.65726375579834, + "learning_rate": 3.6176028666895936e-06, + "loss": 0.4938, + "mean_token_accuracy": 0.8424899771809577, + "num_tokens": 189819412.0, + "step": 157760 + }, + { + "entropy": 1.9022864386439324, + "epoch": 0.48907319590904547, + "grad_norm": 8.7039794921875, + "learning_rate": 3.6174882161501066e-06, + "loss": 0.4664, + "mean_token_accuracy": 0.8537297427654267, + "num_tokens": 189830506.0, + "step": 157770 + }, + { + "entropy": 1.892046569287777, + "epoch": 0.48910419503409513, + "grad_norm": 3.2481043338775635, + "learning_rate": 3.6173735765105827e-06, + "loss": 0.4376, + "mean_token_accuracy": 0.8614903211593627, + "num_tokens": 189841759.0, + "step": 157780 + }, + { + "entropy": 1.9165176048874855, + "epoch": 0.48913519415914486, + "grad_norm": 5.342856407165527, + "learning_rate": 3.6172589477692956e-06, + "loss": 0.469, + "mean_token_accuracy": 0.8441179916262627, + "num_tokens": 189853777.0, + "step": 157790 + }, + { + "entropy": 1.8486308708786965, + "epoch": 0.48916619328419453, + "grad_norm": 7.395596504211426, + "learning_rate": 3.6171443299245185e-06, + "loss": 0.4164, + "mean_token_accuracy": 0.8635935798287392, + "num_tokens": 189865936.0, + "step": 157800 + }, + { + "entropy": 1.8705967336893081, + "epoch": 0.48919719240924425, + "grad_norm": 9.73678207397461, + "learning_rate": 3.617029722974525e-06, + "loss": 0.4225, + "mean_token_accuracy": 0.8559140339493752, + "num_tokens": 189878261.0, + "step": 157810 + }, + { + "entropy": 1.8054387748241425, + "epoch": 0.4892281915342939, + "grad_norm": 2.276423931121826, + "learning_rate": 3.6169151269175887e-06, + "loss": 0.4847, + "mean_token_accuracy": 0.847235806286335, + "num_tokens": 189890974.0, + "step": 157820 + }, + { + "entropy": 1.8155124217271805, + "epoch": 0.48925919065934365, + "grad_norm": 8.386927604675293, + "learning_rate": 3.6168005417519857e-06, + "loss": 0.4134, + "mean_token_accuracy": 0.8562206014990806, + "num_tokens": 189903505.0, + "step": 157830 + }, + { + "entropy": 1.8468881100416183, + "epoch": 0.4892901897843933, + "grad_norm": 8.004000663757324, + "learning_rate": 3.616685967475989e-06, + "loss": 0.3927, + "mean_token_accuracy": 0.8538503974676133, + "num_tokens": 189916014.0, + "step": 157840 + }, + { + "entropy": 1.8874759256839753, + "epoch": 0.48932118890944304, + "grad_norm": 8.225050926208496, + "learning_rate": 3.6165714040878753e-06, + "loss": 0.4722, + "mean_token_accuracy": 0.8494992300868034, + "num_tokens": 189927102.0, + "step": 157850 + }, + { + "entropy": 1.9252758368849754, + "epoch": 0.4893521880344927, + "grad_norm": 9.809006690979004, + "learning_rate": 3.61645685158592e-06, + "loss": 0.4832, + "mean_token_accuracy": 0.8438394486904144, + "num_tokens": 189938653.0, + "step": 157860 + }, + { + "entropy": 1.86532269269228, + "epoch": 0.48938318715954243, + "grad_norm": 7.953715801239014, + "learning_rate": 3.616342309968398e-06, + "loss": 0.4421, + "mean_token_accuracy": 0.8573437079787254, + "num_tokens": 189950573.0, + "step": 157870 + }, + { + "entropy": 1.8941395804286003, + "epoch": 0.4894141862845921, + "grad_norm": 8.05754566192627, + "learning_rate": 3.616227779233587e-06, + "loss": 0.433, + "mean_token_accuracy": 0.8579764619469643, + "num_tokens": 189962373.0, + "step": 157880 + }, + { + "entropy": 1.8559586256742477, + "epoch": 0.4894451854096418, + "grad_norm": 7.713656902313232, + "learning_rate": 3.6161132593797637e-06, + "loss": 0.4283, + "mean_token_accuracy": 0.852620604634285, + "num_tokens": 189975314.0, + "step": 157890 + }, + { + "entropy": 1.875124379992485, + "epoch": 0.4894761845346915, + "grad_norm": 8.812604904174805, + "learning_rate": 3.615998750405205e-06, + "loss": 0.448, + "mean_token_accuracy": 0.8633681252598763, + "num_tokens": 189987539.0, + "step": 157900 + }, + { + "entropy": 1.8811095029115676, + "epoch": 0.4895071836597412, + "grad_norm": 10.135832786560059, + "learning_rate": 3.6158842523081883e-06, + "loss": 0.4323, + "mean_token_accuracy": 0.8540236935019493, + "num_tokens": 189999109.0, + "step": 157910 + }, + { + "entropy": 1.8668017193675042, + "epoch": 0.4895381827847909, + "grad_norm": 8.14980411529541, + "learning_rate": 3.6157697650869916e-06, + "loss": 0.4357, + "mean_token_accuracy": 0.8578397080302238, + "num_tokens": 190011487.0, + "step": 157920 + }, + { + "entropy": 1.847634233534336, + "epoch": 0.4895691819098406, + "grad_norm": 7.752749919891357, + "learning_rate": 3.6156552887398934e-06, + "loss": 0.4059, + "mean_token_accuracy": 0.8659601092338562, + "num_tokens": 190023692.0, + "step": 157930 + }, + { + "entropy": 1.9245047122240067, + "epoch": 0.4896001810348903, + "grad_norm": 8.915438652038574, + "learning_rate": 3.615540823265173e-06, + "loss": 0.4712, + "mean_token_accuracy": 0.8451450273394585, + "num_tokens": 190035059.0, + "step": 157940 + }, + { + "entropy": 1.9289063662290573, + "epoch": 0.48963118015994, + "grad_norm": 8.771639823913574, + "learning_rate": 3.615426368661109e-06, + "loss": 0.4944, + "mean_token_accuracy": 0.8454332157969475, + "num_tokens": 190046188.0, + "step": 157950 + }, + { + "entropy": 1.8621390245854854, + "epoch": 0.4896621792849897, + "grad_norm": 8.388298034667969, + "learning_rate": 3.61531192492598e-06, + "loss": 0.4724, + "mean_token_accuracy": 0.8460786879062653, + "num_tokens": 190058636.0, + "step": 157960 + }, + { + "entropy": 1.9229322642087936, + "epoch": 0.4896931784100394, + "grad_norm": 9.573465347290039, + "learning_rate": 3.6151974920580674e-06, + "loss": 0.4714, + "mean_token_accuracy": 0.8481347262859344, + "num_tokens": 190069764.0, + "step": 157970 + }, + { + "entropy": 1.7791677549481393, + "epoch": 0.48972417753508907, + "grad_norm": 3.663013458251953, + "learning_rate": 3.6150830700556498e-06, + "loss": 0.3645, + "mean_token_accuracy": 0.874565900862217, + "num_tokens": 190082703.0, + "step": 157980 + }, + { + "entropy": 1.8474309265613555, + "epoch": 0.4897551766601388, + "grad_norm": 4.272090435028076, + "learning_rate": 3.61496865891701e-06, + "loss": 0.4352, + "mean_token_accuracy": 0.8537533566355705, + "num_tokens": 190095209.0, + "step": 157990 + }, + { + "entropy": 1.8715117886662482, + "epoch": 0.48978617578518846, + "grad_norm": 9.850628852844238, + "learning_rate": 3.6148542586404274e-06, + "loss": 0.4476, + "mean_token_accuracy": 0.8550691947340965, + "num_tokens": 190107364.0, + "step": 158000 + }, + { + "entropy": 1.8813581451773644, + "epoch": 0.4898171749102382, + "grad_norm": 8.418951988220215, + "learning_rate": 3.614739869224183e-06, + "loss": 0.4654, + "mean_token_accuracy": 0.8511038661003113, + "num_tokens": 190118495.0, + "step": 158010 + }, + { + "entropy": 1.8656797528266906, + "epoch": 0.48984817403528785, + "grad_norm": 8.324191093444824, + "learning_rate": 3.6146254906665607e-06, + "loss": 0.4431, + "mean_token_accuracy": 0.8558537855744361, + "num_tokens": 190130949.0, + "step": 158020 + }, + { + "entropy": 1.7613300889730454, + "epoch": 0.4898791731603375, + "grad_norm": 8.842034339904785, + "learning_rate": 3.61451112296584e-06, + "loss": 0.3847, + "mean_token_accuracy": 0.8550317272543907, + "num_tokens": 190144475.0, + "step": 158030 + }, + { + "entropy": 1.9174160197377206, + "epoch": 0.48991017228538725, + "grad_norm": 9.014627456665039, + "learning_rate": 3.614396766120305e-06, + "loss": 0.4402, + "mean_token_accuracy": 0.8600229039788246, + "num_tokens": 190155849.0, + "step": 158040 + }, + { + "entropy": 1.884463222324848, + "epoch": 0.4899411714104369, + "grad_norm": 9.524755477905273, + "learning_rate": 3.614282420128239e-06, + "loss": 0.5113, + "mean_token_accuracy": 0.838143216073513, + "num_tokens": 190166700.0, + "step": 158050 + }, + { + "entropy": 1.7631948590278625, + "epoch": 0.48997217053548664, + "grad_norm": 2.52717661857605, + "learning_rate": 3.614168084987924e-06, + "loss": 0.3626, + "mean_token_accuracy": 0.8702189907431602, + "num_tokens": 190180515.0, + "step": 158060 + }, + { + "entropy": 1.871630634367466, + "epoch": 0.4900031696605363, + "grad_norm": 7.53773832321167, + "learning_rate": 3.6140537606976445e-06, + "loss": 0.4348, + "mean_token_accuracy": 0.8584669470787049, + "num_tokens": 190192676.0, + "step": 158070 + }, + { + "entropy": 1.9180895671248437, + "epoch": 0.49003416878558603, + "grad_norm": 8.001392364501953, + "learning_rate": 3.613939447255685e-06, + "loss": 0.4626, + "mean_token_accuracy": 0.8476955905556679, + "num_tokens": 190203722.0, + "step": 158080 + }, + { + "entropy": 1.8444015264511109, + "epoch": 0.4900651679106357, + "grad_norm": 8.631937980651855, + "learning_rate": 3.6138251446603284e-06, + "loss": 0.4391, + "mean_token_accuracy": 0.8547645956277847, + "num_tokens": 190216666.0, + "step": 158090 + }, + { + "entropy": 1.8547025889158248, + "epoch": 0.4900961670356854, + "grad_norm": 8.481095314025879, + "learning_rate": 3.61371085290986e-06, + "loss": 0.4479, + "mean_token_accuracy": 0.8580074161291122, + "num_tokens": 190228229.0, + "step": 158100 + }, + { + "entropy": 1.889371046423912, + "epoch": 0.4901271661607351, + "grad_norm": 7.122347831726074, + "learning_rate": 3.613596572002567e-06, + "loss": 0.4802, + "mean_token_accuracy": 0.854512770473957, + "num_tokens": 190239419.0, + "step": 158110 + }, + { + "entropy": 1.7553760528564453, + "epoch": 0.4901581652857848, + "grad_norm": 8.698734283447266, + "learning_rate": 3.6134823019367315e-06, + "loss": 0.4142, + "mean_token_accuracy": 0.864093753695488, + "num_tokens": 190252901.0, + "step": 158120 + }, + { + "entropy": 1.912713533639908, + "epoch": 0.4901891644108345, + "grad_norm": 12.8290376663208, + "learning_rate": 3.613368042710643e-06, + "loss": 0.4929, + "mean_token_accuracy": 0.8492612764239311, + "num_tokens": 190264212.0, + "step": 158130 + }, + { + "entropy": 1.8984272867441176, + "epoch": 0.4902201635358842, + "grad_norm": 8.531185150146484, + "learning_rate": 3.6132537943225854e-06, + "loss": 0.4689, + "mean_token_accuracy": 0.8525654658675194, + "num_tokens": 190275101.0, + "step": 158140 + }, + { + "entropy": 1.875417274236679, + "epoch": 0.4902511626609339, + "grad_norm": 7.956252574920654, + "learning_rate": 3.6131395567708462e-06, + "loss": 0.4432, + "mean_token_accuracy": 0.8634052559733391, + "num_tokens": 190286182.0, + "step": 158150 + }, + { + "entropy": 1.8667438074946403, + "epoch": 0.4902821617859836, + "grad_norm": 8.726470947265625, + "learning_rate": 3.6130253300537122e-06, + "loss": 0.4458, + "mean_token_accuracy": 0.853116650879383, + "num_tokens": 190298040.0, + "step": 158160 + }, + { + "entropy": 1.8496040746569633, + "epoch": 0.4903131609110333, + "grad_norm": 9.257894515991211, + "learning_rate": 3.612911114169471e-06, + "loss": 0.4438, + "mean_token_accuracy": 0.8475693762302399, + "num_tokens": 190310368.0, + "step": 158170 + }, + { + "entropy": 1.726231387257576, + "epoch": 0.490344160036083, + "grad_norm": 8.153938293457031, + "learning_rate": 3.612796909116411e-06, + "loss": 0.3491, + "mean_token_accuracy": 0.861776416003704, + "num_tokens": 190324421.0, + "step": 158180 + }, + { + "entropy": 1.91248170286417, + "epoch": 0.49037515916113267, + "grad_norm": 8.336747169494629, + "learning_rate": 3.61268271489282e-06, + "loss": 0.4851, + "mean_token_accuracy": 0.8475732728838921, + "num_tokens": 190335616.0, + "step": 158190 + }, + { + "entropy": 1.8628337591886521, + "epoch": 0.4904061582861824, + "grad_norm": 3.811713933944702, + "learning_rate": 3.6125685314969867e-06, + "loss": 0.4735, + "mean_token_accuracy": 0.8399553775787354, + "num_tokens": 190348026.0, + "step": 158200 + }, + { + "entropy": 1.8320413410663605, + "epoch": 0.49043715741123206, + "grad_norm": 8.665163040161133, + "learning_rate": 3.6124543589272e-06, + "loss": 0.4245, + "mean_token_accuracy": 0.8562450557947159, + "num_tokens": 190360694.0, + "step": 158210 + }, + { + "entropy": 1.8738995507359504, + "epoch": 0.4904681565362818, + "grad_norm": 7.839025497436523, + "learning_rate": 3.6123401971817484e-06, + "loss": 0.4328, + "mean_token_accuracy": 0.8596608370542527, + "num_tokens": 190372445.0, + "step": 158220 + }, + { + "entropy": 1.9184532716870308, + "epoch": 0.49049915566133145, + "grad_norm": 7.668222904205322, + "learning_rate": 3.6122260462589237e-06, + "loss": 0.4548, + "mean_token_accuracy": 0.8530950948596001, + "num_tokens": 190383789.0, + "step": 158230 + }, + { + "entropy": 1.9229084342718124, + "epoch": 0.4905301547863812, + "grad_norm": 7.734839916229248, + "learning_rate": 3.6121119061570144e-06, + "loss": 0.4684, + "mean_token_accuracy": 0.8534473240375519, + "num_tokens": 190395480.0, + "step": 158240 + }, + { + "entropy": 1.8685783818364143, + "epoch": 0.49056115391143085, + "grad_norm": 3.483649253845215, + "learning_rate": 3.6119977768743107e-06, + "loss": 0.4063, + "mean_token_accuracy": 0.8567217275500297, + "num_tokens": 190407662.0, + "step": 158250 + }, + { + "entropy": 1.8338368773460387, + "epoch": 0.49059215303648057, + "grad_norm": 8.351194381713867, + "learning_rate": 3.611883658409105e-06, + "loss": 0.4413, + "mean_token_accuracy": 0.8560110285878182, + "num_tokens": 190419921.0, + "step": 158260 + }, + { + "entropy": 1.948527753353119, + "epoch": 0.49062315216153024, + "grad_norm": 9.00690746307373, + "learning_rate": 3.6117695507596867e-06, + "loss": 0.5085, + "mean_token_accuracy": 0.8417193755507469, + "num_tokens": 190430689.0, + "step": 158270 + }, + { + "entropy": 1.8747548520565034, + "epoch": 0.4906541512865799, + "grad_norm": 9.07733154296875, + "learning_rate": 3.61165545392435e-06, + "loss": 0.4375, + "mean_token_accuracy": 0.8512477323412895, + "num_tokens": 190443343.0, + "step": 158280 + }, + { + "entropy": 1.8400466367602348, + "epoch": 0.49068515041162963, + "grad_norm": 4.46486759185791, + "learning_rate": 3.6115413679013844e-06, + "loss": 0.4234, + "mean_token_accuracy": 0.8513605386018753, + "num_tokens": 190456514.0, + "step": 158290 + }, + { + "entropy": 1.8381875991821288, + "epoch": 0.4907161495366793, + "grad_norm": 9.451220512390137, + "learning_rate": 3.611427292689083e-06, + "loss": 0.4521, + "mean_token_accuracy": 0.8568148076534271, + "num_tokens": 190468151.0, + "step": 158300 + }, + { + "entropy": 1.8779310151934623, + "epoch": 0.490747148661729, + "grad_norm": 8.845046043395996, + "learning_rate": 3.6113132282857395e-06, + "loss": 0.443, + "mean_token_accuracy": 0.8532233744859695, + "num_tokens": 190479605.0, + "step": 158310 + }, + { + "entropy": 1.8384259521961213, + "epoch": 0.4907781477867787, + "grad_norm": 4.648056507110596, + "learning_rate": 3.6111991746896458e-06, + "loss": 0.4536, + "mean_token_accuracy": 0.8502117037773133, + "num_tokens": 190491381.0, + "step": 158320 + }, + { + "entropy": 1.8655261859297751, + "epoch": 0.4908091469118284, + "grad_norm": 6.474485397338867, + "learning_rate": 3.6110851318990962e-06, + "loss": 0.452, + "mean_token_accuracy": 0.8507389679551125, + "num_tokens": 190503415.0, + "step": 158330 + }, + { + "entropy": 1.8652947336435317, + "epoch": 0.4908401460368781, + "grad_norm": 7.278781890869141, + "learning_rate": 3.610971099912384e-06, + "loss": 0.4171, + "mean_token_accuracy": 0.8620616480708122, + "num_tokens": 190514775.0, + "step": 158340 + }, + { + "entropy": 1.745438788831234, + "epoch": 0.4908711451619278, + "grad_norm": 7.849375247955322, + "learning_rate": 3.6108570787278046e-06, + "loss": 0.397, + "mean_token_accuracy": 0.8658480405807495, + "num_tokens": 190527699.0, + "step": 158350 + }, + { + "entropy": 1.9015355363488198, + "epoch": 0.4909021442869775, + "grad_norm": 7.331711292266846, + "learning_rate": 3.6107430683436514e-06, + "loss": 0.4334, + "mean_token_accuracy": 0.8570471405982971, + "num_tokens": 190539052.0, + "step": 158360 + }, + { + "entropy": 1.8853423982858657, + "epoch": 0.4909331434120272, + "grad_norm": 8.293661117553711, + "learning_rate": 3.6106290687582196e-06, + "loss": 0.4799, + "mean_token_accuracy": 0.8404641300439835, + "num_tokens": 190550584.0, + "step": 158370 + }, + { + "entropy": 1.9187031179666518, + "epoch": 0.4909641425370769, + "grad_norm": 7.615777969360352, + "learning_rate": 3.6105150799698063e-06, + "loss": 0.4587, + "mean_token_accuracy": 0.8540566086769104, + "num_tokens": 190561826.0, + "step": 158380 + }, + { + "entropy": 1.9027689948678017, + "epoch": 0.4909951416621266, + "grad_norm": 8.56459903717041, + "learning_rate": 3.6104011019767042e-06, + "loss": 0.4743, + "mean_token_accuracy": 0.8480705738067627, + "num_tokens": 190573607.0, + "step": 158390 + }, + { + "entropy": 1.8521524950861932, + "epoch": 0.49102614078717627, + "grad_norm": 8.306175231933594, + "learning_rate": 3.610287134777212e-06, + "loss": 0.4325, + "mean_token_accuracy": 0.8617270559072494, + "num_tokens": 190585491.0, + "step": 158400 + }, + { + "entropy": 1.883074052631855, + "epoch": 0.491057139912226, + "grad_norm": 7.961977958679199, + "learning_rate": 3.6101731783696254e-06, + "loss": 0.405, + "mean_token_accuracy": 0.8651751860976219, + "num_tokens": 190596588.0, + "step": 158410 + }, + { + "entropy": 1.9026632726192474, + "epoch": 0.49108813903727566, + "grad_norm": 7.978363037109375, + "learning_rate": 3.6100592327522414e-06, + "loss": 0.4865, + "mean_token_accuracy": 0.8567787855863571, + "num_tokens": 190607394.0, + "step": 158420 + }, + { + "entropy": 1.856230580806732, + "epoch": 0.4911191381623254, + "grad_norm": 8.092080116271973, + "learning_rate": 3.609945297923357e-06, + "loss": 0.4274, + "mean_token_accuracy": 0.8433807224035264, + "num_tokens": 190620235.0, + "step": 158430 + }, + { + "entropy": 1.8610840275883676, + "epoch": 0.49115013728737505, + "grad_norm": 6.581269264221191, + "learning_rate": 3.60983137388127e-06, + "loss": 0.4968, + "mean_token_accuracy": 0.8450413525104523, + "num_tokens": 190632374.0, + "step": 158440 + }, + { + "entropy": 1.8135110765695572, + "epoch": 0.4911811364124248, + "grad_norm": 8.886683464050293, + "learning_rate": 3.6097174606242787e-06, + "loss": 0.402, + "mean_token_accuracy": 0.8638947665691376, + "num_tokens": 190645439.0, + "step": 158450 + }, + { + "entropy": 1.9266662746667862, + "epoch": 0.49121213553747445, + "grad_norm": 8.657486915588379, + "learning_rate": 3.609603558150681e-06, + "loss": 0.4881, + "mean_token_accuracy": 0.8563274934887886, + "num_tokens": 190656388.0, + "step": 158460 + }, + { + "entropy": 1.8276669576764106, + "epoch": 0.4912431346625242, + "grad_norm": 2.386725664138794, + "learning_rate": 3.6094896664587762e-06, + "loss": 0.4024, + "mean_token_accuracy": 0.8620262265205383, + "num_tokens": 190669516.0, + "step": 158470 + }, + { + "entropy": 1.89359792470932, + "epoch": 0.49127413378757384, + "grad_norm": 4.238652229309082, + "learning_rate": 3.6093757855468625e-06, + "loss": 0.4699, + "mean_token_accuracy": 0.8525636330246925, + "num_tokens": 190681421.0, + "step": 158480 + }, + { + "entropy": 1.914654791355133, + "epoch": 0.49130513291262357, + "grad_norm": 6.82390022277832, + "learning_rate": 3.6092619154132415e-06, + "loss": 0.4631, + "mean_token_accuracy": 0.8525402382016182, + "num_tokens": 190692959.0, + "step": 158490 + }, + { + "entropy": 1.8836501479148864, + "epoch": 0.49133613203767323, + "grad_norm": 4.361353397369385, + "learning_rate": 3.6091480560562114e-06, + "loss": 0.4636, + "mean_token_accuracy": 0.8494673162698746, + "num_tokens": 190704902.0, + "step": 158500 + }, + { + "entropy": 1.901443050801754, + "epoch": 0.49136713116272296, + "grad_norm": 3.428273916244507, + "learning_rate": 3.6090342074740727e-06, + "loss": 0.4516, + "mean_token_accuracy": 0.8393049791455269, + "num_tokens": 190717320.0, + "step": 158510 + }, + { + "entropy": 1.9237724661827087, + "epoch": 0.4913981302877726, + "grad_norm": 8.968462944030762, + "learning_rate": 3.608920369665126e-06, + "loss": 0.4693, + "mean_token_accuracy": 0.8518886238336563, + "num_tokens": 190728541.0, + "step": 158520 + }, + { + "entropy": 1.9531280279159546, + "epoch": 0.4914291294128223, + "grad_norm": 9.573519706726074, + "learning_rate": 3.6088065426276725e-06, + "loss": 0.5737, + "mean_token_accuracy": 0.8390479683876038, + "num_tokens": 190740022.0, + "step": 158530 + }, + { + "entropy": 1.9052818953990935, + "epoch": 0.491460128537872, + "grad_norm": 9.276761054992676, + "learning_rate": 3.6086927263600148e-06, + "loss": 0.4698, + "mean_token_accuracy": 0.8530030116438866, + "num_tokens": 190751019.0, + "step": 158540 + }, + { + "entropy": 1.8869039386510849, + "epoch": 0.4914911276629217, + "grad_norm": 4.146981716156006, + "learning_rate": 3.6085789208604527e-06, + "loss": 0.4269, + "mean_token_accuracy": 0.8596585467457771, + "num_tokens": 190763800.0, + "step": 158550 + }, + { + "entropy": 1.7681370228528976, + "epoch": 0.4915221267879714, + "grad_norm": 3.943372964859009, + "learning_rate": 3.6084651261272897e-06, + "loss": 0.3736, + "mean_token_accuracy": 0.8705621913075448, + "num_tokens": 190777433.0, + "step": 158560 + }, + { + "entropy": 1.9185601070523262, + "epoch": 0.4915531259130211, + "grad_norm": 7.802610397338867, + "learning_rate": 3.6083513421588275e-06, + "loss": 0.4505, + "mean_token_accuracy": 0.8546137645840645, + "num_tokens": 190789867.0, + "step": 158570 + }, + { + "entropy": 1.9287995994091034, + "epoch": 0.4915841250380708, + "grad_norm": 8.18673324584961, + "learning_rate": 3.6082375689533694e-06, + "loss": 0.4293, + "mean_token_accuracy": 0.8485715165734291, + "num_tokens": 190801836.0, + "step": 158580 + }, + { + "entropy": 1.7540825635194779, + "epoch": 0.4916151241631205, + "grad_norm": 4.475728511810303, + "learning_rate": 3.6081238065092192e-06, + "loss": 0.3126, + "mean_token_accuracy": 0.8646261185407639, + "num_tokens": 190816330.0, + "step": 158590 + }, + { + "entropy": 1.9391558408737182, + "epoch": 0.4916461232881702, + "grad_norm": 4.104739189147949, + "learning_rate": 3.60801005482468e-06, + "loss": 0.5068, + "mean_token_accuracy": 0.8371262863278389, + "num_tokens": 190828313.0, + "step": 158600 + }, + { + "entropy": 1.9185767412185668, + "epoch": 0.49167712241321987, + "grad_norm": 8.502923011779785, + "learning_rate": 3.607896313898056e-06, + "loss": 0.4647, + "mean_token_accuracy": 0.8552430674433709, + "num_tokens": 190840723.0, + "step": 158610 + }, + { + "entropy": 1.9190486401319504, + "epoch": 0.4917081215382696, + "grad_norm": 3.9274349212646484, + "learning_rate": 3.6077825837276513e-06, + "loss": 0.4443, + "mean_token_accuracy": 0.8470717743039131, + "num_tokens": 190852704.0, + "step": 158620 + }, + { + "entropy": 1.9390907883644104, + "epoch": 0.49173912066331926, + "grad_norm": 8.89169979095459, + "learning_rate": 3.607668864311771e-06, + "loss": 0.4595, + "mean_token_accuracy": 0.8512443661689758, + "num_tokens": 190863472.0, + "step": 158630 + }, + { + "entropy": 1.9003855600953101, + "epoch": 0.491770119788369, + "grad_norm": 6.889798641204834, + "learning_rate": 3.607555155648721e-06, + "loss": 0.4402, + "mean_token_accuracy": 0.8536323562264443, + "num_tokens": 190875506.0, + "step": 158640 + }, + { + "entropy": 1.9173786222934723, + "epoch": 0.49180111891341866, + "grad_norm": 8.534137725830078, + "learning_rate": 3.6074414577368046e-06, + "loss": 0.4389, + "mean_token_accuracy": 0.8548526600003242, + "num_tokens": 190887431.0, + "step": 158650 + }, + { + "entropy": 1.9805822908878326, + "epoch": 0.4918321180384684, + "grad_norm": 8.78432846069336, + "learning_rate": 3.60732777057433e-06, + "loss": 0.481, + "mean_token_accuracy": 0.8482834473252296, + "num_tokens": 190898042.0, + "step": 158660 + }, + { + "entropy": 1.9581329673528671, + "epoch": 0.49186311716351805, + "grad_norm": 7.826303958892822, + "learning_rate": 3.607214094159603e-06, + "loss": 0.4789, + "mean_token_accuracy": 0.8571975663304329, + "num_tokens": 190909409.0, + "step": 158670 + }, + { + "entropy": 1.906209309399128, + "epoch": 0.4918941162885678, + "grad_norm": 8.253519058227539, + "learning_rate": 3.6071004284909293e-06, + "loss": 0.4601, + "mean_token_accuracy": 0.8512861356139183, + "num_tokens": 190921294.0, + "step": 158680 + }, + { + "entropy": 1.9099845930933952, + "epoch": 0.49192511541361744, + "grad_norm": 3.3048346042633057, + "learning_rate": 3.606986773566617e-06, + "loss": 0.4697, + "mean_token_accuracy": 0.851481918990612, + "num_tokens": 190933707.0, + "step": 158690 + }, + { + "entropy": 1.8390526965260505, + "epoch": 0.49195611453866717, + "grad_norm": 3.801232099533081, + "learning_rate": 3.606873129384972e-06, + "loss": 0.3815, + "mean_token_accuracy": 0.8585761964321137, + "num_tokens": 190946405.0, + "step": 158700 + }, + { + "entropy": 1.8441692113876342, + "epoch": 0.49198711366371684, + "grad_norm": 8.876874923706055, + "learning_rate": 3.606759495944304e-06, + "loss": 0.4182, + "mean_token_accuracy": 0.8560978010296821, + "num_tokens": 190959067.0, + "step": 158710 + }, + { + "entropy": 1.8029930278658868, + "epoch": 0.49201811278876656, + "grad_norm": 8.655598640441895, + "learning_rate": 3.6066458732429203e-06, + "loss": 0.4188, + "mean_token_accuracy": 0.857164989411831, + "num_tokens": 190972623.0, + "step": 158720 + }, + { + "entropy": 1.9350174590945244, + "epoch": 0.49204911191381623, + "grad_norm": 9.002110481262207, + "learning_rate": 3.6065322612791293e-06, + "loss": 0.4608, + "mean_token_accuracy": 0.848691463470459, + "num_tokens": 190984331.0, + "step": 158730 + }, + { + "entropy": 1.8599534809589386, + "epoch": 0.49208011103886595, + "grad_norm": 3.756072759628296, + "learning_rate": 3.60641866005124e-06, + "loss": 0.4706, + "mean_token_accuracy": 0.846306300163269, + "num_tokens": 190997036.0, + "step": 158740 + }, + { + "entropy": 1.8874867737293244, + "epoch": 0.4921111101639156, + "grad_norm": 8.43781852722168, + "learning_rate": 3.6063050695575613e-06, + "loss": 0.4231, + "mean_token_accuracy": 0.8569997996091843, + "num_tokens": 191009039.0, + "step": 158750 + }, + { + "entropy": 1.9412825867533683, + "epoch": 0.49214210928896535, + "grad_norm": 8.520960807800293, + "learning_rate": 3.6061914897964035e-06, + "loss": 0.5182, + "mean_token_accuracy": 0.8366198286414146, + "num_tokens": 191020334.0, + "step": 158760 + }, + { + "entropy": 1.9006411105394363, + "epoch": 0.492173108414015, + "grad_norm": 10.549320220947266, + "learning_rate": 3.606077920766076e-06, + "loss": 0.4682, + "mean_token_accuracy": 0.845074562728405, + "num_tokens": 191032363.0, + "step": 158770 + }, + { + "entropy": 1.9263049215078354, + "epoch": 0.4922041075390647, + "grad_norm": 3.9291138648986816, + "learning_rate": 3.6059643624648898e-06, + "loss": 0.4452, + "mean_token_accuracy": 0.8536166071891784, + "num_tokens": 191044148.0, + "step": 158780 + }, + { + "entropy": 1.9111416772007943, + "epoch": 0.4922351066641144, + "grad_norm": 3.9041879177093506, + "learning_rate": 3.6058508148911555e-06, + "loss": 0.4417, + "mean_token_accuracy": 0.8473462641239167, + "num_tokens": 191055737.0, + "step": 158790 + }, + { + "entropy": 1.8865627378225327, + "epoch": 0.4922661057891641, + "grad_norm": 4.586888790130615, + "learning_rate": 3.605737278043184e-06, + "loss": 0.4357, + "mean_token_accuracy": 0.8481022253632545, + "num_tokens": 191069166.0, + "step": 158800 + }, + { + "entropy": 1.872072483599186, + "epoch": 0.4922971049142138, + "grad_norm": 8.540482521057129, + "learning_rate": 3.6056237519192867e-06, + "loss": 0.4369, + "mean_token_accuracy": 0.8634131327271461, + "num_tokens": 191082087.0, + "step": 158810 + }, + { + "entropy": 1.9465556621551514, + "epoch": 0.49232810403926347, + "grad_norm": 8.96761417388916, + "learning_rate": 3.605510236517776e-06, + "loss": 0.5089, + "mean_token_accuracy": 0.8466716349124909, + "num_tokens": 191093735.0, + "step": 158820 + }, + { + "entropy": 1.8635354220867157, + "epoch": 0.4923591031643132, + "grad_norm": 7.013364791870117, + "learning_rate": 3.6053967318369633e-06, + "loss": 0.4536, + "mean_token_accuracy": 0.8506390795111656, + "num_tokens": 191106221.0, + "step": 158830 + }, + { + "entropy": 1.8185190051794051, + "epoch": 0.49239010228936286, + "grad_norm": 9.036824226379395, + "learning_rate": 3.6052832378751617e-06, + "loss": 0.4254, + "mean_token_accuracy": 0.8497332945466042, + "num_tokens": 191119668.0, + "step": 158840 + }, + { + "entropy": 1.915633998811245, + "epoch": 0.4924211014144126, + "grad_norm": 4.1883673667907715, + "learning_rate": 3.6051697546306846e-06, + "loss": 0.4536, + "mean_token_accuracy": 0.8417811617255211, + "num_tokens": 191132314.0, + "step": 158850 + }, + { + "entropy": 1.9364984586834908, + "epoch": 0.49245210053946226, + "grad_norm": 7.76389217376709, + "learning_rate": 3.6050562821018447e-06, + "loss": 0.4657, + "mean_token_accuracy": 0.8470098197460174, + "num_tokens": 191143993.0, + "step": 158860 + }, + { + "entropy": 1.8327162981033325, + "epoch": 0.492483099664512, + "grad_norm": 7.964986324310303, + "learning_rate": 3.604942820286957e-06, + "loss": 0.4049, + "mean_token_accuracy": 0.8613449767231941, + "num_tokens": 191156864.0, + "step": 158870 + }, + { + "entropy": 1.8690617755055428, + "epoch": 0.49251409878956165, + "grad_norm": 4.35117244720459, + "learning_rate": 3.6048293691843333e-06, + "loss": 0.4475, + "mean_token_accuracy": 0.8491967663168907, + "num_tokens": 191169304.0, + "step": 158880 + }, + { + "entropy": 1.8968679785728455, + "epoch": 0.4925450979146114, + "grad_norm": 4.414698123931885, + "learning_rate": 3.6047159287922902e-06, + "loss": 0.4392, + "mean_token_accuracy": 0.8521355882287025, + "num_tokens": 191181207.0, + "step": 158890 + }, + { + "entropy": 1.9356569901108742, + "epoch": 0.49257609703966104, + "grad_norm": 8.631135940551758, + "learning_rate": 3.6046024991091415e-06, + "loss": 0.5132, + "mean_token_accuracy": 0.8320743992924691, + "num_tokens": 191192832.0, + "step": 158900 + }, + { + "entropy": 1.8540920540690422, + "epoch": 0.49260709616471077, + "grad_norm": 7.090327262878418, + "learning_rate": 3.604489080133203e-06, + "loss": 0.4149, + "mean_token_accuracy": 0.8626075640320778, + "num_tokens": 191205382.0, + "step": 158910 + }, + { + "entropy": 1.8661150723695754, + "epoch": 0.49263809528976044, + "grad_norm": 4.442000389099121, + "learning_rate": 3.604375671862789e-06, + "loss": 0.4347, + "mean_token_accuracy": 0.8547174111008644, + "num_tokens": 191218404.0, + "step": 158920 + }, + { + "entropy": 1.9430969834327698, + "epoch": 0.49266909441481016, + "grad_norm": 7.606647968292236, + "learning_rate": 3.604262274296218e-06, + "loss": 0.4695, + "mean_token_accuracy": 0.8519912898540497, + "num_tokens": 191230222.0, + "step": 158930 + }, + { + "entropy": 1.8457999877631663, + "epoch": 0.49270009353985983, + "grad_norm": 7.863997936248779, + "learning_rate": 3.6041488874318038e-06, + "loss": 0.3795, + "mean_token_accuracy": 0.8659952461719513, + "num_tokens": 191242348.0, + "step": 158940 + }, + { + "entropy": 1.9445855423808098, + "epoch": 0.49273109266490955, + "grad_norm": 10.894515037536621, + "learning_rate": 3.6040355112678643e-06, + "loss": 0.5062, + "mean_token_accuracy": 0.8433070674538612, + "num_tokens": 191254529.0, + "step": 158950 + }, + { + "entropy": 1.8986426308751105, + "epoch": 0.4927620917899592, + "grad_norm": 9.425984382629395, + "learning_rate": 3.6039221458027167e-06, + "loss": 0.4558, + "mean_token_accuracy": 0.84424237459898, + "num_tokens": 191267476.0, + "step": 158960 + }, + { + "entropy": 1.8787411868572235, + "epoch": 0.49279309091500895, + "grad_norm": 7.693811416625977, + "learning_rate": 3.6038087910346775e-06, + "loss": 0.4695, + "mean_token_accuracy": 0.8551173999905586, + "num_tokens": 191279350.0, + "step": 158970 + }, + { + "entropy": 1.8206491246819496, + "epoch": 0.4928240900400586, + "grad_norm": 5.349733829498291, + "learning_rate": 3.6036954469620656e-06, + "loss": 0.4141, + "mean_token_accuracy": 0.858145822584629, + "num_tokens": 191292641.0, + "step": 158980 + }, + { + "entropy": 1.9382214292883873, + "epoch": 0.49285508916510834, + "grad_norm": 8.99599838256836, + "learning_rate": 3.603582113583198e-06, + "loss": 0.4286, + "mean_token_accuracy": 0.8653192713856697, + "num_tokens": 191303702.0, + "step": 158990 + }, + { + "entropy": 1.9484039723873139, + "epoch": 0.492886088290158, + "grad_norm": 9.494302749633789, + "learning_rate": 3.6034687908963946e-06, + "loss": 0.4695, + "mean_token_accuracy": 0.8453142330050468, + "num_tokens": 191315049.0, + "step": 159000 + }, + { + "entropy": 1.9259200036525725, + "epoch": 0.49291708741520773, + "grad_norm": 8.478474617004395, + "learning_rate": 3.6033554788999735e-06, + "loss": 0.5419, + "mean_token_accuracy": 0.8397252514958382, + "num_tokens": 191326984.0, + "step": 159010 + }, + { + "entropy": 1.7619906887412071, + "epoch": 0.4929480865402574, + "grad_norm": 7.3137407302856445, + "learning_rate": 3.603242177592254e-06, + "loss": 0.3754, + "mean_token_accuracy": 0.8664394572377205, + "num_tokens": 191340485.0, + "step": 159020 + }, + { + "entropy": 1.9478530764579773, + "epoch": 0.49297908566530707, + "grad_norm": 8.351778984069824, + "learning_rate": 3.6031288869715564e-06, + "loss": 0.509, + "mean_token_accuracy": 0.8485710576176644, + "num_tokens": 191351634.0, + "step": 159030 + }, + { + "entropy": 1.8213827714323998, + "epoch": 0.4930100847903568, + "grad_norm": 7.855036735534668, + "learning_rate": 3.6030156070362e-06, + "loss": 0.3937, + "mean_token_accuracy": 0.8727815434336662, + "num_tokens": 191364782.0, + "step": 159040 + }, + { + "entropy": 1.7960595518350602, + "epoch": 0.49304108391540646, + "grad_norm": 2.6937222480773926, + "learning_rate": 3.6029023377845047e-06, + "loss": 0.3767, + "mean_token_accuracy": 0.8672909617424012, + "num_tokens": 191377748.0, + "step": 159050 + }, + { + "entropy": 1.8690377235412599, + "epoch": 0.4930720830404562, + "grad_norm": 4.590978622436523, + "learning_rate": 3.6027890792147935e-06, + "loss": 0.4366, + "mean_token_accuracy": 0.8542490854859353, + "num_tokens": 191390080.0, + "step": 159060 + }, + { + "entropy": 1.7864416763186455, + "epoch": 0.49310308216550586, + "grad_norm": 8.749615669250488, + "learning_rate": 3.602675831325385e-06, + "loss": 0.3539, + "mean_token_accuracy": 0.8751874417066574, + "num_tokens": 191403446.0, + "step": 159070 + }, + { + "entropy": 1.9172361060976981, + "epoch": 0.4931340812905556, + "grad_norm": 8.16201114654541, + "learning_rate": 3.6025625941146024e-06, + "loss": 0.4948, + "mean_token_accuracy": 0.8378383472561837, + "num_tokens": 191415168.0, + "step": 159080 + }, + { + "entropy": 1.8703493565320968, + "epoch": 0.49316508041560525, + "grad_norm": 8.217545509338379, + "learning_rate": 3.602449367580767e-06, + "loss": 0.4655, + "mean_token_accuracy": 0.852349478006363, + "num_tokens": 191426771.0, + "step": 159090 + }, + { + "entropy": 1.9466371893882752, + "epoch": 0.493196079540655, + "grad_norm": 8.871587753295898, + "learning_rate": 3.6023361517222004e-06, + "loss": 0.4819, + "mean_token_accuracy": 0.8483422711491585, + "num_tokens": 191437750.0, + "step": 159100 + }, + { + "entropy": 1.9135088309645654, + "epoch": 0.49322707866570464, + "grad_norm": 8.855616569519043, + "learning_rate": 3.6022229465372273e-06, + "loss": 0.5018, + "mean_token_accuracy": 0.8475657269358635, + "num_tokens": 191449796.0, + "step": 159110 + }, + { + "entropy": 1.7874897986650466, + "epoch": 0.49325807779075437, + "grad_norm": 2.3957784175872803, + "learning_rate": 3.6021097520241676e-06, + "loss": 0.4042, + "mean_token_accuracy": 0.8699712991714478, + "num_tokens": 191462458.0, + "step": 159120 + }, + { + "entropy": 1.9435395896434784, + "epoch": 0.49328907691580404, + "grad_norm": 7.960422515869141, + "learning_rate": 3.6019965681813475e-06, + "loss": 0.4552, + "mean_token_accuracy": 0.8521071627736092, + "num_tokens": 191473434.0, + "step": 159130 + }, + { + "entropy": 1.9279766619205474, + "epoch": 0.49332007604085376, + "grad_norm": 3.857520341873169, + "learning_rate": 3.601883395007089e-06, + "loss": 0.5056, + "mean_token_accuracy": 0.8442885830998421, + "num_tokens": 191484903.0, + "step": 159140 + }, + { + "entropy": 1.8475008338689805, + "epoch": 0.49335107516590343, + "grad_norm": 6.955073356628418, + "learning_rate": 3.6017702324997172e-06, + "loss": 0.4643, + "mean_token_accuracy": 0.8446203991770744, + "num_tokens": 191498120.0, + "step": 159150 + }, + { + "entropy": 1.9266953021287918, + "epoch": 0.49338207429095315, + "grad_norm": 8.705105781555176, + "learning_rate": 3.6016570806575563e-06, + "loss": 0.5382, + "mean_token_accuracy": 0.8330066472291946, + "num_tokens": 191509278.0, + "step": 159160 + }, + { + "entropy": 1.8248368367552756, + "epoch": 0.4934130734160028, + "grad_norm": 7.812545299530029, + "learning_rate": 3.601543939478931e-06, + "loss": 0.4071, + "mean_token_accuracy": 0.8588651672005654, + "num_tokens": 191521592.0, + "step": 159170 + }, + { + "entropy": 1.8829329013824463, + "epoch": 0.49344407254105255, + "grad_norm": 8.274620056152344, + "learning_rate": 3.6014308089621654e-06, + "loss": 0.4756, + "mean_token_accuracy": 0.8415772512555122, + "num_tokens": 191533098.0, + "step": 159180 + }, + { + "entropy": 1.8726583898067475, + "epoch": 0.4934750716661022, + "grad_norm": 9.604278564453125, + "learning_rate": 3.601317689105587e-06, + "loss": 0.4613, + "mean_token_accuracy": 0.8552197560667991, + "num_tokens": 191544887.0, + "step": 159190 + }, + { + "entropy": 1.859221415221691, + "epoch": 0.49350607079115194, + "grad_norm": 6.692994117736816, + "learning_rate": 3.6012045799075205e-06, + "loss": 0.4347, + "mean_token_accuracy": 0.8668141156435013, + "num_tokens": 191556853.0, + "step": 159200 + }, + { + "entropy": 1.9711533397436143, + "epoch": 0.4935370699162016, + "grad_norm": 9.214414596557617, + "learning_rate": 3.6010914813662927e-06, + "loss": 0.5046, + "mean_token_accuracy": 0.8427471235394478, + "num_tokens": 191568177.0, + "step": 159210 + }, + { + "entropy": 1.8187393069267273, + "epoch": 0.49356806904125133, + "grad_norm": 8.627388954162598, + "learning_rate": 3.60097839348023e-06, + "loss": 0.4427, + "mean_token_accuracy": 0.8565979853272438, + "num_tokens": 191580641.0, + "step": 159220 + }, + { + "entropy": 1.8383538708090783, + "epoch": 0.493599068166301, + "grad_norm": 3.850691795349121, + "learning_rate": 3.6008653162476602e-06, + "loss": 0.3849, + "mean_token_accuracy": 0.8547204375267029, + "num_tokens": 191593431.0, + "step": 159230 + }, + { + "entropy": 1.8895878791809082, + "epoch": 0.4936300672913507, + "grad_norm": 7.909356117248535, + "learning_rate": 3.6007522496669095e-06, + "loss": 0.4623, + "mean_token_accuracy": 0.8444205358624458, + "num_tokens": 191605046.0, + "step": 159240 + }, + { + "entropy": 1.9098277121782303, + "epoch": 0.4936610664164004, + "grad_norm": 6.846156597137451, + "learning_rate": 3.6006391937363068e-06, + "loss": 0.4493, + "mean_token_accuracy": 0.8568753972649574, + "num_tokens": 191616417.0, + "step": 159250 + }, + { + "entropy": 1.8749052241444588, + "epoch": 0.4936920655414501, + "grad_norm": 8.896378517150879, + "learning_rate": 3.600526148454179e-06, + "loss": 0.4854, + "mean_token_accuracy": 0.847206848859787, + "num_tokens": 191628583.0, + "step": 159260 + }, + { + "entropy": 1.843507680296898, + "epoch": 0.4937230646664998, + "grad_norm": 8.739633560180664, + "learning_rate": 3.6004131138188563e-06, + "loss": 0.406, + "mean_token_accuracy": 0.8644677430391312, + "num_tokens": 191641180.0, + "step": 159270 + }, + { + "entropy": 1.928541123867035, + "epoch": 0.49375406379154946, + "grad_norm": 7.948415279388428, + "learning_rate": 3.6003000898286653e-06, + "loss": 0.4932, + "mean_token_accuracy": 0.8485148876905442, + "num_tokens": 191652064.0, + "step": 159280 + }, + { + "entropy": 1.8564437612891198, + "epoch": 0.4937850629165992, + "grad_norm": 3.7542104721069336, + "learning_rate": 3.6001870764819376e-06, + "loss": 0.4418, + "mean_token_accuracy": 0.8547224655747414, + "num_tokens": 191664242.0, + "step": 159290 + }, + { + "entropy": 1.886579157412052, + "epoch": 0.49381606204164885, + "grad_norm": 7.148205757141113, + "learning_rate": 3.6000740737770015e-06, + "loss": 0.4305, + "mean_token_accuracy": 0.8600714549422264, + "num_tokens": 191676071.0, + "step": 159300 + }, + { + "entropy": 1.9193453639745712, + "epoch": 0.4938470611666986, + "grad_norm": 7.308450222015381, + "learning_rate": 3.599961081712188e-06, + "loss": 0.4872, + "mean_token_accuracy": 0.8472814872860909, + "num_tokens": 191687806.0, + "step": 159310 + }, + { + "entropy": 1.9151620626449586, + "epoch": 0.49387806029174824, + "grad_norm": 3.6540892124176025, + "learning_rate": 3.5998481002858256e-06, + "loss": 0.459, + "mean_token_accuracy": 0.8533779740333557, + "num_tokens": 191698520.0, + "step": 159320 + }, + { + "entropy": 1.8681719586253167, + "epoch": 0.49390905941679797, + "grad_norm": 9.010276794433594, + "learning_rate": 3.5997351294962464e-06, + "loss": 0.4324, + "mean_token_accuracy": 0.8576828598976135, + "num_tokens": 191711297.0, + "step": 159330 + }, + { + "entropy": 1.8712261497974396, + "epoch": 0.49394005854184764, + "grad_norm": 7.325274467468262, + "learning_rate": 3.5996221693417817e-06, + "loss": 0.4474, + "mean_token_accuracy": 0.8589169010519981, + "num_tokens": 191723406.0, + "step": 159340 + }, + { + "entropy": 1.8066833555698394, + "epoch": 0.49397105766689736, + "grad_norm": 3.5466883182525635, + "learning_rate": 3.599509219820762e-06, + "loss": 0.4108, + "mean_token_accuracy": 0.8577698558568955, + "num_tokens": 191736967.0, + "step": 159350 + }, + { + "entropy": 1.7822127223014832, + "epoch": 0.49400205679194703, + "grad_norm": 4.0035576820373535, + "learning_rate": 3.5993962809315197e-06, + "loss": 0.3724, + "mean_token_accuracy": 0.8697702825069428, + "num_tokens": 191750056.0, + "step": 159360 + }, + { + "entropy": 1.936877228319645, + "epoch": 0.49403305591699676, + "grad_norm": 9.158493041992188, + "learning_rate": 3.5992833526723876e-06, + "loss": 0.4997, + "mean_token_accuracy": 0.8464233413338661, + "num_tokens": 191761365.0, + "step": 159370 + }, + { + "entropy": 1.8716009959578515, + "epoch": 0.4940640550420464, + "grad_norm": 9.602849006652832, + "learning_rate": 3.5991704350416963e-06, + "loss": 0.4716, + "mean_token_accuracy": 0.8516163632273674, + "num_tokens": 191773293.0, + "step": 159380 + }, + { + "entropy": 1.8163740314543246, + "epoch": 0.49409505416709615, + "grad_norm": 1.8862295150756836, + "learning_rate": 3.5990575280377803e-06, + "loss": 0.4396, + "mean_token_accuracy": 0.8576122790575027, + "num_tokens": 191786131.0, + "step": 159390 + }, + { + "entropy": 1.9228911444544792, + "epoch": 0.4941260532921458, + "grad_norm": 6.864531993865967, + "learning_rate": 3.5989446316589728e-06, + "loss": 0.4464, + "mean_token_accuracy": 0.8586970806121826, + "num_tokens": 191797537.0, + "step": 159400 + }, + { + "entropy": 1.8249765798449515, + "epoch": 0.49415705241719554, + "grad_norm": 9.325008392333984, + "learning_rate": 3.5988317459036063e-06, + "loss": 0.41, + "mean_token_accuracy": 0.8683119297027588, + "num_tokens": 191810309.0, + "step": 159410 + }, + { + "entropy": 1.8564854457974433, + "epoch": 0.4941880515422452, + "grad_norm": 9.13007926940918, + "learning_rate": 3.5987188707700173e-06, + "loss": 0.4441, + "mean_token_accuracy": 0.8559637442231178, + "num_tokens": 191821949.0, + "step": 159420 + }, + { + "entropy": 1.9013855665922166, + "epoch": 0.49421905066729493, + "grad_norm": 7.891731262207031, + "learning_rate": 3.598606006256537e-06, + "loss": 0.4329, + "mean_token_accuracy": 0.8630088746547699, + "num_tokens": 191833045.0, + "step": 159430 + }, + { + "entropy": 1.8180122032761574, + "epoch": 0.4942500497923446, + "grad_norm": 7.655247688293457, + "learning_rate": 3.5984931523615023e-06, + "loss": 0.4172, + "mean_token_accuracy": 0.8637914076447487, + "num_tokens": 191845436.0, + "step": 159440 + }, + { + "entropy": 1.949680632352829, + "epoch": 0.49428104891739433, + "grad_norm": 8.30689525604248, + "learning_rate": 3.5983803090832483e-06, + "loss": 0.489, + "mean_token_accuracy": 0.8543712347745895, + "num_tokens": 191856433.0, + "step": 159450 + }, + { + "entropy": 1.9027078568935394, + "epoch": 0.494312048042444, + "grad_norm": 9.063887596130371, + "learning_rate": 3.5982674764201085e-06, + "loss": 0.4823, + "mean_token_accuracy": 0.8519957110285759, + "num_tokens": 191867610.0, + "step": 159460 + }, + { + "entropy": 1.9152158468961715, + "epoch": 0.4943430471674937, + "grad_norm": 7.915401935577393, + "learning_rate": 3.598154654370421e-06, + "loss": 0.4439, + "mean_token_accuracy": 0.8566673934459687, + "num_tokens": 191878868.0, + "step": 159470 + }, + { + "entropy": 1.9054709061980248, + "epoch": 0.4943740462925434, + "grad_norm": 7.502203941345215, + "learning_rate": 3.598041842932521e-06, + "loss": 0.4503, + "mean_token_accuracy": 0.863150903582573, + "num_tokens": 191890203.0, + "step": 159480 + }, + { + "entropy": 1.8717289566993713, + "epoch": 0.4944050454175931, + "grad_norm": 8.5286283493042, + "learning_rate": 3.5979290421047445e-06, + "loss": 0.4555, + "mean_token_accuracy": 0.851759298145771, + "num_tokens": 191901799.0, + "step": 159490 + }, + { + "entropy": 1.8034158065915107, + "epoch": 0.4944360445426428, + "grad_norm": 7.359280109405518, + "learning_rate": 3.5978162518854305e-06, + "loss": 0.3651, + "mean_token_accuracy": 0.8661789894104004, + "num_tokens": 191914715.0, + "step": 159500 + }, + { + "entropy": 1.900420169532299, + "epoch": 0.49446704366769245, + "grad_norm": 7.568366527557373, + "learning_rate": 3.5977034722729138e-06, + "loss": 0.4615, + "mean_token_accuracy": 0.8520032197237015, + "num_tokens": 191926711.0, + "step": 159510 + }, + { + "entropy": 1.8336208701133727, + "epoch": 0.4944980427927422, + "grad_norm": 3.925819158554077, + "learning_rate": 3.5975907032655337e-06, + "loss": 0.4219, + "mean_token_accuracy": 0.8600850090384483, + "num_tokens": 191939407.0, + "step": 159520 + }, + { + "entropy": 1.8698050826787949, + "epoch": 0.49452904191779185, + "grad_norm": 8.200296401977539, + "learning_rate": 3.5974779448616272e-06, + "loss": 0.4385, + "mean_token_accuracy": 0.8596655920147895, + "num_tokens": 191951027.0, + "step": 159530 + }, + { + "entropy": 1.9370542958378791, + "epoch": 0.49456004104284157, + "grad_norm": 8.19310474395752, + "learning_rate": 3.5973651970595336e-06, + "loss": 0.4588, + "mean_token_accuracy": 0.8577198967337608, + "num_tokens": 191962661.0, + "step": 159540 + }, + { + "entropy": 1.9390544414520263, + "epoch": 0.49459104016789124, + "grad_norm": 8.382871627807617, + "learning_rate": 3.5972524598575907e-06, + "loss": 0.4806, + "mean_token_accuracy": 0.855408425629139, + "num_tokens": 191973798.0, + "step": 159550 + }, + { + "entropy": 1.8777232959866523, + "epoch": 0.49462203929294096, + "grad_norm": 8.35791015625, + "learning_rate": 3.597139733254139e-06, + "loss": 0.4339, + "mean_token_accuracy": 0.860966557264328, + "num_tokens": 191985741.0, + "step": 159560 + }, + { + "entropy": 1.9498206496238708, + "epoch": 0.49465303841799063, + "grad_norm": 8.334501266479492, + "learning_rate": 3.597027017247517e-06, + "loss": 0.496, + "mean_token_accuracy": 0.8421608254313468, + "num_tokens": 191997083.0, + "step": 159570 + }, + { + "entropy": 1.9712836146354675, + "epoch": 0.49468403754304036, + "grad_norm": 8.104325294494629, + "learning_rate": 3.5969143118360645e-06, + "loss": 0.4925, + "mean_token_accuracy": 0.850074777007103, + "num_tokens": 192008075.0, + "step": 159580 + }, + { + "entropy": 1.9014674112200738, + "epoch": 0.49471503666809, + "grad_norm": 8.756881713867188, + "learning_rate": 3.596801617018122e-06, + "loss": 0.4485, + "mean_token_accuracy": 0.8585664972662925, + "num_tokens": 192020574.0, + "step": 159590 + }, + { + "entropy": 1.9716274067759514, + "epoch": 0.49474603579313975, + "grad_norm": 9.641308784484863, + "learning_rate": 3.5966889327920303e-06, + "loss": 0.5101, + "mean_token_accuracy": 0.8372088670730591, + "num_tokens": 192032036.0, + "step": 159600 + }, + { + "entropy": 1.9579774558544158, + "epoch": 0.4947770349181894, + "grad_norm": 8.46997356414795, + "learning_rate": 3.596576259156129e-06, + "loss": 0.4723, + "mean_token_accuracy": 0.8514121487736702, + "num_tokens": 192043288.0, + "step": 159610 + }, + { + "entropy": 1.9294979020953178, + "epoch": 0.49480803404323914, + "grad_norm": 8.299081802368164, + "learning_rate": 3.5964635961087614e-06, + "loss": 0.4578, + "mean_token_accuracy": 0.8471070304512978, + "num_tokens": 192055247.0, + "step": 159620 + }, + { + "entropy": 1.9172394633293153, + "epoch": 0.4948390331682888, + "grad_norm": 7.675030708312988, + "learning_rate": 3.596350943648268e-06, + "loss": 0.4523, + "mean_token_accuracy": 0.8571576774120331, + "num_tokens": 192066190.0, + "step": 159630 + }, + { + "entropy": 1.9542252331972123, + "epoch": 0.49487003229333854, + "grad_norm": 7.442762851715088, + "learning_rate": 3.5962383017729907e-06, + "loss": 0.4722, + "mean_token_accuracy": 0.854285454750061, + "num_tokens": 192077133.0, + "step": 159640 + }, + { + "entropy": 1.9150119140744208, + "epoch": 0.4949010314183882, + "grad_norm": 7.354382514953613, + "learning_rate": 3.596125670481273e-06, + "loss": 0.4672, + "mean_token_accuracy": 0.8513759151101112, + "num_tokens": 192088338.0, + "step": 159650 + }, + { + "entropy": 1.9242964759469032, + "epoch": 0.49493203054343793, + "grad_norm": 8.360685348510742, + "learning_rate": 3.596013049771456e-06, + "loss": 0.4496, + "mean_token_accuracy": 0.8601465240120888, + "num_tokens": 192099903.0, + "step": 159660 + }, + { + "entropy": 1.8404909834265708, + "epoch": 0.4949630296684876, + "grad_norm": 6.913203239440918, + "learning_rate": 3.5959004396418847e-06, + "loss": 0.4468, + "mean_token_accuracy": 0.8578110337257385, + "num_tokens": 192112770.0, + "step": 159670 + }, + { + "entropy": 1.922998534142971, + "epoch": 0.4949940287935373, + "grad_norm": 4.097351551055908, + "learning_rate": 3.595787840090901e-06, + "loss": 0.4733, + "mean_token_accuracy": 0.8465961366891861, + "num_tokens": 192124121.0, + "step": 159680 + }, + { + "entropy": 1.921201765537262, + "epoch": 0.495025027918587, + "grad_norm": 8.46877384185791, + "learning_rate": 3.5956752511168493e-06, + "loss": 0.458, + "mean_token_accuracy": 0.8489266246557235, + "num_tokens": 192135725.0, + "step": 159690 + }, + { + "entropy": 1.795041662454605, + "epoch": 0.4950560270436367, + "grad_norm": 2.492420196533203, + "learning_rate": 3.5955626727180743e-06, + "loss": 0.3979, + "mean_token_accuracy": 0.865375991165638, + "num_tokens": 192149338.0, + "step": 159700 + }, + { + "entropy": 1.9332377836108208, + "epoch": 0.4950870261686864, + "grad_norm": 8.110081672668457, + "learning_rate": 3.59545010489292e-06, + "loss": 0.5078, + "mean_token_accuracy": 0.8450874507427215, + "num_tokens": 192160823.0, + "step": 159710 + }, + { + "entropy": 1.8410456269979476, + "epoch": 0.4951180252937361, + "grad_norm": 6.63314151763916, + "learning_rate": 3.595337547639732e-06, + "loss": 0.4297, + "mean_token_accuracy": 0.855519250035286, + "num_tokens": 192174110.0, + "step": 159720 + }, + { + "entropy": 1.9845334231853484, + "epoch": 0.4951490244187858, + "grad_norm": 9.71750545501709, + "learning_rate": 3.5952250009568545e-06, + "loss": 0.494, + "mean_token_accuracy": 0.85206418633461, + "num_tokens": 192185155.0, + "step": 159730 + }, + { + "entropy": 1.8927864357829094, + "epoch": 0.4951800235438355, + "grad_norm": 3.551086187362671, + "learning_rate": 3.595112464842634e-06, + "loss": 0.451, + "mean_token_accuracy": 0.8464046537876129, + "num_tokens": 192197012.0, + "step": 159740 + }, + { + "entropy": 1.8689477637410163, + "epoch": 0.49521102266888517, + "grad_norm": 8.574982643127441, + "learning_rate": 3.594999939295416e-06, + "loss": 0.4122, + "mean_token_accuracy": 0.8616869121789932, + "num_tokens": 192209129.0, + "step": 159750 + }, + { + "entropy": 1.9469447121024133, + "epoch": 0.49524202179393484, + "grad_norm": 8.655628204345703, + "learning_rate": 3.5948874243135472e-06, + "loss": 0.457, + "mean_token_accuracy": 0.8474073946475983, + "num_tokens": 192220445.0, + "step": 159760 + }, + { + "entropy": 1.8619813948869706, + "epoch": 0.49527302091898456, + "grad_norm": 7.8713297843933105, + "learning_rate": 3.5947749198953753e-06, + "loss": 0.4569, + "mean_token_accuracy": 0.8450932338833809, + "num_tokens": 192232925.0, + "step": 159770 + }, + { + "entropy": 1.7789973124861718, + "epoch": 0.49530402004403423, + "grad_norm": 2.4723801612854004, + "learning_rate": 3.5946624260392455e-06, + "loss": 0.3942, + "mean_token_accuracy": 0.8597326517105103, + "num_tokens": 192246932.0, + "step": 159780 + }, + { + "entropy": 1.9105781406164168, + "epoch": 0.49533501916908396, + "grad_norm": 5.2691850662231445, + "learning_rate": 3.5945499427435066e-06, + "loss": 0.4914, + "mean_token_accuracy": 0.8449198469519615, + "num_tokens": 192258019.0, + "step": 159790 + }, + { + "entropy": 1.8505356892943383, + "epoch": 0.4953660182941336, + "grad_norm": 8.916192054748535, + "learning_rate": 3.5944374700065052e-06, + "loss": 0.4483, + "mean_token_accuracy": 0.855469511449337, + "num_tokens": 192270524.0, + "step": 159800 + }, + { + "entropy": 1.896497841179371, + "epoch": 0.49539701741918335, + "grad_norm": 3.9803407192230225, + "learning_rate": 3.5943250078265917e-06, + "loss": 0.4517, + "mean_token_accuracy": 0.8532499849796296, + "num_tokens": 192281947.0, + "step": 159810 + }, + { + "entropy": 1.8115594327449798, + "epoch": 0.495428016544233, + "grad_norm": 9.716015815734863, + "learning_rate": 3.594212556202113e-06, + "loss": 0.4016, + "mean_token_accuracy": 0.8568425461649894, + "num_tokens": 192295045.0, + "step": 159820 + }, + { + "entropy": 1.8505840301513672, + "epoch": 0.49545901566928274, + "grad_norm": 4.513120174407959, + "learning_rate": 3.594100115131418e-06, + "loss": 0.4468, + "mean_token_accuracy": 0.8524532780051232, + "num_tokens": 192307333.0, + "step": 159830 + }, + { + "entropy": 1.889463298022747, + "epoch": 0.4954900147943324, + "grad_norm": 4.289556503295898, + "learning_rate": 3.5939876846128567e-06, + "loss": 0.4797, + "mean_token_accuracy": 0.8482072561979294, + "num_tokens": 192319247.0, + "step": 159840 + }, + { + "entropy": 1.8135507375001907, + "epoch": 0.49552101391938214, + "grad_norm": 3.4432010650634766, + "learning_rate": 3.5938752646447785e-06, + "loss": 0.3781, + "mean_token_accuracy": 0.8619725123047829, + "num_tokens": 192332304.0, + "step": 159850 + }, + { + "entropy": 1.7961521610617637, + "epoch": 0.4955520130444318, + "grad_norm": 3.9302399158477783, + "learning_rate": 3.5937628552255325e-06, + "loss": 0.3758, + "mean_token_accuracy": 0.8562786504626274, + "num_tokens": 192345314.0, + "step": 159860 + }, + { + "entropy": 1.8858400031924247, + "epoch": 0.49558301216948153, + "grad_norm": 8.590622901916504, + "learning_rate": 3.593650456353471e-06, + "loss": 0.4808, + "mean_token_accuracy": 0.8526899054646492, + "num_tokens": 192357612.0, + "step": 159870 + }, + { + "entropy": 1.9095351725816727, + "epoch": 0.4956140112945312, + "grad_norm": 7.664645671844482, + "learning_rate": 3.593538068026943e-06, + "loss": 0.4466, + "mean_token_accuracy": 0.8507430016994476, + "num_tokens": 192369306.0, + "step": 159880 + }, + { + "entropy": 1.8387876883149148, + "epoch": 0.4956450104195809, + "grad_norm": 9.930180549621582, + "learning_rate": 3.593425690244299e-06, + "loss": 0.4228, + "mean_token_accuracy": 0.8595908388495446, + "num_tokens": 192381626.0, + "step": 159890 + }, + { + "entropy": 1.9600819304585457, + "epoch": 0.4956760095446306, + "grad_norm": 9.37524127960205, + "learning_rate": 3.5933133230038935e-06, + "loss": 0.488, + "mean_token_accuracy": 0.8463097050786018, + "num_tokens": 192393259.0, + "step": 159900 + }, + { + "entropy": 1.9440770626068116, + "epoch": 0.4957070086696803, + "grad_norm": 7.7554755210876465, + "learning_rate": 3.5932009663040756e-06, + "loss": 0.472, + "mean_token_accuracy": 0.8595362901687622, + "num_tokens": 192404060.0, + "step": 159910 + }, + { + "entropy": 1.6832211181521415, + "epoch": 0.49573800779473, + "grad_norm": 3.3963851928710938, + "learning_rate": 3.593088620143198e-06, + "loss": 0.3069, + "mean_token_accuracy": 0.8765401244163513, + "num_tokens": 192418579.0, + "step": 159920 + }, + { + "entropy": 1.7505709946155548, + "epoch": 0.4957690069197797, + "grad_norm": 7.246809959411621, + "learning_rate": 3.5929762845196146e-06, + "loss": 0.3778, + "mean_token_accuracy": 0.8670257017016411, + "num_tokens": 192432317.0, + "step": 159930 + }, + { + "entropy": 1.8624306321144104, + "epoch": 0.4958000060448294, + "grad_norm": 4.187715530395508, + "learning_rate": 3.5928639594316765e-06, + "loss": 0.445, + "mean_token_accuracy": 0.8575484842061997, + "num_tokens": 192444479.0, + "step": 159940 + }, + { + "entropy": 1.8551115036010741, + "epoch": 0.4958310051698791, + "grad_norm": 3.9937570095062256, + "learning_rate": 3.592751644877738e-06, + "loss": 0.4007, + "mean_token_accuracy": 0.8715003669261933, + "num_tokens": 192456677.0, + "step": 159950 + }, + { + "entropy": 1.9346601396799088, + "epoch": 0.49586200429492877, + "grad_norm": 8.055962562561035, + "learning_rate": 3.5926393408561522e-06, + "loss": 0.4836, + "mean_token_accuracy": 0.8497431352734566, + "num_tokens": 192468137.0, + "step": 159960 + }, + { + "entropy": 1.8983276531100273, + "epoch": 0.4958930034199785, + "grad_norm": 8.280396461486816, + "learning_rate": 3.5925270473652735e-06, + "loss": 0.4381, + "mean_token_accuracy": 0.8556265756487846, + "num_tokens": 192479694.0, + "step": 159970 + }, + { + "entropy": 1.882783156633377, + "epoch": 0.49592400254502816, + "grad_norm": 9.567574501037598, + "learning_rate": 3.5924147644034557e-06, + "loss": 0.4529, + "mean_token_accuracy": 0.8511046752333641, + "num_tokens": 192491376.0, + "step": 159980 + }, + { + "entropy": 1.7969225481152535, + "epoch": 0.4959550016700779, + "grad_norm": 2.3194515705108643, + "learning_rate": 3.592302491969054e-06, + "loss": 0.3909, + "mean_token_accuracy": 0.866172443330288, + "num_tokens": 192504201.0, + "step": 159990 + }, + { + "entropy": 1.9083019599318505, + "epoch": 0.49598600079512756, + "grad_norm": 7.948028564453125, + "learning_rate": 3.5921902300604235e-06, + "loss": 0.4494, + "mean_token_accuracy": 0.8478529721498489, + "num_tokens": 192516145.0, + "step": 160000 + }, + { + "entropy": 1.8109319642186166, + "epoch": 0.4960169999201772, + "grad_norm": 8.787992477416992, + "learning_rate": 3.59207797867592e-06, + "loss": 0.3886, + "mean_token_accuracy": 0.8696772411465645, + "num_tokens": 192528786.0, + "step": 160010 + }, + { + "entropy": 1.8585580334067344, + "epoch": 0.49604799904522695, + "grad_norm": 3.6390159130096436, + "learning_rate": 3.591965737813897e-06, + "loss": 0.4232, + "mean_token_accuracy": 0.8561258107423783, + "num_tokens": 192541016.0, + "step": 160020 + }, + { + "entropy": 1.827420374751091, + "epoch": 0.4960789981702766, + "grad_norm": 7.844151020050049, + "learning_rate": 3.591853507472713e-06, + "loss": 0.4528, + "mean_token_accuracy": 0.8646694928407669, + "num_tokens": 192553566.0, + "step": 160030 + }, + { + "entropy": 1.9029534503817558, + "epoch": 0.49610999729532634, + "grad_norm": 8.42615795135498, + "learning_rate": 3.591741287650724e-06, + "loss": 0.4644, + "mean_token_accuracy": 0.8528878018260002, + "num_tokens": 192564840.0, + "step": 160040 + }, + { + "entropy": 1.9213525876402855, + "epoch": 0.496140996420376, + "grad_norm": 8.478250503540039, + "learning_rate": 3.5916290783462864e-06, + "loss": 0.5006, + "mean_token_accuracy": 0.8403630703687668, + "num_tokens": 192577087.0, + "step": 160050 + }, + { + "entropy": 1.8662138119339944, + "epoch": 0.49617199554542574, + "grad_norm": 8.441492080688477, + "learning_rate": 3.5915168795577587e-06, + "loss": 0.416, + "mean_token_accuracy": 0.8532093212008476, + "num_tokens": 192589686.0, + "step": 160060 + }, + { + "entropy": 1.8910415381193162, + "epoch": 0.4962029946704754, + "grad_norm": 6.746820449829102, + "learning_rate": 3.5914046912834966e-06, + "loss": 0.4201, + "mean_token_accuracy": 0.856334288418293, + "num_tokens": 192602111.0, + "step": 160070 + }, + { + "entropy": 1.9242399752140045, + "epoch": 0.49623399379552513, + "grad_norm": 8.525853157043457, + "learning_rate": 3.5912925135218584e-06, + "loss": 0.4753, + "mean_token_accuracy": 0.8545285657048225, + "num_tokens": 192613246.0, + "step": 160080 + }, + { + "entropy": 1.835014469921589, + "epoch": 0.4962649929205748, + "grad_norm": 3.318446159362793, + "learning_rate": 3.5911803462712035e-06, + "loss": 0.4093, + "mean_token_accuracy": 0.8679602593183517, + "num_tokens": 192625964.0, + "step": 160090 + }, + { + "entropy": 1.8478663966059685, + "epoch": 0.4962959920456245, + "grad_norm": 7.670246601104736, + "learning_rate": 3.5910681895298898e-06, + "loss": 0.4272, + "mean_token_accuracy": 0.8541574910283088, + "num_tokens": 192637795.0, + "step": 160100 + }, + { + "entropy": 1.901483315229416, + "epoch": 0.4963269911706742, + "grad_norm": 3.5240871906280518, + "learning_rate": 3.5909560432962764e-06, + "loss": 0.4324, + "mean_token_accuracy": 0.8594571843743324, + "num_tokens": 192649967.0, + "step": 160110 + }, + { + "entropy": 1.8992531165480613, + "epoch": 0.4963579902957239, + "grad_norm": 8.688932418823242, + "learning_rate": 3.590843907568723e-06, + "loss": 0.4436, + "mean_token_accuracy": 0.8527524411678314, + "num_tokens": 192661724.0, + "step": 160120 + }, + { + "entropy": 1.8817158490419388, + "epoch": 0.4963889894207736, + "grad_norm": 6.7711992263793945, + "learning_rate": 3.5907317823455882e-06, + "loss": 0.4296, + "mean_token_accuracy": 0.86692134141922, + "num_tokens": 192673692.0, + "step": 160130 + }, + { + "entropy": 1.8810399681329728, + "epoch": 0.4964199885458233, + "grad_norm": 3.6192641258239746, + "learning_rate": 3.5906196676252334e-06, + "loss": 0.4325, + "mean_token_accuracy": 0.8557695388793946, + "num_tokens": 192685719.0, + "step": 160140 + }, + { + "entropy": 1.763352060317993, + "epoch": 0.496450987670873, + "grad_norm": 8.72749137878418, + "learning_rate": 3.5905075634060187e-06, + "loss": 0.4211, + "mean_token_accuracy": 0.8585641667246818, + "num_tokens": 192699439.0, + "step": 160150 + }, + { + "entropy": 1.9361596137285233, + "epoch": 0.4964819867959227, + "grad_norm": 6.907578945159912, + "learning_rate": 3.590395469686304e-06, + "loss": 0.5622, + "mean_token_accuracy": 0.8400822654366493, + "num_tokens": 192710481.0, + "step": 160160 + }, + { + "entropy": 1.978668710589409, + "epoch": 0.49651298592097237, + "grad_norm": 8.254986763000488, + "learning_rate": 3.590283386464452e-06, + "loss": 0.5487, + "mean_token_accuracy": 0.8343606054782867, + "num_tokens": 192721151.0, + "step": 160170 + }, + { + "entropy": 1.8602470502257347, + "epoch": 0.4965439850460221, + "grad_norm": 6.900646686553955, + "learning_rate": 3.590171313738823e-06, + "loss": 0.4019, + "mean_token_accuracy": 0.8702385917305946, + "num_tokens": 192733018.0, + "step": 160180 + }, + { + "entropy": 1.8422987774014472, + "epoch": 0.49657498417107176, + "grad_norm": 8.578625679016113, + "learning_rate": 3.59005925150778e-06, + "loss": 0.4148, + "mean_token_accuracy": 0.8609873950481415, + "num_tokens": 192745197.0, + "step": 160190 + }, + { + "entropy": 1.944932959973812, + "epoch": 0.4966059832961215, + "grad_norm": 9.156232833862305, + "learning_rate": 3.589947199769683e-06, + "loss": 0.4781, + "mean_token_accuracy": 0.8525719374418259, + "num_tokens": 192756468.0, + "step": 160200 + }, + { + "entropy": 1.8785189107060432, + "epoch": 0.49663698242117116, + "grad_norm": 7.663604259490967, + "learning_rate": 3.5898351585228973e-06, + "loss": 0.4157, + "mean_token_accuracy": 0.8553120478987694, + "num_tokens": 192769114.0, + "step": 160210 + }, + { + "entropy": 1.8660555586218834, + "epoch": 0.4966679815462209, + "grad_norm": 7.651570796966553, + "learning_rate": 3.589723127765784e-06, + "loss": 0.4296, + "mean_token_accuracy": 0.8518196612596511, + "num_tokens": 192782159.0, + "step": 160220 + }, + { + "entropy": 1.8391312196850778, + "epoch": 0.49669898067127055, + "grad_norm": 4.205702304840088, + "learning_rate": 3.5896111074967082e-06, + "loss": 0.4375, + "mean_token_accuracy": 0.8440369963645935, + "num_tokens": 192795497.0, + "step": 160230 + }, + { + "entropy": 1.9263321340084076, + "epoch": 0.4967299797963203, + "grad_norm": 9.188881874084473, + "learning_rate": 3.589499097714031e-06, + "loss": 0.4668, + "mean_token_accuracy": 0.8481300055980683, + "num_tokens": 192807445.0, + "step": 160240 + }, + { + "entropy": 1.9688881516456604, + "epoch": 0.49676097892136994, + "grad_norm": 7.7191481590271, + "learning_rate": 3.589387098416119e-06, + "loss": 0.4924, + "mean_token_accuracy": 0.8576457172632217, + "num_tokens": 192818000.0, + "step": 160250 + }, + { + "entropy": 1.8553527995944024, + "epoch": 0.4967919780464196, + "grad_norm": 8.173221588134766, + "learning_rate": 3.5892751096013353e-06, + "loss": 0.4053, + "mean_token_accuracy": 0.8690877959132195, + "num_tokens": 192830549.0, + "step": 160260 + }, + { + "entropy": 1.9142266526818275, + "epoch": 0.49682297717146934, + "grad_norm": 3.9052417278289795, + "learning_rate": 3.5891631312680436e-06, + "loss": 0.4795, + "mean_token_accuracy": 0.8526837721467018, + "num_tokens": 192843309.0, + "step": 160270 + }, + { + "entropy": 1.9387848749756813, + "epoch": 0.496853976296519, + "grad_norm": 9.477102279663086, + "learning_rate": 3.5890511634146112e-06, + "loss": 0.4682, + "mean_token_accuracy": 0.8555059671401978, + "num_tokens": 192854568.0, + "step": 160280 + }, + { + "entropy": 1.8374897107481956, + "epoch": 0.49688497542156873, + "grad_norm": 7.315760135650635, + "learning_rate": 3.5889392060394016e-06, + "loss": 0.4744, + "mean_token_accuracy": 0.8532007947564125, + "num_tokens": 192867609.0, + "step": 160290 + }, + { + "entropy": 1.952652522921562, + "epoch": 0.4969159745466184, + "grad_norm": 8.343222618103027, + "learning_rate": 3.588827259140783e-06, + "loss": 0.4878, + "mean_token_accuracy": 0.8470484256744385, + "num_tokens": 192878254.0, + "step": 160300 + }, + { + "entropy": 2.024555891752243, + "epoch": 0.4969469736716681, + "grad_norm": 7.191964149475098, + "learning_rate": 3.5887153227171184e-06, + "loss": 0.4987, + "mean_token_accuracy": 0.8383811950683594, + "num_tokens": 192889067.0, + "step": 160310 + }, + { + "entropy": 1.8601422876119613, + "epoch": 0.4969779727967178, + "grad_norm": 7.714452266693115, + "learning_rate": 3.5886033967667773e-06, + "loss": 0.4423, + "mean_token_accuracy": 0.8559423238039017, + "num_tokens": 192901287.0, + "step": 160320 + }, + { + "entropy": 1.9518428400158883, + "epoch": 0.4970089719217675, + "grad_norm": 4.6776275634765625, + "learning_rate": 3.5884914812881245e-06, + "loss": 0.4943, + "mean_token_accuracy": 0.8465476736426354, + "num_tokens": 192912823.0, + "step": 160330 + }, + { + "entropy": 1.8527782797813415, + "epoch": 0.4970399710468172, + "grad_norm": 7.401676177978516, + "learning_rate": 3.5883795762795283e-06, + "loss": 0.407, + "mean_token_accuracy": 0.8594189703464508, + "num_tokens": 192926217.0, + "step": 160340 + }, + { + "entropy": 1.8976733207702636, + "epoch": 0.4970709701718669, + "grad_norm": 6.59383487701416, + "learning_rate": 3.588267681739356e-06, + "loss": 0.395, + "mean_token_accuracy": 0.8672205924987793, + "num_tokens": 192937314.0, + "step": 160350 + }, + { + "entropy": 1.874462467432022, + "epoch": 0.4971019692969166, + "grad_norm": 6.723745822906494, + "learning_rate": 3.588155797665975e-06, + "loss": 0.4479, + "mean_token_accuracy": 0.8523491159081459, + "num_tokens": 192949521.0, + "step": 160360 + }, + { + "entropy": 1.9262579411268235, + "epoch": 0.4971329684219663, + "grad_norm": 9.00383472442627, + "learning_rate": 3.588043924057755e-06, + "loss": 0.4431, + "mean_token_accuracy": 0.8581419870257377, + "num_tokens": 192960939.0, + "step": 160370 + }, + { + "entropy": 1.86645487844944, + "epoch": 0.497163967547016, + "grad_norm": 10.316017150878906, + "learning_rate": 3.5879320609130632e-06, + "loss": 0.42, + "mean_token_accuracy": 0.8479255259037017, + "num_tokens": 192973919.0, + "step": 160380 + }, + { + "entropy": 1.894903513789177, + "epoch": 0.4971949666720657, + "grad_norm": 8.902325630187988, + "learning_rate": 3.5878202082302687e-06, + "loss": 0.4436, + "mean_token_accuracy": 0.8499386295676231, + "num_tokens": 192985411.0, + "step": 160390 + }, + { + "entropy": 1.8823218569159508, + "epoch": 0.49722596579711537, + "grad_norm": 5.8220014572143555, + "learning_rate": 3.5877083660077423e-06, + "loss": 0.3909, + "mean_token_accuracy": 0.8693734228610992, + "num_tokens": 192997183.0, + "step": 160400 + }, + { + "entropy": 1.8421266853809357, + "epoch": 0.4972569649221651, + "grad_norm": 3.864579439163208, + "learning_rate": 3.5875965342438524e-06, + "loss": 0.3832, + "mean_token_accuracy": 0.8598260506987572, + "num_tokens": 193010293.0, + "step": 160410 + }, + { + "entropy": 1.836855800449848, + "epoch": 0.49728796404721476, + "grad_norm": 3.868010997772217, + "learning_rate": 3.5874847129369696e-06, + "loss": 0.4179, + "mean_token_accuracy": 0.8634832099080085, + "num_tokens": 193022967.0, + "step": 160420 + }, + { + "entropy": 1.716268916428089, + "epoch": 0.4973189631722645, + "grad_norm": 8.21786117553711, + "learning_rate": 3.5873729020854643e-06, + "loss": 0.3336, + "mean_token_accuracy": 0.875548966228962, + "num_tokens": 193037075.0, + "step": 160430 + }, + { + "entropy": 1.8625114277005195, + "epoch": 0.49734996229731415, + "grad_norm": 8.999544143676758, + "learning_rate": 3.5872611016877067e-06, + "loss": 0.4151, + "mean_token_accuracy": 0.8617020696401596, + "num_tokens": 193049323.0, + "step": 160440 + }, + { + "entropy": 1.85534148812294, + "epoch": 0.4973809614223639, + "grad_norm": 7.702752113342285, + "learning_rate": 3.5871493117420684e-06, + "loss": 0.4322, + "mean_token_accuracy": 0.8578760981559753, + "num_tokens": 193061093.0, + "step": 160450 + }, + { + "entropy": 1.8131542086601258, + "epoch": 0.49741196054741355, + "grad_norm": 7.689070224761963, + "learning_rate": 3.587037532246922e-06, + "loss": 0.4264, + "mean_token_accuracy": 0.8490845113992691, + "num_tokens": 193074196.0, + "step": 160460 + }, + { + "entropy": 1.8581334315240383, + "epoch": 0.49744295967246327, + "grad_norm": 8.081965446472168, + "learning_rate": 3.586925763200637e-06, + "loss": 0.4122, + "mean_token_accuracy": 0.8551263764500618, + "num_tokens": 193086783.0, + "step": 160470 + }, + { + "entropy": 1.8547850877046586, + "epoch": 0.49747395879751294, + "grad_norm": 7.127450466156006, + "learning_rate": 3.5868140046015883e-06, + "loss": 0.4176, + "mean_token_accuracy": 0.8548941016197205, + "num_tokens": 193099533.0, + "step": 160480 + }, + { + "entropy": 1.8142898052930831, + "epoch": 0.49750495792256266, + "grad_norm": 4.191645622253418, + "learning_rate": 3.586702256448146e-06, + "loss": 0.406, + "mean_token_accuracy": 0.8591264098882675, + "num_tokens": 193112575.0, + "step": 160490 + }, + { + "entropy": 1.927902916073799, + "epoch": 0.49753595704761233, + "grad_norm": 7.649588584899902, + "learning_rate": 3.5865905187386845e-06, + "loss": 0.4836, + "mean_token_accuracy": 0.848539587855339, + "num_tokens": 193124101.0, + "step": 160500 + }, + { + "entropy": 1.8359772473573686, + "epoch": 0.497566956172662, + "grad_norm": 3.336451530456543, + "learning_rate": 3.5864787914715773e-06, + "loss": 0.4197, + "mean_token_accuracy": 0.8483365759253502, + "num_tokens": 193136746.0, + "step": 160510 + }, + { + "entropy": 1.8549357965588569, + "epoch": 0.4975979552977117, + "grad_norm": 3.272468328475952, + "learning_rate": 3.5863670746451963e-06, + "loss": 0.3921, + "mean_token_accuracy": 0.8660996928811073, + "num_tokens": 193149274.0, + "step": 160520 + }, + { + "entropy": 1.907238420844078, + "epoch": 0.4976289544227614, + "grad_norm": 3.739243745803833, + "learning_rate": 3.5862553682579175e-06, + "loss": 0.4601, + "mean_token_accuracy": 0.8532083362340928, + "num_tokens": 193160807.0, + "step": 160530 + }, + { + "entropy": 1.8771161273121835, + "epoch": 0.4976599535478111, + "grad_norm": 7.761352062225342, + "learning_rate": 3.586143672308113e-06, + "loss": 0.4552, + "mean_token_accuracy": 0.851627166569233, + "num_tokens": 193173146.0, + "step": 160540 + }, + { + "entropy": 1.8144533932209015, + "epoch": 0.4976909526728608, + "grad_norm": 9.153278350830078, + "learning_rate": 3.58603198679416e-06, + "loss": 0.4255, + "mean_token_accuracy": 0.8531859219074249, + "num_tokens": 193185308.0, + "step": 160550 + }, + { + "entropy": 1.903257629275322, + "epoch": 0.4977219517979105, + "grad_norm": 9.370990753173828, + "learning_rate": 3.5859203117144324e-06, + "loss": 0.5, + "mean_token_accuracy": 0.8494491398334503, + "num_tokens": 193196560.0, + "step": 160560 + }, + { + "entropy": 1.7986585214734077, + "epoch": 0.4977529509229602, + "grad_norm": 3.870612144470215, + "learning_rate": 3.5858086470673054e-06, + "loss": 0.4082, + "mean_token_accuracy": 0.8564859181642532, + "num_tokens": 193209503.0, + "step": 160570 + }, + { + "entropy": 1.9573538348078727, + "epoch": 0.4977839500480099, + "grad_norm": 9.66162109375, + "learning_rate": 3.585696992851155e-06, + "loss": 0.4833, + "mean_token_accuracy": 0.8488394528627395, + "num_tokens": 193220876.0, + "step": 160580 + }, + { + "entropy": 1.8788496136665345, + "epoch": 0.4978149491730596, + "grad_norm": 8.454802513122559, + "learning_rate": 3.5855853490643573e-06, + "loss": 0.4499, + "mean_token_accuracy": 0.8537091195583344, + "num_tokens": 193232585.0, + "step": 160590 + }, + { + "entropy": 1.7184559665620327, + "epoch": 0.4978459482981093, + "grad_norm": 8.67977523803711, + "learning_rate": 3.5854737157052886e-06, + "loss": 0.3645, + "mean_token_accuracy": 0.8678611069917679, + "num_tokens": 193247245.0, + "step": 160600 + }, + { + "entropy": 1.8064767561852932, + "epoch": 0.49787694742315897, + "grad_norm": 2.588245153427124, + "learning_rate": 3.5853620927723265e-06, + "loss": 0.436, + "mean_token_accuracy": 0.860741664469242, + "num_tokens": 193260256.0, + "step": 160610 + }, + { + "entropy": 1.9140912219882011, + "epoch": 0.4979079465482087, + "grad_norm": 8.764533996582031, + "learning_rate": 3.5852504802638467e-06, + "loss": 0.5058, + "mean_token_accuracy": 0.8398638129234314, + "num_tokens": 193271290.0, + "step": 160620 + }, + { + "entropy": 1.796024279296398, + "epoch": 0.49793894567325836, + "grad_norm": 4.2609543800354, + "learning_rate": 3.5851388781782276e-06, + "loss": 0.3752, + "mean_token_accuracy": 0.8642449587583542, + "num_tokens": 193284344.0, + "step": 160630 + }, + { + "entropy": 1.82810877263546, + "epoch": 0.4979699447983081, + "grad_norm": 9.36631965637207, + "learning_rate": 3.5850272865138475e-06, + "loss": 0.4207, + "mean_token_accuracy": 0.8589912727475166, + "num_tokens": 193296415.0, + "step": 160640 + }, + { + "entropy": 1.8314507842063903, + "epoch": 0.49800094392335775, + "grad_norm": 8.399770736694336, + "learning_rate": 3.5849157052690836e-06, + "loss": 0.3946, + "mean_token_accuracy": 0.86473308801651, + "num_tokens": 193308563.0, + "step": 160650 + }, + { + "entropy": 1.8416701436042786, + "epoch": 0.4980319430484075, + "grad_norm": 8.284887313842773, + "learning_rate": 3.584804134442316e-06, + "loss": 0.4147, + "mean_token_accuracy": 0.8562467381358146, + "num_tokens": 193321328.0, + "step": 160660 + }, + { + "entropy": 1.8433082446455955, + "epoch": 0.49806294217345715, + "grad_norm": 3.750549554824829, + "learning_rate": 3.5846925740319214e-06, + "loss": 0.4251, + "mean_token_accuracy": 0.8653277009725571, + "num_tokens": 193333110.0, + "step": 160670 + }, + { + "entropy": 1.8722620084881783, + "epoch": 0.49809394129850687, + "grad_norm": 9.886913299560547, + "learning_rate": 3.5845810240362815e-06, + "loss": 0.4267, + "mean_token_accuracy": 0.857438001036644, + "num_tokens": 193345144.0, + "step": 160680 + }, + { + "entropy": 1.8812648728489876, + "epoch": 0.49812494042355654, + "grad_norm": 3.8210055828094482, + "learning_rate": 3.584469484453774e-06, + "loss": 0.4351, + "mean_token_accuracy": 0.850870543718338, + "num_tokens": 193357243.0, + "step": 160690 + }, + { + "entropy": 1.751509742438793, + "epoch": 0.49815593954860626, + "grad_norm": 8.921031951904297, + "learning_rate": 3.5843579552827802e-06, + "loss": 0.4664, + "mean_token_accuracy": 0.8553698793053627, + "num_tokens": 193370914.0, + "step": 160700 + }, + { + "entropy": 1.8664864271879196, + "epoch": 0.49818693867365593, + "grad_norm": 3.878030300140381, + "learning_rate": 3.5842464365216806e-06, + "loss": 0.4201, + "mean_token_accuracy": 0.8606648445129395, + "num_tokens": 193383110.0, + "step": 160710 + }, + { + "entropy": 1.844856907427311, + "epoch": 0.49821793779870566, + "grad_norm": 8.737525939941406, + "learning_rate": 3.584134928168854e-06, + "loss": 0.3944, + "mean_token_accuracy": 0.8666700348258018, + "num_tokens": 193395446.0, + "step": 160720 + }, + { + "entropy": 1.8013098880648613, + "epoch": 0.4982489369237553, + "grad_norm": 8.164528846740723, + "learning_rate": 3.5840234302226833e-06, + "loss": 0.425, + "mean_token_accuracy": 0.8570413291454315, + "num_tokens": 193408380.0, + "step": 160730 + }, + { + "entropy": 1.9012565463781357, + "epoch": 0.49827993604880505, + "grad_norm": 8.635117530822754, + "learning_rate": 3.58391194268155e-06, + "loss": 0.4785, + "mean_token_accuracy": 0.8446336343884469, + "num_tokens": 193420383.0, + "step": 160740 + }, + { + "entropy": 1.9297475934028625, + "epoch": 0.4983109351738547, + "grad_norm": 8.393662452697754, + "learning_rate": 3.5838004655438347e-06, + "loss": 0.4535, + "mean_token_accuracy": 0.8500937014818192, + "num_tokens": 193431556.0, + "step": 160750 + }, + { + "entropy": 1.875514169037342, + "epoch": 0.4983419342989044, + "grad_norm": 10.568635940551758, + "learning_rate": 3.5836889988079206e-06, + "loss": 0.4371, + "mean_token_accuracy": 0.8593442022800446, + "num_tokens": 193443133.0, + "step": 160760 + }, + { + "entropy": 1.8471161857247353, + "epoch": 0.4983729334239541, + "grad_norm": 8.708873748779297, + "learning_rate": 3.5835775424721886e-06, + "loss": 0.4227, + "mean_token_accuracy": 0.8582314670085907, + "num_tokens": 193455329.0, + "step": 160770 + }, + { + "entropy": 1.9030718505382538, + "epoch": 0.4984039325490038, + "grad_norm": 8.513256072998047, + "learning_rate": 3.583466096535023e-06, + "loss": 0.494, + "mean_token_accuracy": 0.8403557062149047, + "num_tokens": 193466580.0, + "step": 160780 + }, + { + "entropy": 1.7898402735590935, + "epoch": 0.4984349316740535, + "grad_norm": 3.610626697540283, + "learning_rate": 3.583354660994807e-06, + "loss": 0.3997, + "mean_token_accuracy": 0.8605961218476296, + "num_tokens": 193479729.0, + "step": 160790 + }, + { + "entropy": 1.8853804931044578, + "epoch": 0.4984659307991032, + "grad_norm": 7.379966735839844, + "learning_rate": 3.5832432358499227e-06, + "loss": 0.5226, + "mean_token_accuracy": 0.8424478873610497, + "num_tokens": 193491462.0, + "step": 160800 + }, + { + "entropy": 1.8701233610510826, + "epoch": 0.4984969299241529, + "grad_norm": 8.146286010742188, + "learning_rate": 3.5831318210987557e-06, + "loss": 0.3963, + "mean_token_accuracy": 0.8631499618291855, + "num_tokens": 193503353.0, + "step": 160810 + }, + { + "entropy": 1.8317988231778144, + "epoch": 0.49852792904920257, + "grad_norm": 7.9895758628845215, + "learning_rate": 3.583020416739689e-06, + "loss": 0.3922, + "mean_token_accuracy": 0.8609337940812111, + "num_tokens": 193516186.0, + "step": 160820 + }, + { + "entropy": 1.8742655038833618, + "epoch": 0.4985589281742523, + "grad_norm": 8.589852333068848, + "learning_rate": 3.582909022771108e-06, + "loss": 0.442, + "mean_token_accuracy": 0.8622861579060555, + "num_tokens": 193527160.0, + "step": 160830 + }, + { + "entropy": 1.8188278570771217, + "epoch": 0.49858992729930196, + "grad_norm": 3.9392247200012207, + "learning_rate": 3.582797639191397e-06, + "loss": 0.4083, + "mean_token_accuracy": 0.8563838452100754, + "num_tokens": 193539448.0, + "step": 160840 + }, + { + "entropy": 1.7949530065059662, + "epoch": 0.4986209264243517, + "grad_norm": 7.496416091918945, + "learning_rate": 3.5826862659989416e-06, + "loss": 0.3885, + "mean_token_accuracy": 0.858404703438282, + "num_tokens": 193552034.0, + "step": 160850 + }, + { + "entropy": 1.900011982023716, + "epoch": 0.49865192554940135, + "grad_norm": 3.616969108581543, + "learning_rate": 3.582574903192127e-06, + "loss": 0.4624, + "mean_token_accuracy": 0.8452894240617752, + "num_tokens": 193563777.0, + "step": 160860 + }, + { + "entropy": 1.8028769597411156, + "epoch": 0.4986829246744511, + "grad_norm": 9.061861991882324, + "learning_rate": 3.5824635507693403e-06, + "loss": 0.4553, + "mean_token_accuracy": 0.8631314247846603, + "num_tokens": 193577121.0, + "step": 160870 + }, + { + "entropy": 1.8786051549017428, + "epoch": 0.49871392379950075, + "grad_norm": 7.7923054695129395, + "learning_rate": 3.5823522087289664e-06, + "loss": 0.4468, + "mean_token_accuracy": 0.8566824227571488, + "num_tokens": 193588577.0, + "step": 160880 + }, + { + "entropy": 1.8396636798977852, + "epoch": 0.49874492292455047, + "grad_norm": 3.286170721054077, + "learning_rate": 3.582240877069393e-06, + "loss": 0.4473, + "mean_token_accuracy": 0.8455423668026925, + "num_tokens": 193601807.0, + "step": 160890 + }, + { + "entropy": 1.9669003546237946, + "epoch": 0.49877592204960014, + "grad_norm": 8.088907241821289, + "learning_rate": 3.5821295557890062e-06, + "loss": 0.5321, + "mean_token_accuracy": 0.8448588579893113, + "num_tokens": 193612280.0, + "step": 160900 + }, + { + "entropy": 1.8892876401543617, + "epoch": 0.49880692117464986, + "grad_norm": 7.942569255828857, + "learning_rate": 3.582018244886195e-06, + "loss": 0.4585, + "mean_token_accuracy": 0.8533686950802803, + "num_tokens": 193623951.0, + "step": 160910 + }, + { + "entropy": 1.9404545783996583, + "epoch": 0.49883792029969953, + "grad_norm": 6.952447891235352, + "learning_rate": 3.581906944359346e-06, + "loss": 0.5396, + "mean_token_accuracy": 0.8299561634659767, + "num_tokens": 193634649.0, + "step": 160920 + }, + { + "entropy": 1.872673834860325, + "epoch": 0.49886891942474926, + "grad_norm": 8.344843864440918, + "learning_rate": 3.5817956542068466e-06, + "loss": 0.4419, + "mean_token_accuracy": 0.862852719426155, + "num_tokens": 193646222.0, + "step": 160930 + }, + { + "entropy": 1.8161238446831702, + "epoch": 0.4988999185497989, + "grad_norm": 4.470328330993652, + "learning_rate": 3.581684374427087e-06, + "loss": 0.4024, + "mean_token_accuracy": 0.8478014081716537, + "num_tokens": 193659437.0, + "step": 160940 + }, + { + "entropy": 1.9250866621732712, + "epoch": 0.49893091767484865, + "grad_norm": 8.126724243164062, + "learning_rate": 3.581573105018454e-06, + "loss": 0.4955, + "mean_token_accuracy": 0.8495612889528275, + "num_tokens": 193670140.0, + "step": 160950 + }, + { + "entropy": 1.8661917194724083, + "epoch": 0.4989619167998983, + "grad_norm": 9.641986846923828, + "learning_rate": 3.581461845979339e-06, + "loss": 0.5094, + "mean_token_accuracy": 0.8382344901561737, + "num_tokens": 193682472.0, + "step": 160960 + }, + { + "entropy": 1.8382489174604415, + "epoch": 0.49899291592494804, + "grad_norm": 8.112931251525879, + "learning_rate": 3.5813505973081294e-06, + "loss": 0.4663, + "mean_token_accuracy": 0.8525958672165871, + "num_tokens": 193695088.0, + "step": 160970 + }, + { + "entropy": 1.8945767790079118, + "epoch": 0.4990239150499977, + "grad_norm": 6.302922248840332, + "learning_rate": 3.5812393590032156e-06, + "loss": 0.4418, + "mean_token_accuracy": 0.8630844354629517, + "num_tokens": 193706455.0, + "step": 160980 + }, + { + "entropy": 1.8268967524170876, + "epoch": 0.4990549141750474, + "grad_norm": 7.509896755218506, + "learning_rate": 3.581128131062989e-06, + "loss": 0.411, + "mean_token_accuracy": 0.8638947933912278, + "num_tokens": 193719141.0, + "step": 160990 + }, + { + "entropy": 1.869875578582287, + "epoch": 0.4990859133000971, + "grad_norm": 7.7595415115356445, + "learning_rate": 3.58101691348584e-06, + "loss": 0.4414, + "mean_token_accuracy": 0.8568534716963768, + "num_tokens": 193731463.0, + "step": 161000 + }, + { + "entropy": 1.8743370115756988, + "epoch": 0.4991169124251468, + "grad_norm": 8.525010108947754, + "learning_rate": 3.580905706270157e-06, + "loss": 0.4557, + "mean_token_accuracy": 0.8406820744276047, + "num_tokens": 193743131.0, + "step": 161010 + }, + { + "entropy": 1.8488017559051513, + "epoch": 0.4991479115501965, + "grad_norm": 3.2859580516815186, + "learning_rate": 3.5807945094143338e-06, + "loss": 0.4125, + "mean_token_accuracy": 0.8638896048069, + "num_tokens": 193755108.0, + "step": 161020 + }, + { + "entropy": 1.9557087212800979, + "epoch": 0.49917891067524617, + "grad_norm": 12.029139518737793, + "learning_rate": 3.580683322916761e-06, + "loss": 0.4921, + "mean_token_accuracy": 0.8448839083313942, + "num_tokens": 193766392.0, + "step": 161030 + }, + { + "entropy": 1.8913592636585235, + "epoch": 0.4992099098002959, + "grad_norm": 7.942804336547852, + "learning_rate": 3.580572146775831e-06, + "loss": 0.4635, + "mean_token_accuracy": 0.8532082542777062, + "num_tokens": 193778253.0, + "step": 161040 + }, + { + "entropy": 1.9256371706724167, + "epoch": 0.49924090892534556, + "grad_norm": 8.954080581665039, + "learning_rate": 3.5804609809899356e-06, + "loss": 0.4912, + "mean_token_accuracy": 0.8524809762835502, + "num_tokens": 193789236.0, + "step": 161050 + }, + { + "entropy": 1.8400282293558121, + "epoch": 0.4992719080503953, + "grad_norm": 7.780353546142578, + "learning_rate": 3.5803498255574676e-06, + "loss": 0.4207, + "mean_token_accuracy": 0.8595440477132797, + "num_tokens": 193801955.0, + "step": 161060 + }, + { + "entropy": 1.8195059970021248, + "epoch": 0.49930290717544495, + "grad_norm": 8.77364730834961, + "learning_rate": 3.5802386804768204e-06, + "loss": 0.4038, + "mean_token_accuracy": 0.8579374149441719, + "num_tokens": 193814514.0, + "step": 161070 + }, + { + "entropy": 1.9665822595357896, + "epoch": 0.4993339063004947, + "grad_norm": 10.273021697998047, + "learning_rate": 3.580127545746387e-06, + "loss": 0.5165, + "mean_token_accuracy": 0.8401234805583954, + "num_tokens": 193825591.0, + "step": 161080 + }, + { + "entropy": 1.903402642905712, + "epoch": 0.49936490542554435, + "grad_norm": 7.531785011291504, + "learning_rate": 3.5800164213645606e-06, + "loss": 0.4651, + "mean_token_accuracy": 0.8556138724088669, + "num_tokens": 193837687.0, + "step": 161090 + }, + { + "entropy": 1.8461224928498268, + "epoch": 0.49939590455059407, + "grad_norm": 7.117091655731201, + "learning_rate": 3.5799053073297356e-06, + "loss": 0.406, + "mean_token_accuracy": 0.8609979644417762, + "num_tokens": 193849622.0, + "step": 161100 + }, + { + "entropy": 1.9408129811286927, + "epoch": 0.49942690367564374, + "grad_norm": 8.56417465209961, + "learning_rate": 3.579794203640307e-06, + "loss": 0.4863, + "mean_token_accuracy": 0.8458396375179291, + "num_tokens": 193861216.0, + "step": 161110 + }, + { + "entropy": 1.8642560109496116, + "epoch": 0.49945790280069347, + "grad_norm": 8.209739685058594, + "learning_rate": 3.579683110294669e-06, + "loss": 0.4376, + "mean_token_accuracy": 0.8453632459044457, + "num_tokens": 193873699.0, + "step": 161120 + }, + { + "entropy": 1.935758689045906, + "epoch": 0.49948890192574313, + "grad_norm": 8.520611763000488, + "learning_rate": 3.5795720272912166e-06, + "loss": 0.4864, + "mean_token_accuracy": 0.8472222432494163, + "num_tokens": 193884501.0, + "step": 161130 + }, + { + "entropy": 1.7907845377922058, + "epoch": 0.49951990105079286, + "grad_norm": 3.21339750289917, + "learning_rate": 3.5794609546283445e-06, + "loss": 0.4153, + "mean_token_accuracy": 0.8530368015170098, + "num_tokens": 193898149.0, + "step": 161140 + }, + { + "entropy": 1.9477273747324944, + "epoch": 0.4995509001758425, + "grad_norm": 8.93225383758545, + "learning_rate": 3.5793498923044502e-06, + "loss": 0.5176, + "mean_token_accuracy": 0.839634670317173, + "num_tokens": 193909283.0, + "step": 161150 + }, + { + "entropy": 1.8608055517077446, + "epoch": 0.49958189930089225, + "grad_norm": 8.591536521911621, + "learning_rate": 3.579238840317929e-06, + "loss": 0.4194, + "mean_token_accuracy": 0.8619609922170639, + "num_tokens": 193921871.0, + "step": 161160 + }, + { + "entropy": 1.903657865524292, + "epoch": 0.4996128984259419, + "grad_norm": 6.806171417236328, + "learning_rate": 3.579127798667177e-06, + "loss": 0.4606, + "mean_token_accuracy": 0.8534547924995423, + "num_tokens": 193933281.0, + "step": 161170 + }, + { + "entropy": 1.8228919118642808, + "epoch": 0.49964389755099164, + "grad_norm": 5.248661041259766, + "learning_rate": 3.579016767350592e-06, + "loss": 0.3961, + "mean_token_accuracy": 0.8604255363345146, + "num_tokens": 193946044.0, + "step": 161180 + }, + { + "entropy": 1.8331756204366685, + "epoch": 0.4996748966760413, + "grad_norm": 10.0490140914917, + "learning_rate": 3.5789057463665692e-06, + "loss": 0.4363, + "mean_token_accuracy": 0.8582054451107979, + "num_tokens": 193958910.0, + "step": 161190 + }, + { + "entropy": 1.839639674127102, + "epoch": 0.49970589580109104, + "grad_norm": 10.93938159942627, + "learning_rate": 3.578794735713509e-06, + "loss": 0.4319, + "mean_token_accuracy": 0.854024501144886, + "num_tokens": 193970762.0, + "step": 161200 + }, + { + "entropy": 1.910327608883381, + "epoch": 0.4997368949261407, + "grad_norm": 9.916841506958008, + "learning_rate": 3.578683735389807e-06, + "loss": 0.4878, + "mean_token_accuracy": 0.84305549710989, + "num_tokens": 193981939.0, + "step": 161210 + }, + { + "entropy": 1.8969020605087281, + "epoch": 0.49976789405119043, + "grad_norm": 8.263409614562988, + "learning_rate": 3.5785727453938623e-06, + "loss": 0.4523, + "mean_token_accuracy": 0.853573352098465, + "num_tokens": 193993370.0, + "step": 161220 + }, + { + "entropy": 1.8600556001067161, + "epoch": 0.4997988931762401, + "grad_norm": 8.200851440429688, + "learning_rate": 3.578461765724073e-06, + "loss": 0.4654, + "mean_token_accuracy": 0.8544128373265266, + "num_tokens": 194004957.0, + "step": 161230 + }, + { + "entropy": 1.868491704761982, + "epoch": 0.49982989230128977, + "grad_norm": 8.064600944519043, + "learning_rate": 3.5783507963788377e-06, + "loss": 0.4525, + "mean_token_accuracy": 0.8572847083210945, + "num_tokens": 194016347.0, + "step": 161240 + }, + { + "entropy": 1.904083575308323, + "epoch": 0.4998608914263395, + "grad_norm": 8.280893325805664, + "learning_rate": 3.5782398373565575e-06, + "loss": 0.5018, + "mean_token_accuracy": 0.8400688841938972, + "num_tokens": 194028152.0, + "step": 161250 + }, + { + "entropy": 1.8171085551381112, + "epoch": 0.49989189055138916, + "grad_norm": 10.810096740722656, + "learning_rate": 3.5781288886556296e-06, + "loss": 0.3941, + "mean_token_accuracy": 0.8669310986995697, + "num_tokens": 194040145.0, + "step": 161260 + }, + { + "entropy": 1.8588432848453522, + "epoch": 0.4999228896764389, + "grad_norm": 8.02641773223877, + "learning_rate": 3.578017950274456e-06, + "loss": 0.4678, + "mean_token_accuracy": 0.8487448245286942, + "num_tokens": 194051969.0, + "step": 161270 + }, + { + "entropy": 1.8924987077713014, + "epoch": 0.49995388880148856, + "grad_norm": 7.1130523681640625, + "learning_rate": 3.577907022211436e-06, + "loss": 0.4199, + "mean_token_accuracy": 0.8623496115207672, + "num_tokens": 194063394.0, + "step": 161280 + }, + { + "entropy": 1.876361007988453, + "epoch": 0.4999848879265383, + "grad_norm": 9.621905326843262, + "learning_rate": 3.57779610446497e-06, + "loss": 0.4444, + "mean_token_accuracy": 0.8544125527143478, + "num_tokens": 194074979.0, + "step": 161290 + }, + { + "entropy": 1.8698507621884346, + "epoch": 0.500015887051588, + "grad_norm": 6.634136199951172, + "learning_rate": 3.5776851970334595e-06, + "loss": 0.451, + "mean_token_accuracy": 0.8547692626714707, + "num_tokens": 194086868.0, + "step": 161300 + }, + { + "entropy": 1.8418845430016517, + "epoch": 0.5000468861766376, + "grad_norm": 8.004556655883789, + "learning_rate": 3.5775742999153062e-06, + "loss": 0.4005, + "mean_token_accuracy": 0.8597453564405442, + "num_tokens": 194099111.0, + "step": 161310 + }, + { + "entropy": 1.8818573281168938, + "epoch": 0.5000778853016874, + "grad_norm": 7.51983642578125, + "learning_rate": 3.577463413108911e-06, + "loss": 0.4688, + "mean_token_accuracy": 0.8519199967384339, + "num_tokens": 194110983.0, + "step": 161320 + }, + { + "entropy": 1.7738328106701373, + "epoch": 0.5001088844267371, + "grad_norm": 2.6278769969940186, + "learning_rate": 3.5773525366126755e-06, + "loss": 0.3906, + "mean_token_accuracy": 0.8709217235445976, + "num_tokens": 194125109.0, + "step": 161330 + }, + { + "entropy": 1.8441026404500007, + "epoch": 0.5001398835517867, + "grad_norm": 4.450294494628906, + "learning_rate": 3.5772416704250034e-06, + "loss": 0.4566, + "mean_token_accuracy": 0.8480847701430321, + "num_tokens": 194137288.0, + "step": 161340 + }, + { + "entropy": 1.8499442219734192, + "epoch": 0.5001708826768364, + "grad_norm": 9.552627563476562, + "learning_rate": 3.5771308145442972e-06, + "loss": 0.4164, + "mean_token_accuracy": 0.8577769994735718, + "num_tokens": 194149433.0, + "step": 161350 + }, + { + "entropy": 1.9184771940112113, + "epoch": 0.5002018818018862, + "grad_norm": 8.634852409362793, + "learning_rate": 3.5770199689689593e-06, + "loss": 0.4814, + "mean_token_accuracy": 0.8490300461649894, + "num_tokens": 194160817.0, + "step": 161360 + }, + { + "entropy": 1.8820372819900513, + "epoch": 0.5002328809269359, + "grad_norm": 3.9401602745056152, + "learning_rate": 3.576909133697393e-06, + "loss": 0.4392, + "mean_token_accuracy": 0.8586199179291725, + "num_tokens": 194172116.0, + "step": 161370 + }, + { + "entropy": 1.8695161834359169, + "epoch": 0.5002638800519855, + "grad_norm": 10.332625389099121, + "learning_rate": 3.5767983087280032e-06, + "loss": 0.4621, + "mean_token_accuracy": 0.8503239244222641, + "num_tokens": 194183969.0, + "step": 161380 + }, + { + "entropy": 1.8113522306084633, + "epoch": 0.5002948791770352, + "grad_norm": 8.187897682189941, + "learning_rate": 3.576687494059193e-06, + "loss": 0.4269, + "mean_token_accuracy": 0.8534181043505669, + "num_tokens": 194197490.0, + "step": 161390 + }, + { + "entropy": 1.8908743023872376, + "epoch": 0.500325878302085, + "grad_norm": 8.39079475402832, + "learning_rate": 3.5765766896893673e-06, + "loss": 0.4376, + "mean_token_accuracy": 0.8607093915343285, + "num_tokens": 194208821.0, + "step": 161400 + }, + { + "entropy": 1.9498828038573266, + "epoch": 0.5003568774271346, + "grad_norm": 10.169794082641602, + "learning_rate": 3.5764658956169306e-06, + "loss": 0.5227, + "mean_token_accuracy": 0.8365338191390037, + "num_tokens": 194220290.0, + "step": 161410 + }, + { + "entropy": 1.884354117512703, + "epoch": 0.5003878765521843, + "grad_norm": 9.11744213104248, + "learning_rate": 3.5763551118402885e-06, + "loss": 0.4548, + "mean_token_accuracy": 0.8483538582921029, + "num_tokens": 194231909.0, + "step": 161420 + }, + { + "entropy": 1.916728711128235, + "epoch": 0.500418875677234, + "grad_norm": 9.879512786865234, + "learning_rate": 3.576244338357846e-06, + "loss": 0.5, + "mean_token_accuracy": 0.8416429966688156, + "num_tokens": 194242891.0, + "step": 161430 + }, + { + "entropy": 1.902044305205345, + "epoch": 0.5004498748022838, + "grad_norm": 7.635390281677246, + "learning_rate": 3.5761335751680097e-06, + "loss": 0.4243, + "mean_token_accuracy": 0.8647704660892487, + "num_tokens": 194254117.0, + "step": 161440 + }, + { + "entropy": 1.85581027418375, + "epoch": 0.5004808739273334, + "grad_norm": 4.087507724761963, + "learning_rate": 3.5760228222691847e-06, + "loss": 0.4701, + "mean_token_accuracy": 0.8528918892145156, + "num_tokens": 194267450.0, + "step": 161450 + }, + { + "entropy": 1.8870635867118835, + "epoch": 0.5005118730523831, + "grad_norm": 4.798098087310791, + "learning_rate": 3.5759120796597783e-06, + "loss": 0.4226, + "mean_token_accuracy": 0.8504547387361526, + "num_tokens": 194279941.0, + "step": 161460 + }, + { + "entropy": 1.8962156638503074, + "epoch": 0.5005428721774328, + "grad_norm": 8.878067016601562, + "learning_rate": 3.575801347338197e-06, + "loss": 0.4982, + "mean_token_accuracy": 0.8517125174403191, + "num_tokens": 194292113.0, + "step": 161470 + }, + { + "entropy": 1.9940287500619889, + "epoch": 0.5005738713024825, + "grad_norm": 9.91559886932373, + "learning_rate": 3.5756906253028483e-06, + "loss": 0.4889, + "mean_token_accuracy": 0.8439011916518211, + "num_tokens": 194302629.0, + "step": 161480 + }, + { + "entropy": 1.9340040653944015, + "epoch": 0.5006048704275322, + "grad_norm": 9.120388984680176, + "learning_rate": 3.5755799135521397e-06, + "loss": 0.4423, + "mean_token_accuracy": 0.864893200993538, + "num_tokens": 194313712.0, + "step": 161490 + }, + { + "entropy": 1.8403319254517556, + "epoch": 0.5006358695525819, + "grad_norm": 8.813713073730469, + "learning_rate": 3.5754692120844784e-06, + "loss": 0.398, + "mean_token_accuracy": 0.8584855198860168, + "num_tokens": 194326868.0, + "step": 161500 + }, + { + "entropy": 1.9156290888786316, + "epoch": 0.5006668686776315, + "grad_norm": 8.051271438598633, + "learning_rate": 3.575358520898275e-06, + "loss": 0.4642, + "mean_token_accuracy": 0.8504732191562653, + "num_tokens": 194339370.0, + "step": 161510 + }, + { + "entropy": 1.8070098072290421, + "epoch": 0.5006978678026812, + "grad_norm": 8.109489440917969, + "learning_rate": 3.5752478399919354e-06, + "loss": 0.4026, + "mean_token_accuracy": 0.8623450741171836, + "num_tokens": 194352815.0, + "step": 161520 + }, + { + "entropy": 1.8298007018864155, + "epoch": 0.500728866927731, + "grad_norm": 2.6040987968444824, + "learning_rate": 3.5751371693638696e-06, + "loss": 0.4278, + "mean_token_accuracy": 0.8597873091697693, + "num_tokens": 194365615.0, + "step": 161530 + }, + { + "entropy": 1.8470653221011162, + "epoch": 0.5007598660527807, + "grad_norm": 7.487679958343506, + "learning_rate": 3.575026509012487e-06, + "loss": 0.4099, + "mean_token_accuracy": 0.8588323727250099, + "num_tokens": 194377759.0, + "step": 161540 + }, + { + "entropy": 1.7974566683173179, + "epoch": 0.5007908651778303, + "grad_norm": 7.743740081787109, + "learning_rate": 3.5749158589361976e-06, + "loss": 0.3927, + "mean_token_accuracy": 0.8658158406615257, + "num_tokens": 194390750.0, + "step": 161550 + }, + { + "entropy": 1.8499557554721833, + "epoch": 0.50082186430288, + "grad_norm": 8.951606750488281, + "learning_rate": 3.5748052191334103e-06, + "loss": 0.5034, + "mean_token_accuracy": 0.8367986068129539, + "num_tokens": 194403015.0, + "step": 161560 + }, + { + "entropy": 1.8279045611619948, + "epoch": 0.5008528634279298, + "grad_norm": 3.8482038974761963, + "learning_rate": 3.5746945896025365e-06, + "loss": 0.3951, + "mean_token_accuracy": 0.8634175881743431, + "num_tokens": 194415227.0, + "step": 161570 + }, + { + "entropy": 1.8749937638640404, + "epoch": 0.5008838625529795, + "grad_norm": 9.023499488830566, + "learning_rate": 3.574583970341986e-06, + "loss": 0.4488, + "mean_token_accuracy": 0.8496809035539628, + "num_tokens": 194427501.0, + "step": 161580 + }, + { + "entropy": 1.8510484382510186, + "epoch": 0.5009148616780291, + "grad_norm": 6.773876190185547, + "learning_rate": 3.5744733613501707e-06, + "loss": 0.4285, + "mean_token_accuracy": 0.855115358531475, + "num_tokens": 194439878.0, + "step": 161590 + }, + { + "entropy": 1.80962725430727, + "epoch": 0.5009458608030788, + "grad_norm": 8.465936660766602, + "learning_rate": 3.574362762625502e-06, + "loss": 0.4577, + "mean_token_accuracy": 0.8550980076193809, + "num_tokens": 194453476.0, + "step": 161600 + }, + { + "entropy": 1.8801057904958725, + "epoch": 0.5009768599281286, + "grad_norm": 8.899097442626953, + "learning_rate": 3.57425217416639e-06, + "loss": 0.521, + "mean_token_accuracy": 0.8410529911518096, + "num_tokens": 194465486.0, + "step": 161610 + }, + { + "entropy": 1.8403988644480704, + "epoch": 0.5010078590531782, + "grad_norm": 10.057798385620117, + "learning_rate": 3.5741415959712482e-06, + "loss": 0.4447, + "mean_token_accuracy": 0.85611502379179, + "num_tokens": 194477958.0, + "step": 161620 + }, + { + "entropy": 1.8564189299941063, + "epoch": 0.5010388581782279, + "grad_norm": 7.379710674285889, + "learning_rate": 3.5740310280384887e-06, + "loss": 0.4527, + "mean_token_accuracy": 0.8523194447159768, + "num_tokens": 194490764.0, + "step": 161630 + }, + { + "entropy": 1.864148397743702, + "epoch": 0.5010698573032776, + "grad_norm": 7.809131622314453, + "learning_rate": 3.5739204703665244e-06, + "loss": 0.4633, + "mean_token_accuracy": 0.8495198383927345, + "num_tokens": 194503509.0, + "step": 161640 + }, + { + "entropy": 1.916366446018219, + "epoch": 0.5011008564283274, + "grad_norm": 7.292889595031738, + "learning_rate": 3.573809922953768e-06, + "loss": 0.5339, + "mean_token_accuracy": 0.8405555829405784, + "num_tokens": 194515043.0, + "step": 161650 + }, + { + "entropy": 1.8589420065283775, + "epoch": 0.501131855553377, + "grad_norm": 3.132341146469116, + "learning_rate": 3.5736993857986335e-06, + "loss": 0.431, + "mean_token_accuracy": 0.8655919685959816, + "num_tokens": 194527318.0, + "step": 161660 + }, + { + "entropy": 1.925423364341259, + "epoch": 0.5011628546784267, + "grad_norm": 4.025610446929932, + "learning_rate": 3.573588858899534e-06, + "loss": 0.4783, + "mean_token_accuracy": 0.8521231457591056, + "num_tokens": 194538326.0, + "step": 161670 + }, + { + "entropy": 1.8823940232396126, + "epoch": 0.5011938538034764, + "grad_norm": 7.982590675354004, + "learning_rate": 3.5734783422548842e-06, + "loss": 0.4819, + "mean_token_accuracy": 0.843793198466301, + "num_tokens": 194550394.0, + "step": 161680 + }, + { + "entropy": 1.8528065443038941, + "epoch": 0.5012248529285261, + "grad_norm": 9.233685493469238, + "learning_rate": 3.5733678358630976e-06, + "loss": 0.4445, + "mean_token_accuracy": 0.8438840091228486, + "num_tokens": 194562639.0, + "step": 161690 + }, + { + "entropy": 1.8516671895980834, + "epoch": 0.5012558520535758, + "grad_norm": 3.61114239692688, + "learning_rate": 3.57325733972259e-06, + "loss": 0.4248, + "mean_token_accuracy": 0.851916354894638, + "num_tokens": 194574471.0, + "step": 161700 + }, + { + "entropy": 1.7707113809883595, + "epoch": 0.5012868511786255, + "grad_norm": 3.688938856124878, + "learning_rate": 3.5731468538317764e-06, + "loss": 0.3455, + "mean_token_accuracy": 0.8591636136174202, + "num_tokens": 194589266.0, + "step": 161710 + }, + { + "entropy": 1.89532478004694, + "epoch": 0.5013178503036751, + "grad_norm": 8.20800495147705, + "learning_rate": 3.573036378189072e-06, + "loss": 0.4477, + "mean_token_accuracy": 0.8467110142111778, + "num_tokens": 194601549.0, + "step": 161720 + }, + { + "entropy": 1.857888177037239, + "epoch": 0.5013488494287249, + "grad_norm": 7.576171398162842, + "learning_rate": 3.572925912792893e-06, + "loss": 0.4383, + "mean_token_accuracy": 0.8551659643650055, + "num_tokens": 194614405.0, + "step": 161730 + }, + { + "entropy": 1.8105993568897247, + "epoch": 0.5013798485537746, + "grad_norm": 9.140817642211914, + "learning_rate": 3.572815457641654e-06, + "loss": 0.4096, + "mean_token_accuracy": 0.8596289157867432, + "num_tokens": 194626813.0, + "step": 161740 + }, + { + "entropy": 1.8599924936890602, + "epoch": 0.5014108476788243, + "grad_norm": 4.219054698944092, + "learning_rate": 3.572705012733774e-06, + "loss": 0.4205, + "mean_token_accuracy": 0.860489945113659, + "num_tokens": 194638501.0, + "step": 161750 + }, + { + "entropy": 1.8883384391665459, + "epoch": 0.5014418468038739, + "grad_norm": 9.223127365112305, + "learning_rate": 3.572594578067668e-06, + "loss": 0.4779, + "mean_token_accuracy": 0.8457170099020004, + "num_tokens": 194650304.0, + "step": 161760 + }, + { + "entropy": 1.843181850016117, + "epoch": 0.5014728459289236, + "grad_norm": 8.869895935058594, + "learning_rate": 3.5724841536417538e-06, + "loss": 0.4415, + "mean_token_accuracy": 0.8534633457660675, + "num_tokens": 194662685.0, + "step": 161770 + }, + { + "entropy": 1.88509142100811, + "epoch": 0.5015038450539734, + "grad_norm": 9.733488082885742, + "learning_rate": 3.5723737394544493e-06, + "loss": 0.4421, + "mean_token_accuracy": 0.8535164296627045, + "num_tokens": 194674862.0, + "step": 161780 + }, + { + "entropy": 1.8763200983405113, + "epoch": 0.501534844179023, + "grad_norm": 3.906675100326538, + "learning_rate": 3.5722633355041724e-06, + "loss": 0.4047, + "mean_token_accuracy": 0.8577969685196877, + "num_tokens": 194686775.0, + "step": 161790 + }, + { + "entropy": 1.896049427986145, + "epoch": 0.5015658433040727, + "grad_norm": 7.51211404800415, + "learning_rate": 3.5721529417893404e-06, + "loss": 0.4634, + "mean_token_accuracy": 0.8528813973069191, + "num_tokens": 194698375.0, + "step": 161800 + }, + { + "entropy": 1.8656664967536927, + "epoch": 0.5015968424291224, + "grad_norm": 8.359691619873047, + "learning_rate": 3.572042558308373e-06, + "loss": 0.4496, + "mean_token_accuracy": 0.8549571260809898, + "num_tokens": 194709410.0, + "step": 161810 + }, + { + "entropy": 1.827798655629158, + "epoch": 0.5016278415541722, + "grad_norm": 8.835021018981934, + "learning_rate": 3.5719321850596877e-06, + "loss": 0.4291, + "mean_token_accuracy": 0.8649173125624656, + "num_tokens": 194721917.0, + "step": 161820 + }, + { + "entropy": 1.8799199149012567, + "epoch": 0.5016588406792218, + "grad_norm": 3.991224765777588, + "learning_rate": 3.571821822041705e-06, + "loss": 0.4567, + "mean_token_accuracy": 0.8502721592783928, + "num_tokens": 194734562.0, + "step": 161830 + }, + { + "entropy": 1.8569432660937308, + "epoch": 0.5016898398042715, + "grad_norm": 9.473611831665039, + "learning_rate": 3.571711469252845e-06, + "loss": 0.4501, + "mean_token_accuracy": 0.8522385835647583, + "num_tokens": 194746701.0, + "step": 161840 + }, + { + "entropy": 1.8433676049113275, + "epoch": 0.5017208389293212, + "grad_norm": 8.809913635253906, + "learning_rate": 3.571601126691525e-06, + "loss": 0.4223, + "mean_token_accuracy": 0.8491621285676956, + "num_tokens": 194759126.0, + "step": 161850 + }, + { + "entropy": 1.8659282326698303, + "epoch": 0.501751838054371, + "grad_norm": 8.631592750549316, + "learning_rate": 3.5714907943561683e-06, + "loss": 0.4625, + "mean_token_accuracy": 0.8507665291428566, + "num_tokens": 194770782.0, + "step": 161860 + }, + { + "entropy": 1.8826793491840363, + "epoch": 0.5017828371794206, + "grad_norm": 7.433642387390137, + "learning_rate": 3.5713804722451937e-06, + "loss": 0.468, + "mean_token_accuracy": 0.8468863472342492, + "num_tokens": 194782817.0, + "step": 161870 + }, + { + "entropy": 1.7949487701058389, + "epoch": 0.5018138363044703, + "grad_norm": 7.305869102478027, + "learning_rate": 3.571270160357023e-06, + "loss": 0.4091, + "mean_token_accuracy": 0.8627200186252594, + "num_tokens": 194795809.0, + "step": 161880 + }, + { + "entropy": 1.778245857357979, + "epoch": 0.50184483542952, + "grad_norm": 7.835882663726807, + "learning_rate": 3.571159858690077e-06, + "loss": 0.4017, + "mean_token_accuracy": 0.8689703851938247, + "num_tokens": 194808823.0, + "step": 161890 + }, + { + "entropy": 1.9078995436429977, + "epoch": 0.5018758345545697, + "grad_norm": 8.27534008026123, + "learning_rate": 3.571049567242778e-06, + "loss": 0.4492, + "mean_token_accuracy": 0.8613488137722015, + "num_tokens": 194820450.0, + "step": 161900 + }, + { + "entropy": 1.8325102671980857, + "epoch": 0.5019068336796194, + "grad_norm": 8.722766876220703, + "learning_rate": 3.5709392860135466e-06, + "loss": 0.4514, + "mean_token_accuracy": 0.8510407224297524, + "num_tokens": 194833049.0, + "step": 161910 + }, + { + "entropy": 1.8785583242774009, + "epoch": 0.5019378328046691, + "grad_norm": 7.252606391906738, + "learning_rate": 3.570829015000807e-06, + "loss": 0.4644, + "mean_token_accuracy": 0.8446881324052811, + "num_tokens": 194845217.0, + "step": 161920 + }, + { + "entropy": 1.9201249092817307, + "epoch": 0.5019688319297188, + "grad_norm": 7.984278678894043, + "learning_rate": 3.5707187542029805e-06, + "loss": 0.476, + "mean_token_accuracy": 0.8447786808013916, + "num_tokens": 194856631.0, + "step": 161930 + }, + { + "entropy": 1.9102614864706993, + "epoch": 0.5019998310547685, + "grad_norm": 6.645012855529785, + "learning_rate": 3.5706085036184905e-06, + "loss": 0.4508, + "mean_token_accuracy": 0.8588759183883667, + "num_tokens": 194868487.0, + "step": 161940 + }, + { + "entropy": 1.8596350729465485, + "epoch": 0.5020308301798182, + "grad_norm": 7.373410701751709, + "learning_rate": 3.5704982632457604e-06, + "loss": 0.4236, + "mean_token_accuracy": 0.8577965304255486, + "num_tokens": 194880047.0, + "step": 161950 + }, + { + "entropy": 1.8387646555900574, + "epoch": 0.5020618293048679, + "grad_norm": 7.618228435516357, + "learning_rate": 3.5703880330832136e-06, + "loss": 0.4219, + "mean_token_accuracy": 0.86079321205616, + "num_tokens": 194891550.0, + "step": 161960 + }, + { + "entropy": 1.9075782671570778, + "epoch": 0.5020928284299175, + "grad_norm": 9.226759910583496, + "learning_rate": 3.5702778131292744e-06, + "loss": 0.4635, + "mean_token_accuracy": 0.8493689194321632, + "num_tokens": 194902889.0, + "step": 161970 + }, + { + "entropy": 1.8558399647474288, + "epoch": 0.5021238275549672, + "grad_norm": 9.140117645263672, + "learning_rate": 3.5701676033823672e-06, + "loss": 0.3996, + "mean_token_accuracy": 0.8625021308660508, + "num_tokens": 194915434.0, + "step": 161980 + }, + { + "entropy": 1.8274715527892114, + "epoch": 0.502154826680017, + "grad_norm": 6.623842239379883, + "learning_rate": 3.570057403840917e-06, + "loss": 0.4433, + "mean_token_accuracy": 0.8567396998405457, + "num_tokens": 194928384.0, + "step": 161990 + }, + { + "entropy": 1.8467453569173813, + "epoch": 0.5021858258050667, + "grad_norm": 10.840411186218262, + "learning_rate": 3.5699472145033483e-06, + "loss": 0.4631, + "mean_token_accuracy": 0.8468669727444649, + "num_tokens": 194940502.0, + "step": 162000 + }, + { + "entropy": 1.826990969479084, + "epoch": 0.5022168249301163, + "grad_norm": 3.876290798187256, + "learning_rate": 3.5698370353680865e-06, + "loss": 0.3885, + "mean_token_accuracy": 0.8612556800246238, + "num_tokens": 194953655.0, + "step": 162010 + }, + { + "entropy": 1.865774655342102, + "epoch": 0.502247824055166, + "grad_norm": 8.75535774230957, + "learning_rate": 3.5697268664335578e-06, + "loss": 0.4588, + "mean_token_accuracy": 0.8436019718647003, + "num_tokens": 194966026.0, + "step": 162020 + }, + { + "entropy": 1.8176633030176164, + "epoch": 0.5022788231802158, + "grad_norm": 7.738468170166016, + "learning_rate": 3.569616707698188e-06, + "loss": 0.414, + "mean_token_accuracy": 0.8597527951002121, + "num_tokens": 194978261.0, + "step": 162030 + }, + { + "entropy": 1.8128898188471794, + "epoch": 0.5023098223052654, + "grad_norm": 7.874271392822266, + "learning_rate": 3.5695065591604027e-06, + "loss": 0.4031, + "mean_token_accuracy": 0.8667504027485847, + "num_tokens": 194990803.0, + "step": 162040 + }, + { + "entropy": 1.809106007218361, + "epoch": 0.5023408214303151, + "grad_norm": 7.867502689361572, + "learning_rate": 3.5693964208186305e-06, + "loss": 0.4039, + "mean_token_accuracy": 0.8542467027902603, + "num_tokens": 195003204.0, + "step": 162050 + }, + { + "entropy": 1.8450546979904174, + "epoch": 0.5023718205553648, + "grad_norm": 8.1978178024292, + "learning_rate": 3.5692862926712974e-06, + "loss": 0.4954, + "mean_token_accuracy": 0.8562353745102882, + "num_tokens": 195014841.0, + "step": 162060 + }, + { + "entropy": 1.8364934653043747, + "epoch": 0.5024028196804146, + "grad_norm": 8.754971504211426, + "learning_rate": 3.56917617471683e-06, + "loss": 0.4087, + "mean_token_accuracy": 0.8514863669872283, + "num_tokens": 195027579.0, + "step": 162070 + }, + { + "entropy": 1.8638313546776772, + "epoch": 0.5024338188054642, + "grad_norm": 8.994431495666504, + "learning_rate": 3.569066066953658e-06, + "loss": 0.4387, + "mean_token_accuracy": 0.8624847516417503, + "num_tokens": 195038596.0, + "step": 162080 + }, + { + "entropy": 1.8255416080355644, + "epoch": 0.5024648179305139, + "grad_norm": 9.329654693603516, + "learning_rate": 3.568955969380207e-06, + "loss": 0.4532, + "mean_token_accuracy": 0.8518840372562408, + "num_tokens": 195051056.0, + "step": 162090 + }, + { + "entropy": 1.8981266662478447, + "epoch": 0.5024958170555636, + "grad_norm": 6.833081245422363, + "learning_rate": 3.5688458819949083e-06, + "loss": 0.4343, + "mean_token_accuracy": 0.8566917911171913, + "num_tokens": 195062601.0, + "step": 162100 + }, + { + "entropy": 1.9108289882540703, + "epoch": 0.5025268161806133, + "grad_norm": 9.281651496887207, + "learning_rate": 3.568735804796189e-06, + "loss": 0.4831, + "mean_token_accuracy": 0.8436849609017372, + "num_tokens": 195073772.0, + "step": 162110 + }, + { + "entropy": 1.877676671743393, + "epoch": 0.502557815305663, + "grad_norm": 7.044517517089844, + "learning_rate": 3.5686257377824777e-06, + "loss": 0.4868, + "mean_token_accuracy": 0.8399266093969345, + "num_tokens": 195086232.0, + "step": 162120 + }, + { + "entropy": 1.9210964649915696, + "epoch": 0.5025888144307127, + "grad_norm": 7.519648551940918, + "learning_rate": 3.568515680952206e-06, + "loss": 0.5072, + "mean_token_accuracy": 0.8400311037898064, + "num_tokens": 195097344.0, + "step": 162130 + }, + { + "entropy": 1.7322592556476593, + "epoch": 0.5026198135557624, + "grad_norm": 7.744187355041504, + "learning_rate": 3.5684056343038014e-06, + "loss": 0.4063, + "mean_token_accuracy": 0.8637993618845939, + "num_tokens": 195110801.0, + "step": 162140 + }, + { + "entropy": 1.8559142157435418, + "epoch": 0.5026508126808121, + "grad_norm": 8.278355598449707, + "learning_rate": 3.568295597835696e-06, + "loss": 0.4526, + "mean_token_accuracy": 0.8566200494766235, + "num_tokens": 195122316.0, + "step": 162150 + }, + { + "entropy": 1.8842547863721848, + "epoch": 0.5026818118058618, + "grad_norm": 7.3361029624938965, + "learning_rate": 3.5681855715463183e-06, + "loss": 0.4767, + "mean_token_accuracy": 0.85838573127985, + "num_tokens": 195133389.0, + "step": 162160 + }, + { + "entropy": 1.8281088501214982, + "epoch": 0.5027128109309115, + "grad_norm": 7.411527633666992, + "learning_rate": 3.5680755554341003e-06, + "loss": 0.4149, + "mean_token_accuracy": 0.8537177413702011, + "num_tokens": 195145711.0, + "step": 162170 + }, + { + "entropy": 1.8865630626678467, + "epoch": 0.5027438100559611, + "grad_norm": 9.198566436767578, + "learning_rate": 3.567965549497473e-06, + "loss": 0.4775, + "mean_token_accuracy": 0.8512567788362503, + "num_tokens": 195156986.0, + "step": 162180 + }, + { + "entropy": 1.77679483294487, + "epoch": 0.5027748091810109, + "grad_norm": 4.166167736053467, + "learning_rate": 3.5678555537348686e-06, + "loss": 0.381, + "mean_token_accuracy": 0.8688113674521446, + "num_tokens": 195170019.0, + "step": 162190 + }, + { + "entropy": 1.8510409817099571, + "epoch": 0.5028058083060606, + "grad_norm": 6.120206356048584, + "learning_rate": 3.5677455681447176e-06, + "loss": 0.4395, + "mean_token_accuracy": 0.8500844925642014, + "num_tokens": 195182067.0, + "step": 162200 + }, + { + "entropy": 1.8342761471867561, + "epoch": 0.5028368074311103, + "grad_norm": 7.963062286376953, + "learning_rate": 3.567635592725453e-06, + "loss": 0.4986, + "mean_token_accuracy": 0.8414328068494796, + "num_tokens": 195194944.0, + "step": 162210 + }, + { + "entropy": 1.8795111045241355, + "epoch": 0.5028678065561599, + "grad_norm": 8.403858184814453, + "learning_rate": 3.5675256274755066e-06, + "loss": 0.417, + "mean_token_accuracy": 0.862656545639038, + "num_tokens": 195206665.0, + "step": 162220 + }, + { + "entropy": 1.876885211467743, + "epoch": 0.5028988056812096, + "grad_norm": 7.945247173309326, + "learning_rate": 3.5674156723933117e-06, + "loss": 0.434, + "mean_token_accuracy": 0.8566308572888375, + "num_tokens": 195218499.0, + "step": 162230 + }, + { + "entropy": 1.8293002039194106, + "epoch": 0.5029298048062594, + "grad_norm": 7.862081050872803, + "learning_rate": 3.5673057274773025e-06, + "loss": 0.4145, + "mean_token_accuracy": 0.8659921497106552, + "num_tokens": 195231040.0, + "step": 162240 + }, + { + "entropy": 1.814247028529644, + "epoch": 0.502960803931309, + "grad_norm": 4.131370544433594, + "learning_rate": 3.567195792725911e-06, + "loss": 0.4476, + "mean_token_accuracy": 0.8450552076101303, + "num_tokens": 195243169.0, + "step": 162250 + }, + { + "entropy": 1.8270811960101128, + "epoch": 0.5029918030563587, + "grad_norm": 8.027288436889648, + "learning_rate": 3.5670858681375727e-06, + "loss": 0.4405, + "mean_token_accuracy": 0.8579193204641342, + "num_tokens": 195256142.0, + "step": 162260 + }, + { + "entropy": 1.8580777063965797, + "epoch": 0.5030228021814084, + "grad_norm": 9.05886173248291, + "learning_rate": 3.566975953710719e-06, + "loss": 0.4171, + "mean_token_accuracy": 0.8610779225826264, + "num_tokens": 195267553.0, + "step": 162270 + }, + { + "entropy": 1.8327148109674454, + "epoch": 0.5030538013064582, + "grad_norm": 8.335761070251465, + "learning_rate": 3.566866049443787e-06, + "loss": 0.4479, + "mean_token_accuracy": 0.8535551458597184, + "num_tokens": 195280087.0, + "step": 162280 + }, + { + "entropy": 1.8819597899913787, + "epoch": 0.5030848004315078, + "grad_norm": 7.634295463562012, + "learning_rate": 3.566756155335211e-06, + "loss": 0.4524, + "mean_token_accuracy": 0.8621120229363441, + "num_tokens": 195291226.0, + "step": 162290 + }, + { + "entropy": 1.8669955790042878, + "epoch": 0.5031157995565575, + "grad_norm": 8.279129028320312, + "learning_rate": 3.5666462713834252e-06, + "loss": 0.4412, + "mean_token_accuracy": 0.8547209471464157, + "num_tokens": 195302557.0, + "step": 162300 + }, + { + "entropy": 1.8815603330731392, + "epoch": 0.5031467986816072, + "grad_norm": 3.4400410652160645, + "learning_rate": 3.566536397586867e-06, + "loss": 0.5623, + "mean_token_accuracy": 0.8529194951057434, + "num_tokens": 195314615.0, + "step": 162310 + }, + { + "entropy": 1.8385434448719025, + "epoch": 0.503177797806657, + "grad_norm": 8.215117454528809, + "learning_rate": 3.5664265339439706e-06, + "loss": 0.4481, + "mean_token_accuracy": 0.8514703080058098, + "num_tokens": 195326532.0, + "step": 162320 + }, + { + "entropy": 1.9076471701264381, + "epoch": 0.5032087969317066, + "grad_norm": 4.307278156280518, + "learning_rate": 3.566316680453173e-06, + "loss": 0.4357, + "mean_token_accuracy": 0.8577963724732399, + "num_tokens": 195338119.0, + "step": 162330 + }, + { + "entropy": 1.7793606474995614, + "epoch": 0.5032397960567563, + "grad_norm": 7.525938034057617, + "learning_rate": 3.566206837112911e-06, + "loss": 0.3931, + "mean_token_accuracy": 0.8615632086992264, + "num_tokens": 195351610.0, + "step": 162340 + }, + { + "entropy": 1.8213515982031823, + "epoch": 0.503270795181806, + "grad_norm": 7.753080368041992, + "learning_rate": 3.566097003921621e-06, + "loss": 0.3951, + "mean_token_accuracy": 0.8667016878724099, + "num_tokens": 195364183.0, + "step": 162350 + }, + { + "entropy": 1.8507965892553329, + "epoch": 0.5033017943068557, + "grad_norm": 6.9658098220825195, + "learning_rate": 3.5659871808777403e-06, + "loss": 0.4439, + "mean_token_accuracy": 0.8534910395741463, + "num_tokens": 195376426.0, + "step": 162360 + }, + { + "entropy": 1.8666039258241653, + "epoch": 0.5033327934319054, + "grad_norm": 8.829039573669434, + "learning_rate": 3.5658773679797065e-06, + "loss": 0.4336, + "mean_token_accuracy": 0.8550096690654755, + "num_tokens": 195388504.0, + "step": 162370 + }, + { + "entropy": 1.7253398269414901, + "epoch": 0.5033637925569551, + "grad_norm": 3.7189226150512695, + "learning_rate": 3.5657675652259573e-06, + "loss": 0.352, + "mean_token_accuracy": 0.8698497876524925, + "num_tokens": 195401715.0, + "step": 162380 + }, + { + "entropy": 1.8578629180788995, + "epoch": 0.5033947916820047, + "grad_norm": 7.805914878845215, + "learning_rate": 3.5656577726149312e-06, + "loss": 0.4134, + "mean_token_accuracy": 0.8586318030953407, + "num_tokens": 195413783.0, + "step": 162390 + }, + { + "entropy": 1.860382466763258, + "epoch": 0.5034257908070545, + "grad_norm": 7.376383304595947, + "learning_rate": 3.5655479901450673e-06, + "loss": 0.4422, + "mean_token_accuracy": 0.852292089164257, + "num_tokens": 195426109.0, + "step": 162400 + }, + { + "entropy": 1.9275429144501686, + "epoch": 0.5034567899321042, + "grad_norm": 7.535244464874268, + "learning_rate": 3.5654382178148036e-06, + "loss": 0.4711, + "mean_token_accuracy": 0.8557485058903694, + "num_tokens": 195437469.0, + "step": 162410 + }, + { + "entropy": 1.858670848608017, + "epoch": 0.5034877890571539, + "grad_norm": 6.955326080322266, + "learning_rate": 3.56532845562258e-06, + "loss": 0.4751, + "mean_token_accuracy": 0.8485241889953613, + "num_tokens": 195449150.0, + "step": 162420 + }, + { + "entropy": 1.9349894881248475, + "epoch": 0.5035187881822035, + "grad_norm": 3.8754801750183105, + "learning_rate": 3.5652187035668352e-06, + "loss": 0.4934, + "mean_token_accuracy": 0.8489937767386436, + "num_tokens": 195461001.0, + "step": 162430 + }, + { + "entropy": 1.8421294122934342, + "epoch": 0.5035497873072533, + "grad_norm": 3.5598652362823486, + "learning_rate": 3.565108961646011e-06, + "loss": 0.3764, + "mean_token_accuracy": 0.8684445038437844, + "num_tokens": 195473100.0, + "step": 162440 + }, + { + "entropy": 1.9252609878778457, + "epoch": 0.503580786432303, + "grad_norm": 7.782764434814453, + "learning_rate": 3.5649992298585456e-06, + "loss": 0.4848, + "mean_token_accuracy": 0.8494197487831116, + "num_tokens": 195483490.0, + "step": 162450 + }, + { + "entropy": 1.9152775600552558, + "epoch": 0.5036117855573526, + "grad_norm": 8.533858299255371, + "learning_rate": 3.5648895082028813e-06, + "loss": 0.4879, + "mean_token_accuracy": 0.8448960855603218, + "num_tokens": 195494785.0, + "step": 162460 + }, + { + "entropy": 1.8215551912784576, + "epoch": 0.5036427846824023, + "grad_norm": 3.6843440532684326, + "learning_rate": 3.564779796677457e-06, + "loss": 0.4003, + "mean_token_accuracy": 0.86255484521389, + "num_tokens": 195507467.0, + "step": 162470 + }, + { + "entropy": 1.8157613933086396, + "epoch": 0.503673783807452, + "grad_norm": 3.5262677669525146, + "learning_rate": 3.5646700952807156e-06, + "loss": 0.4047, + "mean_token_accuracy": 0.8654473096132278, + "num_tokens": 195520127.0, + "step": 162480 + }, + { + "entropy": 1.7795906513929367, + "epoch": 0.5037047829325018, + "grad_norm": 2.948425531387329, + "learning_rate": 3.564560404011099e-06, + "loss": 0.4558, + "mean_token_accuracy": 0.8592526748776436, + "num_tokens": 195533959.0, + "step": 162490 + }, + { + "entropy": 1.9089320957660676, + "epoch": 0.5037357820575514, + "grad_norm": 8.778511047363281, + "learning_rate": 3.5644507228670484e-06, + "loss": 0.4872, + "mean_token_accuracy": 0.8524185940623283, + "num_tokens": 195544950.0, + "step": 162500 + }, + { + "entropy": 1.9139399603009224, + "epoch": 0.5037667811826011, + "grad_norm": 9.315644264221191, + "learning_rate": 3.5643410518470057e-06, + "loss": 0.4663, + "mean_token_accuracy": 0.8555566519498825, + "num_tokens": 195555547.0, + "step": 162510 + }, + { + "entropy": 1.8605040475726127, + "epoch": 0.5037977803076508, + "grad_norm": 8.976792335510254, + "learning_rate": 3.5642313909494137e-06, + "loss": 0.4765, + "mean_token_accuracy": 0.8453806653618813, + "num_tokens": 195567770.0, + "step": 162520 + }, + { + "entropy": 1.907510343194008, + "epoch": 0.5038287794327005, + "grad_norm": 7.562481880187988, + "learning_rate": 3.564121740172716e-06, + "loss": 0.4398, + "mean_token_accuracy": 0.8639927938580513, + "num_tokens": 195578608.0, + "step": 162530 + }, + { + "entropy": 1.7901795297861098, + "epoch": 0.5038597785577502, + "grad_norm": 8.070297241210938, + "learning_rate": 3.564012099515356e-06, + "loss": 0.3757, + "mean_token_accuracy": 0.8621209725737572, + "num_tokens": 195592039.0, + "step": 162540 + }, + { + "entropy": 1.8731615900993348, + "epoch": 0.5038907776827999, + "grad_norm": 8.658186912536621, + "learning_rate": 3.5639024689757764e-06, + "loss": 0.4581, + "mean_token_accuracy": 0.8569270372390747, + "num_tokens": 195604573.0, + "step": 162550 + }, + { + "entropy": 1.9131707713007926, + "epoch": 0.5039217768078496, + "grad_norm": 9.220719337463379, + "learning_rate": 3.563792848552421e-06, + "loss": 0.4654, + "mean_token_accuracy": 0.8557065084576607, + "num_tokens": 195615623.0, + "step": 162560 + }, + { + "entropy": 1.868717408180237, + "epoch": 0.5039527759328993, + "grad_norm": 7.270646095275879, + "learning_rate": 3.5636832382437353e-06, + "loss": 0.4689, + "mean_token_accuracy": 0.8525914192199707, + "num_tokens": 195627525.0, + "step": 162570 + }, + { + "entropy": 1.8212410807609558, + "epoch": 0.503983775057949, + "grad_norm": 4.4293060302734375, + "learning_rate": 3.5635736380481627e-06, + "loss": 0.4176, + "mean_token_accuracy": 0.8602419763803482, + "num_tokens": 195639900.0, + "step": 162580 + }, + { + "entropy": 1.859234368801117, + "epoch": 0.5040147741829987, + "grad_norm": 7.507938861846924, + "learning_rate": 3.563464047964149e-06, + "loss": 0.3935, + "mean_token_accuracy": 0.8594542518258095, + "num_tokens": 195651608.0, + "step": 162590 + }, + { + "entropy": 1.838496372103691, + "epoch": 0.5040457733080483, + "grad_norm": 3.874418258666992, + "learning_rate": 3.5633544679901406e-06, + "loss": 0.3798, + "mean_token_accuracy": 0.8664188906550407, + "num_tokens": 195664106.0, + "step": 162600 + }, + { + "entropy": 1.8103223249316216, + "epoch": 0.5040767724330981, + "grad_norm": 7.857743740081787, + "learning_rate": 3.56324489812458e-06, + "loss": 0.4064, + "mean_token_accuracy": 0.8632272258400917, + "num_tokens": 195676849.0, + "step": 162610 + }, + { + "entropy": 1.841738609969616, + "epoch": 0.5041077715581478, + "grad_norm": 4.171340465545654, + "learning_rate": 3.563135338365916e-06, + "loss": 0.4059, + "mean_token_accuracy": 0.8611914679408074, + "num_tokens": 195688771.0, + "step": 162620 + }, + { + "entropy": 1.8774579733610153, + "epoch": 0.5041387706831975, + "grad_norm": 3.3485891819000244, + "learning_rate": 3.5630257887125935e-06, + "loss": 0.4318, + "mean_token_accuracy": 0.8576310917735099, + "num_tokens": 195700208.0, + "step": 162630 + }, + { + "entropy": 1.768410849571228, + "epoch": 0.5041697698082471, + "grad_norm": 3.4588170051574707, + "learning_rate": 3.562916249163059e-06, + "loss": 0.3664, + "mean_token_accuracy": 0.8597879484295845, + "num_tokens": 195713203.0, + "step": 162640 + }, + { + "entropy": 1.8163224518299104, + "epoch": 0.5042007689332969, + "grad_norm": 7.023558616638184, + "learning_rate": 3.5628067197157614e-06, + "loss": 0.3915, + "mean_token_accuracy": 0.8744960188865661, + "num_tokens": 195725159.0, + "step": 162650 + }, + { + "entropy": 1.9602682262659072, + "epoch": 0.5042317680583466, + "grad_norm": 8.747138977050781, + "learning_rate": 3.5626972003691456e-06, + "loss": 0.5098, + "mean_token_accuracy": 0.8455164894461632, + "num_tokens": 195735711.0, + "step": 162660 + }, + { + "entropy": 1.9070382133126258, + "epoch": 0.5042627671833962, + "grad_norm": 7.247774124145508, + "learning_rate": 3.5625876911216605e-06, + "loss": 0.4521, + "mean_token_accuracy": 0.8507187455892563, + "num_tokens": 195746697.0, + "step": 162670 + }, + { + "entropy": 1.853401993960142, + "epoch": 0.5042937663084459, + "grad_norm": 7.354197978973389, + "learning_rate": 3.562478191971754e-06, + "loss": 0.4193, + "mean_token_accuracy": 0.8596841201186181, + "num_tokens": 195758088.0, + "step": 162680 + }, + { + "entropy": 1.8842257231473922, + "epoch": 0.5043247654334957, + "grad_norm": 7.232276916503906, + "learning_rate": 3.5623687029178734e-06, + "loss": 0.4294, + "mean_token_accuracy": 0.8579404756426812, + "num_tokens": 195769723.0, + "step": 162690 + }, + { + "entropy": 1.8756929002702236, + "epoch": 0.5043557645585454, + "grad_norm": 6.513383388519287, + "learning_rate": 3.5622592239584688e-06, + "loss": 0.4516, + "mean_token_accuracy": 0.8571664929389954, + "num_tokens": 195782232.0, + "step": 162700 + }, + { + "entropy": 1.9318101540207864, + "epoch": 0.504386763683595, + "grad_norm": 8.294295310974121, + "learning_rate": 3.562149755091988e-06, + "loss": 0.5111, + "mean_token_accuracy": 0.8378466337919235, + "num_tokens": 195793428.0, + "step": 162710 + }, + { + "entropy": 1.8354035213589668, + "epoch": 0.5044177628086447, + "grad_norm": 7.902247905731201, + "learning_rate": 3.5620402963168814e-06, + "loss": 0.4184, + "mean_token_accuracy": 0.8561977759003639, + "num_tokens": 195806102.0, + "step": 162720 + }, + { + "entropy": 1.8348199382424355, + "epoch": 0.5044487619336944, + "grad_norm": 7.229824066162109, + "learning_rate": 3.5619308476315977e-06, + "loss": 0.4144, + "mean_token_accuracy": 0.8591752856969833, + "num_tokens": 195818891.0, + "step": 162730 + }, + { + "entropy": 1.9697114109992981, + "epoch": 0.5044797610587441, + "grad_norm": 7.836243152618408, + "learning_rate": 3.5618214090345877e-06, + "loss": 0.4915, + "mean_token_accuracy": 0.8503987655043602, + "num_tokens": 195829687.0, + "step": 162740 + }, + { + "entropy": 1.786692251265049, + "epoch": 0.5045107601837938, + "grad_norm": 6.62006950378418, + "learning_rate": 3.561711980524301e-06, + "loss": 0.3948, + "mean_token_accuracy": 0.8584597021341324, + "num_tokens": 195843142.0, + "step": 162750 + }, + { + "entropy": 1.8900447756052017, + "epoch": 0.5045417593088435, + "grad_norm": 9.13289737701416, + "learning_rate": 3.5616025620991885e-06, + "loss": 0.4559, + "mean_token_accuracy": 0.8533695697784424, + "num_tokens": 195855208.0, + "step": 162760 + }, + { + "entropy": 1.9006880089640616, + "epoch": 0.5045727584338932, + "grad_norm": 7.307352066040039, + "learning_rate": 3.5614931537577008e-06, + "loss": 0.5253, + "mean_token_accuracy": 0.8466876611113549, + "num_tokens": 195867829.0, + "step": 162770 + }, + { + "entropy": 1.9048534497618674, + "epoch": 0.5046037575589429, + "grad_norm": 9.334182739257812, + "learning_rate": 3.5613837554982904e-06, + "loss": 0.4961, + "mean_token_accuracy": 0.8492770150303841, + "num_tokens": 195879511.0, + "step": 162780 + }, + { + "entropy": 1.8171656548976898, + "epoch": 0.5046347566839926, + "grad_norm": 8.229894638061523, + "learning_rate": 3.5612743673194076e-06, + "loss": 0.379, + "mean_token_accuracy": 0.865054938197136, + "num_tokens": 195892308.0, + "step": 162790 + }, + { + "entropy": 1.8470677226781844, + "epoch": 0.5046657558090423, + "grad_norm": 7.824212551116943, + "learning_rate": 3.561164989219505e-06, + "loss": 0.4028, + "mean_token_accuracy": 0.8674655973911285, + "num_tokens": 195904395.0, + "step": 162800 + }, + { + "entropy": 1.9542918413877488, + "epoch": 0.5046967549340919, + "grad_norm": 8.983610153198242, + "learning_rate": 3.561055621197035e-06, + "loss": 0.5495, + "mean_token_accuracy": 0.8392492473125458, + "num_tokens": 195915064.0, + "step": 162810 + }, + { + "entropy": 1.8837048798799514, + "epoch": 0.5047277540591417, + "grad_norm": 9.500035285949707, + "learning_rate": 3.5609462632504497e-06, + "loss": 0.4458, + "mean_token_accuracy": 0.857371661067009, + "num_tokens": 195926940.0, + "step": 162820 + }, + { + "entropy": 1.904589705169201, + "epoch": 0.5047587531841914, + "grad_norm": 7.82620906829834, + "learning_rate": 3.5608369153782024e-06, + "loss": 0.4877, + "mean_token_accuracy": 0.8510312214493752, + "num_tokens": 195938652.0, + "step": 162830 + }, + { + "entropy": 1.8928764477372169, + "epoch": 0.5047897523092411, + "grad_norm": 9.256412506103516, + "learning_rate": 3.5607275775787463e-06, + "loss": 0.4418, + "mean_token_accuracy": 0.8574974581599235, + "num_tokens": 195949705.0, + "step": 162840 + }, + { + "entropy": 1.8851047992706298, + "epoch": 0.5048207514342907, + "grad_norm": 7.625764846801758, + "learning_rate": 3.5606182498505353e-06, + "loss": 0.4258, + "mean_token_accuracy": 0.8522122874855995, + "num_tokens": 195962380.0, + "step": 162850 + }, + { + "entropy": 1.8839653983712197, + "epoch": 0.5048517505593405, + "grad_norm": 8.40437126159668, + "learning_rate": 3.560508932192024e-06, + "loss": 0.509, + "mean_token_accuracy": 0.8503521829843521, + "num_tokens": 195974171.0, + "step": 162860 + }, + { + "entropy": 1.924563753604889, + "epoch": 0.5048827496843902, + "grad_norm": 4.18229866027832, + "learning_rate": 3.5603996246016654e-06, + "loss": 0.4687, + "mean_token_accuracy": 0.8518718630075455, + "num_tokens": 195985037.0, + "step": 162870 + }, + { + "entropy": 1.9052809610962869, + "epoch": 0.5049137488094398, + "grad_norm": 7.999540328979492, + "learning_rate": 3.5602903270779145e-06, + "loss": 0.4749, + "mean_token_accuracy": 0.8530282035470009, + "num_tokens": 195996485.0, + "step": 162880 + }, + { + "entropy": 1.844748669862747, + "epoch": 0.5049447479344895, + "grad_norm": 8.114140510559082, + "learning_rate": 3.560181039619226e-06, + "loss": 0.4077, + "mean_token_accuracy": 0.866498276591301, + "num_tokens": 196008896.0, + "step": 162890 + }, + { + "entropy": 1.9648150354623795, + "epoch": 0.5049757470595393, + "grad_norm": 8.700517654418945, + "learning_rate": 3.560071762224056e-06, + "loss": 0.4984, + "mean_token_accuracy": 0.8492494702339173, + "num_tokens": 196019760.0, + "step": 162900 + }, + { + "entropy": 1.860958421230316, + "epoch": 0.505006746184589, + "grad_norm": 8.077210426330566, + "learning_rate": 3.55996249489086e-06, + "loss": 0.4374, + "mean_token_accuracy": 0.8595580518245697, + "num_tokens": 196031677.0, + "step": 162910 + }, + { + "entropy": 1.914548434317112, + "epoch": 0.5050377453096386, + "grad_norm": 8.275936126708984, + "learning_rate": 3.559853237618094e-06, + "loss": 0.443, + "mean_token_accuracy": 0.85650105625391, + "num_tokens": 196043701.0, + "step": 162920 + }, + { + "entropy": 1.9637446463108064, + "epoch": 0.5050687444346883, + "grad_norm": 10.936336517333984, + "learning_rate": 3.5597439904042135e-06, + "loss": 0.4851, + "mean_token_accuracy": 0.8512757152318955, + "num_tokens": 196054231.0, + "step": 162930 + }, + { + "entropy": 1.805374266207218, + "epoch": 0.5050997435597381, + "grad_norm": 8.0625, + "learning_rate": 3.559634753247676e-06, + "loss": 0.4073, + "mean_token_accuracy": 0.8544696539640426, + "num_tokens": 196067812.0, + "step": 162940 + }, + { + "entropy": 1.8754793629050255, + "epoch": 0.5051307426847877, + "grad_norm": 8.15665054321289, + "learning_rate": 3.5595255261469374e-06, + "loss": 0.509, + "mean_token_accuracy": 0.8432677164673805, + "num_tokens": 196080283.0, + "step": 162950 + }, + { + "entropy": 1.925532579421997, + "epoch": 0.5051617418098374, + "grad_norm": 9.213263511657715, + "learning_rate": 3.5594163091004564e-06, + "loss": 0.4683, + "mean_token_accuracy": 0.8600358635187149, + "num_tokens": 196091436.0, + "step": 162960 + }, + { + "entropy": 1.901042965054512, + "epoch": 0.5051927409348871, + "grad_norm": 8.694961547851562, + "learning_rate": 3.5593071021066894e-06, + "loss": 0.4715, + "mean_token_accuracy": 0.8498509466648102, + "num_tokens": 196103145.0, + "step": 162970 + }, + { + "entropy": 1.8106228694319726, + "epoch": 0.5052237400599368, + "grad_norm": 9.617980003356934, + "learning_rate": 3.559197905164095e-06, + "loss": 0.4386, + "mean_token_accuracy": 0.8574653580784798, + "num_tokens": 196116424.0, + "step": 162980 + }, + { + "entropy": 1.908635352551937, + "epoch": 0.5052547391849865, + "grad_norm": 9.529869079589844, + "learning_rate": 3.559088718271132e-06, + "loss": 0.4657, + "mean_token_accuracy": 0.8451606497168541, + "num_tokens": 196127678.0, + "step": 162990 + }, + { + "entropy": 1.8978085353970529, + "epoch": 0.5052857383100362, + "grad_norm": 8.438754081726074, + "learning_rate": 3.5589795414262574e-06, + "loss": 0.4265, + "mean_token_accuracy": 0.8580261752009392, + "num_tokens": 196139073.0, + "step": 163000 + }, + { + "entropy": 1.8685655757784843, + "epoch": 0.5053167374350859, + "grad_norm": 10.1161527633667, + "learning_rate": 3.558870374627932e-06, + "loss": 0.4395, + "mean_token_accuracy": 0.8439155369997025, + "num_tokens": 196151307.0, + "step": 163010 + }, + { + "entropy": 1.8491392314434052, + "epoch": 0.5053477365601355, + "grad_norm": 10.6281156539917, + "learning_rate": 3.5587612178746127e-06, + "loss": 0.4127, + "mean_token_accuracy": 0.8669519662857056, + "num_tokens": 196163886.0, + "step": 163020 + }, + { + "entropy": 1.9063034534454346, + "epoch": 0.5053787356851853, + "grad_norm": 3.8416171073913574, + "learning_rate": 3.5586520711647617e-06, + "loss": 0.5117, + "mean_token_accuracy": 0.8448263436555863, + "num_tokens": 196175457.0, + "step": 163030 + }, + { + "entropy": 1.806091445684433, + "epoch": 0.505409734810235, + "grad_norm": 4.29022216796875, + "learning_rate": 3.558542934496838e-06, + "loss": 0.4284, + "mean_token_accuracy": 0.8600598022341728, + "num_tokens": 196188363.0, + "step": 163040 + }, + { + "entropy": 1.8129280880093575, + "epoch": 0.5054407339352847, + "grad_norm": 3.926964521408081, + "learning_rate": 3.558433807869301e-06, + "loss": 0.4255, + "mean_token_accuracy": 0.8455210164189338, + "num_tokens": 196201293.0, + "step": 163050 + }, + { + "entropy": 1.9800580739974976, + "epoch": 0.5054717330603343, + "grad_norm": 8.467527389526367, + "learning_rate": 3.5583246912806125e-06, + "loss": 0.6094, + "mean_token_accuracy": 0.8347618013620377, + "num_tokens": 196213158.0, + "step": 163060 + }, + { + "entropy": 1.858849048614502, + "epoch": 0.5055027321853841, + "grad_norm": 6.902448654174805, + "learning_rate": 3.5582155847292326e-06, + "loss": 0.4079, + "mean_token_accuracy": 0.8614846393465996, + "num_tokens": 196224932.0, + "step": 163070 + }, + { + "entropy": 1.9071022912859916, + "epoch": 0.5055337313104338, + "grad_norm": 7.139303684234619, + "learning_rate": 3.558106488213623e-06, + "loss": 0.4688, + "mean_token_accuracy": 0.8478293195366859, + "num_tokens": 196236392.0, + "step": 163080 + }, + { + "entropy": 1.797558219730854, + "epoch": 0.5055647304354834, + "grad_norm": 3.989180326461792, + "learning_rate": 3.557997401732245e-06, + "loss": 0.3896, + "mean_token_accuracy": 0.8643699809908867, + "num_tokens": 196249374.0, + "step": 163090 + }, + { + "entropy": 1.9015874803066253, + "epoch": 0.5055957295605331, + "grad_norm": 10.056818962097168, + "learning_rate": 3.5578883252835605e-06, + "loss": 0.4492, + "mean_token_accuracy": 0.854583865404129, + "num_tokens": 196261723.0, + "step": 163100 + }, + { + "entropy": 1.8313779145479203, + "epoch": 0.5056267286855829, + "grad_norm": 7.709977149963379, + "learning_rate": 3.557779258866032e-06, + "loss": 0.4154, + "mean_token_accuracy": 0.8604197606444359, + "num_tokens": 196273674.0, + "step": 163110 + }, + { + "entropy": 1.8923453286290168, + "epoch": 0.5056577278106326, + "grad_norm": 7.376804828643799, + "learning_rate": 3.5576702024781225e-06, + "loss": 0.4001, + "mean_token_accuracy": 0.8579896494746209, + "num_tokens": 196285871.0, + "step": 163120 + }, + { + "entropy": 1.8411938205361367, + "epoch": 0.5056887269356822, + "grad_norm": 9.377099990844727, + "learning_rate": 3.557561156118294e-06, + "loss": 0.4311, + "mean_token_accuracy": 0.8572324633598327, + "num_tokens": 196299051.0, + "step": 163130 + }, + { + "entropy": 1.9145785033702851, + "epoch": 0.5057197260607319, + "grad_norm": 7.624598026275635, + "learning_rate": 3.5574521197850097e-06, + "loss": 0.4525, + "mean_token_accuracy": 0.8504158198833466, + "num_tokens": 196310614.0, + "step": 163140 + }, + { + "entropy": 1.8663910001516342, + "epoch": 0.5057507251857817, + "grad_norm": 9.322566986083984, + "learning_rate": 3.5573430934767344e-06, + "loss": 0.4534, + "mean_token_accuracy": 0.851227393746376, + "num_tokens": 196322489.0, + "step": 163150 + }, + { + "entropy": 1.8053078174591064, + "epoch": 0.5057817243108313, + "grad_norm": 8.671806335449219, + "learning_rate": 3.5572340771919307e-06, + "loss": 0.3865, + "mean_token_accuracy": 0.8665953055024147, + "num_tokens": 196335281.0, + "step": 163160 + }, + { + "entropy": 1.9042056173086166, + "epoch": 0.505812723435881, + "grad_norm": 9.928486824035645, + "learning_rate": 3.5571250709290633e-06, + "loss": 0.4379, + "mean_token_accuracy": 0.8564940080046654, + "num_tokens": 196346642.0, + "step": 163170 + }, + { + "entropy": 1.9873277485370635, + "epoch": 0.5058437225609307, + "grad_norm": 8.902290344238281, + "learning_rate": 3.5570160746865965e-06, + "loss": 0.5176, + "mean_token_accuracy": 0.8341750279068947, + "num_tokens": 196358056.0, + "step": 163180 + }, + { + "entropy": 1.8295244112610818, + "epoch": 0.5058747216859805, + "grad_norm": 8.927177429199219, + "learning_rate": 3.5569070884629963e-06, + "loss": 0.4099, + "mean_token_accuracy": 0.8622911289334297, + "num_tokens": 196370104.0, + "step": 163190 + }, + { + "entropy": 1.8176844894886017, + "epoch": 0.5059057208110301, + "grad_norm": 7.046045303344727, + "learning_rate": 3.5567981122567263e-06, + "loss": 0.4211, + "mean_token_accuracy": 0.8524423450231552, + "num_tokens": 196383106.0, + "step": 163200 + }, + { + "entropy": 1.90313328653574, + "epoch": 0.5059367199360798, + "grad_norm": 10.572953224182129, + "learning_rate": 3.556689146066253e-06, + "loss": 0.4372, + "mean_token_accuracy": 0.8559685334563255, + "num_tokens": 196394839.0, + "step": 163210 + }, + { + "entropy": 1.9140533238649369, + "epoch": 0.5059677190611295, + "grad_norm": 2.8058857917785645, + "learning_rate": 3.5565801898900427e-06, + "loss": 0.4258, + "mean_token_accuracy": 0.8639476001262665, + "num_tokens": 196406632.0, + "step": 163220 + }, + { + "entropy": 1.8180472642183303, + "epoch": 0.5059987181861791, + "grad_norm": 2.5980710983276367, + "learning_rate": 3.5564712437265604e-06, + "loss": 0.414, + "mean_token_accuracy": 0.8603103265166283, + "num_tokens": 196419364.0, + "step": 163230 + }, + { + "entropy": 1.9302504003047942, + "epoch": 0.5060297173112289, + "grad_norm": 7.521374225616455, + "learning_rate": 3.5563623075742736e-06, + "loss": 0.471, + "mean_token_accuracy": 0.8582198992371559, + "num_tokens": 196430357.0, + "step": 163240 + }, + { + "entropy": 1.8773491248488425, + "epoch": 0.5060607164362786, + "grad_norm": 7.329319477081299, + "learning_rate": 3.556253381431648e-06, + "loss": 0.408, + "mean_token_accuracy": 0.8633986055850983, + "num_tokens": 196442626.0, + "step": 163250 + }, + { + "entropy": 1.9318356201052667, + "epoch": 0.5060917155613283, + "grad_norm": 3.0441231727600098, + "learning_rate": 3.5561444652971527e-06, + "loss": 0.4759, + "mean_token_accuracy": 0.8500076308846474, + "num_tokens": 196453654.0, + "step": 163260 + }, + { + "entropy": 1.8296071410179138, + "epoch": 0.5061227146863779, + "grad_norm": 7.275659084320068, + "learning_rate": 3.556035559169253e-06, + "loss": 0.434, + "mean_token_accuracy": 0.8644940406084061, + "num_tokens": 196465862.0, + "step": 163270 + }, + { + "entropy": 1.8528779864311218, + "epoch": 0.5061537138114277, + "grad_norm": 8.553239822387695, + "learning_rate": 3.5559266630464182e-06, + "loss": 0.416, + "mean_token_accuracy": 0.8582915931940078, + "num_tokens": 196478225.0, + "step": 163280 + }, + { + "entropy": 1.8958040222525596, + "epoch": 0.5061847129364774, + "grad_norm": 8.999159812927246, + "learning_rate": 3.555817776927117e-06, + "loss": 0.4531, + "mean_token_accuracy": 0.8500429227948189, + "num_tokens": 196489275.0, + "step": 163290 + }, + { + "entropy": 1.953699079155922, + "epoch": 0.506215712061527, + "grad_norm": 8.064541816711426, + "learning_rate": 3.5557089008098162e-06, + "loss": 0.4706, + "mean_token_accuracy": 0.8561220705509186, + "num_tokens": 196500253.0, + "step": 163300 + }, + { + "entropy": 1.870068684220314, + "epoch": 0.5062467111865767, + "grad_norm": 8.940662384033203, + "learning_rate": 3.5556000346929846e-06, + "loss": 0.506, + "mean_token_accuracy": 0.8416852578520775, + "num_tokens": 196512116.0, + "step": 163310 + }, + { + "entropy": 1.8221497237682343, + "epoch": 0.5062777103116265, + "grad_norm": 8.552144050598145, + "learning_rate": 3.555491178575094e-06, + "loss": 0.3984, + "mean_token_accuracy": 0.8682190105319023, + "num_tokens": 196525033.0, + "step": 163320 + }, + { + "entropy": 1.831996063888073, + "epoch": 0.5063087094366762, + "grad_norm": 9.747136116027832, + "learning_rate": 3.5553823324546104e-06, + "loss": 0.4539, + "mean_token_accuracy": 0.8476357832551003, + "num_tokens": 196537789.0, + "step": 163330 + }, + { + "entropy": 1.7898123905062675, + "epoch": 0.5063397085617258, + "grad_norm": 5.09330940246582, + "learning_rate": 3.5552734963300062e-06, + "loss": 0.3892, + "mean_token_accuracy": 0.8650940164923668, + "num_tokens": 196551522.0, + "step": 163340 + }, + { + "entropy": 1.7997850097715855, + "epoch": 0.5063707076867755, + "grad_norm": 4.622293472290039, + "learning_rate": 3.5551646701997505e-06, + "loss": 0.4114, + "mean_token_accuracy": 0.8593069702386856, + "num_tokens": 196564746.0, + "step": 163350 + }, + { + "entropy": 1.8479792803525925, + "epoch": 0.5064017068118253, + "grad_norm": 10.61267375946045, + "learning_rate": 3.5550558540623135e-06, + "loss": 0.4225, + "mean_token_accuracy": 0.8591436266899108, + "num_tokens": 196577353.0, + "step": 163360 + }, + { + "entropy": 1.830297763645649, + "epoch": 0.506432705936875, + "grad_norm": 7.7146711349487305, + "learning_rate": 3.5549470479161675e-06, + "loss": 0.421, + "mean_token_accuracy": 0.8574198722839356, + "num_tokens": 196588958.0, + "step": 163370 + }, + { + "entropy": 1.8857371792197228, + "epoch": 0.5064637050619246, + "grad_norm": 4.479658126831055, + "learning_rate": 3.554838251759782e-06, + "loss": 0.5091, + "mean_token_accuracy": 0.840976457297802, + "num_tokens": 196600307.0, + "step": 163380 + }, + { + "entropy": 1.861470101773739, + "epoch": 0.5064947041869743, + "grad_norm": 6.463710784912109, + "learning_rate": 3.554729465591629e-06, + "loss": 0.4305, + "mean_token_accuracy": 0.8638939753174781, + "num_tokens": 196611868.0, + "step": 163390 + }, + { + "entropy": 1.7832600355148316, + "epoch": 0.5065257033120241, + "grad_norm": 7.852832317352295, + "learning_rate": 3.5546206894101797e-06, + "loss": 0.3883, + "mean_token_accuracy": 0.8515184715390205, + "num_tokens": 196625328.0, + "step": 163400 + }, + { + "entropy": 1.8880818665027619, + "epoch": 0.5065567024370737, + "grad_norm": 3.90632963180542, + "learning_rate": 3.554511923213907e-06, + "loss": 0.4456, + "mean_token_accuracy": 0.8527933284640312, + "num_tokens": 196637075.0, + "step": 163410 + }, + { + "entropy": 1.8785538420081138, + "epoch": 0.5065877015621234, + "grad_norm": 7.10991096496582, + "learning_rate": 3.5544031670012836e-06, + "loss": 0.47, + "mean_token_accuracy": 0.8546990528702736, + "num_tokens": 196648934.0, + "step": 163420 + }, + { + "entropy": 1.8279323622584343, + "epoch": 0.5066187006871731, + "grad_norm": 8.198480606079102, + "learning_rate": 3.5542944207707806e-06, + "loss": 0.4145, + "mean_token_accuracy": 0.8548176631331443, + "num_tokens": 196661904.0, + "step": 163430 + }, + { + "entropy": 1.8618044629693031, + "epoch": 0.5066496998122229, + "grad_norm": 9.521814346313477, + "learning_rate": 3.554185684520874e-06, + "loss": 0.4561, + "mean_token_accuracy": 0.8495841041207314, + "num_tokens": 196673894.0, + "step": 163440 + }, + { + "entropy": 1.8538531705737114, + "epoch": 0.5066806989372725, + "grad_norm": 8.083247184753418, + "learning_rate": 3.5540769582500344e-06, + "loss": 0.4565, + "mean_token_accuracy": 0.8586274668574333, + "num_tokens": 196686759.0, + "step": 163450 + }, + { + "entropy": 1.8029966354370117, + "epoch": 0.5067116980623222, + "grad_norm": 8.461233139038086, + "learning_rate": 3.553968241956737e-06, + "loss": 0.3961, + "mean_token_accuracy": 0.861527732014656, + "num_tokens": 196699380.0, + "step": 163460 + }, + { + "entropy": 1.9485577285289764, + "epoch": 0.5067426971873719, + "grad_norm": 8.530065536499023, + "learning_rate": 3.5538595356394546e-06, + "loss": 0.4552, + "mean_token_accuracy": 0.8589451789855957, + "num_tokens": 196710283.0, + "step": 163470 + }, + { + "entropy": 1.8798445031046866, + "epoch": 0.5067736963124215, + "grad_norm": 8.257960319519043, + "learning_rate": 3.553750839296663e-06, + "loss": 0.4343, + "mean_token_accuracy": 0.8557620927691459, + "num_tokens": 196722297.0, + "step": 163480 + }, + { + "entropy": 1.8360868021845818, + "epoch": 0.5068046954374713, + "grad_norm": 4.080684661865234, + "learning_rate": 3.5536421529268368e-06, + "loss": 0.4367, + "mean_token_accuracy": 0.8628736943006515, + "num_tokens": 196735291.0, + "step": 163490 + }, + { + "entropy": 1.8096343368291854, + "epoch": 0.506835694562521, + "grad_norm": 8.325018882751465, + "learning_rate": 3.55353347652845e-06, + "loss": 0.362, + "mean_token_accuracy": 0.8683801367878914, + "num_tokens": 196748018.0, + "step": 163500 + }, + { + "entropy": 1.870979880541563, + "epoch": 0.5068666936875706, + "grad_norm": 8.216853141784668, + "learning_rate": 3.5534248100999797e-06, + "loss": 0.4639, + "mean_token_accuracy": 0.8563655108213425, + "num_tokens": 196759790.0, + "step": 163510 + }, + { + "entropy": 1.8905472248792647, + "epoch": 0.5068976928126203, + "grad_norm": 7.7638726234436035, + "learning_rate": 3.553316153639899e-06, + "loss": 0.4838, + "mean_token_accuracy": 0.8415771916508674, + "num_tokens": 196771119.0, + "step": 163520 + }, + { + "entropy": 1.8568203702569008, + "epoch": 0.5069286919376701, + "grad_norm": 7.551204204559326, + "learning_rate": 3.5532075071466866e-06, + "loss": 0.4298, + "mean_token_accuracy": 0.8592291429638863, + "num_tokens": 196783134.0, + "step": 163530 + }, + { + "entropy": 1.845226949453354, + "epoch": 0.5069596910627198, + "grad_norm": 11.321545600891113, + "learning_rate": 3.5530988706188167e-06, + "loss": 0.4273, + "mean_token_accuracy": 0.8542516723275184, + "num_tokens": 196794843.0, + "step": 163540 + }, + { + "entropy": 1.9101459234952927, + "epoch": 0.5069906901877694, + "grad_norm": 7.633603572845459, + "learning_rate": 3.5529902440547686e-06, + "loss": 0.4493, + "mean_token_accuracy": 0.8477040901780128, + "num_tokens": 196806883.0, + "step": 163550 + }, + { + "entropy": 1.9154523998498916, + "epoch": 0.5070216893128191, + "grad_norm": 7.648013591766357, + "learning_rate": 3.5528816274530164e-06, + "loss": 0.477, + "mean_token_accuracy": 0.852397172152996, + "num_tokens": 196818158.0, + "step": 163560 + }, + { + "entropy": 1.8581027761101723, + "epoch": 0.5070526884378689, + "grad_norm": 8.126503944396973, + "learning_rate": 3.5527730208120387e-06, + "loss": 0.4297, + "mean_token_accuracy": 0.8557763010263443, + "num_tokens": 196830042.0, + "step": 163570 + }, + { + "entropy": 1.9071855247020721, + "epoch": 0.5070836875629186, + "grad_norm": 8.152803421020508, + "learning_rate": 3.5526644241303143e-06, + "loss": 0.4265, + "mean_token_accuracy": 0.8564406126737595, + "num_tokens": 196841618.0, + "step": 163580 + }, + { + "entropy": 1.7716242380440235, + "epoch": 0.5071146866879682, + "grad_norm": 9.797401428222656, + "learning_rate": 3.5525558374063195e-06, + "loss": 0.3823, + "mean_token_accuracy": 0.8628744825720787, + "num_tokens": 196855435.0, + "step": 163590 + }, + { + "entropy": 1.7673632681369782, + "epoch": 0.5071456858130179, + "grad_norm": 9.011829376220703, + "learning_rate": 3.5524472606385324e-06, + "loss": 0.3566, + "mean_token_accuracy": 0.8731319457292557, + "num_tokens": 196867956.0, + "step": 163600 + }, + { + "entropy": 1.9083643838763238, + "epoch": 0.5071766849380677, + "grad_norm": 4.590526103973389, + "learning_rate": 3.5523386938254335e-06, + "loss": 0.4542, + "mean_token_accuracy": 0.8606811299920082, + "num_tokens": 196879300.0, + "step": 163610 + }, + { + "entropy": 1.7268058001995086, + "epoch": 0.5072076840631173, + "grad_norm": 8.471205711364746, + "learning_rate": 3.5522301369655e-06, + "loss": 0.3688, + "mean_token_accuracy": 0.8630135610699654, + "num_tokens": 196893124.0, + "step": 163620 + }, + { + "entropy": 1.867137537896633, + "epoch": 0.507238683188167, + "grad_norm": 8.471604347229004, + "learning_rate": 3.552121590057212e-06, + "loss": 0.4147, + "mean_token_accuracy": 0.8629182651638985, + "num_tokens": 196905343.0, + "step": 163630 + }, + { + "entropy": 1.745313723385334, + "epoch": 0.5072696823132167, + "grad_norm": 3.680870532989502, + "learning_rate": 3.5520130530990493e-06, + "loss": 0.3636, + "mean_token_accuracy": 0.8692538917064667, + "num_tokens": 196919685.0, + "step": 163640 + }, + { + "entropy": 1.80671256929636, + "epoch": 0.5073006814382665, + "grad_norm": 10.35844898223877, + "learning_rate": 3.5519045260894907e-06, + "loss": 0.3949, + "mean_token_accuracy": 0.866978970170021, + "num_tokens": 196932109.0, + "step": 163650 + }, + { + "entropy": 1.8454255670309068, + "epoch": 0.5073316805633161, + "grad_norm": 9.719582557678223, + "learning_rate": 3.551796009027018e-06, + "loss": 0.4218, + "mean_token_accuracy": 0.8601211249828339, + "num_tokens": 196944736.0, + "step": 163660 + }, + { + "entropy": 1.8403030604124069, + "epoch": 0.5073626796883658, + "grad_norm": 8.171625137329102, + "learning_rate": 3.551687501910111e-06, + "loss": 0.4166, + "mean_token_accuracy": 0.8558960810303688, + "num_tokens": 196957203.0, + "step": 163670 + }, + { + "entropy": 1.9288623362779618, + "epoch": 0.5073936788134155, + "grad_norm": 7.492020606994629, + "learning_rate": 3.5515790047372508e-06, + "loss": 0.4898, + "mean_token_accuracy": 0.8433584362268448, + "num_tokens": 196967955.0, + "step": 163680 + }, + { + "entropy": 1.86645817309618, + "epoch": 0.5074246779384652, + "grad_norm": 7.969177722930908, + "learning_rate": 3.5514705175069186e-06, + "loss": 0.4805, + "mean_token_accuracy": 0.8469413831830025, + "num_tokens": 196979245.0, + "step": 163690 + }, + { + "entropy": 1.870174802839756, + "epoch": 0.5074556770635149, + "grad_norm": 9.207684516906738, + "learning_rate": 3.551362040217595e-06, + "loss": 0.4496, + "mean_token_accuracy": 0.8529495477676392, + "num_tokens": 196991101.0, + "step": 163700 + }, + { + "entropy": 1.906150184571743, + "epoch": 0.5074866761885646, + "grad_norm": 10.05109691619873, + "learning_rate": 3.551253572867763e-06, + "loss": 0.4698, + "mean_token_accuracy": 0.8455789417028428, + "num_tokens": 197002059.0, + "step": 163710 + }, + { + "entropy": 1.859985102713108, + "epoch": 0.5075176753136142, + "grad_norm": 7.572958469390869, + "learning_rate": 3.551145115455905e-06, + "loss": 0.4089, + "mean_token_accuracy": 0.8629415169358253, + "num_tokens": 197013664.0, + "step": 163720 + }, + { + "entropy": 1.8383402064442635, + "epoch": 0.5075486744386639, + "grad_norm": 4.040586471557617, + "learning_rate": 3.551036667980503e-06, + "loss": 0.396, + "mean_token_accuracy": 0.8514451235532761, + "num_tokens": 197026174.0, + "step": 163730 + }, + { + "entropy": 1.8111710965633392, + "epoch": 0.5075796735637137, + "grad_norm": 9.287176132202148, + "learning_rate": 3.55092823044004e-06, + "loss": 0.4212, + "mean_token_accuracy": 0.8552304312586785, + "num_tokens": 197039878.0, + "step": 163740 + }, + { + "entropy": 1.7784530088305472, + "epoch": 0.5076106726887634, + "grad_norm": 7.509320259094238, + "learning_rate": 3.5508198028329993e-06, + "loss": 0.4064, + "mean_token_accuracy": 0.855512909591198, + "num_tokens": 197053606.0, + "step": 163750 + }, + { + "entropy": 1.9045552536845207, + "epoch": 0.507641671813813, + "grad_norm": 8.221936225891113, + "learning_rate": 3.5507113851578635e-06, + "loss": 0.4295, + "mean_token_accuracy": 0.8489176541566849, + "num_tokens": 197065651.0, + "step": 163760 + }, + { + "entropy": 1.8847736433148383, + "epoch": 0.5076726709388627, + "grad_norm": 7.487794876098633, + "learning_rate": 3.5506029774131175e-06, + "loss": 0.4387, + "mean_token_accuracy": 0.8544848307967186, + "num_tokens": 197077138.0, + "step": 163770 + }, + { + "entropy": 1.8698254212737084, + "epoch": 0.5077036700639125, + "grad_norm": 8.723936080932617, + "learning_rate": 3.550494579597245e-06, + "loss": 0.4264, + "mean_token_accuracy": 0.8674852281808854, + "num_tokens": 197088846.0, + "step": 163780 + }, + { + "entropy": 1.9057440683245659, + "epoch": 0.5077346691889622, + "grad_norm": 7.241644382476807, + "learning_rate": 3.550386191708731e-06, + "loss": 0.4744, + "mean_token_accuracy": 0.8506020948290824, + "num_tokens": 197100182.0, + "step": 163790 + }, + { + "entropy": 1.816692951321602, + "epoch": 0.5077656683140118, + "grad_norm": 7.693803787231445, + "learning_rate": 3.55027781374606e-06, + "loss": 0.3816, + "mean_token_accuracy": 0.8697048261761665, + "num_tokens": 197112422.0, + "step": 163800 + }, + { + "entropy": 1.896825762093067, + "epoch": 0.5077966674390615, + "grad_norm": 10.119492530822754, + "learning_rate": 3.5501694457077167e-06, + "loss": 0.4499, + "mean_token_accuracy": 0.8635825246572495, + "num_tokens": 197123947.0, + "step": 163810 + }, + { + "entropy": 1.8742366090416909, + "epoch": 0.5078276665641113, + "grad_norm": 8.425687789916992, + "learning_rate": 3.550061087592187e-06, + "loss": 0.4545, + "mean_token_accuracy": 0.8580169692635536, + "num_tokens": 197135606.0, + "step": 163820 + }, + { + "entropy": 1.800526437163353, + "epoch": 0.5078586656891609, + "grad_norm": 8.00316333770752, + "learning_rate": 3.5499527393979565e-06, + "loss": 0.4263, + "mean_token_accuracy": 0.8611070349812507, + "num_tokens": 197148361.0, + "step": 163830 + }, + { + "entropy": 1.8301258489489556, + "epoch": 0.5078896648142106, + "grad_norm": 3.099703311920166, + "learning_rate": 3.5498444011235117e-06, + "loss": 0.4182, + "mean_token_accuracy": 0.8658434525132179, + "num_tokens": 197160383.0, + "step": 163840 + }, + { + "entropy": 1.801127390563488, + "epoch": 0.5079206639392603, + "grad_norm": 9.042040824890137, + "learning_rate": 3.549736072767338e-06, + "loss": 0.3819, + "mean_token_accuracy": 0.8617468386888504, + "num_tokens": 197173219.0, + "step": 163850 + }, + { + "entropy": 1.9284664213657379, + "epoch": 0.50795166306431, + "grad_norm": 9.226612091064453, + "learning_rate": 3.549627754327924e-06, + "loss": 0.4928, + "mean_token_accuracy": 0.8404616430401802, + "num_tokens": 197184744.0, + "step": 163860 + }, + { + "entropy": 1.8895764395594596, + "epoch": 0.5079826621893597, + "grad_norm": 8.440512657165527, + "learning_rate": 3.5495194458037544e-06, + "loss": 0.4993, + "mean_token_accuracy": 0.8522619500756263, + "num_tokens": 197196174.0, + "step": 163870 + }, + { + "entropy": 1.7769380405545234, + "epoch": 0.5080136613144094, + "grad_norm": 4.372562408447266, + "learning_rate": 3.549411147193318e-06, + "loss": 0.3977, + "mean_token_accuracy": 0.8606853768229484, + "num_tokens": 197208968.0, + "step": 163880 + }, + { + "entropy": 1.8417119562625885, + "epoch": 0.5080446604394591, + "grad_norm": 4.590822696685791, + "learning_rate": 3.5493028584951027e-06, + "loss": 0.4064, + "mean_token_accuracy": 0.8595148786902428, + "num_tokens": 197221585.0, + "step": 163890 + }, + { + "entropy": 1.9214375928044318, + "epoch": 0.5080756595645088, + "grad_norm": 4.399219512939453, + "learning_rate": 3.5491945797075963e-06, + "loss": 0.468, + "mean_token_accuracy": 0.8582563444972038, + "num_tokens": 197233428.0, + "step": 163900 + }, + { + "entropy": 1.7925603240728378, + "epoch": 0.5081066586895585, + "grad_norm": 3.6897733211517334, + "learning_rate": 3.5490863108292856e-06, + "loss": 0.3631, + "mean_token_accuracy": 0.8733308136463165, + "num_tokens": 197246486.0, + "step": 163910 + }, + { + "entropy": 1.8893893167376519, + "epoch": 0.5081376578146082, + "grad_norm": 8.758194923400879, + "learning_rate": 3.5489780518586626e-06, + "loss": 0.4548, + "mean_token_accuracy": 0.8464448168873787, + "num_tokens": 197258696.0, + "step": 163920 + }, + { + "entropy": 1.8779218710958958, + "epoch": 0.5081686569396578, + "grad_norm": 8.010139465332031, + "learning_rate": 3.548869802794213e-06, + "loss": 0.4136, + "mean_token_accuracy": 0.8593690872192383, + "num_tokens": 197270587.0, + "step": 163930 + }, + { + "entropy": 1.8980011656880378, + "epoch": 0.5081996560647076, + "grad_norm": 4.0742106437683105, + "learning_rate": 3.548761563634428e-06, + "loss": 0.4225, + "mean_token_accuracy": 0.8588485881686211, + "num_tokens": 197282756.0, + "step": 163940 + }, + { + "entropy": 1.925993339717388, + "epoch": 0.5082306551897573, + "grad_norm": 6.346606254577637, + "learning_rate": 3.548653334377797e-06, + "loss": 0.4553, + "mean_token_accuracy": 0.853511492908001, + "num_tokens": 197294423.0, + "step": 163950 + }, + { + "entropy": 1.8731113463640212, + "epoch": 0.508261654314807, + "grad_norm": 3.2879886627197266, + "learning_rate": 3.5485451150228088e-06, + "loss": 0.4326, + "mean_token_accuracy": 0.8611008077859879, + "num_tokens": 197306496.0, + "step": 163960 + }, + { + "entropy": 1.9740417271852493, + "epoch": 0.5082926534398566, + "grad_norm": 7.165710926055908, + "learning_rate": 3.5484369055679565e-06, + "loss": 0.4983, + "mean_token_accuracy": 0.8428760409355164, + "num_tokens": 197317823.0, + "step": 163970 + }, + { + "entropy": 1.9370626002550124, + "epoch": 0.5083236525649063, + "grad_norm": 3.800901174545288, + "learning_rate": 3.5483287060117265e-06, + "loss": 0.452, + "mean_token_accuracy": 0.8547460094094277, + "num_tokens": 197329227.0, + "step": 163980 + }, + { + "entropy": 1.9272918194532394, + "epoch": 0.5083546516899561, + "grad_norm": 8.870448112487793, + "learning_rate": 3.548220516352614e-06, + "loss": 0.5126, + "mean_token_accuracy": 0.8483779489994049, + "num_tokens": 197340517.0, + "step": 163990 + }, + { + "entropy": 1.9233269169926643, + "epoch": 0.5083856508150058, + "grad_norm": 9.553996086120605, + "learning_rate": 3.548112336589107e-06, + "loss": 0.4691, + "mean_token_accuracy": 0.8529503479599952, + "num_tokens": 197351963.0, + "step": 164000 + }, + { + "entropy": 1.8978917241096496, + "epoch": 0.5084166499400554, + "grad_norm": 8.318848609924316, + "learning_rate": 3.548004166719699e-06, + "loss": 0.4558, + "mean_token_accuracy": 0.8580149009823799, + "num_tokens": 197363948.0, + "step": 164010 + }, + { + "entropy": 1.877458170056343, + "epoch": 0.5084476490651051, + "grad_norm": 11.015156745910645, + "learning_rate": 3.5478960067428814e-06, + "loss": 0.422, + "mean_token_accuracy": 0.8697662636637687, + "num_tokens": 197375852.0, + "step": 164020 + }, + { + "entropy": 1.905393399298191, + "epoch": 0.5084786481901549, + "grad_norm": 6.92934513092041, + "learning_rate": 3.547787856657146e-06, + "loss": 0.4105, + "mean_token_accuracy": 0.8674715921282768, + "num_tokens": 197386981.0, + "step": 164030 + }, + { + "entropy": 1.8934918195009232, + "epoch": 0.5085096473152045, + "grad_norm": 9.103461265563965, + "learning_rate": 3.5476797164609863e-06, + "loss": 0.4769, + "mean_token_accuracy": 0.8485694825649261, + "num_tokens": 197398553.0, + "step": 164040 + }, + { + "entropy": 1.959120711684227, + "epoch": 0.5085406464402542, + "grad_norm": 7.4005351066589355, + "learning_rate": 3.5475715861528943e-06, + "loss": 0.4598, + "mean_token_accuracy": 0.855261555314064, + "num_tokens": 197409315.0, + "step": 164050 + }, + { + "entropy": 1.8080861911177635, + "epoch": 0.5085716455653039, + "grad_norm": 8.302371978759766, + "learning_rate": 3.547463465731363e-06, + "loss": 0.401, + "mean_token_accuracy": 0.8583597347140313, + "num_tokens": 197423022.0, + "step": 164060 + }, + { + "entropy": 1.8989132165908813, + "epoch": 0.5086026446903537, + "grad_norm": 7.591587066650391, + "learning_rate": 3.547355355194887e-06, + "loss": 0.4597, + "mean_token_accuracy": 0.8550845861434937, + "num_tokens": 197434763.0, + "step": 164070 + }, + { + "entropy": 1.7511195302009583, + "epoch": 0.5086336438154033, + "grad_norm": 8.043708801269531, + "learning_rate": 3.547247254541959e-06, + "loss": 0.3614, + "mean_token_accuracy": 0.8655338972806931, + "num_tokens": 197448589.0, + "step": 164080 + }, + { + "entropy": 1.9792837232351304, + "epoch": 0.508664642940453, + "grad_norm": 7.99447774887085, + "learning_rate": 3.5471391637710738e-06, + "loss": 0.499, + "mean_token_accuracy": 0.8506904348731041, + "num_tokens": 197459864.0, + "step": 164090 + }, + { + "entropy": 1.9158392682671548, + "epoch": 0.5086956420655027, + "grad_norm": 9.754770278930664, + "learning_rate": 3.547031082880726e-06, + "loss": 0.4272, + "mean_token_accuracy": 0.853808979690075, + "num_tokens": 197471839.0, + "step": 164100 + }, + { + "entropy": 1.8726506367325784, + "epoch": 0.5087266411905524, + "grad_norm": 10.339028358459473, + "learning_rate": 3.5469230118694107e-06, + "loss": 0.4304, + "mean_token_accuracy": 0.8654339507222175, + "num_tokens": 197483011.0, + "step": 164110 + }, + { + "entropy": 1.9108057722449303, + "epoch": 0.5087576403156021, + "grad_norm": 7.670596122741699, + "learning_rate": 3.5468149507356213e-06, + "loss": 0.4863, + "mean_token_accuracy": 0.8540572628378869, + "num_tokens": 197494412.0, + "step": 164120 + }, + { + "entropy": 1.8840801566839218, + "epoch": 0.5087886394406518, + "grad_norm": 3.8235244750976562, + "learning_rate": 3.5467068994778553e-06, + "loss": 0.4559, + "mean_token_accuracy": 0.8526855111122131, + "num_tokens": 197506145.0, + "step": 164130 + }, + { + "entropy": 1.9018552049994468, + "epoch": 0.5088196385657014, + "grad_norm": 10.32859992980957, + "learning_rate": 3.546598858094607e-06, + "loss": 0.4597, + "mean_token_accuracy": 0.8581657335162163, + "num_tokens": 197517526.0, + "step": 164140 + }, + { + "entropy": 1.7613848388195037, + "epoch": 0.5088506376907512, + "grad_norm": 6.967850208282471, + "learning_rate": 3.5464908265843733e-06, + "loss": 0.3746, + "mean_token_accuracy": 0.8673210009932518, + "num_tokens": 197530473.0, + "step": 164150 + }, + { + "entropy": 1.7470743969082831, + "epoch": 0.5088816368158009, + "grad_norm": 4.126484394073486, + "learning_rate": 3.5463828049456504e-06, + "loss": 0.4068, + "mean_token_accuracy": 0.8649399444460869, + "num_tokens": 197544141.0, + "step": 164160 + }, + { + "entropy": 1.7916277647018433, + "epoch": 0.5089126359408506, + "grad_norm": 7.930361270904541, + "learning_rate": 3.5462747931769348e-06, + "loss": 0.3446, + "mean_token_accuracy": 0.8660852313041687, + "num_tokens": 197557293.0, + "step": 164170 + }, + { + "entropy": 1.9348184138536453, + "epoch": 0.5089436350659002, + "grad_norm": 7.338921070098877, + "learning_rate": 3.546166791276724e-06, + "loss": 0.4419, + "mean_token_accuracy": 0.8696695506572724, + "num_tokens": 197568090.0, + "step": 164180 + }, + { + "entropy": 1.8887701407074928, + "epoch": 0.50897463419095, + "grad_norm": 8.492532730102539, + "learning_rate": 3.5460587992435147e-06, + "loss": 0.4823, + "mean_token_accuracy": 0.8550636664032936, + "num_tokens": 197579983.0, + "step": 164190 + }, + { + "entropy": 1.8933542668819427, + "epoch": 0.5090056333159997, + "grad_norm": 8.57034969329834, + "learning_rate": 3.5459508170758054e-06, + "loss": 0.4174, + "mean_token_accuracy": 0.868916317820549, + "num_tokens": 197590837.0, + "step": 164200 + }, + { + "entropy": 1.8781364992260934, + "epoch": 0.5090366324410494, + "grad_norm": 7.852393627166748, + "learning_rate": 3.5458428447720934e-06, + "loss": 0.4503, + "mean_token_accuracy": 0.8487447679042817, + "num_tokens": 197603268.0, + "step": 164210 + }, + { + "entropy": 1.7770580515265464, + "epoch": 0.509067631566099, + "grad_norm": 2.3359949588775635, + "learning_rate": 3.545734882330877e-06, + "loss": 0.3527, + "mean_token_accuracy": 0.8754623264074326, + "num_tokens": 197615363.0, + "step": 164220 + }, + { + "entropy": 1.930907705426216, + "epoch": 0.5090986306911487, + "grad_norm": 8.559337615966797, + "learning_rate": 3.545626929750655e-06, + "loss": 0.4852, + "mean_token_accuracy": 0.8568104296922684, + "num_tokens": 197626364.0, + "step": 164230 + }, + { + "entropy": 1.8539829179644585, + "epoch": 0.5091296298161985, + "grad_norm": 9.082908630371094, + "learning_rate": 3.545518987029927e-06, + "loss": 0.4415, + "mean_token_accuracy": 0.8485892757773399, + "num_tokens": 197638250.0, + "step": 164240 + }, + { + "entropy": 1.8581020534038544, + "epoch": 0.5091606289412481, + "grad_norm": 7.516939640045166, + "learning_rate": 3.5454110541671916e-06, + "loss": 0.4276, + "mean_token_accuracy": 0.8599382519721985, + "num_tokens": 197650104.0, + "step": 164250 + }, + { + "entropy": 1.9120456263422967, + "epoch": 0.5091916280662978, + "grad_norm": 8.88882827758789, + "learning_rate": 3.5453031311609487e-06, + "loss": 0.4833, + "mean_token_accuracy": 0.8505443364381791, + "num_tokens": 197661272.0, + "step": 164260 + }, + { + "entropy": 1.9172147080302238, + "epoch": 0.5092226271913475, + "grad_norm": 9.059195518493652, + "learning_rate": 3.5451952180096983e-06, + "loss": 0.4718, + "mean_token_accuracy": 0.8485334411263465, + "num_tokens": 197672129.0, + "step": 164270 + }, + { + "entropy": 1.7511213548481463, + "epoch": 0.5092536263163973, + "grad_norm": 3.4584226608276367, + "learning_rate": 3.54508731471194e-06, + "loss": 0.3507, + "mean_token_accuracy": 0.8762229889631271, + "num_tokens": 197685480.0, + "step": 164280 + }, + { + "entropy": 1.8804846301674842, + "epoch": 0.5092846254414469, + "grad_norm": 6.6632866859436035, + "learning_rate": 3.5449794212661747e-06, + "loss": 0.4756, + "mean_token_accuracy": 0.8547676905989647, + "num_tokens": 197697031.0, + "step": 164290 + }, + { + "entropy": 1.8086992830038071, + "epoch": 0.5093156245664966, + "grad_norm": 9.774088859558105, + "learning_rate": 3.544871537670904e-06, + "loss": 0.3817, + "mean_token_accuracy": 0.8651171505451203, + "num_tokens": 197710087.0, + "step": 164300 + }, + { + "entropy": 1.9255617767572404, + "epoch": 0.5093466236915463, + "grad_norm": 8.894806861877441, + "learning_rate": 3.5447636639246287e-06, + "loss": 0.465, + "mean_token_accuracy": 0.8540884301066398, + "num_tokens": 197722079.0, + "step": 164310 + }, + { + "entropy": 1.8370198220014573, + "epoch": 0.509377622816596, + "grad_norm": 7.935099124908447, + "learning_rate": 3.544655800025849e-06, + "loss": 0.4137, + "mean_token_accuracy": 0.8566048562526702, + "num_tokens": 197734150.0, + "step": 164320 + }, + { + "entropy": 1.8919256404042244, + "epoch": 0.5094086219416457, + "grad_norm": 9.719154357910156, + "learning_rate": 3.544547945973069e-06, + "loss": 0.4697, + "mean_token_accuracy": 0.8487145140767097, + "num_tokens": 197745210.0, + "step": 164330 + }, + { + "entropy": 1.870015025138855, + "epoch": 0.5094396210666954, + "grad_norm": 3.9895741939544678, + "learning_rate": 3.5444401017647894e-06, + "loss": 0.4344, + "mean_token_accuracy": 0.8562591806054115, + "num_tokens": 197757465.0, + "step": 164340 + }, + { + "entropy": 1.9672464281320572, + "epoch": 0.509470620191745, + "grad_norm": 8.094328880310059, + "learning_rate": 3.5443322673995123e-06, + "loss": 0.5217, + "mean_token_accuracy": 0.843452051281929, + "num_tokens": 197768213.0, + "step": 164350 + }, + { + "entropy": 1.9159003645181656, + "epoch": 0.5095016193167948, + "grad_norm": 6.670746326446533, + "learning_rate": 3.544224442875742e-06, + "loss": 0.4485, + "mean_token_accuracy": 0.8514717981219292, + "num_tokens": 197779125.0, + "step": 164360 + }, + { + "entropy": 1.9196984738111496, + "epoch": 0.5095326184418445, + "grad_norm": 7.664515495300293, + "learning_rate": 3.5441166281919807e-06, + "loss": 0.4429, + "mean_token_accuracy": 0.8598332405090332, + "num_tokens": 197789903.0, + "step": 164370 + }, + { + "entropy": 1.9413407146930695, + "epoch": 0.5095636175668942, + "grad_norm": 8.811555862426758, + "learning_rate": 3.544008823346732e-06, + "loss": 0.4799, + "mean_token_accuracy": 0.8528330892324447, + "num_tokens": 197800737.0, + "step": 164380 + }, + { + "entropy": 1.9213477104902268, + "epoch": 0.5095946166919438, + "grad_norm": 9.078819274902344, + "learning_rate": 3.543901028338499e-06, + "loss": 0.4674, + "mean_token_accuracy": 0.8492666840553283, + "num_tokens": 197811307.0, + "step": 164390 + }, + { + "entropy": 1.8556159757077695, + "epoch": 0.5096256158169936, + "grad_norm": 2.676553249359131, + "learning_rate": 3.543793243165787e-06, + "loss": 0.4248, + "mean_token_accuracy": 0.8604639634490013, + "num_tokens": 197823836.0, + "step": 164400 + }, + { + "entropy": 1.873467753827572, + "epoch": 0.5096566149420433, + "grad_norm": 6.844171047210693, + "learning_rate": 3.5436854678270993e-06, + "loss": 0.4209, + "mean_token_accuracy": 0.8624146491289139, + "num_tokens": 197835920.0, + "step": 164410 + }, + { + "entropy": 1.8581449389457703, + "epoch": 0.509687614067093, + "grad_norm": 8.513300895690918, + "learning_rate": 3.5435777023209413e-06, + "loss": 0.46, + "mean_token_accuracy": 0.8637478351593018, + "num_tokens": 197847504.0, + "step": 164420 + }, + { + "entropy": 1.9565213829278947, + "epoch": 0.5097186131921426, + "grad_norm": 6.896825790405273, + "learning_rate": 3.543469946645818e-06, + "loss": 0.4646, + "mean_token_accuracy": 0.8548880100250245, + "num_tokens": 197857870.0, + "step": 164430 + }, + { + "entropy": 1.9588289812207222, + "epoch": 0.5097496123171924, + "grad_norm": 3.6317381858825684, + "learning_rate": 3.543362200800234e-06, + "loss": 0.4967, + "mean_token_accuracy": 0.8448257014155388, + "num_tokens": 197869294.0, + "step": 164440 + }, + { + "entropy": 1.8617324098944663, + "epoch": 0.5097806114422421, + "grad_norm": 9.595418930053711, + "learning_rate": 3.5432544647826957e-06, + "loss": 0.4779, + "mean_token_accuracy": 0.8515863418579102, + "num_tokens": 197881334.0, + "step": 164450 + }, + { + "entropy": 1.922871397435665, + "epoch": 0.5098116105672917, + "grad_norm": 8.938125610351562, + "learning_rate": 3.543146738591709e-06, + "loss": 0.4793, + "mean_token_accuracy": 0.847772279381752, + "num_tokens": 197892219.0, + "step": 164460 + }, + { + "entropy": 1.8417800962924957, + "epoch": 0.5098426096923414, + "grad_norm": 4.177606582641602, + "learning_rate": 3.5430390222257797e-06, + "loss": 0.4315, + "mean_token_accuracy": 0.8512136593461037, + "num_tokens": 197905193.0, + "step": 164470 + }, + { + "entropy": 1.979746587574482, + "epoch": 0.5098736088173911, + "grad_norm": 8.593181610107422, + "learning_rate": 3.5429313156834156e-06, + "loss": 0.4925, + "mean_token_accuracy": 0.8451717495918274, + "num_tokens": 197917083.0, + "step": 164480 + }, + { + "entropy": 1.9483318507671357, + "epoch": 0.5099046079424409, + "grad_norm": 7.634199619293213, + "learning_rate": 3.5428236189631227e-06, + "loss": 0.457, + "mean_token_accuracy": 0.8457421749830246, + "num_tokens": 197928191.0, + "step": 164490 + }, + { + "entropy": 1.8038464456796646, + "epoch": 0.5099356070674905, + "grad_norm": 8.22901725769043, + "learning_rate": 3.5427159320634082e-06, + "loss": 0.3874, + "mean_token_accuracy": 0.8666422128677368, + "num_tokens": 197941195.0, + "step": 164500 + }, + { + "entropy": 1.8235293269157409, + "epoch": 0.5099666061925402, + "grad_norm": 3.2526485919952393, + "learning_rate": 3.542608254982779e-06, + "loss": 0.3726, + "mean_token_accuracy": 0.862106654047966, + "num_tokens": 197953781.0, + "step": 164510 + }, + { + "entropy": 1.838102599978447, + "epoch": 0.5099976053175899, + "grad_norm": 3.73408579826355, + "learning_rate": 3.5425005877197447e-06, + "loss": 0.4261, + "mean_token_accuracy": 0.8607692360877991, + "num_tokens": 197965978.0, + "step": 164520 + }, + { + "entropy": 1.8384420067071914, + "epoch": 0.5100286044426396, + "grad_norm": 5.095405578613281, + "learning_rate": 3.5423929302728125e-06, + "loss": 0.3859, + "mean_token_accuracy": 0.8745714992284774, + "num_tokens": 197979104.0, + "step": 164530 + }, + { + "entropy": 1.9089295566082, + "epoch": 0.5100596035676893, + "grad_norm": 8.513500213623047, + "learning_rate": 3.5422852826404913e-06, + "loss": 0.4551, + "mean_token_accuracy": 0.852853499352932, + "num_tokens": 197990177.0, + "step": 164540 + }, + { + "entropy": 1.8421483382582664, + "epoch": 0.510090602692739, + "grad_norm": 4.655264854431152, + "learning_rate": 3.542177644821289e-06, + "loss": 0.4233, + "mean_token_accuracy": 0.8579674288630486, + "num_tokens": 198002778.0, + "step": 164550 + }, + { + "entropy": 1.7636081084609032, + "epoch": 0.5101216018177887, + "grad_norm": 6.842883110046387, + "learning_rate": 3.5420700168137166e-06, + "loss": 0.3649, + "mean_token_accuracy": 0.8728412866592408, + "num_tokens": 198015645.0, + "step": 164560 + }, + { + "entropy": 1.907676276564598, + "epoch": 0.5101526009428384, + "grad_norm": 3.8116097450256348, + "learning_rate": 3.5419623986162816e-06, + "loss": 0.4072, + "mean_token_accuracy": 0.855129437148571, + "num_tokens": 198027888.0, + "step": 164570 + }, + { + "entropy": 1.8424773082137107, + "epoch": 0.5101836000678881, + "grad_norm": 8.173355102539062, + "learning_rate": 3.541854790227495e-06, + "loss": 0.4155, + "mean_token_accuracy": 0.8628433018922805, + "num_tokens": 198040245.0, + "step": 164580 + }, + { + "entropy": 1.8889279991388321, + "epoch": 0.5102145991929378, + "grad_norm": 4.406068325042725, + "learning_rate": 3.541747191645866e-06, + "loss": 0.4406, + "mean_token_accuracy": 0.8483342587947845, + "num_tokens": 198052398.0, + "step": 164590 + }, + { + "entropy": 1.7704968944191932, + "epoch": 0.5102455983179874, + "grad_norm": 7.529422760009766, + "learning_rate": 3.5416396028699058e-06, + "loss": 0.3776, + "mean_token_accuracy": 0.8711699575185776, + "num_tokens": 198065697.0, + "step": 164600 + }, + { + "entropy": 1.8590932220220566, + "epoch": 0.5102765974430372, + "grad_norm": 3.707942247390747, + "learning_rate": 3.5415320238981252e-06, + "loss": 0.3845, + "mean_token_accuracy": 0.8645050838589668, + "num_tokens": 198078312.0, + "step": 164610 + }, + { + "entropy": 1.91665877699852, + "epoch": 0.5103075965680869, + "grad_norm": 8.822848320007324, + "learning_rate": 3.541424454729035e-06, + "loss": 0.4654, + "mean_token_accuracy": 0.8502047717571258, + "num_tokens": 198089581.0, + "step": 164620 + }, + { + "entropy": 1.9822220832109452, + "epoch": 0.5103385956931366, + "grad_norm": 8.159029006958008, + "learning_rate": 3.5413168953611463e-06, + "loss": 0.5218, + "mean_token_accuracy": 0.8457861587405204, + "num_tokens": 198100607.0, + "step": 164630 + }, + { + "entropy": 1.8249846056103707, + "epoch": 0.5103695948181862, + "grad_norm": 7.530824184417725, + "learning_rate": 3.5412093457929706e-06, + "loss": 0.385, + "mean_token_accuracy": 0.8623237699270249, + "num_tokens": 198113464.0, + "step": 164640 + }, + { + "entropy": 1.8657769158482551, + "epoch": 0.510400593943236, + "grad_norm": 5.076999664306641, + "learning_rate": 3.5411018060230205e-06, + "loss": 0.4339, + "mean_token_accuracy": 0.8500089332461357, + "num_tokens": 198125673.0, + "step": 164650 + }, + { + "entropy": 1.8927562654018402, + "epoch": 0.5104315930682857, + "grad_norm": 7.442946910858154, + "learning_rate": 3.540994276049809e-06, + "loss": 0.4191, + "mean_token_accuracy": 0.8680774718523026, + "num_tokens": 198138294.0, + "step": 164660 + }, + { + "entropy": 1.9183226093649863, + "epoch": 0.5104625921933353, + "grad_norm": 9.387221336364746, + "learning_rate": 3.540886755871847e-06, + "loss": 0.469, + "mean_token_accuracy": 0.8459124505519867, + "num_tokens": 198150002.0, + "step": 164670 + }, + { + "entropy": 1.7814438581466674, + "epoch": 0.510493591318385, + "grad_norm": 7.187930583953857, + "learning_rate": 3.5407792454876488e-06, + "loss": 0.3887, + "mean_token_accuracy": 0.8671529695391655, + "num_tokens": 198162639.0, + "step": 164680 + }, + { + "entropy": 1.9143247455358505, + "epoch": 0.5105245904434348, + "grad_norm": 7.576807022094727, + "learning_rate": 3.5406717448957274e-06, + "loss": 0.4714, + "mean_token_accuracy": 0.8488384842872619, + "num_tokens": 198174173.0, + "step": 164690 + }, + { + "entropy": 1.8134226769208908, + "epoch": 0.5105555895684845, + "grad_norm": 2.628392457962036, + "learning_rate": 3.5405642540945955e-06, + "loss": 0.3778, + "mean_token_accuracy": 0.862903805077076, + "num_tokens": 198188038.0, + "step": 164700 + }, + { + "entropy": 1.9163833409547806, + "epoch": 0.5105865886935341, + "grad_norm": 8.192092895507812, + "learning_rate": 3.5404567730827683e-06, + "loss": 0.4252, + "mean_token_accuracy": 0.85711118131876, + "num_tokens": 198199655.0, + "step": 164710 + }, + { + "entropy": 1.916769452393055, + "epoch": 0.5106175878185838, + "grad_norm": 3.710341453552246, + "learning_rate": 3.540349301858759e-06, + "loss": 0.4449, + "mean_token_accuracy": 0.8497982263565064, + "num_tokens": 198211331.0, + "step": 164720 + }, + { + "entropy": 1.9329410195350647, + "epoch": 0.5106485869436335, + "grad_norm": 7.230764389038086, + "learning_rate": 3.5402418404210827e-06, + "loss": 0.4664, + "mean_token_accuracy": 0.8429116651415824, + "num_tokens": 198223089.0, + "step": 164730 + }, + { + "entropy": 1.9391484722495078, + "epoch": 0.5106795860686832, + "grad_norm": 8.100869178771973, + "learning_rate": 3.540134388768255e-06, + "loss": 0.4478, + "mean_token_accuracy": 0.8531536921858788, + "num_tokens": 198234375.0, + "step": 164740 + }, + { + "entropy": 1.9161068618297576, + "epoch": 0.5107105851937329, + "grad_norm": 3.7813363075256348, + "learning_rate": 3.5400269468987893e-06, + "loss": 0.4064, + "mean_token_accuracy": 0.8692640915513039, + "num_tokens": 198245859.0, + "step": 164750 + }, + { + "entropy": 1.861397238075733, + "epoch": 0.5107415843187826, + "grad_norm": 4.208311080932617, + "learning_rate": 3.5399195148112014e-06, + "loss": 0.4346, + "mean_token_accuracy": 0.8546230092644691, + "num_tokens": 198257420.0, + "step": 164760 + }, + { + "entropy": 1.9197211220860482, + "epoch": 0.5107725834438323, + "grad_norm": 9.448493003845215, + "learning_rate": 3.5398120925040085e-06, + "loss": 0.4794, + "mean_token_accuracy": 0.8522847130894661, + "num_tokens": 198268666.0, + "step": 164770 + }, + { + "entropy": 1.8664867907762528, + "epoch": 0.510803582568882, + "grad_norm": 9.399632453918457, + "learning_rate": 3.5397046799757255e-06, + "loss": 0.4372, + "mean_token_accuracy": 0.8565617486834526, + "num_tokens": 198280919.0, + "step": 164780 + }, + { + "entropy": 1.8348496824502945, + "epoch": 0.5108345816939317, + "grad_norm": 3.520848512649536, + "learning_rate": 3.539597277224869e-06, + "loss": 0.4182, + "mean_token_accuracy": 0.8544135078787803, + "num_tokens": 198293749.0, + "step": 164790 + }, + { + "entropy": 1.8871194452047348, + "epoch": 0.5108655808189814, + "grad_norm": 3.2471134662628174, + "learning_rate": 3.539489884249957e-06, + "loss": 0.4173, + "mean_token_accuracy": 0.8605234354734421, + "num_tokens": 198305524.0, + "step": 164800 + }, + { + "entropy": 1.8586633920669555, + "epoch": 0.510896579944031, + "grad_norm": 8.798843383789062, + "learning_rate": 3.5393825010495047e-06, + "loss": 0.4024, + "mean_token_accuracy": 0.8607447475194931, + "num_tokens": 198317700.0, + "step": 164810 + }, + { + "entropy": 1.7715517044067384, + "epoch": 0.5109275790690808, + "grad_norm": 8.043583869934082, + "learning_rate": 3.5392751276220303e-06, + "loss": 0.345, + "mean_token_accuracy": 0.8794338956475258, + "num_tokens": 198331081.0, + "step": 164820 + }, + { + "entropy": 1.8529795736074448, + "epoch": 0.5109585781941305, + "grad_norm": 7.2134013175964355, + "learning_rate": 3.5391677639660516e-06, + "loss": 0.4048, + "mean_token_accuracy": 0.8627293214201928, + "num_tokens": 198343316.0, + "step": 164830 + }, + { + "entropy": 1.7684592947363853, + "epoch": 0.5109895773191802, + "grad_norm": 3.8594398498535156, + "learning_rate": 3.5390604100800864e-06, + "loss": 0.4141, + "mean_token_accuracy": 0.8634788036346436, + "num_tokens": 198356311.0, + "step": 164840 + }, + { + "entropy": 1.8074128575623036, + "epoch": 0.5110205764442298, + "grad_norm": 7.467392444610596, + "learning_rate": 3.538953065962653e-06, + "loss": 0.4432, + "mean_token_accuracy": 0.8613016933202744, + "num_tokens": 198370124.0, + "step": 164850 + }, + { + "entropy": 1.9014081314206124, + "epoch": 0.5110515755692796, + "grad_norm": 8.128207206726074, + "learning_rate": 3.53884573161227e-06, + "loss": 0.4696, + "mean_token_accuracy": 0.8408779174089431, + "num_tokens": 198382346.0, + "step": 164860 + }, + { + "entropy": 1.8639238759875298, + "epoch": 0.5110825746943293, + "grad_norm": 8.85745906829834, + "learning_rate": 3.538738407027457e-06, + "loss": 0.4407, + "mean_token_accuracy": 0.8516306474804878, + "num_tokens": 198394541.0, + "step": 164870 + }, + { + "entropy": 1.8283547207713127, + "epoch": 0.5111135738193789, + "grad_norm": 8.482251167297363, + "learning_rate": 3.5386310922067324e-06, + "loss": 0.3967, + "mean_token_accuracy": 0.8613016873598098, + "num_tokens": 198407068.0, + "step": 164880 + }, + { + "entropy": 1.8640737235546112, + "epoch": 0.5111445729444286, + "grad_norm": 9.156514167785645, + "learning_rate": 3.5385237871486154e-06, + "loss": 0.4243, + "mean_token_accuracy": 0.8556307137012482, + "num_tokens": 198419070.0, + "step": 164890 + }, + { + "entropy": 1.784039095044136, + "epoch": 0.5111755720694784, + "grad_norm": 3.7771859169006348, + "learning_rate": 3.5384164918516278e-06, + "loss": 0.3541, + "mean_token_accuracy": 0.873118770122528, + "num_tokens": 198431781.0, + "step": 164900 + }, + { + "entropy": 1.917399762570858, + "epoch": 0.5112065711945281, + "grad_norm": 9.159950256347656, + "learning_rate": 3.5383092063142884e-06, + "loss": 0.504, + "mean_token_accuracy": 0.8393927633762359, + "num_tokens": 198443784.0, + "step": 164910 + }, + { + "entropy": 1.8153990522027015, + "epoch": 0.5112375703195777, + "grad_norm": 3.03525447845459, + "learning_rate": 3.538201930535117e-06, + "loss": 0.4143, + "mean_token_accuracy": 0.8660739451646805, + "num_tokens": 198456000.0, + "step": 164920 + }, + { + "entropy": 1.878081302344799, + "epoch": 0.5112685694446274, + "grad_norm": 7.645351409912109, + "learning_rate": 3.5380946645126355e-06, + "loss": 0.4463, + "mean_token_accuracy": 0.8555167511105537, + "num_tokens": 198467146.0, + "step": 164930 + }, + { + "entropy": 1.8525796994566917, + "epoch": 0.5112995685696771, + "grad_norm": 4.232410907745361, + "learning_rate": 3.537987408245366e-06, + "loss": 0.3904, + "mean_token_accuracy": 0.8579295039176941, + "num_tokens": 198479522.0, + "step": 164940 + }, + { + "entropy": 1.9068205669522285, + "epoch": 0.5113305676947268, + "grad_norm": 7.772948265075684, + "learning_rate": 3.537880161731828e-06, + "loss": 0.4729, + "mean_token_accuracy": 0.8462733164429664, + "num_tokens": 198491028.0, + "step": 164950 + }, + { + "entropy": 1.8264345929026604, + "epoch": 0.5113615668197765, + "grad_norm": 7.550524711608887, + "learning_rate": 3.5377729249705438e-06, + "loss": 0.3998, + "mean_token_accuracy": 0.867035549879074, + "num_tokens": 198503289.0, + "step": 164960 + }, + { + "entropy": 1.8178565993905067, + "epoch": 0.5113925659448262, + "grad_norm": 7.000597953796387, + "learning_rate": 3.5376656979600365e-06, + "loss": 0.3955, + "mean_token_accuracy": 0.8724155217409134, + "num_tokens": 198515438.0, + "step": 164970 + }, + { + "entropy": 1.9243502393364906, + "epoch": 0.5114235650698759, + "grad_norm": 7.471307277679443, + "learning_rate": 3.537558480698827e-06, + "loss": 0.4723, + "mean_token_accuracy": 0.848363322019577, + "num_tokens": 198526507.0, + "step": 164980 + }, + { + "entropy": 1.847606810927391, + "epoch": 0.5114545641949256, + "grad_norm": 8.920063972473145, + "learning_rate": 3.5374512731854394e-06, + "loss": 0.4407, + "mean_token_accuracy": 0.8580059990286827, + "num_tokens": 198538844.0, + "step": 164990 + }, + { + "entropy": 1.8540842577815055, + "epoch": 0.5114855633199753, + "grad_norm": 7.94528341293335, + "learning_rate": 3.5373440754183965e-06, + "loss": 0.4417, + "mean_token_accuracy": 0.8460657671093941, + "num_tokens": 198550727.0, + "step": 165000 + }, + { + "entropy": 1.8856645733118058, + "epoch": 0.511516562445025, + "grad_norm": 4.576896667480469, + "learning_rate": 3.53723688739622e-06, + "loss": 0.463, + "mean_token_accuracy": 0.8499826610088348, + "num_tokens": 198562724.0, + "step": 165010 + }, + { + "entropy": 1.927583736181259, + "epoch": 0.5115475615700746, + "grad_norm": 9.744261741638184, + "learning_rate": 3.5371297091174357e-06, + "loss": 0.4734, + "mean_token_accuracy": 0.8518247798085212, + "num_tokens": 198574075.0, + "step": 165020 + }, + { + "entropy": 1.8654179081320763, + "epoch": 0.5115785606951244, + "grad_norm": 4.000446796417236, + "learning_rate": 3.5370225405805663e-06, + "loss": 0.4602, + "mean_token_accuracy": 0.8373050197958947, + "num_tokens": 198586300.0, + "step": 165030 + }, + { + "entropy": 1.8400757029652595, + "epoch": 0.5116095598201741, + "grad_norm": 4.41628360748291, + "learning_rate": 3.5369153817841367e-06, + "loss": 0.3661, + "mean_token_accuracy": 0.8663878023624421, + "num_tokens": 198599012.0, + "step": 165040 + }, + { + "entropy": 1.93585607111454, + "epoch": 0.5116405589452238, + "grad_norm": 7.442049980163574, + "learning_rate": 3.5368082327266712e-06, + "loss": 0.4947, + "mean_token_accuracy": 0.8513322427868844, + "num_tokens": 198610777.0, + "step": 165050 + }, + { + "entropy": 1.8586627542972565, + "epoch": 0.5116715580702734, + "grad_norm": 4.374122619628906, + "learning_rate": 3.536701093406695e-06, + "loss": 0.414, + "mean_token_accuracy": 0.8645623952150345, + "num_tokens": 198623111.0, + "step": 165060 + }, + { + "entropy": 1.8560616582632066, + "epoch": 0.5117025571953232, + "grad_norm": 4.863686561584473, + "learning_rate": 3.5365939638227324e-06, + "loss": 0.4074, + "mean_token_accuracy": 0.861749242246151, + "num_tokens": 198635806.0, + "step": 165070 + }, + { + "entropy": 1.8346017554402352, + "epoch": 0.5117335563203729, + "grad_norm": 10.877781867980957, + "learning_rate": 3.53648684397331e-06, + "loss": 0.4191, + "mean_token_accuracy": 0.859768570959568, + "num_tokens": 198647986.0, + "step": 165080 + }, + { + "entropy": 1.8296894863247872, + "epoch": 0.5117645554454225, + "grad_norm": 6.875624656677246, + "learning_rate": 3.536379733856953e-06, + "loss": 0.4016, + "mean_token_accuracy": 0.8574081555008888, + "num_tokens": 198660724.0, + "step": 165090 + }, + { + "entropy": 1.8089259415864944, + "epoch": 0.5117955545704722, + "grad_norm": 4.244149684906006, + "learning_rate": 3.5362726334721887e-06, + "loss": 0.4045, + "mean_token_accuracy": 0.8540760070085526, + "num_tokens": 198674278.0, + "step": 165100 + }, + { + "entropy": 1.8661685228347777, + "epoch": 0.511826553695522, + "grad_norm": 7.697582244873047, + "learning_rate": 3.5361655428175417e-06, + "loss": 0.4136, + "mean_token_accuracy": 0.8633060559630394, + "num_tokens": 198686853.0, + "step": 165110 + }, + { + "entropy": 1.8793710887432098, + "epoch": 0.5118575528205717, + "grad_norm": 3.8460631370544434, + "learning_rate": 3.5360584618915406e-06, + "loss": 0.3943, + "mean_token_accuracy": 0.8673370242118835, + "num_tokens": 198699128.0, + "step": 165120 + }, + { + "entropy": 1.848849655687809, + "epoch": 0.5118885519456213, + "grad_norm": 9.106633186340332, + "learning_rate": 3.5359513906927113e-06, + "loss": 0.3761, + "mean_token_accuracy": 0.8695919916033745, + "num_tokens": 198711717.0, + "step": 165130 + }, + { + "entropy": 1.8800451412796975, + "epoch": 0.511919551070671, + "grad_norm": 7.916536808013916, + "learning_rate": 3.5358443292195817e-06, + "loss": 0.4699, + "mean_token_accuracy": 0.8574667185544967, + "num_tokens": 198723408.0, + "step": 165140 + }, + { + "entropy": 1.8587745755910874, + "epoch": 0.5119505501957208, + "grad_norm": 10.332902908325195, + "learning_rate": 3.5357372774706783e-06, + "loss": 0.4105, + "mean_token_accuracy": 0.8618461728096009, + "num_tokens": 198735665.0, + "step": 165150 + }, + { + "entropy": 1.8222912192344665, + "epoch": 0.5119815493207704, + "grad_norm": 3.885300397872925, + "learning_rate": 3.535630235444531e-06, + "loss": 0.4034, + "mean_token_accuracy": 0.8571152433753013, + "num_tokens": 198748298.0, + "step": 165160 + }, + { + "entropy": 1.9119007468223572, + "epoch": 0.5120125484458201, + "grad_norm": 9.721302032470703, + "learning_rate": 3.5355232031396675e-06, + "loss": 0.4826, + "mean_token_accuracy": 0.8569756045937538, + "num_tokens": 198758950.0, + "step": 165170 + }, + { + "entropy": 1.8979253500699997, + "epoch": 0.5120435475708698, + "grad_norm": 7.7287917137146, + "learning_rate": 3.5354161805546155e-06, + "loss": 0.4471, + "mean_token_accuracy": 0.8547446504235268, + "num_tokens": 198770387.0, + "step": 165180 + }, + { + "entropy": 1.8123277112841607, + "epoch": 0.5120745466959195, + "grad_norm": 4.713539123535156, + "learning_rate": 3.5353091676879057e-06, + "loss": 0.3891, + "mean_token_accuracy": 0.8553282573819161, + "num_tokens": 198783402.0, + "step": 165190 + }, + { + "entropy": 1.8285948097705842, + "epoch": 0.5121055458209692, + "grad_norm": 8.289105415344238, + "learning_rate": 3.535202164538066e-06, + "loss": 0.4239, + "mean_token_accuracy": 0.8569412216544151, + "num_tokens": 198796000.0, + "step": 165200 + }, + { + "entropy": 1.905951689183712, + "epoch": 0.5121365449460189, + "grad_norm": 9.26884937286377, + "learning_rate": 3.5350951711036262e-06, + "loss": 0.4539, + "mean_token_accuracy": 0.8543861001729965, + "num_tokens": 198807335.0, + "step": 165210 + }, + { + "entropy": 1.8662221506237984, + "epoch": 0.5121675440710686, + "grad_norm": 9.472772598266602, + "learning_rate": 3.5349881873831165e-06, + "loss": 0.4737, + "mean_token_accuracy": 0.8481184557080269, + "num_tokens": 198819688.0, + "step": 165220 + }, + { + "entropy": 1.8440553843975067, + "epoch": 0.5121985431961182, + "grad_norm": 8.241409301757812, + "learning_rate": 3.5348812133750676e-06, + "loss": 0.383, + "mean_token_accuracy": 0.8684048295021057, + "num_tokens": 198831562.0, + "step": 165230 + }, + { + "entropy": 1.8263012327253818, + "epoch": 0.512229542321168, + "grad_norm": 4.3429670333862305, + "learning_rate": 3.534774249078009e-06, + "loss": 0.3925, + "mean_token_accuracy": 0.8637837857007981, + "num_tokens": 198845431.0, + "step": 165240 + }, + { + "entropy": 1.831053911149502, + "epoch": 0.5122605414462177, + "grad_norm": 3.6324291229248047, + "learning_rate": 3.5346672944904727e-06, + "loss": 0.4284, + "mean_token_accuracy": 0.8589798659086227, + "num_tokens": 198858159.0, + "step": 165250 + }, + { + "entropy": 1.8699246987700462, + "epoch": 0.5122915405712674, + "grad_norm": 7.301723480224609, + "learning_rate": 3.534560349610988e-06, + "loss": 0.4003, + "mean_token_accuracy": 0.8703466773033142, + "num_tokens": 198870055.0, + "step": 165260 + }, + { + "entropy": 1.85399060100317, + "epoch": 0.512322539696317, + "grad_norm": 7.461488723754883, + "learning_rate": 3.5344534144380875e-06, + "loss": 0.4328, + "mean_token_accuracy": 0.862822137773037, + "num_tokens": 198881372.0, + "step": 165270 + }, + { + "entropy": 1.9233081370592118, + "epoch": 0.5123535388213668, + "grad_norm": 9.688029289245605, + "learning_rate": 3.534346488970303e-06, + "loss": 0.498, + "mean_token_accuracy": 0.838792422413826, + "num_tokens": 198892477.0, + "step": 165280 + }, + { + "entropy": 1.8923705786466598, + "epoch": 0.5123845379464165, + "grad_norm": 8.833309173583984, + "learning_rate": 3.534239573206167e-06, + "loss": 0.4379, + "mean_token_accuracy": 0.8576890960335731, + "num_tokens": 198904123.0, + "step": 165290 + }, + { + "entropy": 1.9048192888498305, + "epoch": 0.5124155370714661, + "grad_norm": 7.9789934158325195, + "learning_rate": 3.534132667144212e-06, + "loss": 0.4729, + "mean_token_accuracy": 0.8539910644292832, + "num_tokens": 198914964.0, + "step": 165300 + }, + { + "entropy": 1.8376499265432358, + "epoch": 0.5124465361965158, + "grad_norm": 4.213772296905518, + "learning_rate": 3.534025770782969e-06, + "loss": 0.3961, + "mean_token_accuracy": 0.8648659616708756, + "num_tokens": 198927131.0, + "step": 165310 + }, + { + "entropy": 1.8888862892985343, + "epoch": 0.5124775353215656, + "grad_norm": 7.217865467071533, + "learning_rate": 3.533918884120972e-06, + "loss": 0.4491, + "mean_token_accuracy": 0.8601428523659707, + "num_tokens": 198938821.0, + "step": 165320 + }, + { + "entropy": 1.8831818588078022, + "epoch": 0.5125085344466153, + "grad_norm": 3.8709065914154053, + "learning_rate": 3.533812007156755e-06, + "loss": 0.4356, + "mean_token_accuracy": 0.8554113984107972, + "num_tokens": 198951829.0, + "step": 165330 + }, + { + "entropy": 1.7507137194275857, + "epoch": 0.5125395335716649, + "grad_norm": 4.336053848266602, + "learning_rate": 3.5337051398888504e-06, + "loss": 0.3983, + "mean_token_accuracy": 0.8600828319787979, + "num_tokens": 198965766.0, + "step": 165340 + }, + { + "entropy": 1.929706057906151, + "epoch": 0.5125705326967146, + "grad_norm": 3.9890756607055664, + "learning_rate": 3.5335982823157936e-06, + "loss": 0.4921, + "mean_token_accuracy": 0.8485506847500801, + "num_tokens": 198977381.0, + "step": 165350 + }, + { + "entropy": 1.9037581861019135, + "epoch": 0.5126015318217644, + "grad_norm": 7.280956745147705, + "learning_rate": 3.5334914344361184e-06, + "loss": 0.4437, + "mean_token_accuracy": 0.8539852678775788, + "num_tokens": 198988697.0, + "step": 165360 + }, + { + "entropy": 1.826684795320034, + "epoch": 0.512632530946814, + "grad_norm": 2.034179210662842, + "learning_rate": 3.533384596248358e-06, + "loss": 0.4377, + "mean_token_accuracy": 0.8617399200797081, + "num_tokens": 199001031.0, + "step": 165370 + }, + { + "entropy": 1.943645280599594, + "epoch": 0.5126635300718637, + "grad_norm": 9.540709495544434, + "learning_rate": 3.533277767751049e-06, + "loss": 0.4885, + "mean_token_accuracy": 0.846565519273281, + "num_tokens": 199012527.0, + "step": 165380 + }, + { + "entropy": 1.8760394856333733, + "epoch": 0.5126945291969134, + "grad_norm": 9.603574752807617, + "learning_rate": 3.533170948942725e-06, + "loss": 0.459, + "mean_token_accuracy": 0.8549725398421287, + "num_tokens": 199024645.0, + "step": 165390 + }, + { + "entropy": 1.9340546056628227, + "epoch": 0.5127255283219632, + "grad_norm": 8.026835441589355, + "learning_rate": 3.5330641398219232e-06, + "loss": 0.4585, + "mean_token_accuracy": 0.8500248864293098, + "num_tokens": 199036414.0, + "step": 165400 + }, + { + "entropy": 1.8972145736217498, + "epoch": 0.5127565274470128, + "grad_norm": 8.733002662658691, + "learning_rate": 3.532957340387178e-06, + "loss": 0.4699, + "mean_token_accuracy": 0.856229642033577, + "num_tokens": 199048689.0, + "step": 165410 + }, + { + "entropy": 1.8412564128637314, + "epoch": 0.5127875265720625, + "grad_norm": 8.106005668640137, + "learning_rate": 3.5328505506370264e-06, + "loss": 0.405, + "mean_token_accuracy": 0.868453860282898, + "num_tokens": 199061006.0, + "step": 165420 + }, + { + "entropy": 1.881354607641697, + "epoch": 0.5128185256971122, + "grad_norm": 9.300365447998047, + "learning_rate": 3.5327437705700047e-06, + "loss": 0.4336, + "mean_token_accuracy": 0.8567851424217224, + "num_tokens": 199072950.0, + "step": 165430 + }, + { + "entropy": 1.9285381063818932, + "epoch": 0.5128495248221618, + "grad_norm": 9.102884292602539, + "learning_rate": 3.5326370001846483e-06, + "loss": 0.4798, + "mean_token_accuracy": 0.8458780542016029, + "num_tokens": 199083759.0, + "step": 165440 + }, + { + "entropy": 1.8580153673887252, + "epoch": 0.5128805239472116, + "grad_norm": 4.1997551918029785, + "learning_rate": 3.532530239479497e-06, + "loss": 0.3904, + "mean_token_accuracy": 0.8655030012130738, + "num_tokens": 199095203.0, + "step": 165450 + }, + { + "entropy": 1.7688224032521247, + "epoch": 0.5129115230722613, + "grad_norm": 9.050981521606445, + "learning_rate": 3.5324234884530855e-06, + "loss": 0.3636, + "mean_token_accuracy": 0.8712503552436829, + "num_tokens": 199109013.0, + "step": 165460 + }, + { + "entropy": 1.913888046145439, + "epoch": 0.512942522197311, + "grad_norm": 6.518311977386475, + "learning_rate": 3.532316747103952e-06, + "loss": 0.4783, + "mean_token_accuracy": 0.8588741257786751, + "num_tokens": 199120388.0, + "step": 165470 + }, + { + "entropy": 1.9045846730470657, + "epoch": 0.5129735213223606, + "grad_norm": 9.379058837890625, + "learning_rate": 3.5322100154306356e-06, + "loss": 0.435, + "mean_token_accuracy": 0.8588658139109612, + "num_tokens": 199132796.0, + "step": 165480 + }, + { + "entropy": 1.8787872701883317, + "epoch": 0.5130045204474104, + "grad_norm": 9.631776809692383, + "learning_rate": 3.532103293431674e-06, + "loss": 0.4292, + "mean_token_accuracy": 0.8609980300068856, + "num_tokens": 199145020.0, + "step": 165490 + }, + { + "entropy": 1.8990243777632714, + "epoch": 0.5130355195724601, + "grad_norm": 8.644235610961914, + "learning_rate": 3.5319965811056055e-06, + "loss": 0.4284, + "mean_token_accuracy": 0.8497151508927345, + "num_tokens": 199157072.0, + "step": 165500 + }, + { + "entropy": 1.964458554983139, + "epoch": 0.5130665186975097, + "grad_norm": 7.992344379425049, + "learning_rate": 3.531889878450969e-06, + "loss": 0.4772, + "mean_token_accuracy": 0.8534716755151749, + "num_tokens": 199168402.0, + "step": 165510 + }, + { + "entropy": 1.9405386328697205, + "epoch": 0.5130975178225594, + "grad_norm": 7.881420612335205, + "learning_rate": 3.5317831854663044e-06, + "loss": 0.48, + "mean_token_accuracy": 0.8552601054310799, + "num_tokens": 199179455.0, + "step": 165520 + }, + { + "entropy": 1.8967503055930137, + "epoch": 0.5131285169476092, + "grad_norm": 6.660313129425049, + "learning_rate": 3.5316765021501502e-06, + "loss": 0.414, + "mean_token_accuracy": 0.8643925070762635, + "num_tokens": 199191263.0, + "step": 165530 + }, + { + "entropy": 1.8865905031561852, + "epoch": 0.5131595160726589, + "grad_norm": 7.316429615020752, + "learning_rate": 3.5315698285010475e-06, + "loss": 0.4497, + "mean_token_accuracy": 0.84514549523592, + "num_tokens": 199203119.0, + "step": 165540 + }, + { + "entropy": 1.925460186600685, + "epoch": 0.5131905151977085, + "grad_norm": 8.755541801452637, + "learning_rate": 3.531463164517535e-06, + "loss": 0.4387, + "mean_token_accuracy": 0.8621005952358246, + "num_tokens": 199214608.0, + "step": 165550 + }, + { + "entropy": 1.9230826959013938, + "epoch": 0.5132215143227582, + "grad_norm": 4.1020636558532715, + "learning_rate": 3.531356510198154e-06, + "loss": 0.4091, + "mean_token_accuracy": 0.8598949074745178, + "num_tokens": 199226632.0, + "step": 165560 + }, + { + "entropy": 1.801899181306362, + "epoch": 0.513252513447808, + "grad_norm": 8.648043632507324, + "learning_rate": 3.5312498655414447e-06, + "loss": 0.37, + "mean_token_accuracy": 0.8700105547904968, + "num_tokens": 199239560.0, + "step": 165570 + }, + { + "entropy": 1.8442170739173889, + "epoch": 0.5132835125728576, + "grad_norm": 3.063127040863037, + "learning_rate": 3.531143230545949e-06, + "loss": 0.3955, + "mean_token_accuracy": 0.8664918586611747, + "num_tokens": 199251263.0, + "step": 165580 + }, + { + "entropy": 1.784967178106308, + "epoch": 0.5133145116979073, + "grad_norm": 8.657151222229004, + "learning_rate": 3.5310366052102076e-06, + "loss": 0.3843, + "mean_token_accuracy": 0.8673649072647095, + "num_tokens": 199263888.0, + "step": 165590 + }, + { + "entropy": 1.7379182785749436, + "epoch": 0.513345510822957, + "grad_norm": 3.7212421894073486, + "learning_rate": 3.530929989532762e-06, + "loss": 0.3682, + "mean_token_accuracy": 0.8674047499895096, + "num_tokens": 199277870.0, + "step": 165600 + }, + { + "entropy": 1.749285961687565, + "epoch": 0.5133765099480068, + "grad_norm": 2.982008934020996, + "learning_rate": 3.5308233835121555e-06, + "loss": 0.3672, + "mean_token_accuracy": 0.8672578066587449, + "num_tokens": 199292780.0, + "step": 165610 + }, + { + "entropy": 1.8962912052869796, + "epoch": 0.5134075090730564, + "grad_norm": 8.388097763061523, + "learning_rate": 3.530716787146928e-06, + "loss": 0.4696, + "mean_token_accuracy": 0.8507509827613831, + "num_tokens": 199304340.0, + "step": 165620 + }, + { + "entropy": 1.835693357884884, + "epoch": 0.5134385081981061, + "grad_norm": 8.318591117858887, + "learning_rate": 3.5306102004356242e-06, + "loss": 0.3864, + "mean_token_accuracy": 0.8652938485145569, + "num_tokens": 199317295.0, + "step": 165630 + }, + { + "entropy": 1.8470296934247017, + "epoch": 0.5134695073231558, + "grad_norm": 8.046967506408691, + "learning_rate": 3.5305036233767865e-06, + "loss": 0.4212, + "mean_token_accuracy": 0.8550007939338684, + "num_tokens": 199330149.0, + "step": 165640 + }, + { + "entropy": 1.8552626207470895, + "epoch": 0.5135005064482056, + "grad_norm": 7.412757396697998, + "learning_rate": 3.5303970559689575e-06, + "loss": 0.4076, + "mean_token_accuracy": 0.8597292914986611, + "num_tokens": 199343185.0, + "step": 165650 + }, + { + "entropy": 1.7761410742998123, + "epoch": 0.5135315055732552, + "grad_norm": 8.163668632507324, + "learning_rate": 3.5302904982106816e-06, + "loss": 0.3737, + "mean_token_accuracy": 0.8719475150108338, + "num_tokens": 199356286.0, + "step": 165660 + }, + { + "entropy": 1.8710751131176948, + "epoch": 0.5135625046983049, + "grad_norm": 4.116541385650635, + "learning_rate": 3.530183950100502e-06, + "loss": 0.4045, + "mean_token_accuracy": 0.8619105949997902, + "num_tokens": 199368052.0, + "step": 165670 + }, + { + "entropy": 1.94207361638546, + "epoch": 0.5135935038233546, + "grad_norm": 7.243650913238525, + "learning_rate": 3.530077411636963e-06, + "loss": 0.5017, + "mean_token_accuracy": 0.843024018406868, + "num_tokens": 199379570.0, + "step": 165680 + }, + { + "entropy": 1.9412450224161149, + "epoch": 0.5136245029484042, + "grad_norm": 7.0396013259887695, + "learning_rate": 3.529970882818609e-06, + "loss": 0.4883, + "mean_token_accuracy": 0.8452283650636673, + "num_tokens": 199390522.0, + "step": 165690 + }, + { + "entropy": 1.8934280395507812, + "epoch": 0.513655502073454, + "grad_norm": 9.029003143310547, + "learning_rate": 3.529864363643985e-06, + "loss": 0.4384, + "mean_token_accuracy": 0.855101665854454, + "num_tokens": 199402784.0, + "step": 165700 + }, + { + "entropy": 1.9106672063469887, + "epoch": 0.5136865011985037, + "grad_norm": 9.377785682678223, + "learning_rate": 3.529757854111636e-06, + "loss": 0.4417, + "mean_token_accuracy": 0.8599131286144257, + "num_tokens": 199413917.0, + "step": 165710 + }, + { + "entropy": 1.8733697950839996, + "epoch": 0.5137175003235533, + "grad_norm": 8.300299644470215, + "learning_rate": 3.529651354220107e-06, + "loss": 0.4217, + "mean_token_accuracy": 0.8556776940822601, + "num_tokens": 199426429.0, + "step": 165720 + }, + { + "entropy": 1.9104059755802154, + "epoch": 0.513748499448603, + "grad_norm": 3.4120521545410156, + "learning_rate": 3.5295448639679436e-06, + "loss": 0.4907, + "mean_token_accuracy": 0.8444541275501252, + "num_tokens": 199438710.0, + "step": 165730 + }, + { + "entropy": 1.8936485648155212, + "epoch": 0.5137794985736528, + "grad_norm": 3.6589162349700928, + "learning_rate": 3.529438383353692e-06, + "loss": 0.4618, + "mean_token_accuracy": 0.851848429441452, + "num_tokens": 199449881.0, + "step": 165740 + }, + { + "entropy": 1.9428910300135613, + "epoch": 0.5138104976987025, + "grad_norm": 8.218547821044922, + "learning_rate": 3.529331912375899e-06, + "loss": 0.466, + "mean_token_accuracy": 0.8518887877464294, + "num_tokens": 199460975.0, + "step": 165750 + }, + { + "entropy": 1.8341187611222267, + "epoch": 0.5138414968237521, + "grad_norm": 4.086446762084961, + "learning_rate": 3.529225451033111e-06, + "loss": 0.4022, + "mean_token_accuracy": 0.8616880744695663, + "num_tokens": 199473647.0, + "step": 165760 + }, + { + "entropy": 1.8468362540006638, + "epoch": 0.5138724959488018, + "grad_norm": 5.933652400970459, + "learning_rate": 3.529118999323874e-06, + "loss": 0.4822, + "mean_token_accuracy": 0.8481451705098152, + "num_tokens": 199485712.0, + "step": 165770 + }, + { + "entropy": 1.9439073607325554, + "epoch": 0.5139034950738516, + "grad_norm": 8.484124183654785, + "learning_rate": 3.529012557246736e-06, + "loss": 0.506, + "mean_token_accuracy": 0.8435920670628547, + "num_tokens": 199497344.0, + "step": 165780 + }, + { + "entropy": 1.9494817286729813, + "epoch": 0.5139344941989012, + "grad_norm": 7.470255374908447, + "learning_rate": 3.528906124800245e-06, + "loss": 0.5036, + "mean_token_accuracy": 0.8448470056056976, + "num_tokens": 199508098.0, + "step": 165790 + }, + { + "entropy": 1.8470247462391853, + "epoch": 0.5139654933239509, + "grad_norm": 8.678824424743652, + "learning_rate": 3.528799701982948e-06, + "loss": 0.4463, + "mean_token_accuracy": 0.852811923623085, + "num_tokens": 199520795.0, + "step": 165800 + }, + { + "entropy": 1.8924536779522896, + "epoch": 0.5139964924490006, + "grad_norm": 7.182823657989502, + "learning_rate": 3.528693288793393e-06, + "loss": 0.4289, + "mean_token_accuracy": 0.8532521143555641, + "num_tokens": 199532460.0, + "step": 165810 + }, + { + "entropy": 1.7975892141461371, + "epoch": 0.5140274915740504, + "grad_norm": 9.009475708007812, + "learning_rate": 3.5285868852301284e-06, + "loss": 0.4233, + "mean_token_accuracy": 0.8550382882356644, + "num_tokens": 199545495.0, + "step": 165820 + }, + { + "entropy": 1.9081748649477959, + "epoch": 0.5140584906991, + "grad_norm": 7.362851142883301, + "learning_rate": 3.5284804912917044e-06, + "loss": 0.4172, + "mean_token_accuracy": 0.86656956076622, + "num_tokens": 199556576.0, + "step": 165830 + }, + { + "entropy": 1.8292412385344505, + "epoch": 0.5140894898241497, + "grad_norm": 7.867893218994141, + "learning_rate": 3.5283741069766682e-06, + "loss": 0.4065, + "mean_token_accuracy": 0.8617888554930687, + "num_tokens": 199568805.0, + "step": 165840 + }, + { + "entropy": 1.8667304873466493, + "epoch": 0.5141204889491994, + "grad_norm": 9.72758960723877, + "learning_rate": 3.5282677322835697e-06, + "loss": 0.4575, + "mean_token_accuracy": 0.8470277801156044, + "num_tokens": 199580316.0, + "step": 165850 + }, + { + "entropy": 1.8580066189169884, + "epoch": 0.5141514880742492, + "grad_norm": 7.151798725128174, + "learning_rate": 3.528161367210959e-06, + "loss": 0.4803, + "mean_token_accuracy": 0.8555332809686661, + "num_tokens": 199592287.0, + "step": 165860 + }, + { + "entropy": 1.8838511526584625, + "epoch": 0.5141824871992988, + "grad_norm": 9.418664932250977, + "learning_rate": 3.528055011757387e-06, + "loss": 0.5016, + "mean_token_accuracy": 0.839755679666996, + "num_tokens": 199604689.0, + "step": 165870 + }, + { + "entropy": 1.8362286701798438, + "epoch": 0.5142134863243485, + "grad_norm": 9.759264945983887, + "learning_rate": 3.527948665921401e-06, + "loss": 0.3988, + "mean_token_accuracy": 0.8611894577741623, + "num_tokens": 199617535.0, + "step": 165880 + }, + { + "entropy": 1.8877715915441513, + "epoch": 0.5142444854493982, + "grad_norm": 5.549670219421387, + "learning_rate": 3.5278423297015547e-06, + "loss": 0.4641, + "mean_token_accuracy": 0.8523763597011567, + "num_tokens": 199629442.0, + "step": 165890 + }, + { + "entropy": 1.9229825258255004, + "epoch": 0.5142754845744479, + "grad_norm": 7.350560188293457, + "learning_rate": 3.527736003096397e-06, + "loss": 0.4548, + "mean_token_accuracy": 0.8494596913456917, + "num_tokens": 199641510.0, + "step": 165900 + }, + { + "entropy": 1.9444432079792022, + "epoch": 0.5143064836994976, + "grad_norm": 8.653480529785156, + "learning_rate": 3.52762968610448e-06, + "loss": 0.4827, + "mean_token_accuracy": 0.8489970117807388, + "num_tokens": 199652436.0, + "step": 165910 + }, + { + "entropy": 1.8965161561965942, + "epoch": 0.5143374828245473, + "grad_norm": 8.42867660522461, + "learning_rate": 3.527523378724355e-06, + "loss": 0.4431, + "mean_token_accuracy": 0.8526179268956184, + "num_tokens": 199664868.0, + "step": 165920 + }, + { + "entropy": 1.9393030047416686, + "epoch": 0.514368481949597, + "grad_norm": 8.376232147216797, + "learning_rate": 3.527417080954574e-06, + "loss": 0.4895, + "mean_token_accuracy": 0.8461043626070023, + "num_tokens": 199675872.0, + "step": 165930 + }, + { + "entropy": 1.938021996617317, + "epoch": 0.5143994810746466, + "grad_norm": 6.738912582397461, + "learning_rate": 3.527310792793688e-06, + "loss": 0.4515, + "mean_token_accuracy": 0.8560506254434586, + "num_tokens": 199687256.0, + "step": 165940 + }, + { + "entropy": 1.8744206488132478, + "epoch": 0.5144304801996964, + "grad_norm": 3.8372957706451416, + "learning_rate": 3.5272045142402507e-06, + "loss": 0.4269, + "mean_token_accuracy": 0.8504655987024308, + "num_tokens": 199699428.0, + "step": 165950 + }, + { + "entropy": 1.7658538281917573, + "epoch": 0.5144614793247461, + "grad_norm": 8.591163635253906, + "learning_rate": 3.527098245292814e-06, + "loss": 0.3913, + "mean_token_accuracy": 0.8597464576363564, + "num_tokens": 199713480.0, + "step": 165960 + }, + { + "entropy": 1.8621069818735123, + "epoch": 0.5144924784497957, + "grad_norm": 8.060001373291016, + "learning_rate": 3.5269919859499325e-06, + "loss": 0.4215, + "mean_token_accuracy": 0.8680314958095551, + "num_tokens": 199725158.0, + "step": 165970 + }, + { + "entropy": 1.985747216641903, + "epoch": 0.5145234775748454, + "grad_norm": 8.78643798828125, + "learning_rate": 3.5268857362101573e-06, + "loss": 0.5117, + "mean_token_accuracy": 0.8432222366333008, + "num_tokens": 199736883.0, + "step": 165980 + }, + { + "entropy": 1.90114316791296, + "epoch": 0.5145544766998952, + "grad_norm": 8.598512649536133, + "learning_rate": 3.5267794960720435e-06, + "loss": 0.4719, + "mean_token_accuracy": 0.8427591428160668, + "num_tokens": 199748636.0, + "step": 165990 + }, + { + "entropy": 1.8595915861427783, + "epoch": 0.5145854758249448, + "grad_norm": 8.431171417236328, + "learning_rate": 3.526673265534144e-06, + "loss": 0.4189, + "mean_token_accuracy": 0.8530223548412323, + "num_tokens": 199760844.0, + "step": 166000 + }, + { + "entropy": 1.9187236800789833, + "epoch": 0.5146164749499945, + "grad_norm": 9.313626289367676, + "learning_rate": 3.526567044595014e-06, + "loss": 0.4553, + "mean_token_accuracy": 0.8506917625665664, + "num_tokens": 199772512.0, + "step": 166010 + }, + { + "entropy": 1.894215178489685, + "epoch": 0.5146474740750442, + "grad_norm": 4.336549282073975, + "learning_rate": 3.526460833253208e-06, + "loss": 0.4713, + "mean_token_accuracy": 0.8501930445432663, + "num_tokens": 199784483.0, + "step": 166020 + }, + { + "entropy": 1.8991673409938812, + "epoch": 0.514678473200094, + "grad_norm": 8.60685920715332, + "learning_rate": 3.5263546315072805e-06, + "loss": 0.4325, + "mean_token_accuracy": 0.8552577182650566, + "num_tokens": 199795866.0, + "step": 166030 + }, + { + "entropy": 1.9237390816211701, + "epoch": 0.5147094723251436, + "grad_norm": 11.135947227478027, + "learning_rate": 3.5262484393557867e-06, + "loss": 0.4295, + "mean_token_accuracy": 0.8558558940887451, + "num_tokens": 199806932.0, + "step": 166040 + }, + { + "entropy": 1.9094901964068414, + "epoch": 0.5147404714501933, + "grad_norm": 3.4820947647094727, + "learning_rate": 3.526142256797282e-06, + "loss": 0.4453, + "mean_token_accuracy": 0.8509971618652343, + "num_tokens": 199818174.0, + "step": 166050 + }, + { + "entropy": 1.9149621576070786, + "epoch": 0.514771470575243, + "grad_norm": 7.39802360534668, + "learning_rate": 3.5260360838303213e-06, + "loss": 0.4559, + "mean_token_accuracy": 0.8556697189807891, + "num_tokens": 199829446.0, + "step": 166060 + }, + { + "entropy": 1.9035993307828902, + "epoch": 0.5148024697002928, + "grad_norm": 8.023327827453613, + "learning_rate": 3.525929920453463e-06, + "loss": 0.4418, + "mean_token_accuracy": 0.8653369784355164, + "num_tokens": 199840994.0, + "step": 166070 + }, + { + "entropy": 1.8855173870921136, + "epoch": 0.5148334688253424, + "grad_norm": 4.079839706420898, + "learning_rate": 3.525823766665261e-06, + "loss": 0.4415, + "mean_token_accuracy": 0.8557475224137306, + "num_tokens": 199853342.0, + "step": 166080 + }, + { + "entropy": 1.832002504169941, + "epoch": 0.5148644679503921, + "grad_norm": 8.041168212890625, + "learning_rate": 3.525717622464274e-06, + "loss": 0.4235, + "mean_token_accuracy": 0.8482832551002503, + "num_tokens": 199866649.0, + "step": 166090 + }, + { + "entropy": 1.806608146429062, + "epoch": 0.5148954670754418, + "grad_norm": 7.931125640869141, + "learning_rate": 3.525611487849057e-06, + "loss": 0.3683, + "mean_token_accuracy": 0.8658397302031517, + "num_tokens": 199879430.0, + "step": 166100 + }, + { + "entropy": 1.8772173956036569, + "epoch": 0.5149264662004915, + "grad_norm": 6.945058345794678, + "learning_rate": 3.525505362818169e-06, + "loss": 0.405, + "mean_token_accuracy": 0.8555058270692826, + "num_tokens": 199891989.0, + "step": 166110 + }, + { + "entropy": 1.9100875377655029, + "epoch": 0.5149574653255412, + "grad_norm": 6.640747547149658, + "learning_rate": 3.5253992473701666e-06, + "loss": 0.4133, + "mean_token_accuracy": 0.861659274995327, + "num_tokens": 199903727.0, + "step": 166120 + }, + { + "entropy": 1.853276364505291, + "epoch": 0.5149884644505909, + "grad_norm": 6.926969051361084, + "learning_rate": 3.525293141503608e-06, + "loss": 0.4246, + "mean_token_accuracy": 0.8639653757214546, + "num_tokens": 199915510.0, + "step": 166130 + }, + { + "entropy": 1.8948578789830208, + "epoch": 0.5150194635756405, + "grad_norm": 9.782580375671387, + "learning_rate": 3.525187045217052e-06, + "loss": 0.4605, + "mean_token_accuracy": 0.8522849783301354, + "num_tokens": 199927351.0, + "step": 166140 + }, + { + "entropy": 1.8471661388874054, + "epoch": 0.5150504627006903, + "grad_norm": 10.758081436157227, + "learning_rate": 3.5250809585090555e-06, + "loss": 0.433, + "mean_token_accuracy": 0.8511803165078163, + "num_tokens": 199940212.0, + "step": 166150 + }, + { + "entropy": 1.9439245939254761, + "epoch": 0.51508146182574, + "grad_norm": 7.815676212310791, + "learning_rate": 3.524974881378178e-06, + "loss": 0.4839, + "mean_token_accuracy": 0.8520305082201958, + "num_tokens": 199951112.0, + "step": 166160 + }, + { + "entropy": 1.9194507017731666, + "epoch": 0.5151124609507897, + "grad_norm": 8.278312683105469, + "learning_rate": 3.52486881382298e-06, + "loss": 0.4434, + "mean_token_accuracy": 0.8619399756193161, + "num_tokens": 199962297.0, + "step": 166170 + }, + { + "entropy": 1.9504115760326386, + "epoch": 0.5151434600758393, + "grad_norm": 9.44292163848877, + "learning_rate": 3.5247627558420196e-06, + "loss": 0.4733, + "mean_token_accuracy": 0.8505216121673584, + "num_tokens": 199972541.0, + "step": 166180 + }, + { + "entropy": 1.8769857451319694, + "epoch": 0.515174459200889, + "grad_norm": 3.786944627761841, + "learning_rate": 3.5246567074338563e-06, + "loss": 0.4242, + "mean_token_accuracy": 0.8582519263029098, + "num_tokens": 199984035.0, + "step": 166190 + }, + { + "entropy": 1.877619171142578, + "epoch": 0.5152054583259388, + "grad_norm": 4.849076271057129, + "learning_rate": 3.524550668597051e-06, + "loss": 0.4357, + "mean_token_accuracy": 0.8577707931399345, + "num_tokens": 199996112.0, + "step": 166200 + }, + { + "entropy": 1.9202482283115387, + "epoch": 0.5152364574509884, + "grad_norm": 3.6988606452941895, + "learning_rate": 3.5244446393301628e-06, + "loss": 0.4677, + "mean_token_accuracy": 0.8513051331043243, + "num_tokens": 200007656.0, + "step": 166210 + }, + { + "entropy": 1.829783384501934, + "epoch": 0.5152674565760381, + "grad_norm": 7.577276229858398, + "learning_rate": 3.5243386196317535e-06, + "loss": 0.3663, + "mean_token_accuracy": 0.8670179888606071, + "num_tokens": 200019989.0, + "step": 166220 + }, + { + "entropy": 1.8723140776157379, + "epoch": 0.5152984557010878, + "grad_norm": 7.912343978881836, + "learning_rate": 3.5242326095003844e-06, + "loss": 0.456, + "mean_token_accuracy": 0.85295629799366, + "num_tokens": 200032038.0, + "step": 166230 + }, + { + "entropy": 1.8697478756308556, + "epoch": 0.5153294548261376, + "grad_norm": 3.890298366546631, + "learning_rate": 3.5241266089346147e-06, + "loss": 0.3906, + "mean_token_accuracy": 0.8689236909151077, + "num_tokens": 200044324.0, + "step": 166240 + }, + { + "entropy": 1.8814658299088478, + "epoch": 0.5153604539511872, + "grad_norm": 3.827788829803467, + "learning_rate": 3.524020617933008e-06, + "loss": 0.4433, + "mean_token_accuracy": 0.858353029191494, + "num_tokens": 200056180.0, + "step": 166250 + }, + { + "entropy": 1.8012808829545974, + "epoch": 0.5153914530762369, + "grad_norm": 7.109594345092773, + "learning_rate": 3.5239146364941247e-06, + "loss": 0.3766, + "mean_token_accuracy": 0.8628587678074837, + "num_tokens": 200069667.0, + "step": 166260 + }, + { + "entropy": 1.9676722317934037, + "epoch": 0.5154224522012866, + "grad_norm": 10.758739471435547, + "learning_rate": 3.5238086646165283e-06, + "loss": 0.4909, + "mean_token_accuracy": 0.8481611773371697, + "num_tokens": 200080634.0, + "step": 166270 + }, + { + "entropy": 1.9452502936124803, + "epoch": 0.5154534513263364, + "grad_norm": 9.257121086120605, + "learning_rate": 3.5237027022987795e-06, + "loss": 0.4932, + "mean_token_accuracy": 0.849012990295887, + "num_tokens": 200091838.0, + "step": 166280 + }, + { + "entropy": 1.8852812498807907, + "epoch": 0.515484450451386, + "grad_norm": 9.17672061920166, + "learning_rate": 3.523596749539443e-06, + "loss": 0.445, + "mean_token_accuracy": 0.8530494660139084, + "num_tokens": 200103554.0, + "step": 166290 + }, + { + "entropy": 1.8659416556358337, + "epoch": 0.5155154495764357, + "grad_norm": 9.371847152709961, + "learning_rate": 3.5234908063370803e-06, + "loss": 0.4387, + "mean_token_accuracy": 0.8579873785376548, + "num_tokens": 200115307.0, + "step": 166300 + }, + { + "entropy": 1.8628961145877838, + "epoch": 0.5155464487014854, + "grad_norm": 9.57691764831543, + "learning_rate": 3.5233848726902554e-06, + "loss": 0.4406, + "mean_token_accuracy": 0.8588206008076668, + "num_tokens": 200127125.0, + "step": 166310 + }, + { + "entropy": 1.8160467877984048, + "epoch": 0.5155774478265351, + "grad_norm": 6.197166919708252, + "learning_rate": 3.5232789485975323e-06, + "loss": 0.3701, + "mean_token_accuracy": 0.8625551044940949, + "num_tokens": 200140225.0, + "step": 166320 + }, + { + "entropy": 1.905704265832901, + "epoch": 0.5156084469515848, + "grad_norm": 10.474200248718262, + "learning_rate": 3.523173034057474e-06, + "loss": 0.5064, + "mean_token_accuracy": 0.8451021924614907, + "num_tokens": 200152276.0, + "step": 166330 + }, + { + "entropy": 1.887017984688282, + "epoch": 0.5156394460766345, + "grad_norm": 8.072344779968262, + "learning_rate": 3.523067129068646e-06, + "loss": 0.4472, + "mean_token_accuracy": 0.8550172179937363, + "num_tokens": 200164173.0, + "step": 166340 + }, + { + "entropy": 1.8533250823616982, + "epoch": 0.5156704452016841, + "grad_norm": 3.522214651107788, + "learning_rate": 3.5229612336296113e-06, + "loss": 0.3713, + "mean_token_accuracy": 0.8682062104344368, + "num_tokens": 200176357.0, + "step": 166350 + }, + { + "entropy": 1.9086049854755402, + "epoch": 0.5157014443267339, + "grad_norm": 7.360813140869141, + "learning_rate": 3.522855347738936e-06, + "loss": 0.4377, + "mean_token_accuracy": 0.8604349881410599, + "num_tokens": 200188158.0, + "step": 166360 + }, + { + "entropy": 1.875716508924961, + "epoch": 0.5157324434517836, + "grad_norm": 6.441467761993408, + "learning_rate": 3.5227494713951847e-06, + "loss": 0.4278, + "mean_token_accuracy": 0.8681019887328147, + "num_tokens": 200200162.0, + "step": 166370 + }, + { + "entropy": 1.776827821880579, + "epoch": 0.5157634425768333, + "grad_norm": 2.422685146331787, + "learning_rate": 3.522643604596923e-06, + "loss": 0.388, + "mean_token_accuracy": 0.8635517880320549, + "num_tokens": 200213370.0, + "step": 166380 + }, + { + "entropy": 1.8618718206882476, + "epoch": 0.5157944417018829, + "grad_norm": 3.5454483032226562, + "learning_rate": 3.522537747342717e-06, + "loss": 0.4561, + "mean_token_accuracy": 0.8620031788945198, + "num_tokens": 200225275.0, + "step": 166390 + }, + { + "entropy": 1.906214900314808, + "epoch": 0.5158254408269327, + "grad_norm": 9.334129333496094, + "learning_rate": 3.5224318996311324e-06, + "loss": 0.4746, + "mean_token_accuracy": 0.8507939204573631, + "num_tokens": 200236830.0, + "step": 166400 + }, + { + "entropy": 1.9781831055879593, + "epoch": 0.5158564399519824, + "grad_norm": 7.7256340980529785, + "learning_rate": 3.5223260614607365e-06, + "loss": 0.5335, + "mean_token_accuracy": 0.8375950440764427, + "num_tokens": 200247890.0, + "step": 166410 + }, + { + "entropy": 1.9732432961463928, + "epoch": 0.515887439077032, + "grad_norm": 9.70375919342041, + "learning_rate": 3.5222202328300936e-06, + "loss": 0.5296, + "mean_token_accuracy": 0.8345681294798851, + "num_tokens": 200259455.0, + "step": 166420 + }, + { + "entropy": 1.86635515242815, + "epoch": 0.5159184382020817, + "grad_norm": 3.7832741737365723, + "learning_rate": 3.5221144137377727e-06, + "loss": 0.4047, + "mean_token_accuracy": 0.8658062994480134, + "num_tokens": 200271878.0, + "step": 166430 + }, + { + "entropy": 1.8391347080469131, + "epoch": 0.5159494373271314, + "grad_norm": 3.3010501861572266, + "learning_rate": 3.522008604182341e-06, + "loss": 0.4102, + "mean_token_accuracy": 0.8555727571249008, + "num_tokens": 200284953.0, + "step": 166440 + }, + { + "entropy": 1.8243245914578439, + "epoch": 0.5159804364521812, + "grad_norm": 7.198421955108643, + "learning_rate": 3.521902804162365e-06, + "loss": 0.3804, + "mean_token_accuracy": 0.8652743324637413, + "num_tokens": 200298321.0, + "step": 166450 + }, + { + "entropy": 1.9128145158290863, + "epoch": 0.5160114355772308, + "grad_norm": 9.630524635314941, + "learning_rate": 3.5217970136764134e-06, + "loss": 0.461, + "mean_token_accuracy": 0.842883138358593, + "num_tokens": 200309586.0, + "step": 166460 + }, + { + "entropy": 1.923278383910656, + "epoch": 0.5160424347022805, + "grad_norm": 8.430140495300293, + "learning_rate": 3.5216912327230545e-06, + "loss": 0.4534, + "mean_token_accuracy": 0.8511732935905456, + "num_tokens": 200321291.0, + "step": 166470 + }, + { + "entropy": 1.779987198114395, + "epoch": 0.5160734338273302, + "grad_norm": 7.797722816467285, + "learning_rate": 3.521585461300856e-06, + "loss": 0.4326, + "mean_token_accuracy": 0.8620175883173943, + "num_tokens": 200334275.0, + "step": 166480 + }, + { + "entropy": 1.9071402192115783, + "epoch": 0.51610443295238, + "grad_norm": 8.116595268249512, + "learning_rate": 3.521479699408388e-06, + "loss": 0.4529, + "mean_token_accuracy": 0.8501278668642044, + "num_tokens": 200346105.0, + "step": 166490 + }, + { + "entropy": 1.8801355749368667, + "epoch": 0.5161354320774296, + "grad_norm": 5.864742279052734, + "learning_rate": 3.5213739470442176e-06, + "loss": 0.4575, + "mean_token_accuracy": 0.8512676224112511, + "num_tokens": 200357894.0, + "step": 166500 + }, + { + "entropy": 1.8980636432766915, + "epoch": 0.5161664312024793, + "grad_norm": 8.04993724822998, + "learning_rate": 3.5212682042069157e-06, + "loss": 0.4358, + "mean_token_accuracy": 0.8641004547476768, + "num_tokens": 200369625.0, + "step": 166510 + }, + { + "entropy": 1.8364667430520059, + "epoch": 0.516197430327529, + "grad_norm": 12.634716033935547, + "learning_rate": 3.5211624708950515e-06, + "loss": 0.4214, + "mean_token_accuracy": 0.8635191649198533, + "num_tokens": 200381515.0, + "step": 166520 + }, + { + "entropy": 1.924870501458645, + "epoch": 0.5162284294525787, + "grad_norm": 8.887164115905762, + "learning_rate": 3.5210567471071962e-06, + "loss": 0.4749, + "mean_token_accuracy": 0.8511701777577401, + "num_tokens": 200392831.0, + "step": 166530 + }, + { + "entropy": 1.91126778870821, + "epoch": 0.5162594285776284, + "grad_norm": 8.9415922164917, + "learning_rate": 3.5209510328419174e-06, + "loss": 0.4264, + "mean_token_accuracy": 0.8611420899629593, + "num_tokens": 200404326.0, + "step": 166540 + }, + { + "entropy": 1.8403394103050232, + "epoch": 0.5162904277026781, + "grad_norm": 7.393084526062012, + "learning_rate": 3.520845328097788e-06, + "loss": 0.4025, + "mean_token_accuracy": 0.8644649833440781, + "num_tokens": 200417037.0, + "step": 166550 + }, + { + "entropy": 1.8981595396995545, + "epoch": 0.5163214268277277, + "grad_norm": 10.253661155700684, + "learning_rate": 3.520739632873378e-06, + "loss": 0.4727, + "mean_token_accuracy": 0.8498363941907883, + "num_tokens": 200428816.0, + "step": 166560 + }, + { + "entropy": 1.8573830232024193, + "epoch": 0.5163524259527775, + "grad_norm": 7.009857177734375, + "learning_rate": 3.5206339471672583e-06, + "loss": 0.4085, + "mean_token_accuracy": 0.8623615726828575, + "num_tokens": 200440760.0, + "step": 166570 + }, + { + "entropy": 1.904543998837471, + "epoch": 0.5163834250778272, + "grad_norm": 7.801250457763672, + "learning_rate": 3.5205282709780015e-06, + "loss": 0.5007, + "mean_token_accuracy": 0.8442154437303543, + "num_tokens": 200452685.0, + "step": 166580 + }, + { + "entropy": 1.8602082580327988, + "epoch": 0.5164144242028769, + "grad_norm": 4.4240946769714355, + "learning_rate": 3.5204226043041776e-06, + "loss": 0.4087, + "mean_token_accuracy": 0.8587385684251785, + "num_tokens": 200464705.0, + "step": 166590 + }, + { + "entropy": 1.875721175968647, + "epoch": 0.5164454233279265, + "grad_norm": 3.9632303714752197, + "learning_rate": 3.520316947144361e-06, + "loss": 0.4252, + "mean_token_accuracy": 0.8631686121225357, + "num_tokens": 200476790.0, + "step": 166600 + }, + { + "entropy": 1.9048528328537941, + "epoch": 0.5164764224529763, + "grad_norm": 7.584252834320068, + "learning_rate": 3.520211299497122e-06, + "loss": 0.4534, + "mean_token_accuracy": 0.8571887835860252, + "num_tokens": 200488053.0, + "step": 166610 + }, + { + "entropy": 1.8573243141174316, + "epoch": 0.516507421578026, + "grad_norm": 5.636211395263672, + "learning_rate": 3.5201056613610347e-06, + "loss": 0.4837, + "mean_token_accuracy": 0.8564253136515617, + "num_tokens": 200499812.0, + "step": 166620 + }, + { + "entropy": 1.9138450264930724, + "epoch": 0.5165384207030757, + "grad_norm": 8.784363746643066, + "learning_rate": 3.5200000327346714e-06, + "loss": 0.4478, + "mean_token_accuracy": 0.8483364924788475, + "num_tokens": 200511806.0, + "step": 166630 + }, + { + "entropy": 1.914737243950367, + "epoch": 0.5165694198281253, + "grad_norm": 3.4447364807128906, + "learning_rate": 3.5198944136166048e-06, + "loss": 0.4885, + "mean_token_accuracy": 0.838789090514183, + "num_tokens": 200523241.0, + "step": 166640 + }, + { + "entropy": 1.7461669012904166, + "epoch": 0.5166004189531751, + "grad_norm": 3.7405452728271484, + "learning_rate": 3.5197888040054094e-06, + "loss": 0.3185, + "mean_token_accuracy": 0.8775429561734199, + "num_tokens": 200537167.0, + "step": 166650 + }, + { + "entropy": 1.8424347892403603, + "epoch": 0.5166314180782248, + "grad_norm": 9.276825904846191, + "learning_rate": 3.519683203899659e-06, + "loss": 0.4288, + "mean_token_accuracy": 0.8487263694405556, + "num_tokens": 200549567.0, + "step": 166660 + }, + { + "entropy": 1.8613199979066848, + "epoch": 0.5166624172032744, + "grad_norm": 7.739795684814453, + "learning_rate": 3.5195776132979283e-06, + "loss": 0.4584, + "mean_token_accuracy": 0.8574581235647202, + "num_tokens": 200561757.0, + "step": 166670 + }, + { + "entropy": 1.8857336401939393, + "epoch": 0.5166934163283241, + "grad_norm": 7.0130696296691895, + "learning_rate": 3.5194720321987894e-06, + "loss": 0.4515, + "mean_token_accuracy": 0.8589113280177116, + "num_tokens": 200573813.0, + "step": 166680 + }, + { + "entropy": 1.889016604423523, + "epoch": 0.5167244154533738, + "grad_norm": 7.985410690307617, + "learning_rate": 3.519366460600821e-06, + "loss": 0.4421, + "mean_token_accuracy": 0.8589774370193481, + "num_tokens": 200584955.0, + "step": 166690 + }, + { + "entropy": 1.8742096453905106, + "epoch": 0.5167554145784236, + "grad_norm": 3.954725980758667, + "learning_rate": 3.519260898502594e-06, + "loss": 0.4285, + "mean_token_accuracy": 0.8560954242944717, + "num_tokens": 200597682.0, + "step": 166700 + }, + { + "entropy": 1.865584236383438, + "epoch": 0.5167864137034732, + "grad_norm": 8.845154762268066, + "learning_rate": 3.519155345902687e-06, + "loss": 0.4162, + "mean_token_accuracy": 0.8622621670365334, + "num_tokens": 200609258.0, + "step": 166710 + }, + { + "entropy": 1.9222540989518166, + "epoch": 0.5168174128285229, + "grad_norm": 4.905745029449463, + "learning_rate": 3.5190498027996738e-06, + "loss": 0.4671, + "mean_token_accuracy": 0.8524176865816117, + "num_tokens": 200620245.0, + "step": 166720 + }, + { + "entropy": 1.83238478153944, + "epoch": 0.5168484119535726, + "grad_norm": 8.726923942565918, + "learning_rate": 3.5189442691921306e-06, + "loss": 0.4615, + "mean_token_accuracy": 0.8474073141813279, + "num_tokens": 200632277.0, + "step": 166730 + }, + { + "entropy": 1.9488019853830338, + "epoch": 0.5168794110786223, + "grad_norm": 7.9495368003845215, + "learning_rate": 3.5188387450786355e-06, + "loss": 0.4888, + "mean_token_accuracy": 0.8537901550531387, + "num_tokens": 200642847.0, + "step": 166740 + }, + { + "entropy": 1.8265003859996796, + "epoch": 0.516910410203672, + "grad_norm": 8.291993141174316, + "learning_rate": 3.5187332304577628e-06, + "loss": 0.4125, + "mean_token_accuracy": 0.8603555530309677, + "num_tokens": 200654847.0, + "step": 166750 + }, + { + "entropy": 1.7454282835125923, + "epoch": 0.5169414093287217, + "grad_norm": 3.814364433288574, + "learning_rate": 3.5186277253280903e-06, + "loss": 0.3628, + "mean_token_accuracy": 0.8665547743439674, + "num_tokens": 200669003.0, + "step": 166760 + }, + { + "entropy": 1.8317288614809513, + "epoch": 0.5169724084537713, + "grad_norm": 3.715165853500366, + "learning_rate": 3.5185222296881947e-06, + "loss": 0.3879, + "mean_token_accuracy": 0.8635482504963875, + "num_tokens": 200681134.0, + "step": 166770 + }, + { + "entropy": 1.8476212650537491, + "epoch": 0.5170034075788211, + "grad_norm": 8.774842262268066, + "learning_rate": 3.518416743536654e-06, + "loss": 0.4539, + "mean_token_accuracy": 0.8514179885387421, + "num_tokens": 200693765.0, + "step": 166780 + }, + { + "entropy": 1.9044752269983292, + "epoch": 0.5170344067038708, + "grad_norm": 8.217805862426758, + "learning_rate": 3.518311266872046e-06, + "loss": 0.4525, + "mean_token_accuracy": 0.8520160347223282, + "num_tokens": 200705265.0, + "step": 166790 + }, + { + "entropy": 1.849595281481743, + "epoch": 0.5170654058289205, + "grad_norm": 8.451947212219238, + "learning_rate": 3.5182057996929488e-06, + "loss": 0.4108, + "mean_token_accuracy": 0.8606655448675156, + "num_tokens": 200718081.0, + "step": 166800 + }, + { + "entropy": 1.812503370642662, + "epoch": 0.5170964049539701, + "grad_norm": 8.556303024291992, + "learning_rate": 3.5181003419979404e-06, + "loss": 0.4244, + "mean_token_accuracy": 0.8502597719430923, + "num_tokens": 200730617.0, + "step": 166810 + }, + { + "entropy": 1.8947110012173654, + "epoch": 0.5171274040790199, + "grad_norm": 7.719273567199707, + "learning_rate": 3.5179948937855994e-06, + "loss": 0.4819, + "mean_token_accuracy": 0.8427142888307572, + "num_tokens": 200741235.0, + "step": 166820 + }, + { + "entropy": 1.8528358027338983, + "epoch": 0.5171584032040696, + "grad_norm": 7.755263805389404, + "learning_rate": 3.5178894550545055e-06, + "loss": 0.4345, + "mean_token_accuracy": 0.8589017793536187, + "num_tokens": 200753559.0, + "step": 166830 + }, + { + "entropy": 1.9601138323545455, + "epoch": 0.5171894023291193, + "grad_norm": 7.6968183517456055, + "learning_rate": 3.517784025803237e-06, + "loss": 0.5124, + "mean_token_accuracy": 0.8391397684812546, + "num_tokens": 200764345.0, + "step": 166840 + }, + { + "entropy": 1.831675609946251, + "epoch": 0.5172204014541689, + "grad_norm": 7.837430000305176, + "learning_rate": 3.5176786060303745e-06, + "loss": 0.4231, + "mean_token_accuracy": 0.8554352253675461, + "num_tokens": 200776357.0, + "step": 166850 + }, + { + "entropy": 1.8829845905303955, + "epoch": 0.5172514005792187, + "grad_norm": 3.6687681674957275, + "learning_rate": 3.5175731957344964e-06, + "loss": 0.4679, + "mean_token_accuracy": 0.848581674695015, + "num_tokens": 200788319.0, + "step": 166860 + }, + { + "entropy": 1.8463978335261344, + "epoch": 0.5172823997042684, + "grad_norm": 7.817187786102295, + "learning_rate": 3.5174677949141845e-06, + "loss": 0.4033, + "mean_token_accuracy": 0.8644368588924408, + "num_tokens": 200799628.0, + "step": 166870 + }, + { + "entropy": 1.864042194187641, + "epoch": 0.517313398829318, + "grad_norm": 8.473761558532715, + "learning_rate": 3.5173624035680187e-06, + "loss": 0.4281, + "mean_token_accuracy": 0.8542978420853615, + "num_tokens": 200811772.0, + "step": 166880 + }, + { + "entropy": 1.909403358399868, + "epoch": 0.5173443979543677, + "grad_norm": 10.229207992553711, + "learning_rate": 3.5172570216945785e-06, + "loss": 0.4407, + "mean_token_accuracy": 0.8571710214018822, + "num_tokens": 200822773.0, + "step": 166890 + }, + { + "entropy": 1.9418629288673401, + "epoch": 0.5173753970794175, + "grad_norm": 9.051170349121094, + "learning_rate": 3.517151649292447e-06, + "loss": 0.4753, + "mean_token_accuracy": 0.8507715001702308, + "num_tokens": 200833601.0, + "step": 166900 + }, + { + "entropy": 1.880239014327526, + "epoch": 0.5174063962044672, + "grad_norm": 9.583940505981445, + "learning_rate": 3.517046286360204e-06, + "loss": 0.4242, + "mean_token_accuracy": 0.8595744326710701, + "num_tokens": 200844933.0, + "step": 166910 + }, + { + "entropy": 1.8107650607824326, + "epoch": 0.5174373953295168, + "grad_norm": 8.987581253051758, + "learning_rate": 3.5169409328964315e-06, + "loss": 0.4582, + "mean_token_accuracy": 0.8517088457942009, + "num_tokens": 200857702.0, + "step": 166920 + }, + { + "entropy": 1.906441855430603, + "epoch": 0.5174683944545665, + "grad_norm": 8.563685417175293, + "learning_rate": 3.5168355888997115e-06, + "loss": 0.4786, + "mean_token_accuracy": 0.8531229227781296, + "num_tokens": 200869141.0, + "step": 166930 + }, + { + "entropy": 1.8778182238340377, + "epoch": 0.5174993935796162, + "grad_norm": 7.703993797302246, + "learning_rate": 3.5167302543686266e-06, + "loss": 0.4294, + "mean_token_accuracy": 0.8593733966350555, + "num_tokens": 200880696.0, + "step": 166940 + }, + { + "entropy": 1.8586980432271958, + "epoch": 0.5175303927046659, + "grad_norm": 4.391999244689941, + "learning_rate": 3.51662492930176e-06, + "loss": 0.405, + "mean_token_accuracy": 0.8623529061675071, + "num_tokens": 200892668.0, + "step": 166950 + }, + { + "entropy": 1.9131682097911835, + "epoch": 0.5175613918297156, + "grad_norm": 8.268182754516602, + "learning_rate": 3.516519613697692e-06, + "loss": 0.4838, + "mean_token_accuracy": 0.8527122482657432, + "num_tokens": 200903497.0, + "step": 166960 + }, + { + "entropy": 1.8035447210073472, + "epoch": 0.5175923909547653, + "grad_norm": 7.8364386558532715, + "learning_rate": 3.5164143075550084e-06, + "loss": 0.4462, + "mean_token_accuracy": 0.8491023346781731, + "num_tokens": 200917014.0, + "step": 166970 + }, + { + "entropy": 1.8485938400030135, + "epoch": 0.517623390079815, + "grad_norm": 8.503170013427734, + "learning_rate": 3.5163090108722914e-06, + "loss": 0.422, + "mean_token_accuracy": 0.8574503690004349, + "num_tokens": 200929733.0, + "step": 166980 + }, + { + "entropy": 1.8992557242512702, + "epoch": 0.5176543892048647, + "grad_norm": 7.843517303466797, + "learning_rate": 3.5162037236481246e-06, + "loss": 0.4634, + "mean_token_accuracy": 0.8507803022861481, + "num_tokens": 200941462.0, + "step": 166990 + }, + { + "entropy": 1.9413973063230514, + "epoch": 0.5176853883299144, + "grad_norm": 7.582324981689453, + "learning_rate": 3.5160984458810924e-06, + "loss": 0.4653, + "mean_token_accuracy": 0.8503754585981369, + "num_tokens": 200952501.0, + "step": 167000 + }, + { + "entropy": 1.8474327087402345, + "epoch": 0.5177163874549641, + "grad_norm": 8.536754608154297, + "learning_rate": 3.515993177569779e-06, + "loss": 0.4311, + "mean_token_accuracy": 0.8653739243745804, + "num_tokens": 200964740.0, + "step": 167010 + }, + { + "entropy": 1.915780645608902, + "epoch": 0.5177473865800137, + "grad_norm": 8.24149227142334, + "learning_rate": 3.5158879187127694e-06, + "loss": 0.4411, + "mean_token_accuracy": 0.8639548003673554, + "num_tokens": 200976228.0, + "step": 167020 + }, + { + "entropy": 1.920879889279604, + "epoch": 0.5177783857050635, + "grad_norm": 9.354827880859375, + "learning_rate": 3.515782669308648e-06, + "loss": 0.4601, + "mean_token_accuracy": 0.8478871122002601, + "num_tokens": 200988501.0, + "step": 167030 + }, + { + "entropy": 1.8832586228847503, + "epoch": 0.5178093848301132, + "grad_norm": 8.015664100646973, + "learning_rate": 3.5156774293559997e-06, + "loss": 0.4471, + "mean_token_accuracy": 0.8528868913650512, + "num_tokens": 201000535.0, + "step": 167040 + }, + { + "entropy": 1.8097984239459037, + "epoch": 0.5178403839551629, + "grad_norm": 7.848792552947998, + "learning_rate": 3.5155721988534107e-06, + "loss": 0.3972, + "mean_token_accuracy": 0.8547261565923691, + "num_tokens": 201013877.0, + "step": 167050 + }, + { + "entropy": 1.8996559455990791, + "epoch": 0.5178713830802125, + "grad_norm": 8.505248069763184, + "learning_rate": 3.515466977799467e-06, + "loss": 0.4239, + "mean_token_accuracy": 0.8700977265834808, + "num_tokens": 201025601.0, + "step": 167060 + }, + { + "entropy": 1.8885849609971046, + "epoch": 0.5179023822052623, + "grad_norm": 8.436034202575684, + "learning_rate": 3.5153617661927536e-06, + "loss": 0.4076, + "mean_token_accuracy": 0.858351269364357, + "num_tokens": 201037162.0, + "step": 167070 + }, + { + "entropy": 1.8170109301805497, + "epoch": 0.517933381330312, + "grad_norm": 6.91679573059082, + "learning_rate": 3.5152565640318574e-06, + "loss": 0.4235, + "mean_token_accuracy": 0.8640385091304779, + "num_tokens": 201050080.0, + "step": 167080 + }, + { + "entropy": 1.915900157392025, + "epoch": 0.5179643804553616, + "grad_norm": 8.707968711853027, + "learning_rate": 3.515151371315366e-06, + "loss": 0.4522, + "mean_token_accuracy": 0.8495099931955338, + "num_tokens": 201061377.0, + "step": 167090 + }, + { + "entropy": 1.8400561198592187, + "epoch": 0.5179953795804113, + "grad_norm": 3.3338160514831543, + "learning_rate": 3.5150461880418655e-06, + "loss": 0.4202, + "mean_token_accuracy": 0.8588408932089806, + "num_tokens": 201074751.0, + "step": 167100 + }, + { + "entropy": 2.0018926441669462, + "epoch": 0.5180263787054611, + "grad_norm": 9.095269203186035, + "learning_rate": 3.5149410142099434e-06, + "loss": 0.5228, + "mean_token_accuracy": 0.8369523748755455, + "num_tokens": 201085008.0, + "step": 167110 + }, + { + "entropy": 1.896622897684574, + "epoch": 0.5180573778305108, + "grad_norm": 8.372537612915039, + "learning_rate": 3.5148358498181865e-06, + "loss": 0.4443, + "mean_token_accuracy": 0.8490730375051498, + "num_tokens": 201097253.0, + "step": 167120 + }, + { + "entropy": 1.9279163151979446, + "epoch": 0.5180883769555604, + "grad_norm": 6.603575229644775, + "learning_rate": 3.5147306948651838e-06, + "loss": 0.4692, + "mean_token_accuracy": 0.8467545539140702, + "num_tokens": 201109037.0, + "step": 167130 + }, + { + "entropy": 1.952432581782341, + "epoch": 0.5181193760806101, + "grad_norm": 8.363898277282715, + "learning_rate": 3.514625549349523e-06, + "loss": 0.4404, + "mean_token_accuracy": 0.8555149123072624, + "num_tokens": 201119894.0, + "step": 167140 + }, + { + "entropy": 1.9005773276090623, + "epoch": 0.5181503752056599, + "grad_norm": 8.650740623474121, + "learning_rate": 3.5145204132697925e-06, + "loss": 0.4749, + "mean_token_accuracy": 0.8549324363470078, + "num_tokens": 201131834.0, + "step": 167150 + }, + { + "entropy": 1.9701307892799378, + "epoch": 0.5181813743307095, + "grad_norm": 7.619329929351807, + "learning_rate": 3.5144152866245813e-06, + "loss": 0.4786, + "mean_token_accuracy": 0.8487115666270256, + "num_tokens": 201142782.0, + "step": 167160 + }, + { + "entropy": 1.9462131574749946, + "epoch": 0.5182123734557592, + "grad_norm": 8.687287330627441, + "learning_rate": 3.5143101694124783e-06, + "loss": 0.5086, + "mean_token_accuracy": 0.8393335893750191, + "num_tokens": 201154661.0, + "step": 167170 + }, + { + "entropy": 1.9058082491159438, + "epoch": 0.5182433725808089, + "grad_norm": 8.543585777282715, + "learning_rate": 3.5142050616320723e-06, + "loss": 0.4629, + "mean_token_accuracy": 0.8486988008022308, + "num_tokens": 201166535.0, + "step": 167180 + }, + { + "entropy": 1.9440076380968094, + "epoch": 0.5182743717058585, + "grad_norm": 9.032926559448242, + "learning_rate": 3.514099963281954e-06, + "loss": 0.4776, + "mean_token_accuracy": 0.8492290109395981, + "num_tokens": 201177837.0, + "step": 167190 + }, + { + "entropy": 1.9601941794157027, + "epoch": 0.5183053708309083, + "grad_norm": 8.802116394042969, + "learning_rate": 3.5139948743607123e-06, + "loss": 0.475, + "mean_token_accuracy": 0.8613344877958298, + "num_tokens": 201188703.0, + "step": 167200 + }, + { + "entropy": 2.0091607570648193, + "epoch": 0.518336369955958, + "grad_norm": 7.016593933105469, + "learning_rate": 3.513889794866938e-06, + "loss": 0.5003, + "mean_token_accuracy": 0.8481083378195763, + "num_tokens": 201199672.0, + "step": 167210 + }, + { + "entropy": 1.963625544309616, + "epoch": 0.5183673690810077, + "grad_norm": 9.131065368652344, + "learning_rate": 3.5137847247992224e-06, + "loss": 0.4796, + "mean_token_accuracy": 0.8464767590165139, + "num_tokens": 201211201.0, + "step": 167220 + }, + { + "entropy": 1.9450436413288117, + "epoch": 0.5183983682060573, + "grad_norm": 8.431489944458008, + "learning_rate": 3.513679664156155e-06, + "loss": 0.5262, + "mean_token_accuracy": 0.8462486758828163, + "num_tokens": 201222474.0, + "step": 167230 + }, + { + "entropy": 1.9184205442667008, + "epoch": 0.5184293673311071, + "grad_norm": 8.032157897949219, + "learning_rate": 3.5135746129363267e-06, + "loss": 0.4851, + "mean_token_accuracy": 0.8471630290150642, + "num_tokens": 201233600.0, + "step": 167240 + }, + { + "entropy": 1.8578265473246574, + "epoch": 0.5184603664561568, + "grad_norm": 3.748180866241455, + "learning_rate": 3.5134695711383304e-06, + "loss": 0.4732, + "mean_token_accuracy": 0.8563891410827636, + "num_tokens": 201246423.0, + "step": 167250 + }, + { + "entropy": 1.7977075070142745, + "epoch": 0.5184913655812065, + "grad_norm": 8.292705535888672, + "learning_rate": 3.5133645387607567e-06, + "loss": 0.3485, + "mean_token_accuracy": 0.8734015017747879, + "num_tokens": 201259551.0, + "step": 167260 + }, + { + "entropy": 1.927210134267807, + "epoch": 0.5185223647062561, + "grad_norm": 8.34482479095459, + "learning_rate": 3.5132595158021987e-06, + "loss": 0.4447, + "mean_token_accuracy": 0.8532064571976662, + "num_tokens": 201270364.0, + "step": 167270 + }, + { + "entropy": 1.8977614745497704, + "epoch": 0.5185533638313059, + "grad_norm": 8.938633918762207, + "learning_rate": 3.5131545022612474e-06, + "loss": 0.4179, + "mean_token_accuracy": 0.8541785389184952, + "num_tokens": 201281740.0, + "step": 167280 + }, + { + "entropy": 1.940072876214981, + "epoch": 0.5185843629563556, + "grad_norm": 8.34633731842041, + "learning_rate": 3.5130494981364954e-06, + "loss": 0.5433, + "mean_token_accuracy": 0.8353075116872788, + "num_tokens": 201293729.0, + "step": 167290 + }, + { + "entropy": 1.9134965017437935, + "epoch": 0.5186153620814052, + "grad_norm": 7.574338912963867, + "learning_rate": 3.512944503426537e-06, + "loss": 0.4505, + "mean_token_accuracy": 0.8549482673406601, + "num_tokens": 201305349.0, + "step": 167300 + }, + { + "entropy": 1.8456029385328292, + "epoch": 0.5186463612064549, + "grad_norm": 10.16829776763916, + "learning_rate": 3.5128395181299646e-06, + "loss": 0.4192, + "mean_token_accuracy": 0.8634034737944603, + "num_tokens": 201317808.0, + "step": 167310 + }, + { + "entropy": 1.9166592717170716, + "epoch": 0.5186773603315047, + "grad_norm": 3.4636266231536865, + "learning_rate": 3.5127345422453706e-06, + "loss": 0.4492, + "mean_token_accuracy": 0.8501359283924103, + "num_tokens": 201329922.0, + "step": 167320 + }, + { + "entropy": 1.937665620446205, + "epoch": 0.5187083594565544, + "grad_norm": 8.84914779663086, + "learning_rate": 3.512629575771351e-06, + "loss": 0.4813, + "mean_token_accuracy": 0.8507133930921554, + "num_tokens": 201341333.0, + "step": 167330 + }, + { + "entropy": 1.8795202478766442, + "epoch": 0.518739358581604, + "grad_norm": 2.294635772705078, + "learning_rate": 3.5125246187064975e-06, + "loss": 0.4155, + "mean_token_accuracy": 0.8657633870840072, + "num_tokens": 201353605.0, + "step": 167340 + }, + { + "entropy": 1.8707646176218986, + "epoch": 0.5187703577066537, + "grad_norm": 8.930290222167969, + "learning_rate": 3.5124196710494057e-06, + "loss": 0.4421, + "mean_token_accuracy": 0.8540336638689041, + "num_tokens": 201366110.0, + "step": 167350 + }, + { + "entropy": 1.9009417802095414, + "epoch": 0.5188013568317035, + "grad_norm": 6.296438694000244, + "learning_rate": 3.5123147327986706e-06, + "loss": 0.5145, + "mean_token_accuracy": 0.8389176607131958, + "num_tokens": 201377996.0, + "step": 167360 + }, + { + "entropy": 1.8275167733430862, + "epoch": 0.5188323559567531, + "grad_norm": 8.323405265808105, + "learning_rate": 3.5122098039528868e-06, + "loss": 0.3644, + "mean_token_accuracy": 0.8666198328137398, + "num_tokens": 201390846.0, + "step": 167370 + }, + { + "entropy": 1.9348952636122703, + "epoch": 0.5188633550818028, + "grad_norm": 7.705176830291748, + "learning_rate": 3.5121048845106496e-06, + "loss": 0.49, + "mean_token_accuracy": 0.8476896122097969, + "num_tokens": 201402387.0, + "step": 167380 + }, + { + "entropy": 1.8035302460193634, + "epoch": 0.5188943542068525, + "grad_norm": 9.087127685546875, + "learning_rate": 3.511999974470554e-06, + "loss": 0.4148, + "mean_token_accuracy": 0.8623087644577027, + "num_tokens": 201415169.0, + "step": 167390 + }, + { + "entropy": 1.9165479212999343, + "epoch": 0.5189253533319023, + "grad_norm": 6.9358439445495605, + "learning_rate": 3.5118950738311956e-06, + "loss": 0.449, + "mean_token_accuracy": 0.861064849793911, + "num_tokens": 201425956.0, + "step": 167400 + }, + { + "entropy": 1.8067945793271065, + "epoch": 0.5189563524569519, + "grad_norm": 9.967735290527344, + "learning_rate": 3.5117901825911716e-06, + "loss": 0.4001, + "mean_token_accuracy": 0.860332365334034, + "num_tokens": 201438627.0, + "step": 167410 + }, + { + "entropy": 1.8174662292003632, + "epoch": 0.5189873515820016, + "grad_norm": 3.570725679397583, + "learning_rate": 3.511685300749078e-06, + "loss": 0.3971, + "mean_token_accuracy": 0.8613906666636467, + "num_tokens": 201450928.0, + "step": 167420 + }, + { + "entropy": 1.847064770758152, + "epoch": 0.5190183507070513, + "grad_norm": 4.677860736846924, + "learning_rate": 3.5115804283035115e-06, + "loss": 0.4355, + "mean_token_accuracy": 0.8492764979600906, + "num_tokens": 201463376.0, + "step": 167430 + }, + { + "entropy": 1.9042583480477333, + "epoch": 0.5190493498321009, + "grad_norm": 8.254162788391113, + "learning_rate": 3.5114755652530693e-06, + "loss": 0.4691, + "mean_token_accuracy": 0.8512088477611541, + "num_tokens": 201474321.0, + "step": 167440 + }, + { + "entropy": 1.8291652023792266, + "epoch": 0.5190803489571507, + "grad_norm": 2.466158866882324, + "learning_rate": 3.511370711596348e-06, + "loss": 0.3921, + "mean_token_accuracy": 0.8621661022305489, + "num_tokens": 201487177.0, + "step": 167450 + }, + { + "entropy": 1.771915179491043, + "epoch": 0.5191113480822004, + "grad_norm": 3.680634021759033, + "learning_rate": 3.511265867331945e-06, + "loss": 0.3624, + "mean_token_accuracy": 0.8703644022345542, + "num_tokens": 201500224.0, + "step": 167460 + }, + { + "entropy": 1.9247154101729393, + "epoch": 0.51914234720725, + "grad_norm": 8.170753479003906, + "learning_rate": 3.51116103245846e-06, + "loss": 0.4676, + "mean_token_accuracy": 0.847488397359848, + "num_tokens": 201511452.0, + "step": 167470 + }, + { + "entropy": 1.8737490639090537, + "epoch": 0.5191733463322997, + "grad_norm": 8.452012062072754, + "learning_rate": 3.5110562069744893e-06, + "loss": 0.4449, + "mean_token_accuracy": 0.8589896142482758, + "num_tokens": 201522618.0, + "step": 167480 + }, + { + "entropy": 1.9660809606313705, + "epoch": 0.5192043454573495, + "grad_norm": 7.087366580963135, + "learning_rate": 3.510951390878632e-06, + "loss": 0.5226, + "mean_token_accuracy": 0.8420694902539253, + "num_tokens": 201533424.0, + "step": 167490 + }, + { + "entropy": 1.7874524354934693, + "epoch": 0.5192353445823992, + "grad_norm": 8.6710205078125, + "learning_rate": 3.5108465841694865e-06, + "loss": 0.3561, + "mean_token_accuracy": 0.8666209667921067, + "num_tokens": 201545720.0, + "step": 167500 + }, + { + "entropy": 1.8550549894571304, + "epoch": 0.5192663437074488, + "grad_norm": 6.358746528625488, + "learning_rate": 3.510741786845653e-06, + "loss": 0.4343, + "mean_token_accuracy": 0.8609934568405151, + "num_tokens": 201557543.0, + "step": 167510 + }, + { + "entropy": 1.774479240179062, + "epoch": 0.5192973428324985, + "grad_norm": 3.6877810955047607, + "learning_rate": 3.51063699890573e-06, + "loss": 0.3477, + "mean_token_accuracy": 0.868420633673668, + "num_tokens": 201570575.0, + "step": 167520 + }, + { + "entropy": 1.7928667023777962, + "epoch": 0.5193283419575483, + "grad_norm": 7.632199764251709, + "learning_rate": 3.5105322203483174e-06, + "loss": 0.3707, + "mean_token_accuracy": 0.8707344710826874, + "num_tokens": 201582948.0, + "step": 167530 + }, + { + "entropy": 1.8913492798805236, + "epoch": 0.519359341082598, + "grad_norm": 7.02748441696167, + "learning_rate": 3.5104274511720142e-06, + "loss": 0.4966, + "mean_token_accuracy": 0.8536164119839669, + "num_tokens": 201594624.0, + "step": 167540 + }, + { + "entropy": 1.779433585703373, + "epoch": 0.5193903402076476, + "grad_norm": 5.0814528465271, + "learning_rate": 3.510322691375422e-06, + "loss": 0.4422, + "mean_token_accuracy": 0.8520729154348373, + "num_tokens": 201607799.0, + "step": 167550 + }, + { + "entropy": 1.8301453605294227, + "epoch": 0.5194213393326973, + "grad_norm": 7.968308448791504, + "learning_rate": 3.51021794095714e-06, + "loss": 0.4214, + "mean_token_accuracy": 0.8606723442673683, + "num_tokens": 201620325.0, + "step": 167560 + }, + { + "entropy": 1.823227186501026, + "epoch": 0.5194523384577471, + "grad_norm": 3.0040812492370605, + "learning_rate": 3.5101131999157707e-06, + "loss": 0.3914, + "mean_token_accuracy": 0.8648414790630341, + "num_tokens": 201632449.0, + "step": 167570 + }, + { + "entropy": 1.9500362485647202, + "epoch": 0.5194833375827967, + "grad_norm": 8.145763397216797, + "learning_rate": 3.5100084682499138e-06, + "loss": 0.4902, + "mean_token_accuracy": 0.8469778224825859, + "num_tokens": 201643547.0, + "step": 167580 + }, + { + "entropy": 1.8669037535786628, + "epoch": 0.5195143367078464, + "grad_norm": 10.10450267791748, + "learning_rate": 3.509903745958171e-06, + "loss": 0.4746, + "mean_token_accuracy": 0.8486629739403725, + "num_tokens": 201655893.0, + "step": 167590 + }, + { + "entropy": 1.8534766390919686, + "epoch": 0.5195453358328961, + "grad_norm": 6.937488079071045, + "learning_rate": 3.5097990330391435e-06, + "loss": 0.397, + "mean_token_accuracy": 0.8601803749799728, + "num_tokens": 201668052.0, + "step": 167600 + }, + { + "entropy": 1.8165239058434963, + "epoch": 0.5195763349579459, + "grad_norm": 8.837699890136719, + "learning_rate": 3.5096943294914342e-06, + "loss": 0.3897, + "mean_token_accuracy": 0.855919572710991, + "num_tokens": 201681132.0, + "step": 167610 + }, + { + "entropy": 1.8845690086483955, + "epoch": 0.5196073340829955, + "grad_norm": 6.574382305145264, + "learning_rate": 3.509589635313646e-06, + "loss": 0.4629, + "mean_token_accuracy": 0.8573358491063118, + "num_tokens": 201692782.0, + "step": 167620 + }, + { + "entropy": 1.8185426101088524, + "epoch": 0.5196383332080452, + "grad_norm": 4.730546474456787, + "learning_rate": 3.50948495050438e-06, + "loss": 0.4295, + "mean_token_accuracy": 0.8604342013597488, + "num_tokens": 201705173.0, + "step": 167630 + }, + { + "entropy": 1.834762555360794, + "epoch": 0.5196693323330949, + "grad_norm": 8.875571250915527, + "learning_rate": 3.509380275062239e-06, + "loss": 0.4126, + "mean_token_accuracy": 0.8564514756202698, + "num_tokens": 201717199.0, + "step": 167640 + }, + { + "entropy": 1.7287873297929763, + "epoch": 0.5197003314581446, + "grad_norm": 3.043661594390869, + "learning_rate": 3.5092756089858265e-06, + "loss": 0.3535, + "mean_token_accuracy": 0.8749393910169602, + "num_tokens": 201730596.0, + "step": 167650 + }, + { + "entropy": 1.885722841322422, + "epoch": 0.5197313305831943, + "grad_norm": 12.398810386657715, + "learning_rate": 3.509170952273747e-06, + "loss": 0.4911, + "mean_token_accuracy": 0.8453606396913529, + "num_tokens": 201742944.0, + "step": 167660 + }, + { + "entropy": 1.923583248257637, + "epoch": 0.519762329708244, + "grad_norm": 8.343000411987305, + "learning_rate": 3.5090663049246027e-06, + "loss": 0.4595, + "mean_token_accuracy": 0.8542603000998497, + "num_tokens": 201754679.0, + "step": 167670 + }, + { + "entropy": 1.868654875457287, + "epoch": 0.5197933288332937, + "grad_norm": 7.407617568969727, + "learning_rate": 3.508961666936999e-06, + "loss": 0.421, + "mean_token_accuracy": 0.854319129884243, + "num_tokens": 201766625.0, + "step": 167680 + }, + { + "entropy": 1.8509719505906106, + "epoch": 0.5198243279583433, + "grad_norm": 8.51540756225586, + "learning_rate": 3.5088570383095382e-06, + "loss": 0.408, + "mean_token_accuracy": 0.8613230153918267, + "num_tokens": 201779781.0, + "step": 167690 + }, + { + "entropy": 1.9388750493526459, + "epoch": 0.5198553270833931, + "grad_norm": 8.581016540527344, + "learning_rate": 3.5087524190408275e-06, + "loss": 0.4629, + "mean_token_accuracy": 0.8547941818833351, + "num_tokens": 201791089.0, + "step": 167700 + }, + { + "entropy": 1.8844716548919678, + "epoch": 0.5198863262084428, + "grad_norm": 4.552313327789307, + "learning_rate": 3.5086478091294697e-06, + "loss": 0.4253, + "mean_token_accuracy": 0.8569486826658249, + "num_tokens": 201803147.0, + "step": 167710 + }, + { + "entropy": 1.8605488628149032, + "epoch": 0.5199173253334924, + "grad_norm": 3.5587663650512695, + "learning_rate": 3.5085432085740706e-06, + "loss": 0.4315, + "mean_token_accuracy": 0.8524827226996422, + "num_tokens": 201815695.0, + "step": 167720 + }, + { + "entropy": 1.9242730915546418, + "epoch": 0.5199483244585421, + "grad_norm": 7.837512493133545, + "learning_rate": 3.5084386173732365e-06, + "loss": 0.4666, + "mean_token_accuracy": 0.8563606560230255, + "num_tokens": 201826892.0, + "step": 167730 + }, + { + "entropy": 1.9117944180965423, + "epoch": 0.5199793235835919, + "grad_norm": 7.830265522003174, + "learning_rate": 3.508334035525572e-06, + "loss": 0.4431, + "mean_token_accuracy": 0.8568824425339698, + "num_tokens": 201837887.0, + "step": 167740 + }, + { + "entropy": 1.857511366903782, + "epoch": 0.5200103227086416, + "grad_norm": 7.367649555206299, + "learning_rate": 3.508229463029684e-06, + "loss": 0.4392, + "mean_token_accuracy": 0.8551246121525764, + "num_tokens": 201849875.0, + "step": 167750 + }, + { + "entropy": 1.915374681353569, + "epoch": 0.5200413218336912, + "grad_norm": 7.929410934448242, + "learning_rate": 3.5081248998841776e-06, + "loss": 0.4532, + "mean_token_accuracy": 0.8556568145751953, + "num_tokens": 201861837.0, + "step": 167760 + }, + { + "entropy": 1.8976527541875838, + "epoch": 0.5200723209587409, + "grad_norm": 8.157280921936035, + "learning_rate": 3.508020346087661e-06, + "loss": 0.4502, + "mean_token_accuracy": 0.8576169595122337, + "num_tokens": 201873619.0, + "step": 167770 + }, + { + "entropy": 1.7953968927264214, + "epoch": 0.5201033200837907, + "grad_norm": 4.682733535766602, + "learning_rate": 3.5079158016387403e-06, + "loss": 0.4431, + "mean_token_accuracy": 0.8547178015112877, + "num_tokens": 201887216.0, + "step": 167780 + }, + { + "entropy": 1.9149467661976813, + "epoch": 0.5201343192088403, + "grad_norm": 8.50699520111084, + "learning_rate": 3.5078112665360233e-06, + "loss": 0.4575, + "mean_token_accuracy": 0.8565734103322029, + "num_tokens": 201898719.0, + "step": 167790 + }, + { + "entropy": 1.9032021701335906, + "epoch": 0.52016531833389, + "grad_norm": 9.742694854736328, + "learning_rate": 3.507706740778117e-06, + "loss": 0.4929, + "mean_token_accuracy": 0.8466816678643226, + "num_tokens": 201910969.0, + "step": 167800 + }, + { + "entropy": 1.8507519498467446, + "epoch": 0.5201963174589397, + "grad_norm": 7.235980987548828, + "learning_rate": 3.507602224363628e-06, + "loss": 0.439, + "mean_token_accuracy": 0.8613151669502258, + "num_tokens": 201922829.0, + "step": 167810 + }, + { + "entropy": 1.9132382899522782, + "epoch": 0.5202273165839895, + "grad_norm": 8.334747314453125, + "learning_rate": 3.507497717291167e-06, + "loss": 0.5141, + "mean_token_accuracy": 0.8361768543720245, + "num_tokens": 201933856.0, + "step": 167820 + }, + { + "entropy": 1.8912567496299744, + "epoch": 0.5202583157090391, + "grad_norm": 7.0671281814575195, + "learning_rate": 3.5073932195593413e-06, + "loss": 0.4624, + "mean_token_accuracy": 0.8508770078420639, + "num_tokens": 201945311.0, + "step": 167830 + }, + { + "entropy": 1.8898201130330563, + "epoch": 0.5202893148340888, + "grad_norm": 8.528030395507812, + "learning_rate": 3.5072887311667585e-06, + "loss": 0.4189, + "mean_token_accuracy": 0.8624457880854607, + "num_tokens": 201956923.0, + "step": 167840 + }, + { + "entropy": 1.9448716431856155, + "epoch": 0.5203203139591385, + "grad_norm": 6.0862531661987305, + "learning_rate": 3.5071842521120283e-06, + "loss": 0.4538, + "mean_token_accuracy": 0.8547197684645653, + "num_tokens": 201967672.0, + "step": 167850 + }, + { + "entropy": 1.9171678617596626, + "epoch": 0.5203513130841882, + "grad_norm": 7.810421466827393, + "learning_rate": 3.50707978239376e-06, + "loss": 0.4812, + "mean_token_accuracy": 0.847561101615429, + "num_tokens": 201979178.0, + "step": 167860 + }, + { + "entropy": 1.9273208022117614, + "epoch": 0.5203823122092379, + "grad_norm": 6.643603801727295, + "learning_rate": 3.506975322010564e-06, + "loss": 0.4682, + "mean_token_accuracy": 0.8571745559573174, + "num_tokens": 201990250.0, + "step": 167870 + }, + { + "entropy": 1.8480944350361823, + "epoch": 0.5204133113342876, + "grad_norm": 9.434873580932617, + "learning_rate": 3.506870870961049e-06, + "loss": 0.4179, + "mean_token_accuracy": 0.8565628483891488, + "num_tokens": 202002760.0, + "step": 167880 + }, + { + "entropy": 1.928942036628723, + "epoch": 0.5204443104593373, + "grad_norm": 8.584845542907715, + "learning_rate": 3.506766429243825e-06, + "loss": 0.477, + "mean_token_accuracy": 0.8476613745093345, + "num_tokens": 202014389.0, + "step": 167890 + }, + { + "entropy": 1.8081008911132812, + "epoch": 0.520475309584387, + "grad_norm": 9.22602367401123, + "learning_rate": 3.5066619968575027e-06, + "loss": 0.3941, + "mean_token_accuracy": 0.857190066576004, + "num_tokens": 202027608.0, + "step": 167900 + }, + { + "entropy": 1.8879745230078697, + "epoch": 0.5205063087094367, + "grad_norm": 8.31643009185791, + "learning_rate": 3.5065575738006936e-06, + "loss": 0.4802, + "mean_token_accuracy": 0.849236112833023, + "num_tokens": 202039567.0, + "step": 167910 + }, + { + "entropy": 1.8697897508740424, + "epoch": 0.5205373078344864, + "grad_norm": 8.23218059539795, + "learning_rate": 3.506453160072007e-06, + "loss": 0.4117, + "mean_token_accuracy": 0.8641917198896408, + "num_tokens": 202051156.0, + "step": 167920 + }, + { + "entropy": 1.9332033962011337, + "epoch": 0.520568306959536, + "grad_norm": 8.56834888458252, + "learning_rate": 3.5063487556700566e-06, + "loss": 0.4395, + "mean_token_accuracy": 0.8528130277991295, + "num_tokens": 202062489.0, + "step": 167930 + }, + { + "entropy": 1.8717573434114456, + "epoch": 0.5205993060845857, + "grad_norm": 8.18609619140625, + "learning_rate": 3.5062443605934516e-06, + "loss": 0.4287, + "mean_token_accuracy": 0.8530358478426934, + "num_tokens": 202074193.0, + "step": 167940 + }, + { + "entropy": 1.8864813312888145, + "epoch": 0.5206303052096355, + "grad_norm": 3.9505395889282227, + "learning_rate": 3.506139974840805e-06, + "loss": 0.4388, + "mean_token_accuracy": 0.8556156173348427, + "num_tokens": 202086671.0, + "step": 167950 + }, + { + "entropy": 1.8406436771154404, + "epoch": 0.5206613043346852, + "grad_norm": 9.141216278076172, + "learning_rate": 3.5060355984107285e-06, + "loss": 0.4141, + "mean_token_accuracy": 0.8611835777759552, + "num_tokens": 202099613.0, + "step": 167960 + }, + { + "entropy": 1.9403121635317802, + "epoch": 0.5206923034597348, + "grad_norm": 3.5786476135253906, + "learning_rate": 3.505931231301835e-06, + "loss": 0.4829, + "mean_token_accuracy": 0.8485002890229225, + "num_tokens": 202111350.0, + "step": 167970 + }, + { + "entropy": 1.9006236642599106, + "epoch": 0.5207233025847845, + "grad_norm": 9.575447082519531, + "learning_rate": 3.505826873512737e-06, + "loss": 0.4582, + "mean_token_accuracy": 0.8519483998417854, + "num_tokens": 202123308.0, + "step": 167980 + }, + { + "entropy": 1.8104294762015343, + "epoch": 0.5207543017098343, + "grad_norm": 8.087808609008789, + "learning_rate": 3.5057225250420484e-06, + "loss": 0.3938, + "mean_token_accuracy": 0.8671251326799393, + "num_tokens": 202136153.0, + "step": 167990 + }, + { + "entropy": 1.9393351331353188, + "epoch": 0.520785300834884, + "grad_norm": 10.509461402893066, + "learning_rate": 3.505618185888381e-06, + "loss": 0.4636, + "mean_token_accuracy": 0.8453001469373703, + "num_tokens": 202147809.0, + "step": 168000 + }, + { + "entropy": 1.8146055683493614, + "epoch": 0.5208162999599336, + "grad_norm": 8.376832962036133, + "learning_rate": 3.5055138560503493e-06, + "loss": 0.3666, + "mean_token_accuracy": 0.8654261186718941, + "num_tokens": 202161538.0, + "step": 168010 + }, + { + "entropy": 1.799776268005371, + "epoch": 0.5208472990849833, + "grad_norm": 5.0410614013671875, + "learning_rate": 3.5054095355265664e-06, + "loss": 0.3816, + "mean_token_accuracy": 0.8637054204940796, + "num_tokens": 202175257.0, + "step": 168020 + }, + { + "entropy": 1.884345107525587, + "epoch": 0.5208782982100331, + "grad_norm": 8.229707717895508, + "learning_rate": 3.5053052243156464e-06, + "loss": 0.411, + "mean_token_accuracy": 0.8621705248951912, + "num_tokens": 202187329.0, + "step": 168030 + }, + { + "entropy": 1.9020549565553666, + "epoch": 0.5209092973350827, + "grad_norm": 7.426633358001709, + "learning_rate": 3.505200922416206e-06, + "loss": 0.448, + "mean_token_accuracy": 0.8587285667657852, + "num_tokens": 202198454.0, + "step": 168040 + }, + { + "entropy": 1.850784559547901, + "epoch": 0.5209402964601324, + "grad_norm": 8.169835090637207, + "learning_rate": 3.5050966298268575e-06, + "loss": 0.4485, + "mean_token_accuracy": 0.8499960124492645, + "num_tokens": 202210755.0, + "step": 168050 + }, + { + "entropy": 1.9735260367393495, + "epoch": 0.5209712955851821, + "grad_norm": 7.9670538902282715, + "learning_rate": 3.5049923465462167e-06, + "loss": 0.4895, + "mean_token_accuracy": 0.850350596010685, + "num_tokens": 202222203.0, + "step": 168060 + }, + { + "entropy": 1.761663031578064, + "epoch": 0.5210022947102319, + "grad_norm": 3.8146419525146484, + "learning_rate": 3.5048880725728984e-06, + "loss": 0.3797, + "mean_token_accuracy": 0.8592882245779038, + "num_tokens": 202236919.0, + "step": 168070 + }, + { + "entropy": 1.8373216532170773, + "epoch": 0.5210332938352815, + "grad_norm": 8.581892013549805, + "learning_rate": 3.50478380790552e-06, + "loss": 0.3779, + "mean_token_accuracy": 0.8676151230931282, + "num_tokens": 202249310.0, + "step": 168080 + }, + { + "entropy": 1.9933584868907928, + "epoch": 0.5210642929603312, + "grad_norm": 8.73966121673584, + "learning_rate": 3.5046795525426953e-06, + "loss": 0.5005, + "mean_token_accuracy": 0.8555369764566422, + "num_tokens": 202259808.0, + "step": 168090 + }, + { + "entropy": 1.9289013132452966, + "epoch": 0.5210952920853809, + "grad_norm": 7.783086776733398, + "learning_rate": 3.504575306483041e-06, + "loss": 0.4778, + "mean_token_accuracy": 0.8488955944776535, + "num_tokens": 202271227.0, + "step": 168100 + }, + { + "entropy": 1.9002173766493797, + "epoch": 0.5211262912104306, + "grad_norm": 8.410770416259766, + "learning_rate": 3.504471069725175e-06, + "loss": 0.4502, + "mean_token_accuracy": 0.8557520598173142, + "num_tokens": 202283576.0, + "step": 168110 + }, + { + "entropy": 1.9111146241426469, + "epoch": 0.5211572903354803, + "grad_norm": 4.1151580810546875, + "learning_rate": 3.504366842267712e-06, + "loss": 0.4711, + "mean_token_accuracy": 0.8459262296557426, + "num_tokens": 202295511.0, + "step": 168120 + }, + { + "entropy": 1.9295530125498772, + "epoch": 0.52118828946053, + "grad_norm": 7.352995872497559, + "learning_rate": 3.5042626241092702e-06, + "loss": 0.4767, + "mean_token_accuracy": 0.8493158027529717, + "num_tokens": 202307587.0, + "step": 168130 + }, + { + "entropy": 1.933849672973156, + "epoch": 0.5212192885855796, + "grad_norm": 7.733171463012695, + "learning_rate": 3.504158415248468e-06, + "loss": 0.4492, + "mean_token_accuracy": 0.8486435234546661, + "num_tokens": 202319619.0, + "step": 168140 + }, + { + "entropy": 1.9473506823182105, + "epoch": 0.5212502877106293, + "grad_norm": 7.936611652374268, + "learning_rate": 3.5040542156839207e-06, + "loss": 0.4456, + "mean_token_accuracy": 0.8542565524578094, + "num_tokens": 202331068.0, + "step": 168150 + }, + { + "entropy": 1.7915266767144202, + "epoch": 0.5212812868356791, + "grad_norm": 2.430027723312378, + "learning_rate": 3.503950025414247e-06, + "loss": 0.3578, + "mean_token_accuracy": 0.8742259576916694, + "num_tokens": 202343873.0, + "step": 168160 + }, + { + "entropy": 1.9285468205809593, + "epoch": 0.5213122859607288, + "grad_norm": 4.726843357086182, + "learning_rate": 3.5038458444380665e-06, + "loss": 0.4777, + "mean_token_accuracy": 0.8377237111330033, + "num_tokens": 202356321.0, + "step": 168170 + }, + { + "entropy": 1.9144100919365883, + "epoch": 0.5213432850857784, + "grad_norm": 7.406527996063232, + "learning_rate": 3.5037416727539957e-06, + "loss": 0.4292, + "mean_token_accuracy": 0.8542553022503853, + "num_tokens": 202368467.0, + "step": 168180 + }, + { + "entropy": 1.796132105588913, + "epoch": 0.5213742842108281, + "grad_norm": 9.365875244140625, + "learning_rate": 3.5036375103606553e-06, + "loss": 0.4268, + "mean_token_accuracy": 0.8590227887034416, + "num_tokens": 202381918.0, + "step": 168190 + }, + { + "entropy": 1.9180462673306464, + "epoch": 0.5214052833358779, + "grad_norm": 4.419243812561035, + "learning_rate": 3.5035333572566626e-06, + "loss": 0.4338, + "mean_token_accuracy": 0.8502243459224701, + "num_tokens": 202394433.0, + "step": 168200 + }, + { + "entropy": 1.9016762152314186, + "epoch": 0.5214362824609275, + "grad_norm": 7.407059669494629, + "learning_rate": 3.5034292134406377e-06, + "loss": 0.414, + "mean_token_accuracy": 0.8573667585849762, + "num_tokens": 202406568.0, + "step": 168210 + }, + { + "entropy": 1.914240649342537, + "epoch": 0.5214672815859772, + "grad_norm": 8.344120025634766, + "learning_rate": 3.5033250789112005e-06, + "loss": 0.429, + "mean_token_accuracy": 0.858941039443016, + "num_tokens": 202418554.0, + "step": 168220 + }, + { + "entropy": 1.8893193066120149, + "epoch": 0.5214982807110269, + "grad_norm": 7.024362087249756, + "learning_rate": 3.503220953666971e-06, + "loss": 0.4118, + "mean_token_accuracy": 0.8614280000329018, + "num_tokens": 202430879.0, + "step": 168230 + }, + { + "entropy": 1.9234270766377448, + "epoch": 0.5215292798360767, + "grad_norm": 9.460162162780762, + "learning_rate": 3.503116837706569e-06, + "loss": 0.4683, + "mean_token_accuracy": 0.8518292471766472, + "num_tokens": 202442812.0, + "step": 168240 + }, + { + "entropy": 1.9079196915030479, + "epoch": 0.5215602789611263, + "grad_norm": 6.5405731201171875, + "learning_rate": 3.5030127310286148e-06, + "loss": 0.5293, + "mean_token_accuracy": 0.85546166151762, + "num_tokens": 202455059.0, + "step": 168250 + }, + { + "entropy": 1.9224894732236861, + "epoch": 0.521591278086176, + "grad_norm": 7.023353099822998, + "learning_rate": 3.5029086336317297e-06, + "loss": 0.4551, + "mean_token_accuracy": 0.857777102291584, + "num_tokens": 202466597.0, + "step": 168260 + }, + { + "entropy": 1.8937045753002166, + "epoch": 0.5216222772112257, + "grad_norm": 7.842694282531738, + "learning_rate": 3.5028045455145344e-06, + "loss": 0.4452, + "mean_token_accuracy": 0.8521323055028915, + "num_tokens": 202478310.0, + "step": 168270 + }, + { + "entropy": 1.8532606914639473, + "epoch": 0.5216532763362755, + "grad_norm": 9.699804306030273, + "learning_rate": 3.5027004666756504e-06, + "loss": 0.4198, + "mean_token_accuracy": 0.861707229912281, + "num_tokens": 202490868.0, + "step": 168280 + }, + { + "entropy": 1.836945144087076, + "epoch": 0.5216842754613251, + "grad_norm": 4.717660903930664, + "learning_rate": 3.5025963971136994e-06, + "loss": 0.4407, + "mean_token_accuracy": 0.865605103969574, + "num_tokens": 202504329.0, + "step": 168290 + }, + { + "entropy": 1.827773043513298, + "epoch": 0.5217152745863748, + "grad_norm": 3.8117053508758545, + "learning_rate": 3.5024923368273035e-06, + "loss": 0.4044, + "mean_token_accuracy": 0.866480652987957, + "num_tokens": 202516707.0, + "step": 168300 + }, + { + "entropy": 1.9818437069654464, + "epoch": 0.5217462737114245, + "grad_norm": 7.698195457458496, + "learning_rate": 3.502388285815085e-06, + "loss": 0.4729, + "mean_token_accuracy": 0.8499929904937744, + "num_tokens": 202527629.0, + "step": 168310 + }, + { + "entropy": 1.9486901313066483, + "epoch": 0.5217772728364742, + "grad_norm": 6.33158540725708, + "learning_rate": 3.5022842440756654e-06, + "loss": 0.4677, + "mean_token_accuracy": 0.8544744238257408, + "num_tokens": 202538355.0, + "step": 168320 + }, + { + "entropy": 1.8631914988160134, + "epoch": 0.5218082719615239, + "grad_norm": 4.584253787994385, + "learning_rate": 3.5021802116076686e-06, + "loss": 0.4546, + "mean_token_accuracy": 0.851434463262558, + "num_tokens": 202551027.0, + "step": 168330 + }, + { + "entropy": 1.8589393228292466, + "epoch": 0.5218392710865736, + "grad_norm": 3.837559461593628, + "learning_rate": 3.5020761884097182e-06, + "loss": 0.3925, + "mean_token_accuracy": 0.8624763250350952, + "num_tokens": 202563762.0, + "step": 168340 + }, + { + "entropy": 1.8021437704563141, + "epoch": 0.5218702702116232, + "grad_norm": 4.51010799407959, + "learning_rate": 3.501972174480436e-06, + "loss": 0.4125, + "mean_token_accuracy": 0.8552407011389732, + "num_tokens": 202577418.0, + "step": 168350 + }, + { + "entropy": 1.771887867152691, + "epoch": 0.521901269336673, + "grad_norm": 7.540759086608887, + "learning_rate": 3.501868169818446e-06, + "loss": 0.3321, + "mean_token_accuracy": 0.8738851860165596, + "num_tokens": 202591115.0, + "step": 168360 + }, + { + "entropy": 1.945265081524849, + "epoch": 0.5219322684617227, + "grad_norm": 10.528114318847656, + "learning_rate": 3.501764174422373e-06, + "loss": 0.4937, + "mean_token_accuracy": 0.8545716404914856, + "num_tokens": 202602290.0, + "step": 168370 + }, + { + "entropy": 1.9452046200633049, + "epoch": 0.5219632675867724, + "grad_norm": 9.390413284301758, + "learning_rate": 3.50166018829084e-06, + "loss": 0.484, + "mean_token_accuracy": 0.8406281679868698, + "num_tokens": 202614626.0, + "step": 168380 + }, + { + "entropy": 1.8217643454670907, + "epoch": 0.521994266711822, + "grad_norm": 4.555027484893799, + "learning_rate": 3.5015562114224727e-06, + "loss": 0.421, + "mean_token_accuracy": 0.8638969480991363, + "num_tokens": 202627795.0, + "step": 168390 + }, + { + "entropy": 1.945632593333721, + "epoch": 0.5220252658368717, + "grad_norm": 9.390481948852539, + "learning_rate": 3.5014522438158964e-06, + "loss": 0.4952, + "mean_token_accuracy": 0.8453856199979782, + "num_tokens": 202638589.0, + "step": 168400 + }, + { + "entropy": 1.956186081469059, + "epoch": 0.5220562649619215, + "grad_norm": 7.714739799499512, + "learning_rate": 3.501348285469734e-06, + "loss": 0.5171, + "mean_token_accuracy": 0.8455076336860656, + "num_tokens": 202649948.0, + "step": 168410 + }, + { + "entropy": 1.9386845543980598, + "epoch": 0.5220872640869711, + "grad_norm": 10.144988059997559, + "learning_rate": 3.501244336382612e-06, + "loss": 0.5306, + "mean_token_accuracy": 0.8470963105559349, + "num_tokens": 202661739.0, + "step": 168420 + }, + { + "entropy": 1.8727570980787278, + "epoch": 0.5221182632120208, + "grad_norm": 8.593897819519043, + "learning_rate": 3.5011403965531572e-06, + "loss": 0.4007, + "mean_token_accuracy": 0.8567142084240913, + "num_tokens": 202674217.0, + "step": 168430 + }, + { + "entropy": 1.944206291437149, + "epoch": 0.5221492623370705, + "grad_norm": 4.206070899963379, + "learning_rate": 3.501036465979994e-06, + "loss": 0.4829, + "mean_token_accuracy": 0.849232442677021, + "num_tokens": 202685615.0, + "step": 168440 + }, + { + "entropy": 1.821098268032074, + "epoch": 0.5221802614621203, + "grad_norm": 7.023726940155029, + "learning_rate": 3.500932544661749e-06, + "loss": 0.3865, + "mean_token_accuracy": 0.8620294481515884, + "num_tokens": 202698481.0, + "step": 168450 + }, + { + "entropy": 1.9788533598184586, + "epoch": 0.5222112605871699, + "grad_norm": 7.938227653503418, + "learning_rate": 3.5008286325970486e-06, + "loss": 0.4763, + "mean_token_accuracy": 0.8501974329352379, + "num_tokens": 202709206.0, + "step": 168460 + }, + { + "entropy": 1.9976587742567062, + "epoch": 0.5222422597122196, + "grad_norm": 7.722753524780273, + "learning_rate": 3.50072472978452e-06, + "loss": 0.4661, + "mean_token_accuracy": 0.8504937782883644, + "num_tokens": 202720069.0, + "step": 168470 + }, + { + "entropy": 1.9856803625822068, + "epoch": 0.5222732588372693, + "grad_norm": 8.472858428955078, + "learning_rate": 3.5006208362227906e-06, + "loss": 0.5181, + "mean_token_accuracy": 0.8380648702383041, + "num_tokens": 202731035.0, + "step": 168480 + }, + { + "entropy": 1.9204457074403762, + "epoch": 0.522304257962319, + "grad_norm": 7.643885612487793, + "learning_rate": 3.5005169519104863e-06, + "loss": 0.4523, + "mean_token_accuracy": 0.8558697760105133, + "num_tokens": 202742481.0, + "step": 168490 + }, + { + "entropy": 1.973274603486061, + "epoch": 0.5223352570873687, + "grad_norm": 3.862253189086914, + "learning_rate": 3.5004130768462363e-06, + "loss": 0.4715, + "mean_token_accuracy": 0.8509830832481384, + "num_tokens": 202753483.0, + "step": 168500 + }, + { + "entropy": 1.8810513690114021, + "epoch": 0.5223662562124184, + "grad_norm": 7.860805034637451, + "learning_rate": 3.5003092110286685e-06, + "loss": 0.4418, + "mean_token_accuracy": 0.8568109959363938, + "num_tokens": 202765592.0, + "step": 168510 + }, + { + "entropy": 1.8250865265727043, + "epoch": 0.5223972553374681, + "grad_norm": 3.617983341217041, + "learning_rate": 3.50020535445641e-06, + "loss": 0.3621, + "mean_token_accuracy": 0.86634581387043, + "num_tokens": 202778915.0, + "step": 168520 + }, + { + "entropy": 1.8575761586427688, + "epoch": 0.5224282544625178, + "grad_norm": 8.61739444732666, + "learning_rate": 3.50010150712809e-06, + "loss": 0.4369, + "mean_token_accuracy": 0.8585295125842094, + "num_tokens": 202790824.0, + "step": 168530 + }, + { + "entropy": 1.9157093957066536, + "epoch": 0.5224592535875675, + "grad_norm": 6.821208953857422, + "learning_rate": 3.4999976690423367e-06, + "loss": 0.4177, + "mean_token_accuracy": 0.8653114318847657, + "num_tokens": 202802693.0, + "step": 168540 + }, + { + "entropy": 1.9041334331035613, + "epoch": 0.5224902527126172, + "grad_norm": 4.114165306091309, + "learning_rate": 3.4998938401977808e-06, + "loss": 0.4298, + "mean_token_accuracy": 0.860270942747593, + "num_tokens": 202814814.0, + "step": 168550 + }, + { + "entropy": 1.8540833964943886, + "epoch": 0.5225212518376668, + "grad_norm": 3.6030046939849854, + "learning_rate": 3.499790020593049e-06, + "loss": 0.4128, + "mean_token_accuracy": 0.8587525814771653, + "num_tokens": 202826750.0, + "step": 168560 + }, + { + "entropy": 1.801897644996643, + "epoch": 0.5225522509627166, + "grad_norm": 4.086511611938477, + "learning_rate": 3.499686210226774e-06, + "loss": 0.3762, + "mean_token_accuracy": 0.8603298366069794, + "num_tokens": 202839477.0, + "step": 168570 + }, + { + "entropy": 1.8381630688905717, + "epoch": 0.5225832500877663, + "grad_norm": 6.6824774742126465, + "learning_rate": 3.499582409097583e-06, + "loss": 0.4449, + "mean_token_accuracy": 0.8520712524652481, + "num_tokens": 202851991.0, + "step": 168580 + }, + { + "entropy": 1.9276944547891617, + "epoch": 0.522614249212816, + "grad_norm": 3.1579701900482178, + "learning_rate": 3.4994786172041074e-06, + "loss": 0.4818, + "mean_token_accuracy": 0.85104900598526, + "num_tokens": 202862938.0, + "step": 168590 + }, + { + "entropy": 1.9088122382760049, + "epoch": 0.5226452483378656, + "grad_norm": 8.01588249206543, + "learning_rate": 3.4993748345449783e-06, + "loss": 0.4524, + "mean_token_accuracy": 0.8535411536693573, + "num_tokens": 202874862.0, + "step": 168600 + }, + { + "entropy": 1.9341975510120393, + "epoch": 0.5226762474629154, + "grad_norm": 7.971364498138428, + "learning_rate": 3.4992710611188265e-06, + "loss": 0.4818, + "mean_token_accuracy": 0.8459011435508728, + "num_tokens": 202885847.0, + "step": 168610 + }, + { + "entropy": 1.9188231587409974, + "epoch": 0.5227072465879651, + "grad_norm": 8.544054985046387, + "learning_rate": 3.4991672969242813e-06, + "loss": 0.452, + "mean_token_accuracy": 0.8533058494329453, + "num_tokens": 202897038.0, + "step": 168620 + }, + { + "entropy": 1.865157674252987, + "epoch": 0.5227382457130147, + "grad_norm": 4.100846767425537, + "learning_rate": 3.499063541959976e-06, + "loss": 0.4161, + "mean_token_accuracy": 0.8643029734492302, + "num_tokens": 202908553.0, + "step": 168630 + }, + { + "entropy": 1.9419152081012725, + "epoch": 0.5227692448380644, + "grad_norm": 7.540561199188232, + "learning_rate": 3.4989597962245414e-06, + "loss": 0.4499, + "mean_token_accuracy": 0.8607556402683259, + "num_tokens": 202919848.0, + "step": 168640 + }, + { + "entropy": 1.7625506401062012, + "epoch": 0.5228002439631141, + "grad_norm": 7.734172821044922, + "learning_rate": 3.498856059716609e-06, + "loss": 0.3477, + "mean_token_accuracy": 0.8655755758285523, + "num_tokens": 202933201.0, + "step": 168650 + }, + { + "entropy": 1.7975231781601906, + "epoch": 0.5228312430881639, + "grad_norm": 9.36955451965332, + "learning_rate": 3.4987523324348114e-06, + "loss": 0.3961, + "mean_token_accuracy": 0.8536623597145081, + "num_tokens": 202946343.0, + "step": 168660 + }, + { + "entropy": 1.7663616985082626, + "epoch": 0.5228622422132135, + "grad_norm": 7.1532182693481445, + "learning_rate": 3.498648614377782e-06, + "loss": 0.3221, + "mean_token_accuracy": 0.8766553714871407, + "num_tokens": 202960093.0, + "step": 168670 + }, + { + "entropy": 1.9286183029413224, + "epoch": 0.5228932413382632, + "grad_norm": 8.140057563781738, + "learning_rate": 3.498544905544153e-06, + "loss": 0.4348, + "mean_token_accuracy": 0.8631218791007995, + "num_tokens": 202971106.0, + "step": 168680 + }, + { + "entropy": 1.8273029983043672, + "epoch": 0.5229242404633129, + "grad_norm": 8.231640815734863, + "learning_rate": 3.498441205932556e-06, + "loss": 0.4325, + "mean_token_accuracy": 0.8470514804124832, + "num_tokens": 202984430.0, + "step": 168690 + }, + { + "entropy": 1.9099699601531028, + "epoch": 0.5229552395883627, + "grad_norm": 10.600870132446289, + "learning_rate": 3.498337515541626e-06, + "loss": 0.4953, + "mean_token_accuracy": 0.84662024974823, + "num_tokens": 202995888.0, + "step": 168700 + }, + { + "entropy": 1.8221244931221008, + "epoch": 0.5229862387134123, + "grad_norm": 7.553411483764648, + "learning_rate": 3.498233834369996e-06, + "loss": 0.39, + "mean_token_accuracy": 0.8616412520408631, + "num_tokens": 203008576.0, + "step": 168710 + }, + { + "entropy": 1.8235291928052901, + "epoch": 0.523017237838462, + "grad_norm": 8.67770767211914, + "learning_rate": 3.498130162416301e-06, + "loss": 0.4043, + "mean_token_accuracy": 0.8563500612974166, + "num_tokens": 203022507.0, + "step": 168720 + }, + { + "entropy": 1.9123604744672775, + "epoch": 0.5230482369635117, + "grad_norm": 9.894606590270996, + "learning_rate": 3.4980264996791734e-06, + "loss": 0.4826, + "mean_token_accuracy": 0.8581787392497062, + "num_tokens": 203033907.0, + "step": 168730 + }, + { + "entropy": 1.9000810965895654, + "epoch": 0.5230792360885614, + "grad_norm": 3.7253689765930176, + "learning_rate": 3.4979228461572484e-06, + "loss": 0.4473, + "mean_token_accuracy": 0.8557541042566299, + "num_tokens": 203046455.0, + "step": 168740 + }, + { + "entropy": 1.876817238330841, + "epoch": 0.5231102352136111, + "grad_norm": 10.623446464538574, + "learning_rate": 3.4978192018491614e-06, + "loss": 0.4359, + "mean_token_accuracy": 0.8541541695594788, + "num_tokens": 203058507.0, + "step": 168750 + }, + { + "entropy": 1.9653200805187225, + "epoch": 0.5231412343386608, + "grad_norm": 14.3389253616333, + "learning_rate": 3.497715566753547e-06, + "loss": 0.525, + "mean_token_accuracy": 0.8415609449148178, + "num_tokens": 203069867.0, + "step": 168760 + }, + { + "entropy": 1.9515894502401352, + "epoch": 0.5231722334637104, + "grad_norm": 6.81016731262207, + "learning_rate": 3.497611940869041e-06, + "loss": 0.4799, + "mean_token_accuracy": 0.8508537262678146, + "num_tokens": 203080859.0, + "step": 168770 + }, + { + "entropy": 1.8536252856254578, + "epoch": 0.5232032325887602, + "grad_norm": 4.349778652191162, + "learning_rate": 3.497508324194277e-06, + "loss": 0.4111, + "mean_token_accuracy": 0.8588446915149689, + "num_tokens": 203093835.0, + "step": 168780 + }, + { + "entropy": 1.9472486570477485, + "epoch": 0.5232342317138099, + "grad_norm": 8.198267936706543, + "learning_rate": 3.497404716727893e-06, + "loss": 0.4509, + "mean_token_accuracy": 0.8478568762540817, + "num_tokens": 203105798.0, + "step": 168790 + }, + { + "entropy": 1.9527030482888221, + "epoch": 0.5232652308388596, + "grad_norm": 5.557439804077148, + "learning_rate": 3.4973011184685244e-06, + "loss": 0.4537, + "mean_token_accuracy": 0.8537330031394958, + "num_tokens": 203116698.0, + "step": 168800 + }, + { + "entropy": 1.939917927980423, + "epoch": 0.5232962299639092, + "grad_norm": 7.945644855499268, + "learning_rate": 3.4971975294148085e-06, + "loss": 0.4396, + "mean_token_accuracy": 0.8575639694929122, + "num_tokens": 203128225.0, + "step": 168810 + }, + { + "entropy": 1.9317618682980537, + "epoch": 0.523327229088959, + "grad_norm": 9.663101196289062, + "learning_rate": 3.4970939495653804e-06, + "loss": 0.4807, + "mean_token_accuracy": 0.8493759095668793, + "num_tokens": 203139468.0, + "step": 168820 + }, + { + "entropy": 1.884542666375637, + "epoch": 0.5233582282140087, + "grad_norm": 5.212667465209961, + "learning_rate": 3.4969903789188785e-06, + "loss": 0.4658, + "mean_token_accuracy": 0.8502926960587501, + "num_tokens": 203151240.0, + "step": 168830 + }, + { + "entropy": 1.8819777011871337, + "epoch": 0.5233892273390583, + "grad_norm": 7.3790106773376465, + "learning_rate": 3.49688681747394e-06, + "loss": 0.418, + "mean_token_accuracy": 0.8593169033527375, + "num_tokens": 203162849.0, + "step": 168840 + }, + { + "entropy": 1.887918284535408, + "epoch": 0.523420226464108, + "grad_norm": 10.357288360595703, + "learning_rate": 3.4967832652292015e-06, + "loss": 0.423, + "mean_token_accuracy": 0.8608911827206611, + "num_tokens": 203174283.0, + "step": 168850 + }, + { + "entropy": 1.9403936624526978, + "epoch": 0.5234512255891578, + "grad_norm": 8.486865043640137, + "learning_rate": 3.4966797221833016e-06, + "loss": 0.5105, + "mean_token_accuracy": 0.8440033331513405, + "num_tokens": 203184723.0, + "step": 168860 + }, + { + "entropy": 1.8190939247608184, + "epoch": 0.5234822247142075, + "grad_norm": 2.843285083770752, + "learning_rate": 3.496576188334879e-06, + "loss": 0.4588, + "mean_token_accuracy": 0.8482326999306679, + "num_tokens": 203197121.0, + "step": 168870 + }, + { + "entropy": 1.964727732539177, + "epoch": 0.5235132238392571, + "grad_norm": 8.751008987426758, + "learning_rate": 3.49647266368257e-06, + "loss": 0.4936, + "mean_token_accuracy": 0.8521199285984039, + "num_tokens": 203208003.0, + "step": 168880 + }, + { + "entropy": 1.8414556071162225, + "epoch": 0.5235442229643068, + "grad_norm": 7.824958801269531, + "learning_rate": 3.496369148225016e-06, + "loss": 0.3974, + "mean_token_accuracy": 0.8654927790164948, + "num_tokens": 203220730.0, + "step": 168890 + }, + { + "entropy": 1.7450319975614548, + "epoch": 0.5235752220893565, + "grad_norm": 7.182183265686035, + "learning_rate": 3.4962656419608543e-06, + "loss": 0.3285, + "mean_token_accuracy": 0.8719310641288758, + "num_tokens": 203234537.0, + "step": 168900 + }, + { + "entropy": 1.9158180862665177, + "epoch": 0.5236062212144063, + "grad_norm": 3.739776611328125, + "learning_rate": 3.496162144888725e-06, + "loss": 0.4321, + "mean_token_accuracy": 0.8643902540206909, + "num_tokens": 203246093.0, + "step": 168910 + }, + { + "entropy": 1.863074316084385, + "epoch": 0.5236372203394559, + "grad_norm": 6.915127754211426, + "learning_rate": 3.4960586570072673e-06, + "loss": 0.4325, + "mean_token_accuracy": 0.8576465114951134, + "num_tokens": 203258124.0, + "step": 168920 + }, + { + "entropy": 1.9026149466633797, + "epoch": 0.5236682194645056, + "grad_norm": 6.958155155181885, + "learning_rate": 3.4959551783151207e-06, + "loss": 0.42, + "mean_token_accuracy": 0.8604533329606057, + "num_tokens": 203270041.0, + "step": 168930 + }, + { + "entropy": 1.8807974457740784, + "epoch": 0.5236992185895553, + "grad_norm": 7.943653106689453, + "learning_rate": 3.495851708810926e-06, + "loss": 0.4061, + "mean_token_accuracy": 0.8648590832948685, + "num_tokens": 203281957.0, + "step": 168940 + }, + { + "entropy": 1.8590452671051025, + "epoch": 0.523730217714605, + "grad_norm": 9.260481834411621, + "learning_rate": 3.495748248493323e-06, + "loss": 0.4522, + "mean_token_accuracy": 0.8521923378109932, + "num_tokens": 203293743.0, + "step": 168950 + }, + { + "entropy": 1.8895818576216699, + "epoch": 0.5237612168396547, + "grad_norm": 8.63992691040039, + "learning_rate": 3.495644797360953e-06, + "loss": 0.483, + "mean_token_accuracy": 0.8420132413506508, + "num_tokens": 203305487.0, + "step": 168960 + }, + { + "entropy": 1.8291345238685608, + "epoch": 0.5237922159647044, + "grad_norm": 4.117769241333008, + "learning_rate": 3.4955413554124568e-06, + "loss": 0.4038, + "mean_token_accuracy": 0.8555587381124496, + "num_tokens": 203318682.0, + "step": 168970 + }, + { + "entropy": 1.8397614166140557, + "epoch": 0.523823215089754, + "grad_norm": 3.8362808227539062, + "learning_rate": 3.495437922646475e-06, + "loss": 0.417, + "mean_token_accuracy": 0.8543219909071922, + "num_tokens": 203331200.0, + "step": 168980 + }, + { + "entropy": 1.8733160473406314, + "epoch": 0.5238542142148038, + "grad_norm": 7.672023296356201, + "learning_rate": 3.49533449906165e-06, + "loss": 0.4379, + "mean_token_accuracy": 0.8624173834919929, + "num_tokens": 203342773.0, + "step": 168990 + }, + { + "entropy": 1.8732567384839058, + "epoch": 0.5238852133398535, + "grad_norm": 9.277653694152832, + "learning_rate": 3.495231084656623e-06, + "loss": 0.4505, + "mean_token_accuracy": 0.861282941699028, + "num_tokens": 203354610.0, + "step": 169000 + }, + { + "entropy": 1.810496024042368, + "epoch": 0.5239162124649032, + "grad_norm": 8.090917587280273, + "learning_rate": 3.4951276794300364e-06, + "loss": 0.4273, + "mean_token_accuracy": 0.8605089306831359, + "num_tokens": 203366465.0, + "step": 169010 + }, + { + "entropy": 1.8707633331418037, + "epoch": 0.5239472115899528, + "grad_norm": 8.256461143493652, + "learning_rate": 3.495024283380533e-06, + "loss": 0.4225, + "mean_token_accuracy": 0.857209199666977, + "num_tokens": 203378502.0, + "step": 169020 + }, + { + "entropy": 1.8902933821082115, + "epoch": 0.5239782107150026, + "grad_norm": 10.598012924194336, + "learning_rate": 3.494920896506754e-06, + "loss": 0.4756, + "mean_token_accuracy": 0.8510710790753364, + "num_tokens": 203390100.0, + "step": 169030 + }, + { + "entropy": 1.8810963049530982, + "epoch": 0.5240092098400523, + "grad_norm": 3.4652762413024902, + "learning_rate": 3.494817518807344e-06, + "loss": 0.4237, + "mean_token_accuracy": 0.8639438837766648, + "num_tokens": 203401188.0, + "step": 169040 + }, + { + "entropy": 1.9366990119218825, + "epoch": 0.524040208965102, + "grad_norm": 7.726593971252441, + "learning_rate": 3.4947141502809452e-06, + "loss": 0.5079, + "mean_token_accuracy": 0.8416985169053077, + "num_tokens": 203411783.0, + "step": 169050 + }, + { + "entropy": 1.9235543400049209, + "epoch": 0.5240712080901516, + "grad_norm": 9.40511417388916, + "learning_rate": 3.4946107909262012e-06, + "loss": 0.4447, + "mean_token_accuracy": 0.8531079888343811, + "num_tokens": 203422994.0, + "step": 169060 + }, + { + "entropy": 1.9281636044383048, + "epoch": 0.5241022072152014, + "grad_norm": 8.211299896240234, + "learning_rate": 3.4945074407417565e-06, + "loss": 0.4914, + "mean_token_accuracy": 0.8453192666172982, + "num_tokens": 203435055.0, + "step": 169070 + }, + { + "entropy": 1.8821644067764283, + "epoch": 0.5241332063402511, + "grad_norm": 9.113029479980469, + "learning_rate": 3.494404099726255e-06, + "loss": 0.4271, + "mean_token_accuracy": 0.8508746236562729, + "num_tokens": 203447495.0, + "step": 169080 + }, + { + "entropy": 1.9432899564504624, + "epoch": 0.5241642054653007, + "grad_norm": 8.647567749023438, + "learning_rate": 3.4943007678783398e-06, + "loss": 0.4896, + "mean_token_accuracy": 0.8540888220071793, + "num_tokens": 203458347.0, + "step": 169090 + }, + { + "entropy": 1.9115382328629493, + "epoch": 0.5241952045903504, + "grad_norm": 8.112555503845215, + "learning_rate": 3.4941974451966564e-06, + "loss": 0.4247, + "mean_token_accuracy": 0.851653291285038, + "num_tokens": 203469957.0, + "step": 169100 + }, + { + "entropy": 1.9376183688640594, + "epoch": 0.5242262037154002, + "grad_norm": 9.164286613464355, + "learning_rate": 3.49409413167985e-06, + "loss": 0.5251, + "mean_token_accuracy": 0.8397494927048683, + "num_tokens": 203481549.0, + "step": 169110 + }, + { + "entropy": 1.9066183164715766, + "epoch": 0.5242572028404499, + "grad_norm": 11.092423439025879, + "learning_rate": 3.4939908273265666e-06, + "loss": 0.4596, + "mean_token_accuracy": 0.8511956304311752, + "num_tokens": 203494139.0, + "step": 169120 + }, + { + "entropy": 1.8191746473312378, + "epoch": 0.5242882019654995, + "grad_norm": 7.425056457519531, + "learning_rate": 3.4938875321354493e-06, + "loss": 0.3734, + "mean_token_accuracy": 0.8660463154315948, + "num_tokens": 203507676.0, + "step": 169130 + }, + { + "entropy": 1.8262469589710235, + "epoch": 0.5243192010905492, + "grad_norm": 3.7181785106658936, + "learning_rate": 3.4937842461051453e-06, + "loss": 0.4016, + "mean_token_accuracy": 0.859797814488411, + "num_tokens": 203521237.0, + "step": 169140 + }, + { + "entropy": 1.9659714698791504, + "epoch": 0.5243502002155989, + "grad_norm": 7.551681995391846, + "learning_rate": 3.4936809692343003e-06, + "loss": 0.4721, + "mean_token_accuracy": 0.8544010862708091, + "num_tokens": 203532795.0, + "step": 169150 + }, + { + "entropy": 1.8716353714466094, + "epoch": 0.5243811993406486, + "grad_norm": 7.684150218963623, + "learning_rate": 3.493577701521561e-06, + "loss": 0.3979, + "mean_token_accuracy": 0.8697442948818207, + "num_tokens": 203545242.0, + "step": 169160 + }, + { + "entropy": 1.956957994401455, + "epoch": 0.5244121984656983, + "grad_norm": 8.428984642028809, + "learning_rate": 3.493474442965574e-06, + "loss": 0.4803, + "mean_token_accuracy": 0.844064648449421, + "num_tokens": 203556959.0, + "step": 169170 + }, + { + "entropy": 1.8892595663666725, + "epoch": 0.524443197590748, + "grad_norm": 9.209391593933105, + "learning_rate": 3.4933711935649857e-06, + "loss": 0.4493, + "mean_token_accuracy": 0.8504563570022583, + "num_tokens": 203568917.0, + "step": 169180 + }, + { + "entropy": 1.8977754607796669, + "epoch": 0.5244741967157976, + "grad_norm": 4.250955581665039, + "learning_rate": 3.493267953318443e-06, + "loss": 0.4496, + "mean_token_accuracy": 0.8544679388403893, + "num_tokens": 203580698.0, + "step": 169190 + }, + { + "entropy": 1.966796690225601, + "epoch": 0.5245051958408474, + "grad_norm": 8.080552101135254, + "learning_rate": 3.493164722224594e-06, + "loss": 0.4686, + "mean_token_accuracy": 0.857042309641838, + "num_tokens": 203592023.0, + "step": 169200 + }, + { + "entropy": 1.9782009929418565, + "epoch": 0.5245361949658971, + "grad_norm": 10.818819999694824, + "learning_rate": 3.4930615002820863e-06, + "loss": 0.4827, + "mean_token_accuracy": 0.8517513841390609, + "num_tokens": 203603354.0, + "step": 169210 + }, + { + "entropy": 1.9704937011003494, + "epoch": 0.5245671940909468, + "grad_norm": 9.936196327209473, + "learning_rate": 3.492958287489568e-06, + "loss": 0.444, + "mean_token_accuracy": 0.8553407356142998, + "num_tokens": 203614187.0, + "step": 169220 + }, + { + "entropy": 1.8049380242824555, + "epoch": 0.5245981932159964, + "grad_norm": 2.656586170196533, + "learning_rate": 3.492855083845687e-06, + "loss": 0.441, + "mean_token_accuracy": 0.855417400598526, + "num_tokens": 203626946.0, + "step": 169230 + }, + { + "entropy": 1.8623328655958176, + "epoch": 0.5246291923410462, + "grad_norm": 8.69382095336914, + "learning_rate": 3.492751889349091e-06, + "loss": 0.4543, + "mean_token_accuracy": 0.8532855972647667, + "num_tokens": 203639192.0, + "step": 169240 + }, + { + "entropy": 1.9222152277827262, + "epoch": 0.5246601914660959, + "grad_norm": 9.066729545593262, + "learning_rate": 3.4926487039984308e-06, + "loss": 0.4637, + "mean_token_accuracy": 0.8545737609267234, + "num_tokens": 203650676.0, + "step": 169250 + }, + { + "entropy": 1.929406814277172, + "epoch": 0.5246911905911456, + "grad_norm": 8.320404052734375, + "learning_rate": 3.492545527792354e-06, + "loss": 0.4538, + "mean_token_accuracy": 0.8549767971038819, + "num_tokens": 203662180.0, + "step": 169260 + }, + { + "entropy": 1.9012570947408676, + "epoch": 0.5247221897161952, + "grad_norm": 7.0718536376953125, + "learning_rate": 3.4924423607295107e-06, + "loss": 0.4361, + "mean_token_accuracy": 0.8597986832261085, + "num_tokens": 203673564.0, + "step": 169270 + }, + { + "entropy": 1.7952979236841202, + "epoch": 0.524753188841245, + "grad_norm": 2.635957717895508, + "learning_rate": 3.4923392028085507e-06, + "loss": 0.3773, + "mean_token_accuracy": 0.8655047237873077, + "num_tokens": 203686834.0, + "step": 169280 + }, + { + "entropy": 1.9100728452205658, + "epoch": 0.5247841879662947, + "grad_norm": 3.5995821952819824, + "learning_rate": 3.492236054028123e-06, + "loss": 0.4945, + "mean_token_accuracy": 0.8441936239600182, + "num_tokens": 203698055.0, + "step": 169290 + }, + { + "entropy": 1.900466850399971, + "epoch": 0.5248151870913443, + "grad_norm": 3.480902910232544, + "learning_rate": 3.4921329143868787e-06, + "loss": 0.4328, + "mean_token_accuracy": 0.850808109343052, + "num_tokens": 203710441.0, + "step": 169300 + }, + { + "entropy": 1.8833848714828492, + "epoch": 0.524846186216394, + "grad_norm": 8.502030372619629, + "learning_rate": 3.4920297838834676e-06, + "loss": 0.4221, + "mean_token_accuracy": 0.8584718599915504, + "num_tokens": 203722127.0, + "step": 169310 + }, + { + "entropy": 1.855775459110737, + "epoch": 0.5248771853414438, + "grad_norm": 7.603749752044678, + "learning_rate": 3.491926662516541e-06, + "loss": 0.39, + "mean_token_accuracy": 0.8616424456238747, + "num_tokens": 203735547.0, + "step": 169320 + }, + { + "entropy": 1.9335170581936836, + "epoch": 0.5249081844664935, + "grad_norm": 9.550626754760742, + "learning_rate": 3.4918235502847503e-06, + "loss": 0.4743, + "mean_token_accuracy": 0.8472717061638833, + "num_tokens": 203746751.0, + "step": 169330 + }, + { + "entropy": 1.8978175684809684, + "epoch": 0.5249391835915431, + "grad_norm": 8.762434959411621, + "learning_rate": 3.4917204471867455e-06, + "loss": 0.4518, + "mean_token_accuracy": 0.8437378078699111, + "num_tokens": 203758587.0, + "step": 169340 + }, + { + "entropy": 1.8782646000385284, + "epoch": 0.5249701827165928, + "grad_norm": 7.3637871742248535, + "learning_rate": 3.4916173532211793e-06, + "loss": 0.428, + "mean_token_accuracy": 0.8627133265137672, + "num_tokens": 203770143.0, + "step": 169350 + }, + { + "entropy": 1.9008455261588098, + "epoch": 0.5250011818416426, + "grad_norm": 9.275456428527832, + "learning_rate": 3.491514268386703e-06, + "loss": 0.4408, + "mean_token_accuracy": 0.8553502589464188, + "num_tokens": 203782295.0, + "step": 169360 + }, + { + "entropy": 1.922987850010395, + "epoch": 0.5250321809666922, + "grad_norm": 8.815250396728516, + "learning_rate": 3.49141119268197e-06, + "loss": 0.4384, + "mean_token_accuracy": 0.8504637837409973, + "num_tokens": 203793616.0, + "step": 169370 + }, + { + "entropy": 1.9362686663866042, + "epoch": 0.5250631800917419, + "grad_norm": 7.780436992645264, + "learning_rate": 3.4913081261056313e-06, + "loss": 0.4331, + "mean_token_accuracy": 0.8581616476178169, + "num_tokens": 203805629.0, + "step": 169380 + }, + { + "entropy": 1.8906577423214912, + "epoch": 0.5250941792167916, + "grad_norm": 7.460948944091797, + "learning_rate": 3.4912050686563403e-06, + "loss": 0.4573, + "mean_token_accuracy": 0.8532565832138062, + "num_tokens": 203817628.0, + "step": 169390 + }, + { + "entropy": 1.8528029769659042, + "epoch": 0.5251251783418412, + "grad_norm": 7.081979751586914, + "learning_rate": 3.49110202033275e-06, + "loss": 0.4096, + "mean_token_accuracy": 0.857559771835804, + "num_tokens": 203829983.0, + "step": 169400 + }, + { + "entropy": 1.8572653010487556, + "epoch": 0.525156177466891, + "grad_norm": 8.674805641174316, + "learning_rate": 3.4909989811335133e-06, + "loss": 0.4098, + "mean_token_accuracy": 0.865797932446003, + "num_tokens": 203842180.0, + "step": 169410 + }, + { + "entropy": 1.9635521739721298, + "epoch": 0.5251871765919407, + "grad_norm": 7.031563758850098, + "learning_rate": 3.490895951057284e-06, + "loss": 0.4654, + "mean_token_accuracy": 0.8588854551315308, + "num_tokens": 203853240.0, + "step": 169420 + }, + { + "entropy": 1.8900843843817712, + "epoch": 0.5252181757169904, + "grad_norm": 9.572572708129883, + "learning_rate": 3.4907929301027164e-06, + "loss": 0.4315, + "mean_token_accuracy": 0.8617082446813583, + "num_tokens": 203865022.0, + "step": 169430 + }, + { + "entropy": 1.831702572107315, + "epoch": 0.52524917484204, + "grad_norm": 9.38916015625, + "learning_rate": 3.4906899182684645e-06, + "loss": 0.4375, + "mean_token_accuracy": 0.8631092309951782, + "num_tokens": 203876987.0, + "step": 169440 + }, + { + "entropy": 1.8782099664211274, + "epoch": 0.5252801739670898, + "grad_norm": 8.228728294372559, + "learning_rate": 3.4905869155531813e-06, + "loss": 0.4405, + "mean_token_accuracy": 0.8564833179116249, + "num_tokens": 203888591.0, + "step": 169450 + }, + { + "entropy": 1.8731250807642936, + "epoch": 0.5253111730921395, + "grad_norm": 8.023486137390137, + "learning_rate": 3.490483921955523e-06, + "loss": 0.443, + "mean_token_accuracy": 0.8530664145946503, + "num_tokens": 203900766.0, + "step": 169460 + }, + { + "entropy": 1.8310665681958198, + "epoch": 0.5253421722171892, + "grad_norm": 7.332006931304932, + "learning_rate": 3.4903809374741443e-06, + "loss": 0.396, + "mean_token_accuracy": 0.8547357425093651, + "num_tokens": 203913127.0, + "step": 169470 + }, + { + "entropy": 1.9228072851896285, + "epoch": 0.5253731713422388, + "grad_norm": 7.0422282218933105, + "learning_rate": 3.4902779621077004e-06, + "loss": 0.4801, + "mean_token_accuracy": 0.8502156108617782, + "num_tokens": 203924369.0, + "step": 169480 + }, + { + "entropy": 1.8755038015544414, + "epoch": 0.5254041704672886, + "grad_norm": 3.896348237991333, + "learning_rate": 3.4901749958548465e-06, + "loss": 0.3737, + "mean_token_accuracy": 0.8643324583768844, + "num_tokens": 203937278.0, + "step": 169490 + }, + { + "entropy": 1.8818801745772362, + "epoch": 0.5254351695923383, + "grad_norm": 8.24651050567627, + "learning_rate": 3.4900720387142383e-06, + "loss": 0.43, + "mean_token_accuracy": 0.8514017388224602, + "num_tokens": 203948661.0, + "step": 169500 + }, + { + "entropy": 1.9414301648736, + "epoch": 0.5254661687173879, + "grad_norm": 9.171976089477539, + "learning_rate": 3.4899690906845326e-06, + "loss": 0.4492, + "mean_token_accuracy": 0.8593140855431557, + "num_tokens": 203959539.0, + "step": 169510 + }, + { + "entropy": 1.9445295572280883, + "epoch": 0.5254971678424376, + "grad_norm": 8.482927322387695, + "learning_rate": 3.4898661517643845e-06, + "loss": 0.4712, + "mean_token_accuracy": 0.8427321448922157, + "num_tokens": 203970761.0, + "step": 169520 + }, + { + "entropy": 1.8659766510128974, + "epoch": 0.5255281669674874, + "grad_norm": 8.594217300415039, + "learning_rate": 3.489763221952451e-06, + "loss": 0.4573, + "mean_token_accuracy": 0.8495791599154472, + "num_tokens": 203982341.0, + "step": 169530 + }, + { + "entropy": 1.9512912288308144, + "epoch": 0.5255591660925371, + "grad_norm": 8.930543899536133, + "learning_rate": 3.4896603012473913e-06, + "loss": 0.5258, + "mean_token_accuracy": 0.8444173559546471, + "num_tokens": 203993560.0, + "step": 169540 + }, + { + "entropy": 1.9422188624739647, + "epoch": 0.5255901652175867, + "grad_norm": 8.387059211730957, + "learning_rate": 3.4895573896478594e-06, + "loss": 0.4749, + "mean_token_accuracy": 0.8544582590460778, + "num_tokens": 204004877.0, + "step": 169550 + }, + { + "entropy": 2.0166870802640915, + "epoch": 0.5256211643426364, + "grad_norm": 9.012490272521973, + "learning_rate": 3.4894544871525138e-06, + "loss": 0.5375, + "mean_token_accuracy": 0.8376458153128624, + "num_tokens": 204015926.0, + "step": 169560 + }, + { + "entropy": 1.9142573595046997, + "epoch": 0.5256521634676862, + "grad_norm": 3.9384617805480957, + "learning_rate": 3.489351593760012e-06, + "loss": 0.4249, + "mean_token_accuracy": 0.8592307507991791, + "num_tokens": 204027720.0, + "step": 169570 + }, + { + "entropy": 1.9269609957933427, + "epoch": 0.5256831625927358, + "grad_norm": 8.374215126037598, + "learning_rate": 3.489248709469013e-06, + "loss": 0.4784, + "mean_token_accuracy": 0.8519828408956528, + "num_tokens": 204039091.0, + "step": 169580 + }, + { + "entropy": 1.8991817444562913, + "epoch": 0.5257141617177855, + "grad_norm": 6.979922771453857, + "learning_rate": 3.489145834278175e-06, + "loss": 0.4364, + "mean_token_accuracy": 0.8537618055939674, + "num_tokens": 204051145.0, + "step": 169590 + }, + { + "entropy": 1.9464630469679833, + "epoch": 0.5257451608428352, + "grad_norm": 3.8475420475006104, + "learning_rate": 3.489042968186156e-06, + "loss": 0.4497, + "mean_token_accuracy": 0.8544916957616806, + "num_tokens": 204062107.0, + "step": 169600 + }, + { + "entropy": 1.8664928540587424, + "epoch": 0.525776159967885, + "grad_norm": 5.392734527587891, + "learning_rate": 3.4889401111916138e-06, + "loss": 0.383, + "mean_token_accuracy": 0.8774384975433349, + "num_tokens": 204074381.0, + "step": 169610 + }, + { + "entropy": 1.9056679099798202, + "epoch": 0.5258071590929346, + "grad_norm": 8.054254531860352, + "learning_rate": 3.4888372632932095e-06, + "loss": 0.4806, + "mean_token_accuracy": 0.8502617850899696, + "num_tokens": 204086077.0, + "step": 169620 + }, + { + "entropy": 1.8804067566990852, + "epoch": 0.5258381582179843, + "grad_norm": 9.382166862487793, + "learning_rate": 3.4887344244896004e-06, + "loss": 0.4181, + "mean_token_accuracy": 0.8627091318368911, + "num_tokens": 204097655.0, + "step": 169630 + }, + { + "entropy": 1.9133615478873254, + "epoch": 0.525869157343034, + "grad_norm": 3.766981363296509, + "learning_rate": 3.488631594779449e-06, + "loss": 0.5307, + "mean_token_accuracy": 0.8391897663474083, + "num_tokens": 204109163.0, + "step": 169640 + }, + { + "entropy": 1.8695868149399757, + "epoch": 0.5259001564680836, + "grad_norm": 5.005122184753418, + "learning_rate": 3.488528774161413e-06, + "loss": 0.4281, + "mean_token_accuracy": 0.8512502536177635, + "num_tokens": 204121920.0, + "step": 169650 + }, + { + "entropy": 1.9205554395914077, + "epoch": 0.5259311555931334, + "grad_norm": 7.43350076675415, + "learning_rate": 3.4884259626341523e-06, + "loss": 0.4477, + "mean_token_accuracy": 0.8556781709194183, + "num_tokens": 204133982.0, + "step": 169660 + }, + { + "entropy": 1.9717912420630455, + "epoch": 0.5259621547181831, + "grad_norm": 8.707015037536621, + "learning_rate": 3.488323160196329e-06, + "loss": 0.517, + "mean_token_accuracy": 0.8443710133433342, + "num_tokens": 204145583.0, + "step": 169670 + }, + { + "entropy": 1.8160235270857812, + "epoch": 0.5259931538432328, + "grad_norm": 4.329787254333496, + "learning_rate": 3.488220366846603e-06, + "loss": 0.3306, + "mean_token_accuracy": 0.8737678155303001, + "num_tokens": 204158620.0, + "step": 169680 + }, + { + "entropy": 1.8438635244965553, + "epoch": 0.5260241529682824, + "grad_norm": 6.801762580871582, + "learning_rate": 3.488117582583636e-06, + "loss": 0.486, + "mean_token_accuracy": 0.8483135282993317, + "num_tokens": 204172257.0, + "step": 169690 + }, + { + "entropy": 1.883940464258194, + "epoch": 0.5260551520933322, + "grad_norm": 7.7768120765686035, + "learning_rate": 3.4880148074060883e-06, + "loss": 0.3715, + "mean_token_accuracy": 0.8663879573345185, + "num_tokens": 204185159.0, + "step": 169700 + }, + { + "entropy": 1.8030682742595672, + "epoch": 0.5260861512183819, + "grad_norm": 4.2130584716796875, + "learning_rate": 3.4879120413126217e-06, + "loss": 0.3886, + "mean_token_accuracy": 0.8646674558520318, + "num_tokens": 204198490.0, + "step": 169710 + }, + { + "entropy": 1.9641460493206977, + "epoch": 0.5261171503434315, + "grad_norm": 7.377951622009277, + "learning_rate": 3.487809284301899e-06, + "loss": 0.4551, + "mean_token_accuracy": 0.8550163015723229, + "num_tokens": 204209459.0, + "step": 169720 + }, + { + "entropy": 1.9644610643386842, + "epoch": 0.5261481494684812, + "grad_norm": 8.131065368652344, + "learning_rate": 3.487706536372582e-06, + "loss": 0.4511, + "mean_token_accuracy": 0.8526425957679749, + "num_tokens": 204220685.0, + "step": 169730 + }, + { + "entropy": 1.8255484610795976, + "epoch": 0.526179148593531, + "grad_norm": 2.616626739501953, + "learning_rate": 3.4876037975233325e-06, + "loss": 0.3855, + "mean_token_accuracy": 0.867869146168232, + "num_tokens": 204233514.0, + "step": 169740 + }, + { + "entropy": 1.9307437628507613, + "epoch": 0.5262101477185807, + "grad_norm": 8.157958984375, + "learning_rate": 3.487501067752813e-06, + "loss": 0.449, + "mean_token_accuracy": 0.8660890907049179, + "num_tokens": 204244621.0, + "step": 169750 + }, + { + "entropy": 1.923675388097763, + "epoch": 0.5262411468436303, + "grad_norm": 9.466991424560547, + "learning_rate": 3.4873983470596878e-06, + "loss": 0.4539, + "mean_token_accuracy": 0.8543407127261162, + "num_tokens": 204256714.0, + "step": 169760 + }, + { + "entropy": 1.9599072575569152, + "epoch": 0.52627214596868, + "grad_norm": 8.29330062866211, + "learning_rate": 3.487295635442619e-06, + "loss": 0.5009, + "mean_token_accuracy": 0.8472807705402374, + "num_tokens": 204267567.0, + "step": 169770 + }, + { + "entropy": 1.924948987364769, + "epoch": 0.5263031450937298, + "grad_norm": 7.503832817077637, + "learning_rate": 3.4871929329002707e-06, + "loss": 0.4462, + "mean_token_accuracy": 0.850759707391262, + "num_tokens": 204279403.0, + "step": 169780 + }, + { + "entropy": 1.9135427996516228, + "epoch": 0.5263341442187794, + "grad_norm": 7.390564918518066, + "learning_rate": 3.487090239431306e-06, + "loss": 0.4437, + "mean_token_accuracy": 0.8542888939380646, + "num_tokens": 204290947.0, + "step": 169790 + }, + { + "entropy": 1.8421156138181687, + "epoch": 0.5263651433438291, + "grad_norm": 3.895979642868042, + "learning_rate": 3.4869875550343913e-06, + "loss": 0.3785, + "mean_token_accuracy": 0.8799378126859665, + "num_tokens": 204304029.0, + "step": 169800 + }, + { + "entropy": 1.8601937487721443, + "epoch": 0.5263961424688788, + "grad_norm": 8.522631645202637, + "learning_rate": 3.4868848797081872e-06, + "loss": 0.4064, + "mean_token_accuracy": 0.8664719671010971, + "num_tokens": 204315905.0, + "step": 169810 + }, + { + "entropy": 1.9008443534374238, + "epoch": 0.5264271415939286, + "grad_norm": 8.575570106506348, + "learning_rate": 3.4867822134513614e-06, + "loss": 0.4531, + "mean_token_accuracy": 0.8516224265098572, + "num_tokens": 204327709.0, + "step": 169820 + }, + { + "entropy": 1.8827106207609177, + "epoch": 0.5264581407189782, + "grad_norm": 7.430328369140625, + "learning_rate": 3.486679556262577e-06, + "loss": 0.4653, + "mean_token_accuracy": 0.8505603596568108, + "num_tokens": 204339706.0, + "step": 169830 + }, + { + "entropy": 1.9500410586595536, + "epoch": 0.5264891398440279, + "grad_norm": 8.332409858703613, + "learning_rate": 3.4865769081405006e-06, + "loss": 0.4682, + "mean_token_accuracy": 0.8455384686589241, + "num_tokens": 204351742.0, + "step": 169840 + }, + { + "entropy": 1.9368303149938584, + "epoch": 0.5265201389690776, + "grad_norm": 7.815258979797363, + "learning_rate": 3.4864742690837967e-06, + "loss": 0.4626, + "mean_token_accuracy": 0.8535314634442329, + "num_tokens": 204363119.0, + "step": 169850 + }, + { + "entropy": 1.8521654710173607, + "epoch": 0.5265511380941273, + "grad_norm": 9.0866117477417, + "learning_rate": 3.4863716390911313e-06, + "loss": 0.4524, + "mean_token_accuracy": 0.8530202284455299, + "num_tokens": 204374754.0, + "step": 169860 + }, + { + "entropy": 1.89563407599926, + "epoch": 0.526582137219177, + "grad_norm": 7.82810640335083, + "learning_rate": 3.4862690181611697e-06, + "loss": 0.4125, + "mean_token_accuracy": 0.8638562858104706, + "num_tokens": 204386610.0, + "step": 169870 + }, + { + "entropy": 1.873375302553177, + "epoch": 0.5266131363442267, + "grad_norm": 8.360272407531738, + "learning_rate": 3.4861664062925797e-06, + "loss": 0.409, + "mean_token_accuracy": 0.8664838880300522, + "num_tokens": 204398916.0, + "step": 169880 + }, + { + "entropy": 1.9053056687116623, + "epoch": 0.5266441354692764, + "grad_norm": 7.953768730163574, + "learning_rate": 3.486063803484026e-06, + "loss": 0.4874, + "mean_token_accuracy": 0.8378211975097656, + "num_tokens": 204410801.0, + "step": 169890 + }, + { + "entropy": 2.0088341891765595, + "epoch": 0.526675134594326, + "grad_norm": 8.043516159057617, + "learning_rate": 3.4859612097341776e-06, + "loss": 0.5323, + "mean_token_accuracy": 0.8447351723909378, + "num_tokens": 204421350.0, + "step": 169900 + }, + { + "entropy": 1.8681996390223503, + "epoch": 0.5267061337193758, + "grad_norm": 7.973627090454102, + "learning_rate": 3.4858586250417004e-06, + "loss": 0.4218, + "mean_token_accuracy": 0.8473279893398284, + "num_tokens": 204434186.0, + "step": 169910 + }, + { + "entropy": 1.8609192743897438, + "epoch": 0.5267371328444255, + "grad_norm": 7.692805290222168, + "learning_rate": 3.485756049405261e-06, + "loss": 0.4071, + "mean_token_accuracy": 0.8609997630119324, + "num_tokens": 204446234.0, + "step": 169920 + }, + { + "entropy": 1.920175838470459, + "epoch": 0.5267681319694751, + "grad_norm": 8.081625938415527, + "learning_rate": 3.485653482823528e-06, + "loss": 0.4367, + "mean_token_accuracy": 0.8617697909474373, + "num_tokens": 204457418.0, + "step": 169930 + }, + { + "entropy": 1.9051961615681647, + "epoch": 0.5267991310945248, + "grad_norm": 8.327408790588379, + "learning_rate": 3.48555092529517e-06, + "loss": 0.4447, + "mean_token_accuracy": 0.8548000529408455, + "num_tokens": 204468969.0, + "step": 169940 + }, + { + "entropy": 1.9939599066972733, + "epoch": 0.5268301302195746, + "grad_norm": 7.189398288726807, + "learning_rate": 3.485448376818854e-06, + "loss": 0.4884, + "mean_token_accuracy": 0.8490493327379227, + "num_tokens": 204480095.0, + "step": 169950 + }, + { + "entropy": 1.883024525642395, + "epoch": 0.5268611293446243, + "grad_norm": 7.902504920959473, + "learning_rate": 3.4853458373932486e-06, + "loss": 0.4408, + "mean_token_accuracy": 0.8574581429362297, + "num_tokens": 204492166.0, + "step": 169960 + }, + { + "entropy": 1.8119531005620957, + "epoch": 0.5268921284696739, + "grad_norm": 8.166264533996582, + "learning_rate": 3.4852433070170234e-06, + "loss": 0.4328, + "mean_token_accuracy": 0.8615056142210961, + "num_tokens": 204505151.0, + "step": 169970 + }, + { + "entropy": 1.8452864043414592, + "epoch": 0.5269231275947236, + "grad_norm": 7.124696254730225, + "learning_rate": 3.4851407856888465e-06, + "loss": 0.3881, + "mean_token_accuracy": 0.8681139811873436, + "num_tokens": 204517998.0, + "step": 169980 + }, + { + "entropy": 1.9364143311977386, + "epoch": 0.5269541267197734, + "grad_norm": 7.2239813804626465, + "learning_rate": 3.485038273407387e-06, + "loss": 0.4701, + "mean_token_accuracy": 0.8534222990274429, + "num_tokens": 204529222.0, + "step": 169990 + }, + { + "entropy": 1.9091769382357597, + "epoch": 0.526985125844823, + "grad_norm": 7.802124977111816, + "learning_rate": 3.4849357701713165e-06, + "loss": 0.4399, + "mean_token_accuracy": 0.8642896711826324, + "num_tokens": 204540170.0, + "step": 170000 + }, + { + "entropy": 1.8714770466089248, + "epoch": 0.5270161249698727, + "grad_norm": 9.276057243347168, + "learning_rate": 3.484833275979302e-06, + "loss": 0.4207, + "mean_token_accuracy": 0.8525351405143737, + "num_tokens": 204552616.0, + "step": 170010 + }, + { + "entropy": 1.857717652618885, + "epoch": 0.5270471240949224, + "grad_norm": 8.349593162536621, + "learning_rate": 3.4847307908300156e-06, + "loss": 0.4202, + "mean_token_accuracy": 0.8563280582427979, + "num_tokens": 204565267.0, + "step": 170020 + }, + { + "entropy": 1.7349850058555603, + "epoch": 0.5270781232199722, + "grad_norm": 7.835326194763184, + "learning_rate": 3.484628314722127e-06, + "loss": 0.3564, + "mean_token_accuracy": 0.871354128420353, + "num_tokens": 204579353.0, + "step": 170030 + }, + { + "entropy": 1.8870081350207328, + "epoch": 0.5271091223450218, + "grad_norm": 9.670953750610352, + "learning_rate": 3.484525847654307e-06, + "loss": 0.4444, + "mean_token_accuracy": 0.8451803371310234, + "num_tokens": 204592415.0, + "step": 170040 + }, + { + "entropy": 1.8382927820086479, + "epoch": 0.5271401214700715, + "grad_norm": 8.578036308288574, + "learning_rate": 3.4844233896252264e-06, + "loss": 0.3805, + "mean_token_accuracy": 0.8661419838666916, + "num_tokens": 204605425.0, + "step": 170050 + }, + { + "entropy": 1.969618821144104, + "epoch": 0.5271711205951212, + "grad_norm": 7.408189296722412, + "learning_rate": 3.484320940633557e-06, + "loss": 0.4686, + "mean_token_accuracy": 0.8454100415110588, + "num_tokens": 204616440.0, + "step": 170060 + }, + { + "entropy": 1.9854531973600387, + "epoch": 0.527202119720171, + "grad_norm": 8.708170890808105, + "learning_rate": 3.484218500677969e-06, + "loss": 0.5099, + "mean_token_accuracy": 0.8439430460333824, + "num_tokens": 204627483.0, + "step": 170070 + }, + { + "entropy": 1.8998574405908584, + "epoch": 0.5272331188452206, + "grad_norm": 8.767289161682129, + "learning_rate": 3.4841160697571356e-06, + "loss": 0.4279, + "mean_token_accuracy": 0.8521378070116044, + "num_tokens": 204639576.0, + "step": 170080 + }, + { + "entropy": 1.7341085240244865, + "epoch": 0.5272641179702703, + "grad_norm": 2.5850956439971924, + "learning_rate": 3.484013647869728e-06, + "loss": 0.3823, + "mean_token_accuracy": 0.8609752714633941, + "num_tokens": 204654034.0, + "step": 170090 + }, + { + "entropy": 1.897045449912548, + "epoch": 0.52729511709532, + "grad_norm": 8.623618125915527, + "learning_rate": 3.483911235014418e-06, + "loss": 0.4526, + "mean_token_accuracy": 0.8511906012892723, + "num_tokens": 204665912.0, + "step": 170100 + }, + { + "entropy": 1.9212136805057525, + "epoch": 0.5273261162203697, + "grad_norm": 9.318014144897461, + "learning_rate": 3.4838088311898806e-06, + "loss": 0.4971, + "mean_token_accuracy": 0.8414152905344963, + "num_tokens": 204677184.0, + "step": 170110 + }, + { + "entropy": 1.8723152995109558, + "epoch": 0.5273571153454194, + "grad_norm": 3.7983345985412598, + "learning_rate": 3.483706436394786e-06, + "loss": 0.4603, + "mean_token_accuracy": 0.8477897018194198, + "num_tokens": 204689567.0, + "step": 170120 + }, + { + "entropy": 1.8677892670035363, + "epoch": 0.5273881144704691, + "grad_norm": 4.092134952545166, + "learning_rate": 3.4836040506278078e-06, + "loss": 0.4584, + "mean_token_accuracy": 0.8498900100588799, + "num_tokens": 204701815.0, + "step": 170130 + }, + { + "entropy": 1.9073769941926002, + "epoch": 0.5274191135955187, + "grad_norm": 7.181079387664795, + "learning_rate": 3.4835016738876204e-06, + "loss": 0.4018, + "mean_token_accuracy": 0.8678811907768249, + "num_tokens": 204712717.0, + "step": 170140 + }, + { + "entropy": 1.9618832275271416, + "epoch": 0.5274501127205684, + "grad_norm": 6.790325164794922, + "learning_rate": 3.483399306172897e-06, + "loss": 0.4756, + "mean_token_accuracy": 0.8506255120038986, + "num_tokens": 204724239.0, + "step": 170150 + }, + { + "entropy": 1.8595173746347426, + "epoch": 0.5274811118456182, + "grad_norm": 7.266312599182129, + "learning_rate": 3.483296947482312e-06, + "loss": 0.4021, + "mean_token_accuracy": 0.8559166491031647, + "num_tokens": 204736308.0, + "step": 170160 + }, + { + "entropy": 1.9057711243629456, + "epoch": 0.5275121109706679, + "grad_norm": 8.606921195983887, + "learning_rate": 3.4831945978145384e-06, + "loss": 0.4537, + "mean_token_accuracy": 0.8584994554519654, + "num_tokens": 204747595.0, + "step": 170170 + }, + { + "entropy": 1.9293906196951867, + "epoch": 0.5275431100957175, + "grad_norm": 6.8501386642456055, + "learning_rate": 3.483092257168251e-06, + "loss": 0.4908, + "mean_token_accuracy": 0.8433649852871895, + "num_tokens": 204758718.0, + "step": 170180 + }, + { + "entropy": 1.8679754436016083, + "epoch": 0.5275741092207672, + "grad_norm": 9.489760398864746, + "learning_rate": 3.4829899255421258e-06, + "loss": 0.4286, + "mean_token_accuracy": 0.8598893627524375, + "num_tokens": 204770993.0, + "step": 170190 + }, + { + "entropy": 1.8623951964080334, + "epoch": 0.527605108345817, + "grad_norm": 3.54229998588562, + "learning_rate": 3.4828876029348362e-06, + "loss": 0.4178, + "mean_token_accuracy": 0.860218308866024, + "num_tokens": 204784022.0, + "step": 170200 + }, + { + "entropy": 1.8672780320048332, + "epoch": 0.5276361074708666, + "grad_norm": 8.450098991394043, + "learning_rate": 3.482785289345059e-06, + "loss": 0.3949, + "mean_token_accuracy": 0.8674737766385079, + "num_tokens": 204795531.0, + "step": 170210 + }, + { + "entropy": 1.820541125535965, + "epoch": 0.5276671065959163, + "grad_norm": 4.2782416343688965, + "learning_rate": 3.4826829847714694e-06, + "loss": 0.3921, + "mean_token_accuracy": 0.8648440659046173, + "num_tokens": 204808516.0, + "step": 170220 + }, + { + "entropy": 1.9438726305961609, + "epoch": 0.527698105720966, + "grad_norm": 8.147907257080078, + "learning_rate": 3.482580689212742e-06, + "loss": 0.4952, + "mean_token_accuracy": 0.842696090042591, + "num_tokens": 204819677.0, + "step": 170230 + }, + { + "entropy": 2.027736449241638, + "epoch": 0.5277291048460158, + "grad_norm": 8.2122802734375, + "learning_rate": 3.482478402667555e-06, + "loss": 0.5091, + "mean_token_accuracy": 0.8460215464234352, + "num_tokens": 204830585.0, + "step": 170240 + }, + { + "entropy": 1.800948679447174, + "epoch": 0.5277601039710654, + "grad_norm": 3.8614985942840576, + "learning_rate": 3.4823761251345834e-06, + "loss": 0.3803, + "mean_token_accuracy": 0.8599292874336243, + "num_tokens": 204843549.0, + "step": 170250 + }, + { + "entropy": 1.8939184978604318, + "epoch": 0.5277911030961151, + "grad_norm": 7.87742805480957, + "learning_rate": 3.482273856612504e-06, + "loss": 0.4268, + "mean_token_accuracy": 0.8504303067922592, + "num_tokens": 204855126.0, + "step": 170260 + }, + { + "entropy": 1.983650615811348, + "epoch": 0.5278221022211648, + "grad_norm": 10.45950984954834, + "learning_rate": 3.482171597099994e-06, + "loss": 0.4943, + "mean_token_accuracy": 0.8400062531232834, + "num_tokens": 204866558.0, + "step": 170270 + }, + { + "entropy": 1.8704144909977913, + "epoch": 0.5278531013462145, + "grad_norm": 8.008955955505371, + "learning_rate": 3.48206934659573e-06, + "loss": 0.4065, + "mean_token_accuracy": 0.8669389069080353, + "num_tokens": 204878894.0, + "step": 170280 + }, + { + "entropy": 1.9509722471237183, + "epoch": 0.5278841004712642, + "grad_norm": 9.869590759277344, + "learning_rate": 3.4819671050983904e-06, + "loss": 0.5041, + "mean_token_accuracy": 0.8404587954282761, + "num_tokens": 204889942.0, + "step": 170290 + }, + { + "entropy": 1.8514474794268607, + "epoch": 0.5279150995963139, + "grad_norm": 6.360729694366455, + "learning_rate": 3.4818648726066523e-06, + "loss": 0.4217, + "mean_token_accuracy": 0.8630643576383591, + "num_tokens": 204902937.0, + "step": 170300 + }, + { + "entropy": 1.9012500569224358, + "epoch": 0.5279460987213636, + "grad_norm": 4.0904059410095215, + "learning_rate": 3.4817626491191943e-06, + "loss": 0.4278, + "mean_token_accuracy": 0.8642749860882759, + "num_tokens": 204914895.0, + "step": 170310 + }, + { + "entropy": 1.847599171102047, + "epoch": 0.5279770978464133, + "grad_norm": 9.81718921661377, + "learning_rate": 3.4816604346346944e-06, + "loss": 0.3941, + "mean_token_accuracy": 0.8613127902150154, + "num_tokens": 204926771.0, + "step": 170320 + }, + { + "entropy": 1.9407305240631103, + "epoch": 0.528008096971463, + "grad_norm": 7.33196496963501, + "learning_rate": 3.4815582291518313e-06, + "loss": 0.4521, + "mean_token_accuracy": 0.8560499280691147, + "num_tokens": 204938238.0, + "step": 170330 + }, + { + "entropy": 1.8835730522871017, + "epoch": 0.5280390960965127, + "grad_norm": 7.579594612121582, + "learning_rate": 3.4814560326692833e-06, + "loss": 0.4312, + "mean_token_accuracy": 0.8645493924617768, + "num_tokens": 204949767.0, + "step": 170340 + }, + { + "entropy": 1.8464823752641677, + "epoch": 0.5280700952215623, + "grad_norm": 4.027227401733398, + "learning_rate": 3.4813538451857298e-06, + "loss": 0.4321, + "mean_token_accuracy": 0.8526124745607376, + "num_tokens": 204962686.0, + "step": 170350 + }, + { + "entropy": 1.7890559926629066, + "epoch": 0.5281010943466121, + "grad_norm": 2.017467498779297, + "learning_rate": 3.4812516666998506e-06, + "loss": 0.3931, + "mean_token_accuracy": 0.8606483578681946, + "num_tokens": 204976198.0, + "step": 170360 + }, + { + "entropy": 1.922431980073452, + "epoch": 0.5281320934716618, + "grad_norm": 8.555486679077148, + "learning_rate": 3.4811494972103254e-06, + "loss": 0.4786, + "mean_token_accuracy": 0.8491716757416725, + "num_tokens": 204987681.0, + "step": 170370 + }, + { + "entropy": 1.943414855003357, + "epoch": 0.5281630925967115, + "grad_norm": 9.159770011901855, + "learning_rate": 3.481047336715833e-06, + "loss": 0.4762, + "mean_token_accuracy": 0.8497003749012947, + "num_tokens": 204999270.0, + "step": 170380 + }, + { + "entropy": 1.9107708364725113, + "epoch": 0.5281940917217611, + "grad_norm": 7.628677845001221, + "learning_rate": 3.4809451852150548e-06, + "loss": 0.4448, + "mean_token_accuracy": 0.8630807980895042, + "num_tokens": 205010971.0, + "step": 170390 + }, + { + "entropy": 1.863230326771736, + "epoch": 0.5282250908468108, + "grad_norm": 8.68991756439209, + "learning_rate": 3.4808430427066714e-06, + "loss": 0.4174, + "mean_token_accuracy": 0.8568514421582222, + "num_tokens": 205023252.0, + "step": 170400 + }, + { + "entropy": 1.8994240581989288, + "epoch": 0.5282560899718606, + "grad_norm": 4.405247211456299, + "learning_rate": 3.4807409091893627e-06, + "loss": 0.4538, + "mean_token_accuracy": 0.8561380252242088, + "num_tokens": 205035045.0, + "step": 170410 + }, + { + "entropy": 1.8784004136919976, + "epoch": 0.5282870890969102, + "grad_norm": 7.923066139221191, + "learning_rate": 3.48063878466181e-06, + "loss": 0.465, + "mean_token_accuracy": 0.8491328284144402, + "num_tokens": 205047400.0, + "step": 170420 + }, + { + "entropy": 1.850473153591156, + "epoch": 0.5283180882219599, + "grad_norm": 8.640925407409668, + "learning_rate": 3.480536669122695e-06, + "loss": 0.4093, + "mean_token_accuracy": 0.8576840028166771, + "num_tokens": 205060199.0, + "step": 170430 + }, + { + "entropy": 1.8898152530193328, + "epoch": 0.5283490873470096, + "grad_norm": 9.438566207885742, + "learning_rate": 3.480434562570698e-06, + "loss": 0.4626, + "mean_token_accuracy": 0.8480508312582969, + "num_tokens": 205072684.0, + "step": 170440 + }, + { + "entropy": 1.8964637443423271, + "epoch": 0.5283800864720594, + "grad_norm": 7.4299798011779785, + "learning_rate": 3.4803324650045023e-06, + "loss": 0.4368, + "mean_token_accuracy": 0.851615709066391, + "num_tokens": 205084596.0, + "step": 170450 + }, + { + "entropy": 1.928575037419796, + "epoch": 0.528411085597109, + "grad_norm": 9.050699234008789, + "learning_rate": 3.480230376422789e-06, + "loss": 0.4368, + "mean_token_accuracy": 0.8607550710439682, + "num_tokens": 205096396.0, + "step": 170460 + }, + { + "entropy": 1.9804482892155648, + "epoch": 0.5284420847221587, + "grad_norm": 11.58686637878418, + "learning_rate": 3.4801282968242415e-06, + "loss": 0.4946, + "mean_token_accuracy": 0.8415035635232926, + "num_tokens": 205107901.0, + "step": 170470 + }, + { + "entropy": 1.965294747054577, + "epoch": 0.5284730838472084, + "grad_norm": 3.7001168727874756, + "learning_rate": 3.4800262262075415e-06, + "loss": 0.4808, + "mean_token_accuracy": 0.8423107132315636, + "num_tokens": 205119244.0, + "step": 170480 + }, + { + "entropy": 1.9720071524381637, + "epoch": 0.5285040829722581, + "grad_norm": 7.256556034088135, + "learning_rate": 3.4799241645713715e-06, + "loss": 0.4625, + "mean_token_accuracy": 0.8573978587985038, + "num_tokens": 205129854.0, + "step": 170490 + }, + { + "entropy": 1.9305579409003257, + "epoch": 0.5285350820973078, + "grad_norm": 6.5054097175598145, + "learning_rate": 3.479822111914416e-06, + "loss": 0.4467, + "mean_token_accuracy": 0.8480049282312393, + "num_tokens": 205141601.0, + "step": 170500 + }, + { + "entropy": 1.7918709874153138, + "epoch": 0.5285660812223575, + "grad_norm": 8.549609184265137, + "learning_rate": 3.479720068235358e-06, + "loss": 0.383, + "mean_token_accuracy": 0.8719558849930763, + "num_tokens": 205154051.0, + "step": 170510 + }, + { + "entropy": 1.921235775947571, + "epoch": 0.5285970803474072, + "grad_norm": 4.8535895347595215, + "learning_rate": 3.4796180335328818e-06, + "loss": 0.4472, + "mean_token_accuracy": 0.8587585180997849, + "num_tokens": 205166626.0, + "step": 170520 + }, + { + "entropy": 1.836370651423931, + "epoch": 0.5286280794724569, + "grad_norm": 7.811720371246338, + "learning_rate": 3.47951600780567e-06, + "loss": 0.3946, + "mean_token_accuracy": 0.8683404237031936, + "num_tokens": 205180061.0, + "step": 170530 + }, + { + "entropy": 1.9387354284524918, + "epoch": 0.5286590785975066, + "grad_norm": 10.000458717346191, + "learning_rate": 3.4794139910524073e-06, + "loss": 0.4971, + "mean_token_accuracy": 0.8442668840289116, + "num_tokens": 205190252.0, + "step": 170540 + }, + { + "entropy": 1.9495687514543534, + "epoch": 0.5286900777225563, + "grad_norm": 8.926695823669434, + "learning_rate": 3.479311983271778e-06, + "loss": 0.4923, + "mean_token_accuracy": 0.8437932714819908, + "num_tokens": 205200850.0, + "step": 170550 + }, + { + "entropy": 1.8872013121843338, + "epoch": 0.5287210768476059, + "grad_norm": 8.04249382019043, + "learning_rate": 3.4792099844624676e-06, + "loss": 0.4584, + "mean_token_accuracy": 0.8474228546023369, + "num_tokens": 205212588.0, + "step": 170560 + }, + { + "entropy": 1.8189583510160445, + "epoch": 0.5287520759726557, + "grad_norm": 6.7188568115234375, + "learning_rate": 3.479107994623162e-06, + "loss": 0.3891, + "mean_token_accuracy": 0.8674964413046837, + "num_tokens": 205225588.0, + "step": 170570 + }, + { + "entropy": 1.8325736671686172, + "epoch": 0.5287830750977054, + "grad_norm": 3.9094064235687256, + "learning_rate": 3.4790060137525445e-06, + "loss": 0.4672, + "mean_token_accuracy": 0.8595153912901878, + "num_tokens": 205238340.0, + "step": 170580 + }, + { + "entropy": 1.7467509105801582, + "epoch": 0.5288140742227551, + "grad_norm": 4.745372295379639, + "learning_rate": 3.478904041849302e-06, + "loss": 0.3705, + "mean_token_accuracy": 0.8667123362421989, + "num_tokens": 205252508.0, + "step": 170590 + }, + { + "entropy": 1.8727618649601936, + "epoch": 0.5288450733478047, + "grad_norm": 8.714336395263672, + "learning_rate": 3.4788020789121196e-06, + "loss": 0.4304, + "mean_token_accuracy": 0.8614105626940727, + "num_tokens": 205264806.0, + "step": 170600 + }, + { + "entropy": 1.9451875492930413, + "epoch": 0.5288760724728545, + "grad_norm": 7.6290388107299805, + "learning_rate": 3.478700124939684e-06, + "loss": 0.5023, + "mean_token_accuracy": 0.8461343124508858, + "num_tokens": 205276133.0, + "step": 170610 + }, + { + "entropy": 1.9926445066928864, + "epoch": 0.5289070715979042, + "grad_norm": 8.265352249145508, + "learning_rate": 3.4785981799306824e-06, + "loss": 0.5201, + "mean_token_accuracy": 0.8461702167987823, + "num_tokens": 205286443.0, + "step": 170620 + }, + { + "entropy": 1.9504462242126466, + "epoch": 0.5289380707229538, + "grad_norm": 6.742199420928955, + "learning_rate": 3.4784962438838e-06, + "loss": 0.4811, + "mean_token_accuracy": 0.843754306435585, + "num_tokens": 205297534.0, + "step": 170630 + }, + { + "entropy": 1.8863413706421852, + "epoch": 0.5289690698480035, + "grad_norm": 7.461973667144775, + "learning_rate": 3.478394316797724e-06, + "loss": 0.4452, + "mean_token_accuracy": 0.8559864416718483, + "num_tokens": 205310114.0, + "step": 170640 + }, + { + "entropy": 1.9195245072245597, + "epoch": 0.5290000689730532, + "grad_norm": 7.091519355773926, + "learning_rate": 3.4782923986711427e-06, + "loss": 0.4084, + "mean_token_accuracy": 0.861282479763031, + "num_tokens": 205322178.0, + "step": 170650 + }, + { + "entropy": 1.9424390122294426, + "epoch": 0.529031068098103, + "grad_norm": 7.431682586669922, + "learning_rate": 3.4781904895027417e-06, + "loss": 0.3987, + "mean_token_accuracy": 0.8677595168352127, + "num_tokens": 205333899.0, + "step": 170660 + }, + { + "entropy": 1.9619128614664079, + "epoch": 0.5290620672231526, + "grad_norm": 7.763944625854492, + "learning_rate": 3.4780885892912114e-06, + "loss": 0.4525, + "mean_token_accuracy": 0.8576542928814888, + "num_tokens": 205345194.0, + "step": 170670 + }, + { + "entropy": 1.813578100502491, + "epoch": 0.5290930663482023, + "grad_norm": 9.515689849853516, + "learning_rate": 3.4779866980352376e-06, + "loss": 0.3974, + "mean_token_accuracy": 0.8588124215602875, + "num_tokens": 205358327.0, + "step": 170680 + }, + { + "entropy": 1.9403289988636971, + "epoch": 0.529124065473252, + "grad_norm": 4.967763423919678, + "learning_rate": 3.477884815733509e-06, + "loss": 0.4581, + "mean_token_accuracy": 0.8475630313158036, + "num_tokens": 205370299.0, + "step": 170690 + }, + { + "entropy": 1.9086318165063858, + "epoch": 0.5291550645983017, + "grad_norm": 4.940814018249512, + "learning_rate": 3.4777829423847153e-06, + "loss": 0.4285, + "mean_token_accuracy": 0.8549073994159698, + "num_tokens": 205381624.0, + "step": 170700 + }, + { + "entropy": 1.870780572295189, + "epoch": 0.5291860637233514, + "grad_norm": 8.352327346801758, + "learning_rate": 3.4776810779875443e-06, + "loss": 0.3743, + "mean_token_accuracy": 0.8714735135436058, + "num_tokens": 205394297.0, + "step": 170710 + }, + { + "entropy": 1.930020920932293, + "epoch": 0.5292170628484011, + "grad_norm": 7.697651386260986, + "learning_rate": 3.477579222540686e-06, + "loss": 0.4324, + "mean_token_accuracy": 0.857844527065754, + "num_tokens": 205406236.0, + "step": 170720 + }, + { + "entropy": 1.8871884644031525, + "epoch": 0.5292480619734508, + "grad_norm": 8.5154447555542, + "learning_rate": 3.4774773760428288e-06, + "loss": 0.4334, + "mean_token_accuracy": 0.8678113892674446, + "num_tokens": 205417888.0, + "step": 170730 + }, + { + "entropy": 1.882606066763401, + "epoch": 0.5292790610985005, + "grad_norm": 9.096325874328613, + "learning_rate": 3.4773755384926622e-06, + "loss": 0.4838, + "mean_token_accuracy": 0.8506225869059563, + "num_tokens": 205429795.0, + "step": 170740 + }, + { + "entropy": 1.810667596757412, + "epoch": 0.5293100602235502, + "grad_norm": 9.180743217468262, + "learning_rate": 3.477273709888877e-06, + "loss": 0.4372, + "mean_token_accuracy": 0.8561224222183228, + "num_tokens": 205442704.0, + "step": 170750 + }, + { + "entropy": 1.815290132164955, + "epoch": 0.5293410593485999, + "grad_norm": 3.778643846511841, + "learning_rate": 3.4771718902301632e-06, + "loss": 0.4178, + "mean_token_accuracy": 0.8607045948505402, + "num_tokens": 205455503.0, + "step": 170760 + }, + { + "entropy": 1.8823131069540977, + "epoch": 0.5293720584736495, + "grad_norm": 4.358188152313232, + "learning_rate": 3.4770700795152113e-06, + "loss": 0.4234, + "mean_token_accuracy": 0.857579217851162, + "num_tokens": 205467961.0, + "step": 170770 + }, + { + "entropy": 1.9376216232776642, + "epoch": 0.5294030575986993, + "grad_norm": 8.354575157165527, + "learning_rate": 3.4769682777427115e-06, + "loss": 0.4691, + "mean_token_accuracy": 0.8583876609802246, + "num_tokens": 205479799.0, + "step": 170780 + }, + { + "entropy": 2.025956559181213, + "epoch": 0.529434056723749, + "grad_norm": 8.568845748901367, + "learning_rate": 3.4768664849113554e-06, + "loss": 0.554, + "mean_token_accuracy": 0.834122110903263, + "num_tokens": 205490649.0, + "step": 170790 + }, + { + "entropy": 1.87187303006649, + "epoch": 0.5294650558487987, + "grad_norm": 9.696044921875, + "learning_rate": 3.4767647010198333e-06, + "loss": 0.4231, + "mean_token_accuracy": 0.8556722313165664, + "num_tokens": 205503385.0, + "step": 170800 + }, + { + "entropy": 1.916469356417656, + "epoch": 0.5294960549738483, + "grad_norm": 8.421268463134766, + "learning_rate": 3.476662926066838e-06, + "loss": 0.4559, + "mean_token_accuracy": 0.8445528611540795, + "num_tokens": 205515716.0, + "step": 170810 + }, + { + "entropy": 2.0089281469583513, + "epoch": 0.5295270540988981, + "grad_norm": 8.422050476074219, + "learning_rate": 3.476561160051061e-06, + "loss": 0.5132, + "mean_token_accuracy": 0.8411181181669235, + "num_tokens": 205526612.0, + "step": 170820 + }, + { + "entropy": 1.8615966871380807, + "epoch": 0.5295580532239478, + "grad_norm": 8.690731048583984, + "learning_rate": 3.4764594029711927e-06, + "loss": 0.4089, + "mean_token_accuracy": 0.8632576271891594, + "num_tokens": 205538945.0, + "step": 170830 + }, + { + "entropy": 1.9792701214551927, + "epoch": 0.5295890523489974, + "grad_norm": 7.975484371185303, + "learning_rate": 3.476357654825928e-06, + "loss": 0.4982, + "mean_token_accuracy": 0.8475763037800789, + "num_tokens": 205549884.0, + "step": 170840 + }, + { + "entropy": 1.880912221968174, + "epoch": 0.5296200514740471, + "grad_norm": 8.423057556152344, + "learning_rate": 3.476255915613958e-06, + "loss": 0.426, + "mean_token_accuracy": 0.8568846389651299, + "num_tokens": 205562367.0, + "step": 170850 + }, + { + "entropy": 1.898339208960533, + "epoch": 0.5296510505990969, + "grad_norm": 3.246166706085205, + "learning_rate": 3.4761541853339758e-06, + "loss": 0.4264, + "mean_token_accuracy": 0.8588922902941704, + "num_tokens": 205574760.0, + "step": 170860 + }, + { + "entropy": 1.9326817393302917, + "epoch": 0.5296820497241466, + "grad_norm": 7.557738304138184, + "learning_rate": 3.476052463984674e-06, + "loss": 0.4572, + "mean_token_accuracy": 0.8551507011055947, + "num_tokens": 205586611.0, + "step": 170870 + }, + { + "entropy": 1.9366946816444397, + "epoch": 0.5297130488491962, + "grad_norm": 9.150696754455566, + "learning_rate": 3.475950751564748e-06, + "loss": 0.5066, + "mean_token_accuracy": 0.852847796678543, + "num_tokens": 205598159.0, + "step": 170880 + }, + { + "entropy": 1.8673755928874016, + "epoch": 0.5297440479742459, + "grad_norm": 8.274737358093262, + "learning_rate": 3.4758490480728883e-06, + "loss": 0.4386, + "mean_token_accuracy": 0.859563185274601, + "num_tokens": 205610695.0, + "step": 170890 + }, + { + "entropy": 1.9312998801469803, + "epoch": 0.5297750470992956, + "grad_norm": 4.422121524810791, + "learning_rate": 3.4757473535077918e-06, + "loss": 0.4503, + "mean_token_accuracy": 0.8544506028294563, + "num_tokens": 205623043.0, + "step": 170900 + }, + { + "entropy": 1.878605456650257, + "epoch": 0.5298060462243454, + "grad_norm": 8.475115776062012, + "learning_rate": 3.475645667868151e-06, + "loss": 0.3706, + "mean_token_accuracy": 0.8737639874219895, + "num_tokens": 205635875.0, + "step": 170910 + }, + { + "entropy": 1.8959810853004455, + "epoch": 0.529837045349395, + "grad_norm": 7.229968070983887, + "learning_rate": 3.4755439911526608e-06, + "loss": 0.4773, + "mean_token_accuracy": 0.8536716759204864, + "num_tokens": 205648414.0, + "step": 170920 + }, + { + "entropy": 1.9027748540043832, + "epoch": 0.5298680444744447, + "grad_norm": 7.403899669647217, + "learning_rate": 3.4754423233600166e-06, + "loss": 0.4177, + "mean_token_accuracy": 0.8609417542815209, + "num_tokens": 205660124.0, + "step": 170930 + }, + { + "entropy": 1.8189100459218026, + "epoch": 0.5298990435994944, + "grad_norm": 7.083561420440674, + "learning_rate": 3.475340664488912e-06, + "loss": 0.4209, + "mean_token_accuracy": 0.860698975622654, + "num_tokens": 205672963.0, + "step": 170940 + }, + { + "entropy": 1.9738568156957625, + "epoch": 0.5299300427245441, + "grad_norm": 4.123296737670898, + "learning_rate": 3.4752390145380434e-06, + "loss": 0.469, + "mean_token_accuracy": 0.851046659052372, + "num_tokens": 205683908.0, + "step": 170950 + }, + { + "entropy": 1.9216079100966454, + "epoch": 0.5299610418495938, + "grad_norm": 7.863401412963867, + "learning_rate": 3.4751373735061063e-06, + "loss": 0.4362, + "mean_token_accuracy": 0.8537676095962524, + "num_tokens": 205695636.0, + "step": 170960 + }, + { + "entropy": 1.9529218971729279, + "epoch": 0.5299920409746435, + "grad_norm": 4.082332611083984, + "learning_rate": 3.4750357413917956e-06, + "loss": 0.4356, + "mean_token_accuracy": 0.8507506966590881, + "num_tokens": 205707912.0, + "step": 170970 + }, + { + "entropy": 1.9960295543074609, + "epoch": 0.5300230400996931, + "grad_norm": 9.555853843688965, + "learning_rate": 3.4749341181938086e-06, + "loss": 0.4904, + "mean_token_accuracy": 0.8483745753765106, + "num_tokens": 205719170.0, + "step": 170980 + }, + { + "entropy": 1.9430689826607703, + "epoch": 0.5300540392247429, + "grad_norm": 6.543044567108154, + "learning_rate": 3.4748325039108404e-06, + "loss": 0.4138, + "mean_token_accuracy": 0.8665653809905052, + "num_tokens": 205730921.0, + "step": 170990 + }, + { + "entropy": 1.7677809298038483, + "epoch": 0.5300850383497926, + "grad_norm": 4.4991536140441895, + "learning_rate": 3.4747308985415882e-06, + "loss": 0.3484, + "mean_token_accuracy": 0.8721923053264617, + "num_tokens": 205745399.0, + "step": 171000 + }, + { + "entropy": 1.8735228538513184, + "epoch": 0.5301160374748423, + "grad_norm": 7.107239723205566, + "learning_rate": 3.4746293020847494e-06, + "loss": 0.436, + "mean_token_accuracy": 0.856609933078289, + "num_tokens": 205758439.0, + "step": 171010 + }, + { + "entropy": 1.9389600038528443, + "epoch": 0.5301470365998919, + "grad_norm": 8.752920150756836, + "learning_rate": 3.4745277145390203e-06, + "loss": 0.4929, + "mean_token_accuracy": 0.8403217494487762, + "num_tokens": 205770191.0, + "step": 171020 + }, + { + "entropy": 1.9211769714951514, + "epoch": 0.5301780357249417, + "grad_norm": 8.575197219848633, + "learning_rate": 3.474426135903099e-06, + "loss": 0.4398, + "mean_token_accuracy": 0.8504500105977059, + "num_tokens": 205782618.0, + "step": 171030 + }, + { + "entropy": 1.9171297043561935, + "epoch": 0.5302090348499914, + "grad_norm": 7.678973197937012, + "learning_rate": 3.4743245661756834e-06, + "loss": 0.4715, + "mean_token_accuracy": 0.8509926497936249, + "num_tokens": 205794273.0, + "step": 171040 + }, + { + "entropy": 1.945462729036808, + "epoch": 0.530240033975041, + "grad_norm": 6.8942060470581055, + "learning_rate": 3.4742230053554697e-06, + "loss": 0.4429, + "mean_token_accuracy": 0.8553547367453576, + "num_tokens": 205805838.0, + "step": 171050 + }, + { + "entropy": 1.9018363282084465, + "epoch": 0.5302710331000907, + "grad_norm": 7.0610504150390625, + "learning_rate": 3.474121453441158e-06, + "loss": 0.4176, + "mean_token_accuracy": 0.8616586282849312, + "num_tokens": 205818642.0, + "step": 171060 + }, + { + "entropy": 1.9222232773900032, + "epoch": 0.5303020322251405, + "grad_norm": 6.2709784507751465, + "learning_rate": 3.474019910431446e-06, + "loss": 0.4335, + "mean_token_accuracy": 0.8556934267282486, + "num_tokens": 205831069.0, + "step": 171070 + }, + { + "entropy": 1.8316721200942994, + "epoch": 0.5303330313501902, + "grad_norm": 3.8591957092285156, + "learning_rate": 3.4739183763250324e-06, + "loss": 0.3959, + "mean_token_accuracy": 0.8605335086584092, + "num_tokens": 205844403.0, + "step": 171080 + }, + { + "entropy": 1.9160492144525052, + "epoch": 0.5303640304752398, + "grad_norm": 2.401305913925171, + "learning_rate": 3.4738168511206166e-06, + "loss": 0.4393, + "mean_token_accuracy": 0.8519866108894348, + "num_tokens": 205857195.0, + "step": 171090 + }, + { + "entropy": 1.9100564211606978, + "epoch": 0.5303950296002895, + "grad_norm": 9.531085968017578, + "learning_rate": 3.4737153348168974e-06, + "loss": 0.4267, + "mean_token_accuracy": 0.8574813723564148, + "num_tokens": 205868676.0, + "step": 171100 + }, + { + "entropy": 1.892510113120079, + "epoch": 0.5304260287253393, + "grad_norm": 8.419562339782715, + "learning_rate": 3.4736138274125736e-06, + "loss": 0.4768, + "mean_token_accuracy": 0.8523040831089019, + "num_tokens": 205880267.0, + "step": 171110 + }, + { + "entropy": 1.930445173382759, + "epoch": 0.530457027850389, + "grad_norm": 8.468448638916016, + "learning_rate": 3.473512328906347e-06, + "loss": 0.4445, + "mean_token_accuracy": 0.8609248101711273, + "num_tokens": 205891258.0, + "step": 171120 + }, + { + "entropy": 1.933561460673809, + "epoch": 0.5304880269754386, + "grad_norm": 9.11067008972168, + "learning_rate": 3.473410839296916e-06, + "loss": 0.4754, + "mean_token_accuracy": 0.8550457268953323, + "num_tokens": 205902734.0, + "step": 171130 + }, + { + "entropy": 1.8673659279942512, + "epoch": 0.5305190261004883, + "grad_norm": 7.698315143585205, + "learning_rate": 3.4733093585829824e-06, + "loss": 0.4254, + "mean_token_accuracy": 0.8555175766348839, + "num_tokens": 205915319.0, + "step": 171140 + }, + { + "entropy": 1.87417384237051, + "epoch": 0.530550025225538, + "grad_norm": 9.63538646697998, + "learning_rate": 3.4732078867632454e-06, + "loss": 0.4275, + "mean_token_accuracy": 0.8585587859153747, + "num_tokens": 205927617.0, + "step": 171150 + }, + { + "entropy": 1.7987977162003517, + "epoch": 0.5305810243505877, + "grad_norm": 8.212528228759766, + "learning_rate": 3.473106423836406e-06, + "loss": 0.4199, + "mean_token_accuracy": 0.8556270152330399, + "num_tokens": 205941685.0, + "step": 171160 + }, + { + "entropy": 1.9743974149227141, + "epoch": 0.5306120234756374, + "grad_norm": 8.22183609008789, + "learning_rate": 3.4730049698011663e-06, + "loss": 0.4375, + "mean_token_accuracy": 0.8619926169514656, + "num_tokens": 205952726.0, + "step": 171170 + }, + { + "entropy": 1.8682279139757156, + "epoch": 0.5306430226006871, + "grad_norm": 8.998403549194336, + "learning_rate": 3.4729035246562272e-06, + "loss": 0.3837, + "mean_token_accuracy": 0.8567379638552666, + "num_tokens": 205965714.0, + "step": 171180 + }, + { + "entropy": 1.999984535574913, + "epoch": 0.5306740217257367, + "grad_norm": 10.236671447753906, + "learning_rate": 3.4728020884002904e-06, + "loss": 0.4473, + "mean_token_accuracy": 0.8585654407739639, + "num_tokens": 205976447.0, + "step": 171190 + }, + { + "entropy": 1.9566071853041649, + "epoch": 0.5307050208507865, + "grad_norm": 8.674802780151367, + "learning_rate": 3.4727006610320568e-06, + "loss": 0.4732, + "mean_token_accuracy": 0.8497566193342209, + "num_tokens": 205987680.0, + "step": 171200 + }, + { + "entropy": 1.9564196184277534, + "epoch": 0.5307360199758362, + "grad_norm": 9.1547269821167, + "learning_rate": 3.4725992425502305e-06, + "loss": 0.4506, + "mean_token_accuracy": 0.8570850923657417, + "num_tokens": 205999031.0, + "step": 171210 + }, + { + "entropy": 1.8233773186802864, + "epoch": 0.5307670191008859, + "grad_norm": 3.746918201446533, + "learning_rate": 3.4724978329535126e-06, + "loss": 0.3573, + "mean_token_accuracy": 0.868577241897583, + "num_tokens": 206012684.0, + "step": 171220 + }, + { + "entropy": 1.9081631690263747, + "epoch": 0.5307980182259355, + "grad_norm": 8.837114334106445, + "learning_rate": 3.472396432240606e-06, + "loss": 0.4134, + "mean_token_accuracy": 0.8688212335109711, + "num_tokens": 206024510.0, + "step": 171230 + }, + { + "entropy": 1.9076814904808999, + "epoch": 0.5308290173509853, + "grad_norm": 3.593170642852783, + "learning_rate": 3.4722950404102142e-06, + "loss": 0.4478, + "mean_token_accuracy": 0.8619635835289955, + "num_tokens": 206036479.0, + "step": 171240 + }, + { + "entropy": 1.9302921831607818, + "epoch": 0.530860016476035, + "grad_norm": 9.049263000488281, + "learning_rate": 3.4721936574610398e-06, + "loss": 0.5019, + "mean_token_accuracy": 0.8401584982872009, + "num_tokens": 206048513.0, + "step": 171250 + }, + { + "entropy": 1.9466217041015625, + "epoch": 0.5308910156010846, + "grad_norm": 8.891075134277344, + "learning_rate": 3.4720922833917873e-06, + "loss": 0.459, + "mean_token_accuracy": 0.8489014402031898, + "num_tokens": 206059987.0, + "step": 171260 + }, + { + "entropy": 1.9733015805482865, + "epoch": 0.5309220147261343, + "grad_norm": 8.178411483764648, + "learning_rate": 3.471990918201159e-06, + "loss": 0.484, + "mean_token_accuracy": 0.8565971910953522, + "num_tokens": 206071045.0, + "step": 171270 + }, + { + "entropy": 1.9010334610939026, + "epoch": 0.5309530138511841, + "grad_norm": 4.258671283721924, + "learning_rate": 3.4718895618878607e-06, + "loss": 0.4309, + "mean_token_accuracy": 0.8510370135307312, + "num_tokens": 206083794.0, + "step": 171280 + }, + { + "entropy": 1.9572122678160668, + "epoch": 0.5309840129762338, + "grad_norm": 6.8072285652160645, + "learning_rate": 3.4717882144505954e-06, + "loss": 0.4409, + "mean_token_accuracy": 0.8524574264883995, + "num_tokens": 206095828.0, + "step": 171290 + }, + { + "entropy": 1.7896913141012192, + "epoch": 0.5310150121012834, + "grad_norm": 8.507625579833984, + "learning_rate": 3.471686875888068e-06, + "loss": 0.3289, + "mean_token_accuracy": 0.8647554531693459, + "num_tokens": 206109338.0, + "step": 171300 + }, + { + "entropy": 1.9450671926140786, + "epoch": 0.5310460112263331, + "grad_norm": 9.12865161895752, + "learning_rate": 3.471585546198984e-06, + "loss": 0.4163, + "mean_token_accuracy": 0.8636203348636627, + "num_tokens": 206120543.0, + "step": 171310 + }, + { + "entropy": 2.017956680059433, + "epoch": 0.5310770103513829, + "grad_norm": 7.720823287963867, + "learning_rate": 3.4714842253820474e-06, + "loss": 0.5124, + "mean_token_accuracy": 0.8404148548841477, + "num_tokens": 206131003.0, + "step": 171320 + }, + { + "entropy": 1.9527254745364189, + "epoch": 0.5311080094764326, + "grad_norm": 8.367618560791016, + "learning_rate": 3.4713829134359637e-06, + "loss": 0.4437, + "mean_token_accuracy": 0.8573798596858978, + "num_tokens": 206142871.0, + "step": 171330 + }, + { + "entropy": 1.8782565340399742, + "epoch": 0.5311390086014822, + "grad_norm": 11.827756881713867, + "learning_rate": 3.47128161035944e-06, + "loss": 0.498, + "mean_token_accuracy": 0.8391523391008378, + "num_tokens": 206155200.0, + "step": 171340 + }, + { + "entropy": 1.8876243814826013, + "epoch": 0.5311700077265319, + "grad_norm": 2.4594743251800537, + "learning_rate": 3.471180316151181e-06, + "loss": 0.4669, + "mean_token_accuracy": 0.8383335024118423, + "num_tokens": 206167298.0, + "step": 171350 + }, + { + "entropy": 1.9841479808092117, + "epoch": 0.5312010068515816, + "grad_norm": 7.097223281860352, + "learning_rate": 3.4710790308098923e-06, + "loss": 0.473, + "mean_token_accuracy": 0.8582739785313607, + "num_tokens": 206178070.0, + "step": 171360 + }, + { + "entropy": 1.7915344461798668, + "epoch": 0.5312320059766313, + "grad_norm": 6.845920562744141, + "learning_rate": 3.4709777543342814e-06, + "loss": 0.3932, + "mean_token_accuracy": 0.8660350710153579, + "num_tokens": 206191483.0, + "step": 171370 + }, + { + "entropy": 1.9542489141225814, + "epoch": 0.531263005101681, + "grad_norm": 5.8323211669921875, + "learning_rate": 3.470876486723055e-06, + "loss": 0.4619, + "mean_token_accuracy": 0.8544400915503502, + "num_tokens": 206202729.0, + "step": 171380 + }, + { + "entropy": 1.9344114646315576, + "epoch": 0.5312940042267307, + "grad_norm": 7.813360214233398, + "learning_rate": 3.4707752279749196e-06, + "loss": 0.4989, + "mean_token_accuracy": 0.8451777279376984, + "num_tokens": 206214813.0, + "step": 171390 + }, + { + "entropy": 1.8977814212441444, + "epoch": 0.5313250033517803, + "grad_norm": 7.615082263946533, + "learning_rate": 3.470673978088583e-06, + "loss": 0.4756, + "mean_token_accuracy": 0.8473282709717751, + "num_tokens": 206226984.0, + "step": 171400 + }, + { + "entropy": 1.835694682598114, + "epoch": 0.5313560024768301, + "grad_norm": 7.679605960845947, + "learning_rate": 3.470572737062751e-06, + "loss": 0.3752, + "mean_token_accuracy": 0.86816466152668, + "num_tokens": 206239730.0, + "step": 171410 + }, + { + "entropy": 1.909173347055912, + "epoch": 0.5313870016018798, + "grad_norm": 7.6606245040893555, + "learning_rate": 3.470471504896134e-06, + "loss": 0.4562, + "mean_token_accuracy": 0.860865730047226, + "num_tokens": 206251566.0, + "step": 171420 + }, + { + "entropy": 1.9146757140755652, + "epoch": 0.5314180007269295, + "grad_norm": 9.230483055114746, + "learning_rate": 3.470370281587438e-06, + "loss": 0.4581, + "mean_token_accuracy": 0.8539326146245003, + "num_tokens": 206264185.0, + "step": 171430 + }, + { + "entropy": 1.965253323316574, + "epoch": 0.5314489998519791, + "grad_norm": 8.606255531311035, + "learning_rate": 3.4702690671353717e-06, + "loss": 0.4623, + "mean_token_accuracy": 0.8522212520241738, + "num_tokens": 206275569.0, + "step": 171440 + }, + { + "entropy": 1.9642528668045998, + "epoch": 0.5314799989770289, + "grad_norm": 4.579861640930176, + "learning_rate": 3.470167861538645e-06, + "loss": 0.4641, + "mean_token_accuracy": 0.8538163334131241, + "num_tokens": 206286986.0, + "step": 171450 + }, + { + "entropy": 1.9098300486803055, + "epoch": 0.5315109981020786, + "grad_norm": 8.911893844604492, + "learning_rate": 3.4700666647959643e-06, + "loss": 0.4264, + "mean_token_accuracy": 0.8599412739276886, + "num_tokens": 206298944.0, + "step": 171460 + }, + { + "entropy": 1.8672787815332412, + "epoch": 0.5315419972271282, + "grad_norm": 9.113754272460938, + "learning_rate": 3.4699654769060397e-06, + "loss": 0.423, + "mean_token_accuracy": 0.8529917195439338, + "num_tokens": 206311131.0, + "step": 171470 + }, + { + "entropy": 1.91919025182724, + "epoch": 0.5315729963521779, + "grad_norm": 3.830425977706909, + "learning_rate": 3.4698642978675815e-06, + "loss": 0.4355, + "mean_token_accuracy": 0.8582732066512108, + "num_tokens": 206322833.0, + "step": 171480 + }, + { + "entropy": 1.9081533446907997, + "epoch": 0.5316039954772277, + "grad_norm": 9.601178169250488, + "learning_rate": 3.469763127679298e-06, + "loss": 0.4191, + "mean_token_accuracy": 0.8536054447293282, + "num_tokens": 206334813.0, + "step": 171490 + }, + { + "entropy": 1.929221235215664, + "epoch": 0.5316349946022774, + "grad_norm": 10.570503234863281, + "learning_rate": 3.4696619663399002e-06, + "loss": 0.4636, + "mean_token_accuracy": 0.8487908989191055, + "num_tokens": 206346558.0, + "step": 171500 + }, + { + "entropy": 1.8757613226771355, + "epoch": 0.531665993727327, + "grad_norm": 8.72884464263916, + "learning_rate": 3.4695608138480966e-06, + "loss": 0.5066, + "mean_token_accuracy": 0.84766735881567, + "num_tokens": 206360134.0, + "step": 171510 + }, + { + "entropy": 1.9297456085681914, + "epoch": 0.5316969928523767, + "grad_norm": 3.734219551086426, + "learning_rate": 3.4694596702026e-06, + "loss": 0.4571, + "mean_token_accuracy": 0.8514823019504547, + "num_tokens": 206371913.0, + "step": 171520 + }, + { + "entropy": 1.9066060423851012, + "epoch": 0.5317279919774265, + "grad_norm": 8.339156150817871, + "learning_rate": 3.4693585354021194e-06, + "loss": 0.4002, + "mean_token_accuracy": 0.8531105428934097, + "num_tokens": 206385110.0, + "step": 171530 + }, + { + "entropy": 1.9329568713903427, + "epoch": 0.5317589911024762, + "grad_norm": 8.051816940307617, + "learning_rate": 3.469257409445365e-06, + "loss": 0.4729, + "mean_token_accuracy": 0.8447334930300713, + "num_tokens": 206396309.0, + "step": 171540 + }, + { + "entropy": 1.9102501556277276, + "epoch": 0.5317899902275258, + "grad_norm": 3.6929969787597656, + "learning_rate": 3.46915629233105e-06, + "loss": 0.4367, + "mean_token_accuracy": 0.857056412100792, + "num_tokens": 206408121.0, + "step": 171550 + }, + { + "entropy": 1.8484255477786065, + "epoch": 0.5318209893525755, + "grad_norm": 7.808130264282227, + "learning_rate": 3.469055184057884e-06, + "loss": 0.4515, + "mean_token_accuracy": 0.8478391453623771, + "num_tokens": 206421278.0, + "step": 171560 + }, + { + "entropy": 1.7564107075333595, + "epoch": 0.5318519884776253, + "grad_norm": 4.053802013397217, + "learning_rate": 3.46895408462458e-06, + "loss": 0.3849, + "mean_token_accuracy": 0.8702838957309723, + "num_tokens": 206435223.0, + "step": 171570 + }, + { + "entropy": 1.904933924973011, + "epoch": 0.5318829876026749, + "grad_norm": 7.180025577545166, + "learning_rate": 3.4688529940298493e-06, + "loss": 0.4499, + "mean_token_accuracy": 0.8612485557794571, + "num_tokens": 206446956.0, + "step": 171580 + }, + { + "entropy": 1.9514161705970765, + "epoch": 0.5319139867277246, + "grad_norm": 8.409560203552246, + "learning_rate": 3.4687519122724043e-06, + "loss": 0.4828, + "mean_token_accuracy": 0.8485655844211578, + "num_tokens": 206458061.0, + "step": 171590 + }, + { + "entropy": 1.912647745013237, + "epoch": 0.5319449858527743, + "grad_norm": 3.6584439277648926, + "learning_rate": 3.4686508393509575e-06, + "loss": 0.4456, + "mean_token_accuracy": 0.8545682623982429, + "num_tokens": 206470372.0, + "step": 171600 + }, + { + "entropy": 1.895480439066887, + "epoch": 0.531975984977824, + "grad_norm": 7.713948726654053, + "learning_rate": 3.4685497752642215e-06, + "loss": 0.4125, + "mean_token_accuracy": 0.8665553316473961, + "num_tokens": 206481643.0, + "step": 171610 + }, + { + "entropy": 1.78747378885746, + "epoch": 0.5320069841028737, + "grad_norm": 9.084161758422852, + "learning_rate": 3.4684487200109096e-06, + "loss": 0.3502, + "mean_token_accuracy": 0.8678223341703415, + "num_tokens": 206495368.0, + "step": 171620 + }, + { + "entropy": 2.0036447823047636, + "epoch": 0.5320379832279234, + "grad_norm": 8.734299659729004, + "learning_rate": 3.468347673589735e-06, + "loss": 0.4483, + "mean_token_accuracy": 0.8604890152812004, + "num_tokens": 206505972.0, + "step": 171630 + }, + { + "entropy": 1.8867813602089882, + "epoch": 0.5320689823529731, + "grad_norm": 4.2584147453308105, + "learning_rate": 3.468246635999411e-06, + "loss": 0.4346, + "mean_token_accuracy": 0.8592818707227707, + "num_tokens": 206517895.0, + "step": 171640 + }, + { + "entropy": 1.8816465973854064, + "epoch": 0.5320999814780227, + "grad_norm": 3.920475721359253, + "learning_rate": 3.468145607238652e-06, + "loss": 0.4481, + "mean_token_accuracy": 0.844399020075798, + "num_tokens": 206530527.0, + "step": 171650 + }, + { + "entropy": 1.887511445581913, + "epoch": 0.5321309806030725, + "grad_norm": 7.801617622375488, + "learning_rate": 3.4680445873061712e-06, + "loss": 0.3949, + "mean_token_accuracy": 0.8650444522500038, + "num_tokens": 206542889.0, + "step": 171660 + }, + { + "entropy": 1.9158253505825997, + "epoch": 0.5321619797281222, + "grad_norm": 9.402112007141113, + "learning_rate": 3.4679435762006837e-06, + "loss": 0.4236, + "mean_token_accuracy": 0.8618099376559257, + "num_tokens": 206555988.0, + "step": 171670 + }, + { + "entropy": 1.87806778550148, + "epoch": 0.5321929788531718, + "grad_norm": 9.117574691772461, + "learning_rate": 3.4678425739209038e-06, + "loss": 0.4078, + "mean_token_accuracy": 0.8550878211855888, + "num_tokens": 206569026.0, + "step": 171680 + }, + { + "entropy": 1.9359776824712753, + "epoch": 0.5322239779782215, + "grad_norm": 8.080842971801758, + "learning_rate": 3.467741580465545e-06, + "loss": 0.4795, + "mean_token_accuracy": 0.8594881877303123, + "num_tokens": 206580849.0, + "step": 171690 + }, + { + "entropy": 1.8874040782451629, + "epoch": 0.5322549771032713, + "grad_norm": 9.077346801757812, + "learning_rate": 3.4676405958333254e-06, + "loss": 0.4366, + "mean_token_accuracy": 0.8619496509432792, + "num_tokens": 206592399.0, + "step": 171700 + }, + { + "entropy": 1.9352701306343079, + "epoch": 0.532285976228321, + "grad_norm": 9.528621673583984, + "learning_rate": 3.4675396200229575e-06, + "loss": 0.4648, + "mean_token_accuracy": 0.844129042327404, + "num_tokens": 206604443.0, + "step": 171710 + }, + { + "entropy": 1.930326594412327, + "epoch": 0.5323169753533706, + "grad_norm": 6.972187519073486, + "learning_rate": 3.4674386530331596e-06, + "loss": 0.443, + "mean_token_accuracy": 0.8594956189393997, + "num_tokens": 206615898.0, + "step": 171720 + }, + { + "entropy": 1.8591390900313853, + "epoch": 0.5323479744784203, + "grad_norm": 8.316756248474121, + "learning_rate": 3.4673376948626446e-06, + "loss": 0.392, + "mean_token_accuracy": 0.8666254863142967, + "num_tokens": 206628209.0, + "step": 171730 + }, + { + "entropy": 1.8329302787780761, + "epoch": 0.5323789736034701, + "grad_norm": 8.091080665588379, + "learning_rate": 3.467236745510131e-06, + "loss": 0.4303, + "mean_token_accuracy": 0.8599769771099091, + "num_tokens": 206640092.0, + "step": 171740 + }, + { + "entropy": 1.910070639848709, + "epoch": 0.5324099727285198, + "grad_norm": 7.389466762542725, + "learning_rate": 3.467135804974334e-06, + "loss": 0.4384, + "mean_token_accuracy": 0.8566348433494568, + "num_tokens": 206652537.0, + "step": 171750 + }, + { + "entropy": 1.8269345745444299, + "epoch": 0.5324409718535694, + "grad_norm": 7.980764389038086, + "learning_rate": 3.467034873253971e-06, + "loss": 0.4006, + "mean_token_accuracy": 0.8637316823005676, + "num_tokens": 206666095.0, + "step": 171760 + }, + { + "entropy": 1.8664949864149094, + "epoch": 0.5324719709786191, + "grad_norm": 7.793969631195068, + "learning_rate": 3.466933950347759e-06, + "loss": 0.4464, + "mean_token_accuracy": 0.8574715122580528, + "num_tokens": 206679078.0, + "step": 171770 + }, + { + "entropy": 1.980468001961708, + "epoch": 0.5325029701036689, + "grad_norm": 11.13428783416748, + "learning_rate": 3.466833036254414e-06, + "loss": 0.4664, + "mean_token_accuracy": 0.8566422179341316, + "num_tokens": 206689658.0, + "step": 171780 + }, + { + "entropy": 1.9217968866229058, + "epoch": 0.5325339692287185, + "grad_norm": 8.130260467529297, + "learning_rate": 3.4667321309726548e-06, + "loss": 0.4216, + "mean_token_accuracy": 0.8584070265293121, + "num_tokens": 206701510.0, + "step": 171790 + }, + { + "entropy": 1.9040216952562332, + "epoch": 0.5325649683537682, + "grad_norm": 9.502899169921875, + "learning_rate": 3.4666312345011983e-06, + "loss": 0.4225, + "mean_token_accuracy": 0.8540240123867988, + "num_tokens": 206713670.0, + "step": 171800 + }, + { + "entropy": 1.9251901611685753, + "epoch": 0.5325959674788179, + "grad_norm": 4.581042289733887, + "learning_rate": 3.4665303468387633e-06, + "loss": 0.4514, + "mean_token_accuracy": 0.8467323318123817, + "num_tokens": 206726091.0, + "step": 171810 + }, + { + "entropy": 1.9404249116778374, + "epoch": 0.5326269666038677, + "grad_norm": 9.953166007995605, + "learning_rate": 3.4664294679840674e-06, + "loss": 0.4994, + "mean_token_accuracy": 0.8406923890113831, + "num_tokens": 206737652.0, + "step": 171820 + }, + { + "entropy": 1.8462284199893475, + "epoch": 0.5326579657289173, + "grad_norm": 9.736047744750977, + "learning_rate": 3.466328597935829e-06, + "loss": 0.4135, + "mean_token_accuracy": 0.8614635065197944, + "num_tokens": 206751383.0, + "step": 171830 + }, + { + "entropy": 1.8233645886182785, + "epoch": 0.532688964853967, + "grad_norm": 2.792520046234131, + "learning_rate": 3.4662277366927677e-06, + "loss": 0.4048, + "mean_token_accuracy": 0.8697629243135452, + "num_tokens": 206764520.0, + "step": 171840 + }, + { + "entropy": 1.9205037295818328, + "epoch": 0.5327199639790167, + "grad_norm": 7.479151725769043, + "learning_rate": 3.4661268842536015e-06, + "loss": 0.4575, + "mean_token_accuracy": 0.856606675684452, + "num_tokens": 206777062.0, + "step": 171850 + }, + { + "entropy": 1.8721492350101472, + "epoch": 0.5327509631040663, + "grad_norm": 7.801178455352783, + "learning_rate": 3.4660260406170507e-06, + "loss": 0.4388, + "mean_token_accuracy": 0.8510833352804184, + "num_tokens": 206789262.0, + "step": 171860 + }, + { + "entropy": 1.7701141953468322, + "epoch": 0.5327819622291161, + "grad_norm": 2.5856385231018066, + "learning_rate": 3.4659252057818343e-06, + "loss": 0.3925, + "mean_token_accuracy": 0.8556995436549186, + "num_tokens": 206803454.0, + "step": 171870 + }, + { + "entropy": 1.89136783182621, + "epoch": 0.5328129613541658, + "grad_norm": 3.5028934478759766, + "learning_rate": 3.4658243797466718e-06, + "loss": 0.4064, + "mean_token_accuracy": 0.8560910046100616, + "num_tokens": 206815864.0, + "step": 171880 + }, + { + "entropy": 1.8578203037381171, + "epoch": 0.5328439604792155, + "grad_norm": 7.930136203765869, + "learning_rate": 3.465723562510284e-06, + "loss": 0.386, + "mean_token_accuracy": 0.8557386100292206, + "num_tokens": 206828998.0, + "step": 171890 + }, + { + "entropy": 1.883947178721428, + "epoch": 0.5328749596042651, + "grad_norm": 3.5195116996765137, + "learning_rate": 3.4656227540713905e-06, + "loss": 0.4158, + "mean_token_accuracy": 0.8585299208760262, + "num_tokens": 206841080.0, + "step": 171900 + }, + { + "entropy": 1.9098535567522048, + "epoch": 0.5329059587293149, + "grad_norm": 3.2760863304138184, + "learning_rate": 3.465521954428713e-06, + "loss": 0.4457, + "mean_token_accuracy": 0.8506251364946366, + "num_tokens": 206853482.0, + "step": 171910 + }, + { + "entropy": 1.9376119673252106, + "epoch": 0.5329369578543646, + "grad_norm": 3.709351062774658, + "learning_rate": 3.465421163580971e-06, + "loss": 0.4254, + "mean_token_accuracy": 0.8610507115721703, + "num_tokens": 206865078.0, + "step": 171920 + }, + { + "entropy": 1.9255750745534896, + "epoch": 0.5329679569794142, + "grad_norm": 8.195679664611816, + "learning_rate": 3.4653203815268865e-06, + "loss": 0.4434, + "mean_token_accuracy": 0.8562889978289604, + "num_tokens": 206877650.0, + "step": 171930 + }, + { + "entropy": 1.9404271885752677, + "epoch": 0.5329989561044639, + "grad_norm": 6.643172740936279, + "learning_rate": 3.46521960826518e-06, + "loss": 0.4128, + "mean_token_accuracy": 0.8628787726163865, + "num_tokens": 206888757.0, + "step": 171940 + }, + { + "entropy": 1.8620061218738555, + "epoch": 0.5330299552295137, + "grad_norm": 7.816823482513428, + "learning_rate": 3.465118843794575e-06, + "loss": 0.4129, + "mean_token_accuracy": 0.8594163581728935, + "num_tokens": 206901369.0, + "step": 171950 + }, + { + "entropy": 1.8807374939322472, + "epoch": 0.5330609543545634, + "grad_norm": 8.188911437988281, + "learning_rate": 3.465018088113791e-06, + "loss": 0.4932, + "mean_token_accuracy": 0.8510372474789619, + "num_tokens": 206913816.0, + "step": 171960 + }, + { + "entropy": 1.819548812508583, + "epoch": 0.533091953479613, + "grad_norm": 4.415698051452637, + "learning_rate": 3.464917341221552e-06, + "loss": 0.3946, + "mean_token_accuracy": 0.8620653316378594, + "num_tokens": 206927026.0, + "step": 171970 + }, + { + "entropy": 1.8535535261034966, + "epoch": 0.5331229526046627, + "grad_norm": 8.980545043945312, + "learning_rate": 3.4648166031165797e-06, + "loss": 0.4003, + "mean_token_accuracy": 0.8751401022076607, + "num_tokens": 206939349.0, + "step": 171980 + }, + { + "entropy": 1.9402846857905387, + "epoch": 0.5331539517297125, + "grad_norm": 8.309432983398438, + "learning_rate": 3.4647158737975966e-06, + "loss": 0.4902, + "mean_token_accuracy": 0.8573802545666694, + "num_tokens": 206951605.0, + "step": 171990 + }, + { + "entropy": 1.8485840871930121, + "epoch": 0.5331849508547621, + "grad_norm": 9.220359802246094, + "learning_rate": 3.4646151532633265e-06, + "loss": 0.4131, + "mean_token_accuracy": 0.8622194632887841, + "num_tokens": 206964468.0, + "step": 172000 + }, + { + "entropy": 1.9720681741833688, + "epoch": 0.5332159499798118, + "grad_norm": 8.475946426391602, + "learning_rate": 3.4645144415124916e-06, + "loss": 0.4566, + "mean_token_accuracy": 0.8575832083821296, + "num_tokens": 206975496.0, + "step": 172010 + }, + { + "entropy": 1.9544428601861, + "epoch": 0.5332469491048615, + "grad_norm": 8.449519157409668, + "learning_rate": 3.4644137385438166e-06, + "loss": 0.4649, + "mean_token_accuracy": 0.8478255704045295, + "num_tokens": 206987103.0, + "step": 172020 + }, + { + "entropy": 1.9222390592098235, + "epoch": 0.5332779482299113, + "grad_norm": 7.751585960388184, + "learning_rate": 3.4643130443560222e-06, + "loss": 0.4392, + "mean_token_accuracy": 0.8567695304751396, + "num_tokens": 206998633.0, + "step": 172030 + }, + { + "entropy": 1.8945856779813766, + "epoch": 0.5333089473549609, + "grad_norm": 8.54149055480957, + "learning_rate": 3.4642123589478366e-06, + "loss": 0.4262, + "mean_token_accuracy": 0.85653665214777, + "num_tokens": 207011016.0, + "step": 172040 + }, + { + "entropy": 1.9423924744129182, + "epoch": 0.5333399464800106, + "grad_norm": 9.315898895263672, + "learning_rate": 3.464111682317981e-06, + "loss": 0.4784, + "mean_token_accuracy": 0.8562673062086106, + "num_tokens": 207022489.0, + "step": 172050 + }, + { + "entropy": 1.8690737947821616, + "epoch": 0.5333709456050603, + "grad_norm": 3.2628121376037598, + "learning_rate": 3.4640110144651813e-06, + "loss": 0.4014, + "mean_token_accuracy": 0.8605872854590416, + "num_tokens": 207035182.0, + "step": 172060 + }, + { + "entropy": 1.954857885837555, + "epoch": 0.53340194473011, + "grad_norm": 7.77451229095459, + "learning_rate": 3.463910355388162e-06, + "loss": 0.4756, + "mean_token_accuracy": 0.8507927462458611, + "num_tokens": 207046525.0, + "step": 172070 + }, + { + "entropy": 1.878305557370186, + "epoch": 0.5334329438551597, + "grad_norm": 4.307947158813477, + "learning_rate": 3.463809705085648e-06, + "loss": 0.4057, + "mean_token_accuracy": 0.8610593438148498, + "num_tokens": 207059144.0, + "step": 172080 + }, + { + "entropy": 1.893868698179722, + "epoch": 0.5334639429802094, + "grad_norm": 8.05561351776123, + "learning_rate": 3.4637090635563638e-06, + "loss": 0.4299, + "mean_token_accuracy": 0.8604981452226639, + "num_tokens": 207071120.0, + "step": 172090 + }, + { + "entropy": 1.87116037607193, + "epoch": 0.533494942105259, + "grad_norm": 2.938767194747925, + "learning_rate": 3.4636084307990363e-06, + "loss": 0.4198, + "mean_token_accuracy": 0.8630983516573906, + "num_tokens": 207084331.0, + "step": 172100 + }, + { + "entropy": 1.9578429192304612, + "epoch": 0.5335259412303087, + "grad_norm": 8.522046089172363, + "learning_rate": 3.4635078068123907e-06, + "loss": 0.4783, + "mean_token_accuracy": 0.8521770656108856, + "num_tokens": 207094960.0, + "step": 172110 + }, + { + "entropy": 1.8961300045251845, + "epoch": 0.5335569403553585, + "grad_norm": 9.013341903686523, + "learning_rate": 3.463407191595153e-06, + "loss": 0.3939, + "mean_token_accuracy": 0.8604053586721421, + "num_tokens": 207107582.0, + "step": 172120 + }, + { + "entropy": 1.9070754051208496, + "epoch": 0.5335879394804082, + "grad_norm": 6.7184929847717285, + "learning_rate": 3.463306585146049e-06, + "loss": 0.432, + "mean_token_accuracy": 0.8622305661439895, + "num_tokens": 207119530.0, + "step": 172130 + }, + { + "entropy": 1.8905790030956269, + "epoch": 0.5336189386054578, + "grad_norm": 7.1780619621276855, + "learning_rate": 3.4632059874638064e-06, + "loss": 0.43, + "mean_token_accuracy": 0.8592176213860512, + "num_tokens": 207131721.0, + "step": 172140 + }, + { + "entropy": 1.899731382727623, + "epoch": 0.5336499377305075, + "grad_norm": 7.558929443359375, + "learning_rate": 3.4631053985471513e-06, + "loss": 0.419, + "mean_token_accuracy": 0.8633270218968392, + "num_tokens": 207143338.0, + "step": 172150 + }, + { + "entropy": 1.8563763827085495, + "epoch": 0.5336809368555573, + "grad_norm": 4.571922302246094, + "learning_rate": 3.4630048183948106e-06, + "loss": 0.4133, + "mean_token_accuracy": 0.8659122556447982, + "num_tokens": 207155624.0, + "step": 172160 + }, + { + "entropy": 1.8730071052908897, + "epoch": 0.533711935980607, + "grad_norm": 8.322432518005371, + "learning_rate": 3.462904247005513e-06, + "loss": 0.4306, + "mean_token_accuracy": 0.8556889921426774, + "num_tokens": 207167987.0, + "step": 172170 + }, + { + "entropy": 1.9720443457365036, + "epoch": 0.5337429351056566, + "grad_norm": 9.21069622039795, + "learning_rate": 3.4628036843779846e-06, + "loss": 0.5156, + "mean_token_accuracy": 0.8485893458127975, + "num_tokens": 207179298.0, + "step": 172180 + }, + { + "entropy": 1.8745362177491187, + "epoch": 0.5337739342307063, + "grad_norm": 7.4704461097717285, + "learning_rate": 3.462703130510953e-06, + "loss": 0.3796, + "mean_token_accuracy": 0.8695771753787994, + "num_tokens": 207191249.0, + "step": 172190 + }, + { + "entropy": 1.9133777856826781, + "epoch": 0.5338049333557561, + "grad_norm": 9.01170539855957, + "learning_rate": 3.4626025854031475e-06, + "loss": 0.4376, + "mean_token_accuracy": 0.8624561950564384, + "num_tokens": 207202988.0, + "step": 172200 + }, + { + "entropy": 1.9582080647349358, + "epoch": 0.5338359324808057, + "grad_norm": 8.62938404083252, + "learning_rate": 3.462502049053296e-06, + "loss": 0.4948, + "mean_token_accuracy": 0.8430709898471832, + "num_tokens": 207214250.0, + "step": 172210 + }, + { + "entropy": 1.8965184196829796, + "epoch": 0.5338669316058554, + "grad_norm": 4.407395362854004, + "learning_rate": 3.462401521460128e-06, + "loss": 0.4486, + "mean_token_accuracy": 0.8566911399364472, + "num_tokens": 207227351.0, + "step": 172220 + }, + { + "entropy": 1.9487920999526978, + "epoch": 0.5338979307309051, + "grad_norm": 8.692533493041992, + "learning_rate": 3.462301002622371e-06, + "loss": 0.465, + "mean_token_accuracy": 0.8509125307202339, + "num_tokens": 207238828.0, + "step": 172230 + }, + { + "entropy": 1.923832182586193, + "epoch": 0.5339289298559549, + "grad_norm": 8.191177368164062, + "learning_rate": 3.4622004925387546e-06, + "loss": 0.442, + "mean_token_accuracy": 0.8531102791428566, + "num_tokens": 207251198.0, + "step": 172240 + }, + { + "entropy": 1.8550199180841447, + "epoch": 0.5339599289810045, + "grad_norm": 4.443090438842773, + "learning_rate": 3.4620999912080088e-06, + "loss": 0.4148, + "mean_token_accuracy": 0.8594103202223777, + "num_tokens": 207264801.0, + "step": 172250 + }, + { + "entropy": 1.9658312529325486, + "epoch": 0.5339909281060542, + "grad_norm": 7.00798225402832, + "learning_rate": 3.461999498628862e-06, + "loss": 0.4756, + "mean_token_accuracy": 0.855456392467022, + "num_tokens": 207276463.0, + "step": 172260 + }, + { + "entropy": 1.9467895850539207, + "epoch": 0.5340219272311039, + "grad_norm": 10.077486038208008, + "learning_rate": 3.4618990148000458e-06, + "loss": 0.4644, + "mean_token_accuracy": 0.844844251871109, + "num_tokens": 207288481.0, + "step": 172270 + }, + { + "entropy": 1.8941056042909623, + "epoch": 0.5340529263561536, + "grad_norm": 7.274338245391846, + "learning_rate": 3.4617985397202896e-06, + "loss": 0.4039, + "mean_token_accuracy": 0.8619851514697074, + "num_tokens": 207301250.0, + "step": 172280 + }, + { + "entropy": 1.9818316102027893, + "epoch": 0.5340839254812033, + "grad_norm": 4.565423965454102, + "learning_rate": 3.4616980733883243e-06, + "loss": 0.4777, + "mean_token_accuracy": 0.847413894534111, + "num_tokens": 207312358.0, + "step": 172290 + }, + { + "entropy": 1.9006615951657295, + "epoch": 0.534114924606253, + "grad_norm": 4.181703090667725, + "learning_rate": 3.461597615802879e-06, + "loss": 0.435, + "mean_token_accuracy": 0.8480461463332176, + "num_tokens": 207324643.0, + "step": 172300 + }, + { + "entropy": 1.949682556092739, + "epoch": 0.5341459237313027, + "grad_norm": 3.784156322479248, + "learning_rate": 3.4614971669626863e-06, + "loss": 0.4195, + "mean_token_accuracy": 0.8603693306446075, + "num_tokens": 207336517.0, + "step": 172310 + }, + { + "entropy": 1.9486548736691476, + "epoch": 0.5341769228563524, + "grad_norm": 6.828722953796387, + "learning_rate": 3.461396726866477e-06, + "loss": 0.4737, + "mean_token_accuracy": 0.8525962918996811, + "num_tokens": 207347650.0, + "step": 172320 + }, + { + "entropy": 1.9545435726642608, + "epoch": 0.5342079219814021, + "grad_norm": 7.952531337738037, + "learning_rate": 3.4612962955129826e-06, + "loss": 0.4338, + "mean_token_accuracy": 0.8619145333766938, + "num_tokens": 207359091.0, + "step": 172330 + }, + { + "entropy": 1.889906283468008, + "epoch": 0.5342389211064518, + "grad_norm": 7.539053440093994, + "learning_rate": 3.461195872900934e-06, + "loss": 0.4255, + "mean_token_accuracy": 0.8641923710703849, + "num_tokens": 207371795.0, + "step": 172340 + }, + { + "entropy": 1.9179092735052108, + "epoch": 0.5342699202315014, + "grad_norm": 3.5443851947784424, + "learning_rate": 3.461095459029065e-06, + "loss": 0.4344, + "mean_token_accuracy": 0.8606422588229179, + "num_tokens": 207383403.0, + "step": 172350 + }, + { + "entropy": 2.0034004122018816, + "epoch": 0.5343009193565511, + "grad_norm": 8.226675033569336, + "learning_rate": 3.460995053896107e-06, + "loss": 0.4975, + "mean_token_accuracy": 0.8465193957090378, + "num_tokens": 207394658.0, + "step": 172360 + }, + { + "entropy": 1.9629688054323196, + "epoch": 0.5343319184816009, + "grad_norm": 7.675604820251465, + "learning_rate": 3.4608946575007906e-06, + "loss": 0.439, + "mean_token_accuracy": 0.8677529841661453, + "num_tokens": 207405234.0, + "step": 172370 + }, + { + "entropy": 1.9701706409454345, + "epoch": 0.5343629176066506, + "grad_norm": 7.960362434387207, + "learning_rate": 3.4607942698418523e-06, + "loss": 0.454, + "mean_token_accuracy": 0.8556162640452385, + "num_tokens": 207416438.0, + "step": 172380 + }, + { + "entropy": 1.8850088074803353, + "epoch": 0.5343939167317002, + "grad_norm": 7.914768218994141, + "learning_rate": 3.4606938909180217e-06, + "loss": 0.3836, + "mean_token_accuracy": 0.8754544615745544, + "num_tokens": 207428510.0, + "step": 172390 + }, + { + "entropy": 1.8988526239991188, + "epoch": 0.5344249158567499, + "grad_norm": 9.443257331848145, + "learning_rate": 3.460593520728034e-06, + "loss": 0.4398, + "mean_token_accuracy": 0.8607255682349205, + "num_tokens": 207439902.0, + "step": 172400 + }, + { + "entropy": 1.8619684666395186, + "epoch": 0.5344559149817997, + "grad_norm": 8.755477905273438, + "learning_rate": 3.460493159270622e-06, + "loss": 0.3887, + "mean_token_accuracy": 0.8674828916788101, + "num_tokens": 207452303.0, + "step": 172410 + }, + { + "entropy": 1.9530357331037522, + "epoch": 0.5344869141068493, + "grad_norm": 7.649057388305664, + "learning_rate": 3.4603928065445197e-06, + "loss": 0.4671, + "mean_token_accuracy": 0.8552047446370125, + "num_tokens": 207463895.0, + "step": 172420 + }, + { + "entropy": 1.9203311637043954, + "epoch": 0.534517913231899, + "grad_norm": 9.419485092163086, + "learning_rate": 3.460292462548462e-06, + "loss": 0.4482, + "mean_token_accuracy": 0.8501846611499786, + "num_tokens": 207475641.0, + "step": 172430 + }, + { + "entropy": 1.965132975578308, + "epoch": 0.5345489123569487, + "grad_norm": 7.676466464996338, + "learning_rate": 3.4601921272811813e-06, + "loss": 0.491, + "mean_token_accuracy": 0.8473555102944375, + "num_tokens": 207487141.0, + "step": 172440 + }, + { + "entropy": 1.9162400186061859, + "epoch": 0.5345799114819985, + "grad_norm": 8.840492248535156, + "learning_rate": 3.4600918007414135e-06, + "loss": 0.4943, + "mean_token_accuracy": 0.8445732891559601, + "num_tokens": 207500343.0, + "step": 172450 + }, + { + "entropy": 1.9381729751825332, + "epoch": 0.5346109106070481, + "grad_norm": 7.45047664642334, + "learning_rate": 3.4599914829278934e-06, + "loss": 0.462, + "mean_token_accuracy": 0.8518120244145393, + "num_tokens": 207511612.0, + "step": 172460 + }, + { + "entropy": 1.967686542868614, + "epoch": 0.5346419097320978, + "grad_norm": 8.611394882202148, + "learning_rate": 3.459891173839356e-06, + "loss": 0.4766, + "mean_token_accuracy": 0.8540784746408463, + "num_tokens": 207523508.0, + "step": 172470 + }, + { + "entropy": 1.9352397471666336, + "epoch": 0.5346729088571475, + "grad_norm": 3.897338390350342, + "learning_rate": 3.459790873474536e-06, + "loss": 0.4391, + "mean_token_accuracy": 0.8492213904857635, + "num_tokens": 207535586.0, + "step": 172480 + }, + { + "entropy": 1.9683047980070114, + "epoch": 0.5347039079821972, + "grad_norm": 8.10174560546875, + "learning_rate": 3.45969058183217e-06, + "loss": 0.4578, + "mean_token_accuracy": 0.8603872656822205, + "num_tokens": 207546228.0, + "step": 172490 + }, + { + "entropy": 1.823877865076065, + "epoch": 0.5347349071072469, + "grad_norm": 7.564563751220703, + "learning_rate": 3.459590298910993e-06, + "loss": 0.4185, + "mean_token_accuracy": 0.8649493157863617, + "num_tokens": 207559795.0, + "step": 172500 + }, + { + "entropy": 1.8830648839473725, + "epoch": 0.5347659062322966, + "grad_norm": 7.6251301765441895, + "learning_rate": 3.459490024709742e-06, + "loss": 0.3804, + "mean_token_accuracy": 0.8656977102160454, + "num_tokens": 207572213.0, + "step": 172510 + }, + { + "entropy": 1.8875678315758706, + "epoch": 0.5347969053573463, + "grad_norm": 3.419621229171753, + "learning_rate": 3.4593897592271515e-06, + "loss": 0.4305, + "mean_token_accuracy": 0.8647677347064018, + "num_tokens": 207584322.0, + "step": 172520 + }, + { + "entropy": 1.9262285679578781, + "epoch": 0.534827904482396, + "grad_norm": 8.029248237609863, + "learning_rate": 3.4592895024619606e-06, + "loss": 0.4394, + "mean_token_accuracy": 0.8527608752250672, + "num_tokens": 207595700.0, + "step": 172530 + }, + { + "entropy": 1.9239438846707344, + "epoch": 0.5348589036074457, + "grad_norm": 9.767742156982422, + "learning_rate": 3.4591892544129045e-06, + "loss": 0.4567, + "mean_token_accuracy": 0.8469107151031494, + "num_tokens": 207607393.0, + "step": 172540 + }, + { + "entropy": 1.9097449451684951, + "epoch": 0.5348899027324954, + "grad_norm": 7.202256679534912, + "learning_rate": 3.459089015078721e-06, + "loss": 0.3834, + "mean_token_accuracy": 0.8678121000528336, + "num_tokens": 207619018.0, + "step": 172550 + }, + { + "entropy": 1.87255839407444, + "epoch": 0.534920901857545, + "grad_norm": 8.831064224243164, + "learning_rate": 3.4589887844581472e-06, + "loss": 0.4533, + "mean_token_accuracy": 0.8524145618081093, + "num_tokens": 207631371.0, + "step": 172560 + }, + { + "entropy": 1.8829514354467392, + "epoch": 0.5349519009825948, + "grad_norm": 9.208990097045898, + "learning_rate": 3.4588885625499207e-06, + "loss": 0.4128, + "mean_token_accuracy": 0.8572834596037865, + "num_tokens": 207643443.0, + "step": 172570 + }, + { + "entropy": 1.7449659064412117, + "epoch": 0.5349829001076445, + "grad_norm": 8.625064849853516, + "learning_rate": 3.45878834935278e-06, + "loss": 0.383, + "mean_token_accuracy": 0.8635283961892128, + "num_tokens": 207657645.0, + "step": 172580 + }, + { + "entropy": 1.9483749613165855, + "epoch": 0.5350138992326942, + "grad_norm": 9.903924942016602, + "learning_rate": 3.458688144865463e-06, + "loss": 0.4544, + "mean_token_accuracy": 0.8563203975558281, + "num_tokens": 207669310.0, + "step": 172590 + }, + { + "entropy": 1.8220311045646667, + "epoch": 0.5350448983577438, + "grad_norm": 3.884104013442993, + "learning_rate": 3.4585879490867074e-06, + "loss": 0.4267, + "mean_token_accuracy": 0.8625266656279564, + "num_tokens": 207682331.0, + "step": 172600 + }, + { + "entropy": 1.7253479383885861, + "epoch": 0.5350758974827935, + "grad_norm": 8.76376724243164, + "learning_rate": 3.458487762015253e-06, + "loss": 0.3447, + "mean_token_accuracy": 0.8723403230309487, + "num_tokens": 207697659.0, + "step": 172610 + }, + { + "entropy": 1.9204844385385513, + "epoch": 0.5351068966078433, + "grad_norm": 8.889963150024414, + "learning_rate": 3.4583875836498375e-06, + "loss": 0.4523, + "mean_token_accuracy": 0.8568612888455391, + "num_tokens": 207709348.0, + "step": 172620 + }, + { + "entropy": 1.8080897092819215, + "epoch": 0.5351378957328929, + "grad_norm": 7.010310173034668, + "learning_rate": 3.458287413989201e-06, + "loss": 0.4343, + "mean_token_accuracy": 0.8615889981389045, + "num_tokens": 207722545.0, + "step": 172630 + }, + { + "entropy": 1.8895937889814376, + "epoch": 0.5351688948579426, + "grad_norm": 10.335844993591309, + "learning_rate": 3.4581872530320827e-06, + "loss": 0.461, + "mean_token_accuracy": 0.8531122609972954, + "num_tokens": 207734311.0, + "step": 172640 + }, + { + "entropy": 1.979372352361679, + "epoch": 0.5351998939829923, + "grad_norm": 8.217429161071777, + "learning_rate": 3.4580871007772215e-06, + "loss": 0.4852, + "mean_token_accuracy": 0.8483906969428062, + "num_tokens": 207744931.0, + "step": 172650 + }, + { + "entropy": 1.8208840578794478, + "epoch": 0.5352308931080421, + "grad_norm": 4.37178897857666, + "learning_rate": 3.4579869572233585e-06, + "loss": 0.4125, + "mean_token_accuracy": 0.8537062719464302, + "num_tokens": 207757648.0, + "step": 172660 + }, + { + "entropy": 1.9177382573485375, + "epoch": 0.5352618922330917, + "grad_norm": 8.044947624206543, + "learning_rate": 3.457886822369234e-06, + "loss": 0.4674, + "mean_token_accuracy": 0.8472369626164437, + "num_tokens": 207769899.0, + "step": 172670 + }, + { + "entropy": 1.8201391249895096, + "epoch": 0.5352928913581414, + "grad_norm": 3.541053533554077, + "learning_rate": 3.4577866962135876e-06, + "loss": 0.3582, + "mean_token_accuracy": 0.8720339864492417, + "num_tokens": 207782654.0, + "step": 172680 + }, + { + "entropy": 1.8746508598327636, + "epoch": 0.5353238904831911, + "grad_norm": 4.510343551635742, + "learning_rate": 3.4576865787551605e-06, + "loss": 0.4442, + "mean_token_accuracy": 0.8468729451298713, + "num_tokens": 207794735.0, + "step": 172690 + }, + { + "entropy": 1.9637598276138306, + "epoch": 0.5353548896082408, + "grad_norm": 9.965831756591797, + "learning_rate": 3.4575864699926935e-06, + "loss": 0.4919, + "mean_token_accuracy": 0.8475473329424859, + "num_tokens": 207805438.0, + "step": 172700 + }, + { + "entropy": 1.8837266564369202, + "epoch": 0.5353858887332905, + "grad_norm": 4.100901126861572, + "learning_rate": 3.4574863699249277e-06, + "loss": 0.4945, + "mean_token_accuracy": 0.8563615873456001, + "num_tokens": 207817902.0, + "step": 172710 + }, + { + "entropy": 1.8690167635679245, + "epoch": 0.5354168878583402, + "grad_norm": 8.928622245788574, + "learning_rate": 3.457386278550605e-06, + "loss": 0.415, + "mean_token_accuracy": 0.8597008213400841, + "num_tokens": 207829494.0, + "step": 172720 + }, + { + "entropy": 1.8866547778248788, + "epoch": 0.5354478869833899, + "grad_norm": 8.530135154724121, + "learning_rate": 3.4572861958684666e-06, + "loss": 0.4395, + "mean_token_accuracy": 0.8589810565114021, + "num_tokens": 207841741.0, + "step": 172730 + }, + { + "entropy": 1.874574901163578, + "epoch": 0.5354788861084396, + "grad_norm": 8.313958168029785, + "learning_rate": 3.457186121877255e-06, + "loss": 0.4252, + "mean_token_accuracy": 0.852715365588665, + "num_tokens": 207853896.0, + "step": 172740 + }, + { + "entropy": 1.8388253033161164, + "epoch": 0.5355098852334893, + "grad_norm": 3.2290029525756836, + "learning_rate": 3.4570860565757115e-06, + "loss": 0.3969, + "mean_token_accuracy": 0.8671393066644668, + "num_tokens": 207867006.0, + "step": 172750 + }, + { + "entropy": 1.9595274776220322, + "epoch": 0.535540884358539, + "grad_norm": 8.689278602600098, + "learning_rate": 3.45698599996258e-06, + "loss": 0.5341, + "mean_token_accuracy": 0.8384348094463349, + "num_tokens": 207878408.0, + "step": 172760 + }, + { + "entropy": 1.8406404912471772, + "epoch": 0.5355718834835886, + "grad_norm": 8.577286720275879, + "learning_rate": 3.456885952036602e-06, + "loss": 0.41, + "mean_token_accuracy": 0.8525245815515519, + "num_tokens": 207891429.0, + "step": 172770 + }, + { + "entropy": 1.8390149533748628, + "epoch": 0.5356028826086384, + "grad_norm": 2.493551731109619, + "learning_rate": 3.4567859127965212e-06, + "loss": 0.3574, + "mean_token_accuracy": 0.8665968149900436, + "num_tokens": 207904908.0, + "step": 172780 + }, + { + "entropy": 1.9711681693792342, + "epoch": 0.5356338817336881, + "grad_norm": 8.279610633850098, + "learning_rate": 3.4566858822410814e-06, + "loss": 0.4773, + "mean_token_accuracy": 0.8502449646592141, + "num_tokens": 207916097.0, + "step": 172790 + }, + { + "entropy": 1.8366296276450158, + "epoch": 0.5356648808587378, + "grad_norm": 3.318671226501465, + "learning_rate": 3.4565858603690243e-06, + "loss": 0.381, + "mean_token_accuracy": 0.8724953427910804, + "num_tokens": 207928142.0, + "step": 172800 + }, + { + "entropy": 1.866967648267746, + "epoch": 0.5356958799837874, + "grad_norm": 8.332340240478516, + "learning_rate": 3.4564858471790957e-06, + "loss": 0.4108, + "mean_token_accuracy": 0.8624767050147056, + "num_tokens": 207940178.0, + "step": 172810 + }, + { + "entropy": 1.911839447915554, + "epoch": 0.5357268791088372, + "grad_norm": 8.774737358093262, + "learning_rate": 3.456385842670038e-06, + "loss": 0.4684, + "mean_token_accuracy": 0.8479789897799492, + "num_tokens": 207952013.0, + "step": 172820 + }, + { + "entropy": 1.8767947524785995, + "epoch": 0.5357578782338869, + "grad_norm": 7.686695098876953, + "learning_rate": 3.4562858468405963e-06, + "loss": 0.4132, + "mean_token_accuracy": 0.8593664765357971, + "num_tokens": 207964510.0, + "step": 172830 + }, + { + "entropy": 1.8312649622559547, + "epoch": 0.5357888773589365, + "grad_norm": 3.4589436054229736, + "learning_rate": 3.456185859689516e-06, + "loss": 0.4022, + "mean_token_accuracy": 0.8600029453635216, + "num_tokens": 207977857.0, + "step": 172840 + }, + { + "entropy": 1.7913474388420583, + "epoch": 0.5358198764839862, + "grad_norm": 6.77054500579834, + "learning_rate": 3.45608588121554e-06, + "loss": 0.4069, + "mean_token_accuracy": 0.8663949474692345, + "num_tokens": 207991073.0, + "step": 172850 + }, + { + "entropy": 1.8621043905615806, + "epoch": 0.5358508756090359, + "grad_norm": 8.442105293273926, + "learning_rate": 3.455985911417414e-06, + "loss": 0.4384, + "mean_token_accuracy": 0.8533388510346412, + "num_tokens": 208003843.0, + "step": 172860 + }, + { + "entropy": 1.876605623960495, + "epoch": 0.5358818747340857, + "grad_norm": 9.525609016418457, + "learning_rate": 3.455885950293884e-06, + "loss": 0.5398, + "mean_token_accuracy": 0.8462421506643295, + "num_tokens": 208015803.0, + "step": 172870 + }, + { + "entropy": 1.8648066401481629, + "epoch": 0.5359128738591353, + "grad_norm": 8.97998046875, + "learning_rate": 3.455785997843695e-06, + "loss": 0.4359, + "mean_token_accuracy": 0.8536581054329873, + "num_tokens": 208028644.0, + "step": 172880 + }, + { + "entropy": 1.9021985232830048, + "epoch": 0.535943872984185, + "grad_norm": 9.232839584350586, + "learning_rate": 3.4556860540655936e-06, + "loss": 0.4411, + "mean_token_accuracy": 0.8553122028708458, + "num_tokens": 208039829.0, + "step": 172890 + }, + { + "entropy": 1.8113704428076745, + "epoch": 0.5359748721092347, + "grad_norm": 6.826096534729004, + "learning_rate": 3.4555861189583244e-06, + "loss": 0.3362, + "mean_token_accuracy": 0.8639003306627273, + "num_tokens": 208052850.0, + "step": 172900 + }, + { + "entropy": 1.8847822308540345, + "epoch": 0.5360058712342844, + "grad_norm": 8.075764656066895, + "learning_rate": 3.4554861925206344e-06, + "loss": 0.4416, + "mean_token_accuracy": 0.8611627250909806, + "num_tokens": 208065092.0, + "step": 172910 + }, + { + "entropy": 1.9323305204510688, + "epoch": 0.5360368703593341, + "grad_norm": 7.556993007659912, + "learning_rate": 3.4553862747512707e-06, + "loss": 0.4678, + "mean_token_accuracy": 0.8483913406729698, + "num_tokens": 208076555.0, + "step": 172920 + }, + { + "entropy": 1.8728545591235162, + "epoch": 0.5360678694843838, + "grad_norm": 7.655262470245361, + "learning_rate": 3.4552863656489795e-06, + "loss": 0.3961, + "mean_token_accuracy": 0.8667805373668671, + "num_tokens": 208087801.0, + "step": 172930 + }, + { + "entropy": 1.9701668232679368, + "epoch": 0.5360988686094335, + "grad_norm": 7.848801136016846, + "learning_rate": 3.4551864652125078e-06, + "loss": 0.5238, + "mean_token_accuracy": 0.8293770626187325, + "num_tokens": 208098926.0, + "step": 172940 + }, + { + "entropy": 1.909597432613373, + "epoch": 0.5361298677344832, + "grad_norm": 9.201456069946289, + "learning_rate": 3.4550865734406037e-06, + "loss": 0.4504, + "mean_token_accuracy": 0.8472042381763458, + "num_tokens": 208110534.0, + "step": 172950 + }, + { + "entropy": 1.8972481831908226, + "epoch": 0.5361608668595329, + "grad_norm": 8.038980484008789, + "learning_rate": 3.4549866903320134e-06, + "loss": 0.4745, + "mean_token_accuracy": 0.8507405325770379, + "num_tokens": 208122410.0, + "step": 172960 + }, + { + "entropy": 1.9153016999363899, + "epoch": 0.5361918659845826, + "grad_norm": 9.517046928405762, + "learning_rate": 3.4548868158854864e-06, + "loss": 0.4585, + "mean_token_accuracy": 0.8530256912112236, + "num_tokens": 208133695.0, + "step": 172970 + }, + { + "entropy": 1.883463017642498, + "epoch": 0.5362228651096322, + "grad_norm": 8.928050994873047, + "learning_rate": 3.4547869500997693e-06, + "loss": 0.4342, + "mean_token_accuracy": 0.8534481927752495, + "num_tokens": 208146386.0, + "step": 172980 + }, + { + "entropy": 1.8348631024360658, + "epoch": 0.536253864234682, + "grad_norm": 6.866858959197998, + "learning_rate": 3.4546870929736113e-06, + "loss": 0.4131, + "mean_token_accuracy": 0.8672243788838386, + "num_tokens": 208159187.0, + "step": 172990 + }, + { + "entropy": 1.9339495077729225, + "epoch": 0.5362848633597317, + "grad_norm": 8.929131507873535, + "learning_rate": 3.4545872445057615e-06, + "loss": 0.4579, + "mean_token_accuracy": 0.8579076424241066, + "num_tokens": 208170970.0, + "step": 173000 + }, + { + "entropy": 1.9648241117596625, + "epoch": 0.5363158624847814, + "grad_norm": 9.15043830871582, + "learning_rate": 3.4544874046949674e-06, + "loss": 0.5074, + "mean_token_accuracy": 0.8426274269819259, + "num_tokens": 208182432.0, + "step": 173010 + }, + { + "entropy": 1.8830683097243308, + "epoch": 0.536346861609831, + "grad_norm": 8.971197128295898, + "learning_rate": 3.454387573539979e-06, + "loss": 0.4268, + "mean_token_accuracy": 0.8599732339382171, + "num_tokens": 208194473.0, + "step": 173020 + }, + { + "entropy": 1.9190237015485763, + "epoch": 0.5363778607348808, + "grad_norm": 8.68978500366211, + "learning_rate": 3.4542877510395453e-06, + "loss": 0.4314, + "mean_token_accuracy": 0.8580868989229202, + "num_tokens": 208206024.0, + "step": 173030 + }, + { + "entropy": 1.8665350019931792, + "epoch": 0.5364088598599305, + "grad_norm": 8.050957679748535, + "learning_rate": 3.4541879371924155e-06, + "loss": 0.427, + "mean_token_accuracy": 0.8516592651605606, + "num_tokens": 208218803.0, + "step": 173040 + }, + { + "entropy": 1.9037343636155128, + "epoch": 0.5364398589849801, + "grad_norm": 8.649856567382812, + "learning_rate": 3.4540881319973406e-06, + "loss": 0.4471, + "mean_token_accuracy": 0.8579539388418198, + "num_tokens": 208230525.0, + "step": 173050 + }, + { + "entropy": 1.9306460633873939, + "epoch": 0.5364708581100298, + "grad_norm": 8.644009590148926, + "learning_rate": 3.4539883354530695e-06, + "loss": 0.4496, + "mean_token_accuracy": 0.8506815403699874, + "num_tokens": 208242480.0, + "step": 173060 + }, + { + "entropy": 1.8853941515088082, + "epoch": 0.5365018572350796, + "grad_norm": 9.751359939575195, + "learning_rate": 3.4538885475583533e-06, + "loss": 0.4279, + "mean_token_accuracy": 0.8584751397371292, + "num_tokens": 208253902.0, + "step": 173070 + }, + { + "entropy": 1.9570772036910058, + "epoch": 0.5365328563601293, + "grad_norm": 8.534307479858398, + "learning_rate": 3.453788768311943e-06, + "loss": 0.4494, + "mean_token_accuracy": 0.8538262590765953, + "num_tokens": 208265528.0, + "step": 173080 + }, + { + "entropy": 1.9516688704490661, + "epoch": 0.5365638554851789, + "grad_norm": 10.632366180419922, + "learning_rate": 3.4536889977125888e-06, + "loss": 0.4485, + "mean_token_accuracy": 0.8635988682508469, + "num_tokens": 208277567.0, + "step": 173090 + }, + { + "entropy": 1.8980008989572525, + "epoch": 0.5365948546102286, + "grad_norm": 9.882356643676758, + "learning_rate": 3.4535892357590418e-06, + "loss": 0.4432, + "mean_token_accuracy": 0.8611980080604553, + "num_tokens": 208289458.0, + "step": 173100 + }, + { + "entropy": 1.8366466403007506, + "epoch": 0.5366258537352783, + "grad_norm": 8.65365219116211, + "learning_rate": 3.453489482450053e-06, + "loss": 0.4154, + "mean_token_accuracy": 0.8533262208104133, + "num_tokens": 208302479.0, + "step": 173110 + }, + { + "entropy": 1.9177147299051285, + "epoch": 0.536656852860328, + "grad_norm": 7.513286590576172, + "learning_rate": 3.453389737784375e-06, + "loss": 0.4168, + "mean_token_accuracy": 0.8542504772543907, + "num_tokens": 208314078.0, + "step": 173120 + }, + { + "entropy": 1.9224371239542961, + "epoch": 0.5366878519853777, + "grad_norm": 6.784999370574951, + "learning_rate": 3.453290001760759e-06, + "loss": 0.4775, + "mean_token_accuracy": 0.8514690682291984, + "num_tokens": 208325860.0, + "step": 173130 + }, + { + "entropy": 1.9379079461097717, + "epoch": 0.5367188511104274, + "grad_norm": 7.635251522064209, + "learning_rate": 3.453190274377958e-06, + "loss": 0.4587, + "mean_token_accuracy": 0.8511418506503106, + "num_tokens": 208337761.0, + "step": 173140 + }, + { + "entropy": 1.9702737927436829, + "epoch": 0.536749850235477, + "grad_norm": 8.940947532653809, + "learning_rate": 3.4530905556347235e-06, + "loss": 0.5274, + "mean_token_accuracy": 0.842144227027893, + "num_tokens": 208348546.0, + "step": 173150 + }, + { + "entropy": 1.8190948873758317, + "epoch": 0.5367808493605268, + "grad_norm": 8.213035583496094, + "learning_rate": 3.4529908455298076e-06, + "loss": 0.4273, + "mean_token_accuracy": 0.8630830571055412, + "num_tokens": 208361404.0, + "step": 173160 + }, + { + "entropy": 1.8803176492452622, + "epoch": 0.5368118484855765, + "grad_norm": 8.948467254638672, + "learning_rate": 3.452891144061965e-06, + "loss": 0.5111, + "mean_token_accuracy": 0.8477946504950523, + "num_tokens": 208373438.0, + "step": 173170 + }, + { + "entropy": 1.8883336156606674, + "epoch": 0.5368428476106262, + "grad_norm": 6.753879547119141, + "learning_rate": 3.4527914512299472e-06, + "loss": 0.4126, + "mean_token_accuracy": 0.8684131637215614, + "num_tokens": 208385340.0, + "step": 173180 + }, + { + "entropy": 1.890251599252224, + "epoch": 0.5368738467356758, + "grad_norm": 3.4004273414611816, + "learning_rate": 3.452691767032508e-06, + "loss": 0.3951, + "mean_token_accuracy": 0.8704697549343109, + "num_tokens": 208397816.0, + "step": 173190 + }, + { + "entropy": 1.8218652233481407, + "epoch": 0.5369048458607256, + "grad_norm": 3.7632715702056885, + "learning_rate": 3.452592091468402e-06, + "loss": 0.3582, + "mean_token_accuracy": 0.8653622597455979, + "num_tokens": 208411104.0, + "step": 173200 + }, + { + "entropy": 1.9061686143279075, + "epoch": 0.5369358449857753, + "grad_norm": 9.365750312805176, + "learning_rate": 3.4524924245363815e-06, + "loss": 0.4465, + "mean_token_accuracy": 0.8505714625120163, + "num_tokens": 208422977.0, + "step": 173210 + }, + { + "entropy": 1.855700920522213, + "epoch": 0.536966844110825, + "grad_norm": 9.142428398132324, + "learning_rate": 3.4523927662352024e-06, + "loss": 0.4, + "mean_token_accuracy": 0.8599683746695519, + "num_tokens": 208435820.0, + "step": 173220 + }, + { + "entropy": 1.9547921270132065, + "epoch": 0.5369978432358746, + "grad_norm": 9.191194534301758, + "learning_rate": 3.4522931165636174e-06, + "loss": 0.4557, + "mean_token_accuracy": 0.8558192774653435, + "num_tokens": 208446473.0, + "step": 173230 + }, + { + "entropy": 1.9289256483316422, + "epoch": 0.5370288423609244, + "grad_norm": 8.622015953063965, + "learning_rate": 3.4521934755203822e-06, + "loss": 0.5088, + "mean_token_accuracy": 0.8463401675224305, + "num_tokens": 208458688.0, + "step": 173240 + }, + { + "entropy": 1.8982299000024796, + "epoch": 0.5370598414859741, + "grad_norm": 3.722247838973999, + "learning_rate": 3.4520938431042513e-06, + "loss": 0.4673, + "mean_token_accuracy": 0.8519804239273071, + "num_tokens": 208470509.0, + "step": 173250 + }, + { + "entropy": 1.9269887924194335, + "epoch": 0.5370908406110237, + "grad_norm": 7.334140300750732, + "learning_rate": 3.4519942193139803e-06, + "loss": 0.4256, + "mean_token_accuracy": 0.8573471799492836, + "num_tokens": 208481720.0, + "step": 173260 + }, + { + "entropy": 1.8478943184018135, + "epoch": 0.5371218397360734, + "grad_norm": 8.863199234008789, + "learning_rate": 3.451894604148324e-06, + "loss": 0.3867, + "mean_token_accuracy": 0.8715490698814392, + "num_tokens": 208493429.0, + "step": 173270 + }, + { + "entropy": 1.916895118355751, + "epoch": 0.5371528388611232, + "grad_norm": 8.353137016296387, + "learning_rate": 3.451794997606039e-06, + "loss": 0.4384, + "mean_token_accuracy": 0.8541607096791267, + "num_tokens": 208504592.0, + "step": 173280 + }, + { + "entropy": 1.906629091501236, + "epoch": 0.5371838379861729, + "grad_norm": 6.4494428634643555, + "learning_rate": 3.4516953996858797e-06, + "loss": 0.4076, + "mean_token_accuracy": 0.8636928036808967, + "num_tokens": 208517286.0, + "step": 173290 + }, + { + "entropy": 1.909351000189781, + "epoch": 0.5372148371112225, + "grad_norm": 8.884832382202148, + "learning_rate": 3.451595810386603e-06, + "loss": 0.4188, + "mean_token_accuracy": 0.8635331153869629, + "num_tokens": 208528781.0, + "step": 173300 + }, + { + "entropy": 1.9376345589756965, + "epoch": 0.5372458362362722, + "grad_norm": 8.363286972045898, + "learning_rate": 3.4514962297069664e-06, + "loss": 0.4583, + "mean_token_accuracy": 0.842518937587738, + "num_tokens": 208540735.0, + "step": 173310 + }, + { + "entropy": 1.8835818395018578, + "epoch": 0.537276835361322, + "grad_norm": 9.146308898925781, + "learning_rate": 3.451396657645725e-06, + "loss": 0.4783, + "mean_token_accuracy": 0.8543020248413086, + "num_tokens": 208553020.0, + "step": 173320 + }, + { + "entropy": 1.9691929280757905, + "epoch": 0.5373078344863716, + "grad_norm": 7.7050275802612305, + "learning_rate": 3.451297094201636e-06, + "loss": 0.4905, + "mean_token_accuracy": 0.844947075843811, + "num_tokens": 208564228.0, + "step": 173330 + }, + { + "entropy": 1.8853344082832337, + "epoch": 0.5373388336114213, + "grad_norm": 8.397521018981934, + "learning_rate": 3.4511975393734574e-06, + "loss": 0.3892, + "mean_token_accuracy": 0.8540289342403412, + "num_tokens": 208577038.0, + "step": 173340 + }, + { + "entropy": 1.8994945645332337, + "epoch": 0.537369832736471, + "grad_norm": 8.379923820495605, + "learning_rate": 3.4510979931599466e-06, + "loss": 0.4298, + "mean_token_accuracy": 0.8555789023637772, + "num_tokens": 208589925.0, + "step": 173350 + }, + { + "entropy": 1.892551527917385, + "epoch": 0.5374008318615207, + "grad_norm": 3.935771942138672, + "learning_rate": 3.4509984555598596e-06, + "loss": 0.4218, + "mean_token_accuracy": 0.8619065880775452, + "num_tokens": 208602194.0, + "step": 173360 + }, + { + "entropy": 1.8687604144215584, + "epoch": 0.5374318309865704, + "grad_norm": 9.803389549255371, + "learning_rate": 3.450898926571956e-06, + "loss": 0.4001, + "mean_token_accuracy": 0.8642571851611137, + "num_tokens": 208615134.0, + "step": 173370 + }, + { + "entropy": 1.8904517486691474, + "epoch": 0.5374628301116201, + "grad_norm": 8.909818649291992, + "learning_rate": 3.450799406194994e-06, + "loss": 0.4521, + "mean_token_accuracy": 0.8571186706423759, + "num_tokens": 208627116.0, + "step": 173380 + }, + { + "entropy": 1.869077640771866, + "epoch": 0.5374938292366698, + "grad_norm": 7.4785284996032715, + "learning_rate": 3.4506998944277306e-06, + "loss": 0.3902, + "mean_token_accuracy": 0.8658688947558403, + "num_tokens": 208639356.0, + "step": 173390 + }, + { + "entropy": 1.9270390465855598, + "epoch": 0.5375248283617194, + "grad_norm": 7.729918956756592, + "learning_rate": 3.4506003912689252e-06, + "loss": 0.4549, + "mean_token_accuracy": 0.8558061882853508, + "num_tokens": 208651327.0, + "step": 173400 + }, + { + "entropy": 1.9015106394886971, + "epoch": 0.5375558274867692, + "grad_norm": 4.28214168548584, + "learning_rate": 3.450500896717338e-06, + "loss": 0.4466, + "mean_token_accuracy": 0.860819086432457, + "num_tokens": 208664302.0, + "step": 173410 + }, + { + "entropy": 1.8902736604213715, + "epoch": 0.5375868266118189, + "grad_norm": 7.470119953155518, + "learning_rate": 3.4504014107717265e-06, + "loss": 0.4488, + "mean_token_accuracy": 0.8502381905913353, + "num_tokens": 208675327.0, + "step": 173420 + }, + { + "entropy": 1.895962157845497, + "epoch": 0.5376178257368686, + "grad_norm": 8.215499877929688, + "learning_rate": 3.45030193343085e-06, + "loss": 0.4397, + "mean_token_accuracy": 0.850038780272007, + "num_tokens": 208687504.0, + "step": 173430 + }, + { + "entropy": 1.9525114193558692, + "epoch": 0.5376488248619182, + "grad_norm": 9.476152420043945, + "learning_rate": 3.450202464693469e-06, + "loss": 0.4646, + "mean_token_accuracy": 0.847514845430851, + "num_tokens": 208698463.0, + "step": 173440 + }, + { + "entropy": 1.8926068410277366, + "epoch": 0.537679823986968, + "grad_norm": 7.753235816955566, + "learning_rate": 3.450103004558344e-06, + "loss": 0.4421, + "mean_token_accuracy": 0.858433011174202, + "num_tokens": 208709860.0, + "step": 173450 + }, + { + "entropy": 1.874835228919983, + "epoch": 0.5377108231120177, + "grad_norm": 8.78343677520752, + "learning_rate": 3.4500035530242337e-06, + "loss": 0.4079, + "mean_token_accuracy": 0.8578429788351059, + "num_tokens": 208722252.0, + "step": 173460 + }, + { + "entropy": 1.8854070708155632, + "epoch": 0.5377418222370673, + "grad_norm": 9.277860641479492, + "learning_rate": 3.449904110089899e-06, + "loss": 0.4636, + "mean_token_accuracy": 0.8456300452351571, + "num_tokens": 208734913.0, + "step": 173470 + }, + { + "entropy": 1.8384581625461578, + "epoch": 0.537772821362117, + "grad_norm": 7.878200054168701, + "learning_rate": 3.4498046757541017e-06, + "loss": 0.4592, + "mean_token_accuracy": 0.8508857935667038, + "num_tokens": 208747257.0, + "step": 173480 + }, + { + "entropy": 1.9341495603322982, + "epoch": 0.5378038204871668, + "grad_norm": 8.864315032958984, + "learning_rate": 3.449705250015601e-06, + "loss": 0.5136, + "mean_token_accuracy": 0.8379593536257743, + "num_tokens": 208758938.0, + "step": 173490 + }, + { + "entropy": 1.9576148480176925, + "epoch": 0.5378348196122165, + "grad_norm": 8.191580772399902, + "learning_rate": 3.449605832873159e-06, + "loss": 0.4595, + "mean_token_accuracy": 0.8521469473838806, + "num_tokens": 208770439.0, + "step": 173500 + }, + { + "entropy": 1.8926663532853127, + "epoch": 0.5378658187372661, + "grad_norm": 7.8449387550354, + "learning_rate": 3.449506424325537e-06, + "loss": 0.4068, + "mean_token_accuracy": 0.8681238234043122, + "num_tokens": 208782915.0, + "step": 173510 + }, + { + "entropy": 1.8926350608468057, + "epoch": 0.5378968178623158, + "grad_norm": 8.02475643157959, + "learning_rate": 3.4494070243714972e-06, + "loss": 0.4305, + "mean_token_accuracy": 0.8605057239532471, + "num_tokens": 208795021.0, + "step": 173520 + }, + { + "entropy": 1.9649064928293227, + "epoch": 0.5379278169873656, + "grad_norm": 8.462267875671387, + "learning_rate": 3.449307633009801e-06, + "loss": 0.454, + "mean_token_accuracy": 0.8515180319547653, + "num_tokens": 208805781.0, + "step": 173530 + }, + { + "entropy": 1.8660337910056115, + "epoch": 0.5379588161124153, + "grad_norm": 6.343677520751953, + "learning_rate": 3.4492082502392097e-06, + "loss": 0.425, + "mean_token_accuracy": 0.8579024732112884, + "num_tokens": 208818869.0, + "step": 173540 + }, + { + "entropy": 1.9495095014572144, + "epoch": 0.5379898152374649, + "grad_norm": 7.972060203552246, + "learning_rate": 3.449108876058487e-06, + "loss": 0.4564, + "mean_token_accuracy": 0.851052550971508, + "num_tokens": 208829863.0, + "step": 173550 + }, + { + "entropy": 1.8896791711449623, + "epoch": 0.5380208143625146, + "grad_norm": 8.474870681762695, + "learning_rate": 3.449009510466395e-06, + "loss": 0.4094, + "mean_token_accuracy": 0.8639056518673897, + "num_tokens": 208841639.0, + "step": 173560 + }, + { + "entropy": 1.8332088232040404, + "epoch": 0.5380518134875644, + "grad_norm": 3.642097234725952, + "learning_rate": 3.4489101534616973e-06, + "loss": 0.3799, + "mean_token_accuracy": 0.871310307085514, + "num_tokens": 208854694.0, + "step": 173570 + }, + { + "entropy": 1.9200448259711265, + "epoch": 0.538082812612614, + "grad_norm": 8.212307929992676, + "learning_rate": 3.448810805043157e-06, + "loss": 0.4183, + "mean_token_accuracy": 0.8595086053013802, + "num_tokens": 208866628.0, + "step": 173580 + }, + { + "entropy": 1.8766262441873551, + "epoch": 0.5381138117376637, + "grad_norm": 4.330519199371338, + "learning_rate": 3.448711465209536e-06, + "loss": 0.405, + "mean_token_accuracy": 0.8583259165287018, + "num_tokens": 208879177.0, + "step": 173590 + }, + { + "entropy": 1.9223135873675345, + "epoch": 0.5381448108627134, + "grad_norm": 7.595412254333496, + "learning_rate": 3.448612133959599e-06, + "loss": 0.4735, + "mean_token_accuracy": 0.8525502845644951, + "num_tokens": 208890739.0, + "step": 173600 + }, + { + "entropy": 1.8765450350940227, + "epoch": 0.538175809987763, + "grad_norm": 9.189422607421875, + "learning_rate": 3.448512811292111e-06, + "loss": 0.4277, + "mean_token_accuracy": 0.8525310665369034, + "num_tokens": 208903519.0, + "step": 173610 + }, + { + "entropy": 1.9034860491752625, + "epoch": 0.5382068091128128, + "grad_norm": 5.840885639190674, + "learning_rate": 3.448413497205834e-06, + "loss": 0.4489, + "mean_token_accuracy": 0.856965248286724, + "num_tokens": 208914822.0, + "step": 173620 + }, + { + "entropy": 1.9403439238667488, + "epoch": 0.5382378082378625, + "grad_norm": 8.857246398925781, + "learning_rate": 3.448314191699534e-06, + "loss": 0.4864, + "mean_token_accuracy": 0.845775356888771, + "num_tokens": 208926069.0, + "step": 173630 + }, + { + "entropy": 1.8852522909641265, + "epoch": 0.5382688073629122, + "grad_norm": 8.122220039367676, + "learning_rate": 3.448214894771975e-06, + "loss": 0.4286, + "mean_token_accuracy": 0.850862056016922, + "num_tokens": 208938482.0, + "step": 173640 + }, + { + "entropy": 1.838669629395008, + "epoch": 0.5382998064879618, + "grad_norm": 8.469392776489258, + "learning_rate": 3.4481156064219225e-06, + "loss": 0.4178, + "mean_token_accuracy": 0.8549366906285286, + "num_tokens": 208951343.0, + "step": 173650 + }, + { + "entropy": 1.8912505745887755, + "epoch": 0.5383308056130116, + "grad_norm": 7.779234409332275, + "learning_rate": 3.4480163266481407e-06, + "loss": 0.4467, + "mean_token_accuracy": 0.8542173370718956, + "num_tokens": 208962657.0, + "step": 173660 + }, + { + "entropy": 1.836891622841358, + "epoch": 0.5383618047380613, + "grad_norm": 7.45448637008667, + "learning_rate": 3.447917055449396e-06, + "loss": 0.3494, + "mean_token_accuracy": 0.8678934350609779, + "num_tokens": 208975278.0, + "step": 173670 + }, + { + "entropy": 1.899190789461136, + "epoch": 0.538392803863111, + "grad_norm": 8.006275177001953, + "learning_rate": 3.447817792824453e-06, + "loss": 0.4329, + "mean_token_accuracy": 0.856468240916729, + "num_tokens": 208986773.0, + "step": 173680 + }, + { + "entropy": 1.8860803157091142, + "epoch": 0.5384238029881606, + "grad_norm": 8.964807510375977, + "learning_rate": 3.4477185387720796e-06, + "loss": 0.4189, + "mean_token_accuracy": 0.8659941121935845, + "num_tokens": 208998891.0, + "step": 173690 + }, + { + "entropy": 1.9246498376131058, + "epoch": 0.5384548021132104, + "grad_norm": 8.570847511291504, + "learning_rate": 3.447619293291039e-06, + "loss": 0.4594, + "mean_token_accuracy": 0.8461833164095879, + "num_tokens": 209009880.0, + "step": 173700 + }, + { + "entropy": 1.9233383148908616, + "epoch": 0.5384858012382601, + "grad_norm": 7.748110294342041, + "learning_rate": 3.4475200563801005e-06, + "loss": 0.4573, + "mean_token_accuracy": 0.8469515576958656, + "num_tokens": 209021551.0, + "step": 173710 + }, + { + "entropy": 1.9223771378397942, + "epoch": 0.5385168003633097, + "grad_norm": 4.379847526550293, + "learning_rate": 3.4474208280380296e-06, + "loss": 0.4549, + "mean_token_accuracy": 0.8564412742853165, + "num_tokens": 209033109.0, + "step": 173720 + }, + { + "entropy": 1.8591162428259849, + "epoch": 0.5385477994883594, + "grad_norm": 8.297298431396484, + "learning_rate": 3.447321608263592e-06, + "loss": 0.4186, + "mean_token_accuracy": 0.8554350823163986, + "num_tokens": 209045664.0, + "step": 173730 + }, + { + "entropy": 1.947766014933586, + "epoch": 0.5385787986134092, + "grad_norm": 9.672722816467285, + "learning_rate": 3.447222397055556e-06, + "loss": 0.4817, + "mean_token_accuracy": 0.8485858336091041, + "num_tokens": 209056805.0, + "step": 173740 + }, + { + "entropy": 1.9055955439805985, + "epoch": 0.5386097977384589, + "grad_norm": 6.304044723510742, + "learning_rate": 3.4471231944126893e-06, + "loss": 0.4918, + "mean_token_accuracy": 0.8486945450305938, + "num_tokens": 209068171.0, + "step": 173750 + }, + { + "entropy": 1.845375980436802, + "epoch": 0.5386407968635085, + "grad_norm": 9.359332084655762, + "learning_rate": 3.447024000333759e-06, + "loss": 0.4399, + "mean_token_accuracy": 0.8463126629590988, + "num_tokens": 209080990.0, + "step": 173760 + }, + { + "entropy": 1.9070203810930253, + "epoch": 0.5386717959885582, + "grad_norm": 8.701565742492676, + "learning_rate": 3.4469248148175338e-06, + "loss": 0.4482, + "mean_token_accuracy": 0.8570631608366966, + "num_tokens": 209092750.0, + "step": 173770 + }, + { + "entropy": 1.9061894118785858, + "epoch": 0.538702795113608, + "grad_norm": 3.7396252155303955, + "learning_rate": 3.44682563786278e-06, + "loss": 0.4586, + "mean_token_accuracy": 0.8583489716053009, + "num_tokens": 209104565.0, + "step": 173780 + }, + { + "entropy": 1.8039513304829597, + "epoch": 0.5387337942386576, + "grad_norm": 3.53255558013916, + "learning_rate": 3.4467264694682684e-06, + "loss": 0.3631, + "mean_token_accuracy": 0.8604403004050255, + "num_tokens": 209117953.0, + "step": 173790 + }, + { + "entropy": 1.9707183718681336, + "epoch": 0.5387647933637073, + "grad_norm": 6.6833109855651855, + "learning_rate": 3.446627309632766e-06, + "loss": 0.4572, + "mean_token_accuracy": 0.851428984105587, + "num_tokens": 209129538.0, + "step": 173800 + }, + { + "entropy": 1.8999001562595368, + "epoch": 0.538795792488757, + "grad_norm": 7.964756965637207, + "learning_rate": 3.446528158355042e-06, + "loss": 0.4346, + "mean_token_accuracy": 0.8490406572818756, + "num_tokens": 209141533.0, + "step": 173810 + }, + { + "entropy": 1.9331061124801636, + "epoch": 0.5388267916138068, + "grad_norm": 9.25655460357666, + "learning_rate": 3.4464290156338653e-06, + "loss": 0.4422, + "mean_token_accuracy": 0.8557355120778084, + "num_tokens": 209153672.0, + "step": 173820 + }, + { + "entropy": 1.904996284842491, + "epoch": 0.5388577907388564, + "grad_norm": 8.72864818572998, + "learning_rate": 3.4463298814680063e-06, + "loss": 0.4802, + "mean_token_accuracy": 0.8544986084103584, + "num_tokens": 209166617.0, + "step": 173830 + }, + { + "entropy": 1.8626493886113167, + "epoch": 0.5388887898639061, + "grad_norm": 8.355567932128906, + "learning_rate": 3.4462307558562334e-06, + "loss": 0.397, + "mean_token_accuracy": 0.8615503028035164, + "num_tokens": 209178685.0, + "step": 173840 + }, + { + "entropy": 1.87907382696867, + "epoch": 0.5389197889889558, + "grad_norm": 7.804126262664795, + "learning_rate": 3.4461316387973177e-06, + "loss": 0.4509, + "mean_token_accuracy": 0.8576026365160943, + "num_tokens": 209191040.0, + "step": 173850 + }, + { + "entropy": 1.903736950457096, + "epoch": 0.5389507881140054, + "grad_norm": 8.96075439453125, + "learning_rate": 3.446032530290028e-06, + "loss": 0.4424, + "mean_token_accuracy": 0.8592004761099815, + "num_tokens": 209203023.0, + "step": 173860 + }, + { + "entropy": 1.850924776494503, + "epoch": 0.5389817872390552, + "grad_norm": 7.536379337310791, + "learning_rate": 3.4459334303331358e-06, + "loss": 0.4114, + "mean_token_accuracy": 0.8623146697878837, + "num_tokens": 209214957.0, + "step": 173870 + }, + { + "entropy": 1.7714354708790778, + "epoch": 0.5390127863641049, + "grad_norm": 3.7824769020080566, + "learning_rate": 3.445834338925412e-06, + "loss": 0.3724, + "mean_token_accuracy": 0.8654832825064659, + "num_tokens": 209227593.0, + "step": 173880 + }, + { + "entropy": 1.8509420529007912, + "epoch": 0.5390437854891545, + "grad_norm": 8.168303489685059, + "learning_rate": 3.4457352560656255e-06, + "loss": 0.4233, + "mean_token_accuracy": 0.8622468948364258, + "num_tokens": 209240043.0, + "step": 173890 + }, + { + "entropy": 1.8068629071116447, + "epoch": 0.5390747846142042, + "grad_norm": 2.740614652633667, + "learning_rate": 3.4456361817525485e-06, + "loss": 0.3772, + "mean_token_accuracy": 0.8759689271450043, + "num_tokens": 209252184.0, + "step": 173900 + }, + { + "entropy": 1.8972719386219978, + "epoch": 0.539105783739254, + "grad_norm": 4.083326816558838, + "learning_rate": 3.4455371159849536e-06, + "loss": 0.4811, + "mean_token_accuracy": 0.8535422086715698, + "num_tokens": 209264573.0, + "step": 173910 + }, + { + "entropy": 1.8553459897637368, + "epoch": 0.5391367828643037, + "grad_norm": 7.37409782409668, + "learning_rate": 3.445438058761611e-06, + "loss": 0.4431, + "mean_token_accuracy": 0.8643639668822288, + "num_tokens": 209276436.0, + "step": 173920 + }, + { + "entropy": 1.8629083037376404, + "epoch": 0.5391677819893533, + "grad_norm": 11.203222274780273, + "learning_rate": 3.4453390100812933e-06, + "loss": 0.4486, + "mean_token_accuracy": 0.8535048425197601, + "num_tokens": 209288296.0, + "step": 173930 + }, + { + "entropy": 1.888429582118988, + "epoch": 0.539198781114403, + "grad_norm": 9.21101188659668, + "learning_rate": 3.4452399699427715e-06, + "loss": 0.417, + "mean_token_accuracy": 0.850302429497242, + "num_tokens": 209300674.0, + "step": 173940 + }, + { + "entropy": 1.865077406167984, + "epoch": 0.5392297802394528, + "grad_norm": 6.1955485343933105, + "learning_rate": 3.4451409383448185e-06, + "loss": 0.4204, + "mean_token_accuracy": 0.8566244795918465, + "num_tokens": 209312498.0, + "step": 173950 + }, + { + "entropy": 1.835512214899063, + "epoch": 0.5392607793645025, + "grad_norm": 3.3951921463012695, + "learning_rate": 3.4450419152862075e-06, + "loss": 0.4027, + "mean_token_accuracy": 0.8619059652090073, + "num_tokens": 209324850.0, + "step": 173960 + }, + { + "entropy": 1.9534437566995622, + "epoch": 0.5392917784895521, + "grad_norm": 7.149500846862793, + "learning_rate": 3.444942900765711e-06, + "loss": 0.4479, + "mean_token_accuracy": 0.8548975363373756, + "num_tokens": 209335333.0, + "step": 173970 + }, + { + "entropy": 1.8894205838441849, + "epoch": 0.5393227776146018, + "grad_norm": 3.9499619007110596, + "learning_rate": 3.4448438947821017e-06, + "loss": 0.4722, + "mean_token_accuracy": 0.8543545797467231, + "num_tokens": 209347592.0, + "step": 173980 + }, + { + "entropy": 1.9478689044713975, + "epoch": 0.5393537767396516, + "grad_norm": 8.097626686096191, + "learning_rate": 3.4447448973341536e-06, + "loss": 0.5032, + "mean_token_accuracy": 0.8446411550045013, + "num_tokens": 209358862.0, + "step": 173990 + }, + { + "entropy": 1.8606567561626435, + "epoch": 0.5393847758647012, + "grad_norm": 7.8725457191467285, + "learning_rate": 3.4446459084206392e-06, + "loss": 0.4179, + "mean_token_accuracy": 0.8621124207973481, + "num_tokens": 209370738.0, + "step": 174000 + }, + { + "entropy": 1.9500312462449074, + "epoch": 0.5394157749897509, + "grad_norm": 7.8665289878845215, + "learning_rate": 3.4445469280403334e-06, + "loss": 0.4702, + "mean_token_accuracy": 0.8539640560746193, + "num_tokens": 209382325.0, + "step": 174010 + }, + { + "entropy": 1.8706529840826989, + "epoch": 0.5394467741148006, + "grad_norm": 4.0839457511901855, + "learning_rate": 3.4444479561920104e-06, + "loss": 0.4393, + "mean_token_accuracy": 0.853885892033577, + "num_tokens": 209394402.0, + "step": 174020 + }, + { + "entropy": 1.8229328334331512, + "epoch": 0.5394777732398504, + "grad_norm": 2.62138032913208, + "learning_rate": 3.4443489928744434e-06, + "loss": 0.4121, + "mean_token_accuracy": 0.8603856071829796, + "num_tokens": 209407115.0, + "step": 174030 + }, + { + "entropy": 1.9071577087044715, + "epoch": 0.5395087723649, + "grad_norm": 7.644775390625, + "learning_rate": 3.444250038086408e-06, + "loss": 0.4771, + "mean_token_accuracy": 0.8491267457604408, + "num_tokens": 209418495.0, + "step": 174040 + }, + { + "entropy": 1.8522107481956482, + "epoch": 0.5395397714899497, + "grad_norm": 8.107036590576172, + "learning_rate": 3.444151091826679e-06, + "loss": 0.4551, + "mean_token_accuracy": 0.849695211648941, + "num_tokens": 209430988.0, + "step": 174050 + }, + { + "entropy": 1.891443131864071, + "epoch": 0.5395707706149994, + "grad_norm": 10.422231674194336, + "learning_rate": 3.444052154094031e-06, + "loss": 0.4249, + "mean_token_accuracy": 0.8538878068327904, + "num_tokens": 209442507.0, + "step": 174060 + }, + { + "entropy": 1.8737448453903198, + "epoch": 0.5396017697400491, + "grad_norm": 7.765069961547852, + "learning_rate": 3.4439532248872388e-06, + "loss": 0.4372, + "mean_token_accuracy": 0.8545178517699241, + "num_tokens": 209454958.0, + "step": 174070 + }, + { + "entropy": 1.9368214011192322, + "epoch": 0.5396327688650988, + "grad_norm": 7.202656269073486, + "learning_rate": 3.4438543042050785e-06, + "loss": 0.4844, + "mean_token_accuracy": 0.8466168284416199, + "num_tokens": 209466512.0, + "step": 174080 + }, + { + "entropy": 1.8895224526524543, + "epoch": 0.5396637679901485, + "grad_norm": 3.6564581394195557, + "learning_rate": 3.4437553920463267e-06, + "loss": 0.4716, + "mean_token_accuracy": 0.8483851253986359, + "num_tokens": 209479335.0, + "step": 174090 + }, + { + "entropy": 1.7815810799598695, + "epoch": 0.5396947671151981, + "grad_norm": 3.281292676925659, + "learning_rate": 3.443656488409759e-06, + "loss": 0.4025, + "mean_token_accuracy": 0.8650498852133751, + "num_tokens": 209493538.0, + "step": 174100 + }, + { + "entropy": 1.9042472764849663, + "epoch": 0.5397257662402478, + "grad_norm": 6.953648567199707, + "learning_rate": 3.443557593294151e-06, + "loss": 0.4342, + "mean_token_accuracy": 0.8552184790372849, + "num_tokens": 209504818.0, + "step": 174110 + }, + { + "entropy": 1.8446774929761887, + "epoch": 0.5397567653652976, + "grad_norm": 7.378088474273682, + "learning_rate": 3.443458706698279e-06, + "loss": 0.3876, + "mean_token_accuracy": 0.8741657555103302, + "num_tokens": 209517016.0, + "step": 174120 + }, + { + "entropy": 1.8744469970464706, + "epoch": 0.5397877644903473, + "grad_norm": 8.403420448303223, + "learning_rate": 3.443359828620922e-06, + "loss": 0.4474, + "mean_token_accuracy": 0.8557930126786232, + "num_tokens": 209529296.0, + "step": 174130 + }, + { + "entropy": 1.8760189965367318, + "epoch": 0.5398187636153969, + "grad_norm": 7.711986064910889, + "learning_rate": 3.4432609590608547e-06, + "loss": 0.4138, + "mean_token_accuracy": 0.8661058858036995, + "num_tokens": 209541437.0, + "step": 174140 + }, + { + "entropy": 1.8844467535614968, + "epoch": 0.5398497627404466, + "grad_norm": 7.68595552444458, + "learning_rate": 3.443162098016855e-06, + "loss": 0.4104, + "mean_token_accuracy": 0.8724717631936073, + "num_tokens": 209552945.0, + "step": 174150 + }, + { + "entropy": 1.9028114691376685, + "epoch": 0.5398807618654964, + "grad_norm": 8.077682495117188, + "learning_rate": 3.443063245487701e-06, + "loss": 0.4133, + "mean_token_accuracy": 0.8684661194682122, + "num_tokens": 209563980.0, + "step": 174160 + }, + { + "entropy": 1.904925413429737, + "epoch": 0.539911760990546, + "grad_norm": 4.101963520050049, + "learning_rate": 3.44296440147217e-06, + "loss": 0.4403, + "mean_token_accuracy": 0.8549754232168197, + "num_tokens": 209575474.0, + "step": 174170 + }, + { + "entropy": 1.8629205331206322, + "epoch": 0.5399427601155957, + "grad_norm": 9.179095268249512, + "learning_rate": 3.4428655659690396e-06, + "loss": 0.4372, + "mean_token_accuracy": 0.8535078182816506, + "num_tokens": 209587292.0, + "step": 174180 + }, + { + "entropy": 1.9449424147605896, + "epoch": 0.5399737592406454, + "grad_norm": 8.293116569519043, + "learning_rate": 3.4427667389770895e-06, + "loss": 0.479, + "mean_token_accuracy": 0.8479105710983277, + "num_tokens": 209598002.0, + "step": 174190 + }, + { + "entropy": 1.9391576394438743, + "epoch": 0.5400047583656952, + "grad_norm": 6.534260272979736, + "learning_rate": 3.442667920495097e-06, + "loss": 0.4704, + "mean_token_accuracy": 0.8563640967011452, + "num_tokens": 209609592.0, + "step": 174200 + }, + { + "entropy": 1.8575612157583237, + "epoch": 0.5400357574907448, + "grad_norm": 7.172982692718506, + "learning_rate": 3.4425691105218407e-06, + "loss": 0.4005, + "mean_token_accuracy": 0.8640211433172226, + "num_tokens": 209621671.0, + "step": 174210 + }, + { + "entropy": 1.8971868574619293, + "epoch": 0.5400667566157945, + "grad_norm": 7.444050312042236, + "learning_rate": 3.4424703090561005e-06, + "loss": 0.4028, + "mean_token_accuracy": 0.8565149545669556, + "num_tokens": 209634034.0, + "step": 174220 + }, + { + "entropy": 1.8702701404690742, + "epoch": 0.5400977557408442, + "grad_norm": 8.514378547668457, + "learning_rate": 3.442371516096655e-06, + "loss": 0.4013, + "mean_token_accuracy": 0.8600099131464958, + "num_tokens": 209646181.0, + "step": 174230 + }, + { + "entropy": 1.9420946687459946, + "epoch": 0.540128754865894, + "grad_norm": 7.077751159667969, + "learning_rate": 3.4422727316422843e-06, + "loss": 0.4714, + "mean_token_accuracy": 0.8432377234101296, + "num_tokens": 209658564.0, + "step": 174240 + }, + { + "entropy": 1.719575546681881, + "epoch": 0.5401597539909436, + "grad_norm": 3.8743057250976562, + "learning_rate": 3.442173955691767e-06, + "loss": 0.3195, + "mean_token_accuracy": 0.8714294508099556, + "num_tokens": 209673328.0, + "step": 174250 + }, + { + "entropy": 1.8225284963846207, + "epoch": 0.5401907531159933, + "grad_norm": 3.381141424179077, + "learning_rate": 3.442075188243884e-06, + "loss": 0.3789, + "mean_token_accuracy": 0.8631697997450829, + "num_tokens": 209686284.0, + "step": 174260 + }, + { + "entropy": 1.8965171083807946, + "epoch": 0.540221752241043, + "grad_norm": 7.4435529708862305, + "learning_rate": 3.4419764292974155e-06, + "loss": 0.4457, + "mean_token_accuracy": 0.8570987716317177, + "num_tokens": 209697671.0, + "step": 174270 + }, + { + "entropy": 1.9002371713519097, + "epoch": 0.5402527513660927, + "grad_norm": 7.851629734039307, + "learning_rate": 3.4418776788511416e-06, + "loss": 0.4192, + "mean_token_accuracy": 0.8583559781312943, + "num_tokens": 209709310.0, + "step": 174280 + }, + { + "entropy": 1.9418524980545044, + "epoch": 0.5402837504911424, + "grad_norm": 9.440665245056152, + "learning_rate": 3.441778936903844e-06, + "loss": 0.4982, + "mean_token_accuracy": 0.8463272243738175, + "num_tokens": 209720450.0, + "step": 174290 + }, + { + "entropy": 1.8744873508810997, + "epoch": 0.5403147496161921, + "grad_norm": 8.730061531066895, + "learning_rate": 3.4416802034543018e-06, + "loss": 0.4428, + "mean_token_accuracy": 0.8531945258378982, + "num_tokens": 209733006.0, + "step": 174300 + }, + { + "entropy": 1.9549224942922592, + "epoch": 0.5403457487412417, + "grad_norm": 8.060844421386719, + "learning_rate": 3.441581478501298e-06, + "loss": 0.4547, + "mean_token_accuracy": 0.8591006144881248, + "num_tokens": 209743610.0, + "step": 174310 + }, + { + "entropy": 1.9375012516975403, + "epoch": 0.5403767478662915, + "grad_norm": 8.074051856994629, + "learning_rate": 3.4414827620436124e-06, + "loss": 0.4468, + "mean_token_accuracy": 0.8519991576671601, + "num_tokens": 209755113.0, + "step": 174320 + }, + { + "entropy": 1.908737349510193, + "epoch": 0.5404077469913412, + "grad_norm": 3.8884499073028564, + "learning_rate": 3.4413840540800287e-06, + "loss": 0.4367, + "mean_token_accuracy": 0.8561335727572441, + "num_tokens": 209767582.0, + "step": 174330 + }, + { + "entropy": 1.882502131164074, + "epoch": 0.5404387461163909, + "grad_norm": 5.514358997344971, + "learning_rate": 3.441285354609327e-06, + "loss": 0.5137, + "mean_token_accuracy": 0.8450815051794052, + "num_tokens": 209780073.0, + "step": 174340 + }, + { + "entropy": 1.936092458665371, + "epoch": 0.5404697452414405, + "grad_norm": 8.400111198425293, + "learning_rate": 3.4411866636302905e-06, + "loss": 0.4453, + "mean_token_accuracy": 0.8598876744508743, + "num_tokens": 209791202.0, + "step": 174350 + }, + { + "entropy": 1.789114499092102, + "epoch": 0.5405007443664902, + "grad_norm": 3.711569309234619, + "learning_rate": 3.4410879811417013e-06, + "loss": 0.3763, + "mean_token_accuracy": 0.8579269453883172, + "num_tokens": 209804460.0, + "step": 174360 + }, + { + "entropy": 1.90018672645092, + "epoch": 0.54053174349154, + "grad_norm": 3.735109567642212, + "learning_rate": 3.4409893071423422e-06, + "loss": 0.435, + "mean_token_accuracy": 0.8549308374524116, + "num_tokens": 209815948.0, + "step": 174370 + }, + { + "entropy": 1.9148787215352059, + "epoch": 0.5405627426165897, + "grad_norm": 4.6859354972839355, + "learning_rate": 3.440890641630996e-06, + "loss": 0.4424, + "mean_token_accuracy": 0.8568407163023949, + "num_tokens": 209828424.0, + "step": 174380 + }, + { + "entropy": 1.9087970286607743, + "epoch": 0.5405937417416393, + "grad_norm": 3.260207414627075, + "learning_rate": 3.440791984606446e-06, + "loss": 0.421, + "mean_token_accuracy": 0.8594276770949364, + "num_tokens": 209839992.0, + "step": 174390 + }, + { + "entropy": 1.892660665512085, + "epoch": 0.540624740866689, + "grad_norm": 8.223847389221191, + "learning_rate": 3.440693336067476e-06, + "loss": 0.4091, + "mean_token_accuracy": 0.8555561438202858, + "num_tokens": 209851621.0, + "step": 174400 + }, + { + "entropy": 1.9226069450378418, + "epoch": 0.5406557399917388, + "grad_norm": 9.147815704345703, + "learning_rate": 3.4405946960128685e-06, + "loss": 0.484, + "mean_token_accuracy": 0.838910260796547, + "num_tokens": 209863542.0, + "step": 174410 + }, + { + "entropy": 1.8764439657330514, + "epoch": 0.5406867391167884, + "grad_norm": 10.320169448852539, + "learning_rate": 3.440496064441408e-06, + "loss": 0.4486, + "mean_token_accuracy": 0.8501550763845444, + "num_tokens": 209876332.0, + "step": 174420 + }, + { + "entropy": 1.932776327431202, + "epoch": 0.5407177382418381, + "grad_norm": 3.934086561203003, + "learning_rate": 3.4403974413518787e-06, + "loss": 0.4702, + "mean_token_accuracy": 0.8536558374762535, + "num_tokens": 209887319.0, + "step": 174430 + }, + { + "entropy": 1.9584178015589715, + "epoch": 0.5407487373668878, + "grad_norm": 9.40331745147705, + "learning_rate": 3.4402988267430653e-06, + "loss": 0.4521, + "mean_token_accuracy": 0.8565890833735466, + "num_tokens": 209898463.0, + "step": 174440 + }, + { + "entropy": 1.8962328165769577, + "epoch": 0.5407797364919376, + "grad_norm": 7.782931327819824, + "learning_rate": 3.4402002206137525e-06, + "loss": 0.4703, + "mean_token_accuracy": 0.8547718808054924, + "num_tokens": 209910866.0, + "step": 174450 + }, + { + "entropy": 2.017945593595505, + "epoch": 0.5408107356169872, + "grad_norm": 6.7005486488342285, + "learning_rate": 3.440101622962724e-06, + "loss": 0.4734, + "mean_token_accuracy": 0.8474322855472565, + "num_tokens": 209921348.0, + "step": 174460 + }, + { + "entropy": 1.9669252157211303, + "epoch": 0.5408417347420369, + "grad_norm": 9.323506355285645, + "learning_rate": 3.440003033788766e-06, + "loss": 0.4896, + "mean_token_accuracy": 0.8428177922964096, + "num_tokens": 209932134.0, + "step": 174470 + }, + { + "entropy": 1.8594336844980717, + "epoch": 0.5408727338670866, + "grad_norm": 11.331262588500977, + "learning_rate": 3.4399044530906633e-06, + "loss": 0.4695, + "mean_token_accuracy": 0.8466810330748558, + "num_tokens": 209945678.0, + "step": 174480 + }, + { + "entropy": 1.9158946216106414, + "epoch": 0.5409037329921363, + "grad_norm": 8.143959999084473, + "learning_rate": 3.439805880867202e-06, + "loss": 0.4263, + "mean_token_accuracy": 0.8595825538039208, + "num_tokens": 209957030.0, + "step": 174490 + }, + { + "entropy": 1.9205025285482407, + "epoch": 0.540934732117186, + "grad_norm": 7.518352031707764, + "learning_rate": 3.439707317117168e-06, + "loss": 0.4791, + "mean_token_accuracy": 0.8524655893445015, + "num_tokens": 209968372.0, + "step": 174500 + }, + { + "entropy": 1.8929289489984513, + "epoch": 0.5409657312422357, + "grad_norm": 9.195374488830566, + "learning_rate": 3.439608761839347e-06, + "loss": 0.4426, + "mean_token_accuracy": 0.8569901049137115, + "num_tokens": 209980375.0, + "step": 174510 + }, + { + "entropy": 1.8607401132583619, + "epoch": 0.5409967303672854, + "grad_norm": 8.270475387573242, + "learning_rate": 3.439510215032525e-06, + "loss": 0.4106, + "mean_token_accuracy": 0.8566593199968338, + "num_tokens": 209992780.0, + "step": 174520 + }, + { + "entropy": 1.9021028637886048, + "epoch": 0.5410277294923351, + "grad_norm": 8.019796371459961, + "learning_rate": 3.4394116766954886e-06, + "loss": 0.4518, + "mean_token_accuracy": 0.8580415681004524, + "num_tokens": 210004429.0, + "step": 174530 + }, + { + "entropy": 1.9144439026713371, + "epoch": 0.5410587286173848, + "grad_norm": 9.35071086883545, + "learning_rate": 3.439313146827026e-06, + "loss": 0.4001, + "mean_token_accuracy": 0.8612898230552674, + "num_tokens": 210016007.0, + "step": 174540 + }, + { + "entropy": 1.9227073967456818, + "epoch": 0.5410897277424345, + "grad_norm": 8.214776992797852, + "learning_rate": 3.439214625425923e-06, + "loss": 0.4615, + "mean_token_accuracy": 0.8489612191915512, + "num_tokens": 210027331.0, + "step": 174550 + }, + { + "entropy": 1.8217739909887314, + "epoch": 0.5411207268674841, + "grad_norm": 4.1948957443237305, + "learning_rate": 3.439116112490967e-06, + "loss": 0.3779, + "mean_token_accuracy": 0.8614114180207253, + "num_tokens": 210040443.0, + "step": 174560 + }, + { + "entropy": 1.9101142331957817, + "epoch": 0.5411517259925338, + "grad_norm": 7.208160877227783, + "learning_rate": 3.4390176080209454e-06, + "loss": 0.4299, + "mean_token_accuracy": 0.8595498502254486, + "num_tokens": 210052159.0, + "step": 174570 + }, + { + "entropy": 1.8564433738589288, + "epoch": 0.5411827251175836, + "grad_norm": 8.189614295959473, + "learning_rate": 3.4389191120146466e-06, + "loss": 0.4474, + "mean_token_accuracy": 0.8517980024218559, + "num_tokens": 210064171.0, + "step": 174580 + }, + { + "entropy": 1.7955839857459068, + "epoch": 0.5412137242426333, + "grad_norm": 9.149805068969727, + "learning_rate": 3.438820624470858e-06, + "loss": 0.3621, + "mean_token_accuracy": 0.8595570862293244, + "num_tokens": 210077852.0, + "step": 174590 + }, + { + "entropy": 1.9039988040924072, + "epoch": 0.5412447233676829, + "grad_norm": 8.685386657714844, + "learning_rate": 3.4387221453883684e-06, + "loss": 0.4086, + "mean_token_accuracy": 0.8574944108724594, + "num_tokens": 210090031.0, + "step": 174600 + }, + { + "entropy": 1.943969477713108, + "epoch": 0.5412757224927326, + "grad_norm": 9.362236976623535, + "learning_rate": 3.438623674765967e-06, + "loss": 0.4563, + "mean_token_accuracy": 0.8577531427145004, + "num_tokens": 210101413.0, + "step": 174610 + }, + { + "entropy": 1.8930824875831604, + "epoch": 0.5413067216177824, + "grad_norm": 3.906763792037964, + "learning_rate": 3.4385252126024405e-06, + "loss": 0.4181, + "mean_token_accuracy": 0.8613180920481682, + "num_tokens": 210115101.0, + "step": 174620 + }, + { + "entropy": 1.8995563462376595, + "epoch": 0.541337720742832, + "grad_norm": 6.614351749420166, + "learning_rate": 3.4384267588965796e-06, + "loss": 0.4127, + "mean_token_accuracy": 0.855283860862255, + "num_tokens": 210127312.0, + "step": 174630 + }, + { + "entropy": 1.8280969649553298, + "epoch": 0.5413687198678817, + "grad_norm": 7.801238059997559, + "learning_rate": 3.438328313647173e-06, + "loss": 0.3953, + "mean_token_accuracy": 0.8650396257638931, + "num_tokens": 210140420.0, + "step": 174640 + }, + { + "entropy": 1.81494547277689, + "epoch": 0.5413997189929314, + "grad_norm": 3.7451062202453613, + "learning_rate": 3.4382298768530104e-06, + "loss": 0.3754, + "mean_token_accuracy": 0.8647580876946449, + "num_tokens": 210153529.0, + "step": 174650 + }, + { + "entropy": 1.7520886182785034, + "epoch": 0.5414307181179812, + "grad_norm": 2.5675580501556396, + "learning_rate": 3.438131448512881e-06, + "loss": 0.3702, + "mean_token_accuracy": 0.8710595235228539, + "num_tokens": 210167218.0, + "step": 174660 + }, + { + "entropy": 1.8860586032271385, + "epoch": 0.5414617172430308, + "grad_norm": 3.851811647415161, + "learning_rate": 3.438033028625575e-06, + "loss": 0.4301, + "mean_token_accuracy": 0.8687780782580375, + "num_tokens": 210178719.0, + "step": 174670 + }, + { + "entropy": 1.9037866786122322, + "epoch": 0.5414927163680805, + "grad_norm": 8.392757415771484, + "learning_rate": 3.4379346171898826e-06, + "loss": 0.4302, + "mean_token_accuracy": 0.8572928667068481, + "num_tokens": 210190726.0, + "step": 174680 + }, + { + "entropy": 1.8577393546700478, + "epoch": 0.5415237154931302, + "grad_norm": 8.268177032470703, + "learning_rate": 3.437836214204595e-06, + "loss": 0.396, + "mean_token_accuracy": 0.8722563162446022, + "num_tokens": 210203752.0, + "step": 174690 + }, + { + "entropy": 1.8598589971661568, + "epoch": 0.5415547146181799, + "grad_norm": 10.199872016906738, + "learning_rate": 3.437737819668502e-06, + "loss": 0.4306, + "mean_token_accuracy": 0.856185057759285, + "num_tokens": 210215977.0, + "step": 174700 + }, + { + "entropy": 1.8862924322485923, + "epoch": 0.5415857137432296, + "grad_norm": 3.774386167526245, + "learning_rate": 3.437639433580395e-06, + "loss": 0.4979, + "mean_token_accuracy": 0.852426928281784, + "num_tokens": 210228793.0, + "step": 174710 + }, + { + "entropy": 1.9758263260126114, + "epoch": 0.5416167128682793, + "grad_norm": 7.43900728225708, + "learning_rate": 3.4375410559390653e-06, + "loss": 0.5577, + "mean_token_accuracy": 0.8290947854518891, + "num_tokens": 210240226.0, + "step": 174720 + }, + { + "entropy": 1.774106466770172, + "epoch": 0.541647711993329, + "grad_norm": 2.82202410697937, + "learning_rate": 3.437442686743304e-06, + "loss": 0.3548, + "mean_token_accuracy": 0.8689462915062904, + "num_tokens": 210254170.0, + "step": 174730 + }, + { + "entropy": 1.891635850071907, + "epoch": 0.5416787111183787, + "grad_norm": 7.973023891448975, + "learning_rate": 3.4373443259919025e-06, + "loss": 0.4441, + "mean_token_accuracy": 0.852353821694851, + "num_tokens": 210267073.0, + "step": 174740 + }, + { + "entropy": 1.9281975954771042, + "epoch": 0.5417097102434284, + "grad_norm": 8.363383293151855, + "learning_rate": 3.437245973683653e-06, + "loss": 0.469, + "mean_token_accuracy": 0.8436614751815796, + "num_tokens": 210278959.0, + "step": 174750 + }, + { + "entropy": 1.9063287645578384, + "epoch": 0.5417407093684781, + "grad_norm": 7.292370319366455, + "learning_rate": 3.4371476298173482e-06, + "loss": 0.4456, + "mean_token_accuracy": 0.8531043663620949, + "num_tokens": 210290779.0, + "step": 174760 + }, + { + "entropy": 1.8370270490646363, + "epoch": 0.5417717084935277, + "grad_norm": 7.736352443695068, + "learning_rate": 3.43704929439178e-06, + "loss": 0.3753, + "mean_token_accuracy": 0.8688179135322571, + "num_tokens": 210303485.0, + "step": 174770 + }, + { + "entropy": 1.930588774383068, + "epoch": 0.5418027076185775, + "grad_norm": 10.174417495727539, + "learning_rate": 3.436950967405741e-06, + "loss": 0.4887, + "mean_token_accuracy": 0.8459608346223831, + "num_tokens": 210314659.0, + "step": 174780 + }, + { + "entropy": 1.7679610848426819, + "epoch": 0.5418337067436272, + "grad_norm": 6.358749866485596, + "learning_rate": 3.436852648858024e-06, + "loss": 0.353, + "mean_token_accuracy": 0.8780291527509689, + "num_tokens": 210328185.0, + "step": 174790 + }, + { + "entropy": 1.8495957791805266, + "epoch": 0.5418647058686769, + "grad_norm": 3.853734016418457, + "learning_rate": 3.436754338747422e-06, + "loss": 0.4052, + "mean_token_accuracy": 0.8549610197544097, + "num_tokens": 210341316.0, + "step": 174800 + }, + { + "entropy": 1.9119720757007599, + "epoch": 0.5418957049937265, + "grad_norm": 9.63703441619873, + "learning_rate": 3.4366560370727294e-06, + "loss": 0.4288, + "mean_token_accuracy": 0.8599133908748626, + "num_tokens": 210353173.0, + "step": 174810 + }, + { + "entropy": 1.8423039808869361, + "epoch": 0.5419267041187762, + "grad_norm": 7.737087726593018, + "learning_rate": 3.436557743832738e-06, + "loss": 0.3884, + "mean_token_accuracy": 0.8715093210339546, + "num_tokens": 210365171.0, + "step": 174820 + }, + { + "entropy": 1.936174413561821, + "epoch": 0.541957703243826, + "grad_norm": 9.276434898376465, + "learning_rate": 3.436459459026243e-06, + "loss": 0.4711, + "mean_token_accuracy": 0.8483540773391723, + "num_tokens": 210376844.0, + "step": 174830 + }, + { + "entropy": 1.8812786877155303, + "epoch": 0.5419887023688756, + "grad_norm": 8.049873352050781, + "learning_rate": 3.436361182652038e-06, + "loss": 0.4226, + "mean_token_accuracy": 0.8652190431952477, + "num_tokens": 210389173.0, + "step": 174840 + }, + { + "entropy": 1.8744924083352088, + "epoch": 0.5420197014939253, + "grad_norm": 7.033712863922119, + "learning_rate": 3.436262914708918e-06, + "loss": 0.4246, + "mean_token_accuracy": 0.868674422800541, + "num_tokens": 210401296.0, + "step": 174850 + }, + { + "entropy": 1.909075213968754, + "epoch": 0.542050700618975, + "grad_norm": 9.055830001831055, + "learning_rate": 3.436164655195677e-06, + "loss": 0.4647, + "mean_token_accuracy": 0.848932571709156, + "num_tokens": 210412989.0, + "step": 174860 + }, + { + "entropy": 1.907054753601551, + "epoch": 0.5420816997440248, + "grad_norm": 8.009981155395508, + "learning_rate": 3.4360664041111097e-06, + "loss": 0.4362, + "mean_token_accuracy": 0.855173397064209, + "num_tokens": 210424168.0, + "step": 174870 + }, + { + "entropy": 1.855896058678627, + "epoch": 0.5421126988690744, + "grad_norm": 4.083925247192383, + "learning_rate": 3.435968161454011e-06, + "loss": 0.3958, + "mean_token_accuracy": 0.8597793817520142, + "num_tokens": 210436327.0, + "step": 174880 + }, + { + "entropy": 1.7705978021025657, + "epoch": 0.5421436979941241, + "grad_norm": 3.9541049003601074, + "learning_rate": 3.4358699272231765e-06, + "loss": 0.3642, + "mean_token_accuracy": 0.8635337948799133, + "num_tokens": 210449816.0, + "step": 174890 + }, + { + "entropy": 1.8276181071996689, + "epoch": 0.5421746971191738, + "grad_norm": 4.227304458618164, + "learning_rate": 3.435771701417402e-06, + "loss": 0.3886, + "mean_token_accuracy": 0.8660426765680314, + "num_tokens": 210462975.0, + "step": 174900 + }, + { + "entropy": 1.7664773181080817, + "epoch": 0.5422056962442235, + "grad_norm": 4.43179988861084, + "learning_rate": 3.435673484035484e-06, + "loss": 0.3642, + "mean_token_accuracy": 0.8715872049331665, + "num_tokens": 210477054.0, + "step": 174910 + }, + { + "entropy": 1.901937472820282, + "epoch": 0.5422366953692732, + "grad_norm": 4.020650863647461, + "learning_rate": 3.435575275076216e-06, + "loss": 0.4332, + "mean_token_accuracy": 0.8585090190172195, + "num_tokens": 210487847.0, + "step": 174920 + }, + { + "entropy": 1.7410590797662735, + "epoch": 0.5422676944943229, + "grad_norm": 10.307069778442383, + "learning_rate": 3.4354770745383966e-06, + "loss": 0.3951, + "mean_token_accuracy": 0.8655442446470261, + "num_tokens": 210502071.0, + "step": 174930 + }, + { + "entropy": 1.9489874988794327, + "epoch": 0.5422986936193726, + "grad_norm": 7.642431259155273, + "learning_rate": 3.4353788824208217e-06, + "loss": 0.5031, + "mean_token_accuracy": 0.846542339026928, + "num_tokens": 210513613.0, + "step": 174940 + }, + { + "entropy": 1.9443811848759651, + "epoch": 0.5423296927444223, + "grad_norm": 6.863345623016357, + "learning_rate": 3.4352806987222875e-06, + "loss": 0.4587, + "mean_token_accuracy": 0.8550491377711296, + "num_tokens": 210524857.0, + "step": 174950 + }, + { + "entropy": 1.950060898065567, + "epoch": 0.542360691869472, + "grad_norm": 10.906983375549316, + "learning_rate": 3.435182523441591e-06, + "loss": 0.4431, + "mean_token_accuracy": 0.8554666265845299, + "num_tokens": 210535796.0, + "step": 174960 + }, + { + "entropy": 1.7573295474052428, + "epoch": 0.5423916909945217, + "grad_norm": 5.128137111663818, + "learning_rate": 3.4350843565775303e-06, + "loss": 0.3626, + "mean_token_accuracy": 0.8658852711319923, + "num_tokens": 210549961.0, + "step": 174970 + }, + { + "entropy": 1.9221460312604903, + "epoch": 0.5424226901195713, + "grad_norm": 8.352112770080566, + "learning_rate": 3.4349861981289017e-06, + "loss": 0.4817, + "mean_token_accuracy": 0.8526023477315903, + "num_tokens": 210561178.0, + "step": 174980 + }, + { + "entropy": 1.9498736828565597, + "epoch": 0.5424536892446211, + "grad_norm": 7.754190921783447, + "learning_rate": 3.434888048094504e-06, + "loss": 0.4683, + "mean_token_accuracy": 0.8518366366624832, + "num_tokens": 210571601.0, + "step": 174990 + }, + { + "entropy": 1.8772225648164749, + "epoch": 0.5424846883696708, + "grad_norm": 9.029406547546387, + "learning_rate": 3.4347899064731345e-06, + "loss": 0.4659, + "mean_token_accuracy": 0.8565710127353668, + "num_tokens": 210583068.0, + "step": 175000 + }, + { + "entropy": 1.8856910184025764, + "epoch": 0.5425156874947205, + "grad_norm": 8.368382453918457, + "learning_rate": 3.4346917732635916e-06, + "loss": 0.4603, + "mean_token_accuracy": 0.853217662870884, + "num_tokens": 210595557.0, + "step": 175010 + }, + { + "entropy": 1.8868588835000992, + "epoch": 0.5425466866197701, + "grad_norm": 8.443853378295898, + "learning_rate": 3.4345936484646737e-06, + "loss": 0.4536, + "mean_token_accuracy": 0.846336479485035, + "num_tokens": 210606720.0, + "step": 175020 + }, + { + "entropy": 1.8998877912759782, + "epoch": 0.5425776857448199, + "grad_norm": 3.720510482788086, + "learning_rate": 3.4344955320751795e-06, + "loss": 0.4661, + "mean_token_accuracy": 0.8508727207779885, + "num_tokens": 210618725.0, + "step": 175030 + }, + { + "entropy": 1.8702963769435883, + "epoch": 0.5426086848698696, + "grad_norm": 7.714414596557617, + "learning_rate": 3.4343974240939077e-06, + "loss": 0.4281, + "mean_token_accuracy": 0.855791012942791, + "num_tokens": 210631454.0, + "step": 175040 + }, + { + "entropy": 1.9237717658281326, + "epoch": 0.5426396839949192, + "grad_norm": 7.232386112213135, + "learning_rate": 3.4342993245196576e-06, + "loss": 0.4663, + "mean_token_accuracy": 0.8514825507998467, + "num_tokens": 210643027.0, + "step": 175050 + }, + { + "entropy": 1.9704272598028183, + "epoch": 0.5426706831199689, + "grad_norm": 8.11324691772461, + "learning_rate": 3.434201233351228e-06, + "loss": 0.4937, + "mean_token_accuracy": 0.8477542325854301, + "num_tokens": 210654106.0, + "step": 175060 + }, + { + "entropy": 1.9354610219597816, + "epoch": 0.5427016822450186, + "grad_norm": 9.615270614624023, + "learning_rate": 3.43410315058742e-06, + "loss": 0.4594, + "mean_token_accuracy": 0.862465412914753, + "num_tokens": 210665825.0, + "step": 175070 + }, + { + "entropy": 1.8135445401072503, + "epoch": 0.5427326813700684, + "grad_norm": 4.195128917694092, + "learning_rate": 3.4340050762270326e-06, + "loss": 0.3732, + "mean_token_accuracy": 0.8759765163064003, + "num_tokens": 210678215.0, + "step": 175080 + }, + { + "entropy": 1.8600399553775788, + "epoch": 0.542763680495118, + "grad_norm": 7.729010581970215, + "learning_rate": 3.4339070102688653e-06, + "loss": 0.4025, + "mean_token_accuracy": 0.8706375062465668, + "num_tokens": 210690496.0, + "step": 175090 + }, + { + "entropy": 1.84743193089962, + "epoch": 0.5427946796201677, + "grad_norm": 3.6065328121185303, + "learning_rate": 3.433808952711719e-06, + "loss": 0.3923, + "mean_token_accuracy": 0.8589259907603264, + "num_tokens": 210703213.0, + "step": 175100 + }, + { + "entropy": 1.8852507174015045, + "epoch": 0.5428256787452174, + "grad_norm": 7.954366683959961, + "learning_rate": 3.433710903554394e-06, + "loss": 0.4549, + "mean_token_accuracy": 0.8529264152050018, + "num_tokens": 210715201.0, + "step": 175110 + }, + { + "entropy": 1.8923664793372155, + "epoch": 0.5428566778702671, + "grad_norm": 7.592563152313232, + "learning_rate": 3.433612862795692e-06, + "loss": 0.4806, + "mean_token_accuracy": 0.8424620926380157, + "num_tokens": 210727901.0, + "step": 175120 + }, + { + "entropy": 1.9149468883872032, + "epoch": 0.5428876769953168, + "grad_norm": 7.934050559997559, + "learning_rate": 3.4335148304344136e-06, + "loss": 0.4417, + "mean_token_accuracy": 0.8597536578774452, + "num_tokens": 210738953.0, + "step": 175130 + }, + { + "entropy": 1.8892448142170906, + "epoch": 0.5429186761203665, + "grad_norm": 8.11552619934082, + "learning_rate": 3.43341680646936e-06, + "loss": 0.42, + "mean_token_accuracy": 0.862630070745945, + "num_tokens": 210751021.0, + "step": 175140 + }, + { + "entropy": 1.958571094274521, + "epoch": 0.5429496752454162, + "grad_norm": 8.172052383422852, + "learning_rate": 3.433318790899332e-06, + "loss": 0.4662, + "mean_token_accuracy": 0.8557330414652824, + "num_tokens": 210761979.0, + "step": 175150 + }, + { + "entropy": 1.8148248717188835, + "epoch": 0.5429806743704659, + "grad_norm": 7.902945041656494, + "learning_rate": 3.4332207837231325e-06, + "loss": 0.3968, + "mean_token_accuracy": 0.8547403365373611, + "num_tokens": 210775482.0, + "step": 175160 + }, + { + "entropy": 1.862857685983181, + "epoch": 0.5430116734955156, + "grad_norm": 11.147188186645508, + "learning_rate": 3.433122784939563e-06, + "loss": 0.4253, + "mean_token_accuracy": 0.8630274266004563, + "num_tokens": 210788441.0, + "step": 175170 + }, + { + "entropy": 1.923531498014927, + "epoch": 0.5430426726205653, + "grad_norm": 9.402071952819824, + "learning_rate": 3.433024794547426e-06, + "loss": 0.4787, + "mean_token_accuracy": 0.8527959004044533, + "num_tokens": 210799409.0, + "step": 175180 + }, + { + "entropy": 1.8989013388752938, + "epoch": 0.5430736717456149, + "grad_norm": 8.027140617370605, + "learning_rate": 3.432926812545524e-06, + "loss": 0.4114, + "mean_token_accuracy": 0.8585892334580422, + "num_tokens": 210811487.0, + "step": 175190 + }, + { + "entropy": 1.8911652371287346, + "epoch": 0.5431046708706647, + "grad_norm": 8.578186988830566, + "learning_rate": 3.4328288389326593e-06, + "loss": 0.4712, + "mean_token_accuracy": 0.8501900911331177, + "num_tokens": 210824056.0, + "step": 175200 + }, + { + "entropy": 1.902529464662075, + "epoch": 0.5431356699957144, + "grad_norm": 7.3062896728515625, + "learning_rate": 3.4327308737076353e-06, + "loss": 0.4403, + "mean_token_accuracy": 0.8597478345036507, + "num_tokens": 210836589.0, + "step": 175210 + }, + { + "entropy": 1.9480405256152153, + "epoch": 0.5431666691207641, + "grad_norm": 7.592820644378662, + "learning_rate": 3.4326329168692556e-06, + "loss": 0.4486, + "mean_token_accuracy": 0.8596141144633294, + "num_tokens": 210848034.0, + "step": 175220 + }, + { + "entropy": 1.8504156574606896, + "epoch": 0.5431976682458137, + "grad_norm": 8.794576644897461, + "learning_rate": 3.432534968416322e-06, + "loss": 0.4322, + "mean_token_accuracy": 0.8582611083984375, + "num_tokens": 210860350.0, + "step": 175230 + }, + { + "entropy": 1.8938454136252403, + "epoch": 0.5432286673708635, + "grad_norm": 8.417749404907227, + "learning_rate": 3.4324370283476405e-06, + "loss": 0.4335, + "mean_token_accuracy": 0.8565092936158181, + "num_tokens": 210872269.0, + "step": 175240 + }, + { + "entropy": 1.9709706351161003, + "epoch": 0.5432596664959132, + "grad_norm": 10.069972038269043, + "learning_rate": 3.4323390966620135e-06, + "loss": 0.4962, + "mean_token_accuracy": 0.8481551617383957, + "num_tokens": 210883417.0, + "step": 175250 + }, + { + "entropy": 1.9295583993196488, + "epoch": 0.5432906656209628, + "grad_norm": 8.450238227844238, + "learning_rate": 3.4322411733582455e-06, + "loss": 0.4716, + "mean_token_accuracy": 0.840464486181736, + "num_tokens": 210894817.0, + "step": 175260 + }, + { + "entropy": 1.9116766616702079, + "epoch": 0.5433216647460125, + "grad_norm": 8.815253257751465, + "learning_rate": 3.432143258435141e-06, + "loss": 0.4855, + "mean_token_accuracy": 0.8442767933011055, + "num_tokens": 210907004.0, + "step": 175270 + }, + { + "entropy": 1.80950618237257, + "epoch": 0.5433526638710623, + "grad_norm": 8.208104133605957, + "learning_rate": 3.432045351891505e-06, + "loss": 0.3741, + "mean_token_accuracy": 0.8676191523671151, + "num_tokens": 210920640.0, + "step": 175280 + }, + { + "entropy": 1.8785382106900215, + "epoch": 0.543383662996112, + "grad_norm": 9.787672996520996, + "learning_rate": 3.4319474537261416e-06, + "loss": 0.4403, + "mean_token_accuracy": 0.8466727659106255, + "num_tokens": 210933513.0, + "step": 175290 + }, + { + "entropy": 1.9406921476125718, + "epoch": 0.5434146621211616, + "grad_norm": 8.736618995666504, + "learning_rate": 3.4318495639378563e-06, + "loss": 0.4468, + "mean_token_accuracy": 0.857597853243351, + "num_tokens": 210944604.0, + "step": 175300 + }, + { + "entropy": 1.9792003378272056, + "epoch": 0.5434456612462113, + "grad_norm": 9.496161460876465, + "learning_rate": 3.431751682525455e-06, + "loss": 0.5123, + "mean_token_accuracy": 0.8454146191477776, + "num_tokens": 210955527.0, + "step": 175310 + }, + { + "entropy": 1.838035662472248, + "epoch": 0.543476660371261, + "grad_norm": 4.342799186706543, + "learning_rate": 3.431653809487742e-06, + "loss": 0.3837, + "mean_token_accuracy": 0.8712288469076157, + "num_tokens": 210967578.0, + "step": 175320 + }, + { + "entropy": 1.8781815335154532, + "epoch": 0.5435076594963107, + "grad_norm": 9.674783706665039, + "learning_rate": 3.4315559448235246e-06, + "loss": 0.449, + "mean_token_accuracy": 0.8497504383325577, + "num_tokens": 210979425.0, + "step": 175330 + }, + { + "entropy": 1.9270706087350846, + "epoch": 0.5435386586213604, + "grad_norm": 8.616156578063965, + "learning_rate": 3.431458088531609e-06, + "loss": 0.4758, + "mean_token_accuracy": 0.852242237329483, + "num_tokens": 210990495.0, + "step": 175340 + }, + { + "entropy": 1.890567271411419, + "epoch": 0.5435696577464101, + "grad_norm": 8.244331359863281, + "learning_rate": 3.4313602406107998e-06, + "loss": 0.4215, + "mean_token_accuracy": 0.8536026954650879, + "num_tokens": 211002476.0, + "step": 175350 + }, + { + "entropy": 1.9067375272512437, + "epoch": 0.5436006568714598, + "grad_norm": 3.6741037368774414, + "learning_rate": 3.4312624010599042e-06, + "loss": 0.4467, + "mean_token_accuracy": 0.8597292706370354, + "num_tokens": 211013821.0, + "step": 175360 + }, + { + "entropy": 1.8930379897356033, + "epoch": 0.5436316559965095, + "grad_norm": 7.21564245223999, + "learning_rate": 3.43116456987773e-06, + "loss": 0.4242, + "mean_token_accuracy": 0.8594876885414123, + "num_tokens": 211025851.0, + "step": 175370 + }, + { + "entropy": 1.823582974076271, + "epoch": 0.5436626551215592, + "grad_norm": 9.629420280456543, + "learning_rate": 3.431066747063083e-06, + "loss": 0.3963, + "mean_token_accuracy": 0.8635643497109413, + "num_tokens": 211039218.0, + "step": 175380 + }, + { + "entropy": 1.878513753414154, + "epoch": 0.5436936542466089, + "grad_norm": 8.077526092529297, + "learning_rate": 3.4309689326147717e-06, + "loss": 0.4583, + "mean_token_accuracy": 0.8498103663325309, + "num_tokens": 211051346.0, + "step": 175390 + }, + { + "entropy": 1.9916471809148788, + "epoch": 0.5437246533716585, + "grad_norm": 8.912239074707031, + "learning_rate": 3.430871126531603e-06, + "loss": 0.4276, + "mean_token_accuracy": 0.8641511112451553, + "num_tokens": 211061719.0, + "step": 175400 + }, + { + "entropy": 1.8105146452784537, + "epoch": 0.5437556524967083, + "grad_norm": 2.607809066772461, + "learning_rate": 3.430773328812384e-06, + "loss": 0.4415, + "mean_token_accuracy": 0.857674777507782, + "num_tokens": 211075875.0, + "step": 175410 + }, + { + "entropy": 1.9355085432529449, + "epoch": 0.543786651621758, + "grad_norm": 7.287834167480469, + "learning_rate": 3.4306755394559235e-06, + "loss": 0.4681, + "mean_token_accuracy": 0.8538926064968109, + "num_tokens": 211086848.0, + "step": 175420 + }, + { + "entropy": 1.9138874024152757, + "epoch": 0.5438176507468077, + "grad_norm": 7.868054389953613, + "learning_rate": 3.4305777584610297e-06, + "loss": 0.4848, + "mean_token_accuracy": 0.8511651039123536, + "num_tokens": 211098638.0, + "step": 175430 + }, + { + "entropy": 1.8980089619755744, + "epoch": 0.5438486498718573, + "grad_norm": 3.7297019958496094, + "learning_rate": 3.4304799858265103e-06, + "loss": 0.4274, + "mean_token_accuracy": 0.8571511924266815, + "num_tokens": 211110170.0, + "step": 175440 + }, + { + "entropy": 1.9528877034783363, + "epoch": 0.5438796489969071, + "grad_norm": 3.9986767768859863, + "learning_rate": 3.430382221551175e-06, + "loss": 0.4903, + "mean_token_accuracy": 0.8474042981863021, + "num_tokens": 211121931.0, + "step": 175450 + }, + { + "entropy": 1.9155747339129447, + "epoch": 0.5439106481219568, + "grad_norm": 4.115497589111328, + "learning_rate": 3.4302844656338325e-06, + "loss": 0.4581, + "mean_token_accuracy": 0.8570854663848877, + "num_tokens": 211134004.0, + "step": 175460 + }, + { + "entropy": 1.9859370857477188, + "epoch": 0.5439416472470064, + "grad_norm": 8.700798034667969, + "learning_rate": 3.4301867180732913e-06, + "loss": 0.4981, + "mean_token_accuracy": 0.8435089200735092, + "num_tokens": 211144638.0, + "step": 175470 + }, + { + "entropy": 1.8553397893905639, + "epoch": 0.5439726463720561, + "grad_norm": 8.085160255432129, + "learning_rate": 3.4300889788683617e-06, + "loss": 0.3936, + "mean_token_accuracy": 0.8684851303696632, + "num_tokens": 211156786.0, + "step": 175480 + }, + { + "entropy": 1.8702120378613472, + "epoch": 0.5440036454971059, + "grad_norm": 8.793665885925293, + "learning_rate": 3.429991248017853e-06, + "loss": 0.4362, + "mean_token_accuracy": 0.8538280829787255, + "num_tokens": 211168883.0, + "step": 175490 + }, + { + "entropy": 1.8888561949133873, + "epoch": 0.5440346446221556, + "grad_norm": 3.943817138671875, + "learning_rate": 3.429893525520575e-06, + "loss": 0.4232, + "mean_token_accuracy": 0.8594100803136826, + "num_tokens": 211180344.0, + "step": 175500 + }, + { + "entropy": 1.9499733239412307, + "epoch": 0.5440656437472052, + "grad_norm": 8.029459953308105, + "learning_rate": 3.4297958113753376e-06, + "loss": 0.4739, + "mean_token_accuracy": 0.8499315157532692, + "num_tokens": 211191260.0, + "step": 175510 + }, + { + "entropy": 1.9230097450315953, + "epoch": 0.5440966428722549, + "grad_norm": 8.456263542175293, + "learning_rate": 3.4296981055809513e-06, + "loss": 0.4608, + "mean_token_accuracy": 0.8554961159825325, + "num_tokens": 211202649.0, + "step": 175520 + }, + { + "entropy": 1.8276287920773029, + "epoch": 0.5441276419973047, + "grad_norm": 5.568211078643799, + "learning_rate": 3.429600408136228e-06, + "loss": 0.4078, + "mean_token_accuracy": 0.8503857553005219, + "num_tokens": 211216056.0, + "step": 175530 + }, + { + "entropy": 1.957040660083294, + "epoch": 0.5441586411223543, + "grad_norm": 8.783206939697266, + "learning_rate": 3.429502719039976e-06, + "loss": 0.5035, + "mean_token_accuracy": 0.8379690006375313, + "num_tokens": 211227214.0, + "step": 175540 + }, + { + "entropy": 1.8432976469397544, + "epoch": 0.544189640247404, + "grad_norm": 4.159402847290039, + "learning_rate": 3.4294050382910083e-06, + "loss": 0.3847, + "mean_token_accuracy": 0.8681906953454017, + "num_tokens": 211240465.0, + "step": 175550 + }, + { + "entropy": 1.8789378896355629, + "epoch": 0.5442206393724537, + "grad_norm": 6.207590103149414, + "learning_rate": 3.4293073658881355e-06, + "loss": 0.4417, + "mean_token_accuracy": 0.8544572427868843, + "num_tokens": 211252444.0, + "step": 175560 + }, + { + "entropy": 1.82925843000412, + "epoch": 0.5442516384975034, + "grad_norm": 4.576752662658691, + "learning_rate": 3.429209701830169e-06, + "loss": 0.41, + "mean_token_accuracy": 0.8618702232837677, + "num_tokens": 211265393.0, + "step": 175570 + }, + { + "entropy": 1.920812802016735, + "epoch": 0.5442826376225531, + "grad_norm": 7.142611503601074, + "learning_rate": 3.429112046115921e-06, + "loss": 0.4923, + "mean_token_accuracy": 0.8376043096184731, + "num_tokens": 211276859.0, + "step": 175580 + }, + { + "entropy": 1.8717590168118476, + "epoch": 0.5443136367476028, + "grad_norm": 7.412079334259033, + "learning_rate": 3.4290143987442045e-06, + "loss": 0.3956, + "mean_token_accuracy": 0.8562609612941742, + "num_tokens": 211289884.0, + "step": 175590 + }, + { + "entropy": 1.8445143483579158, + "epoch": 0.5443446358726525, + "grad_norm": 9.400655746459961, + "learning_rate": 3.4289167597138296e-06, + "loss": 0.3788, + "mean_token_accuracy": 0.8686357572674751, + "num_tokens": 211302516.0, + "step": 175600 + }, + { + "entropy": 1.8606164276599884, + "epoch": 0.5443756349977021, + "grad_norm": 8.446548461914062, + "learning_rate": 3.4288191290236093e-06, + "loss": 0.443, + "mean_token_accuracy": 0.8458468541502953, + "num_tokens": 211315066.0, + "step": 175610 + }, + { + "entropy": 1.8773271977901458, + "epoch": 0.5444066341227519, + "grad_norm": 8.17170524597168, + "learning_rate": 3.428721506672358e-06, + "loss": 0.4073, + "mean_token_accuracy": 0.863808399438858, + "num_tokens": 211327052.0, + "step": 175620 + }, + { + "entropy": 1.8070940539240836, + "epoch": 0.5444376332478016, + "grad_norm": 2.7977521419525146, + "learning_rate": 3.4286238926588865e-06, + "loss": 0.3686, + "mean_token_accuracy": 0.8716510728001594, + "num_tokens": 211340446.0, + "step": 175630 + }, + { + "entropy": 1.8379455134272575, + "epoch": 0.5444686323728513, + "grad_norm": 3.4940185546875, + "learning_rate": 3.4285262869820103e-06, + "loss": 0.417, + "mean_token_accuracy": 0.8602952301502228, + "num_tokens": 211352351.0, + "step": 175640 + }, + { + "entropy": 1.8413410797715186, + "epoch": 0.5444996314979009, + "grad_norm": 3.8549675941467285, + "learning_rate": 3.428428689640541e-06, + "loss": 0.3772, + "mean_token_accuracy": 0.8687475189566612, + "num_tokens": 211364643.0, + "step": 175650 + }, + { + "entropy": 1.9668766289949418, + "epoch": 0.5445306306229507, + "grad_norm": 7.134676456451416, + "learning_rate": 3.4283311006332927e-06, + "loss": 0.4793, + "mean_token_accuracy": 0.85071252733469, + "num_tokens": 211376493.0, + "step": 175660 + }, + { + "entropy": 1.8720296204090119, + "epoch": 0.5445616297480004, + "grad_norm": 8.542624473571777, + "learning_rate": 3.42823351995908e-06, + "loss": 0.3931, + "mean_token_accuracy": 0.8693165212869645, + "num_tokens": 211388683.0, + "step": 175670 + }, + { + "entropy": 1.997011986374855, + "epoch": 0.54459262887305, + "grad_norm": 8.116767883300781, + "learning_rate": 3.4281359476167157e-06, + "loss": 0.5041, + "mean_token_accuracy": 0.8428088009357453, + "num_tokens": 211399341.0, + "step": 175680 + }, + { + "entropy": 1.9090626895427705, + "epoch": 0.5446236279980997, + "grad_norm": 7.649209022521973, + "learning_rate": 3.428038383605016e-06, + "loss": 0.4293, + "mean_token_accuracy": 0.8594685822725296, + "num_tokens": 211411098.0, + "step": 175690 + }, + { + "entropy": 1.9082271412014962, + "epoch": 0.5446546271231495, + "grad_norm": 7.6990838050842285, + "learning_rate": 3.427940827922794e-06, + "loss": 0.4221, + "mean_token_accuracy": 0.8626911997795105, + "num_tokens": 211422535.0, + "step": 175700 + }, + { + "entropy": 1.8366917654871942, + "epoch": 0.5446856262481992, + "grad_norm": 3.823190689086914, + "learning_rate": 3.4278432805688655e-06, + "loss": 0.4119, + "mean_token_accuracy": 0.8561408147215843, + "num_tokens": 211434911.0, + "step": 175710 + }, + { + "entropy": 1.8189725920557975, + "epoch": 0.5447166253732488, + "grad_norm": 7.969996929168701, + "learning_rate": 3.4277457415420452e-06, + "loss": 0.4019, + "mean_token_accuracy": 0.8670765489339829, + "num_tokens": 211446980.0, + "step": 175720 + }, + { + "entropy": 1.8865886434912682, + "epoch": 0.5447476244982985, + "grad_norm": 12.047924995422363, + "learning_rate": 3.4276482108411475e-06, + "loss": 0.43, + "mean_token_accuracy": 0.8559050917625427, + "num_tokens": 211459883.0, + "step": 175730 + }, + { + "entropy": 1.908797726035118, + "epoch": 0.5447786236233483, + "grad_norm": 9.070921897888184, + "learning_rate": 3.42755068846499e-06, + "loss": 0.4925, + "mean_token_accuracy": 0.8463908329606056, + "num_tokens": 211470730.0, + "step": 175740 + }, + { + "entropy": 1.9791014522314072, + "epoch": 0.544809622748398, + "grad_norm": 7.4087066650390625, + "learning_rate": 3.427453174412387e-06, + "loss": 0.4655, + "mean_token_accuracy": 0.8555495351552963, + "num_tokens": 211481448.0, + "step": 175750 + }, + { + "entropy": 1.8049452945590019, + "epoch": 0.5448406218734476, + "grad_norm": 9.106644630432129, + "learning_rate": 3.4273556686821547e-06, + "loss": 0.4439, + "mean_token_accuracy": 0.8545056328177452, + "num_tokens": 211494686.0, + "step": 175760 + }, + { + "entropy": 1.8799980938434602, + "epoch": 0.5448716209984973, + "grad_norm": 9.619363784790039, + "learning_rate": 3.4272581712731103e-06, + "loss": 0.4607, + "mean_token_accuracy": 0.8563680201768875, + "num_tokens": 211507092.0, + "step": 175770 + }, + { + "entropy": 1.9642152100801469, + "epoch": 0.5449026201235471, + "grad_norm": 9.396842002868652, + "learning_rate": 3.427160682184069e-06, + "loss": 0.4747, + "mean_token_accuracy": 0.8502561032772065, + "num_tokens": 211517813.0, + "step": 175780 + }, + { + "entropy": 1.896457839012146, + "epoch": 0.5449336192485967, + "grad_norm": 8.072653770446777, + "learning_rate": 3.427063201413849e-06, + "loss": 0.4271, + "mean_token_accuracy": 0.8577140867710114, + "num_tokens": 211529323.0, + "step": 175790 + }, + { + "entropy": 1.8286516055464745, + "epoch": 0.5449646183736464, + "grad_norm": 3.8755486011505127, + "learning_rate": 3.4269657289612652e-06, + "loss": 0.3747, + "mean_token_accuracy": 0.8703928470611573, + "num_tokens": 211543025.0, + "step": 175800 + }, + { + "entropy": 1.915991945564747, + "epoch": 0.5449956174986961, + "grad_norm": 7.096658229827881, + "learning_rate": 3.4268682648251365e-06, + "loss": 0.4537, + "mean_token_accuracy": 0.862190768122673, + "num_tokens": 211554435.0, + "step": 175810 + }, + { + "entropy": 1.9708215206861497, + "epoch": 0.5450266166237457, + "grad_norm": 9.02193832397461, + "learning_rate": 3.42677080900428e-06, + "loss": 0.5086, + "mean_token_accuracy": 0.8442001223564148, + "num_tokens": 211565658.0, + "step": 175820 + }, + { + "entropy": 1.8491972595453263, + "epoch": 0.5450576157487955, + "grad_norm": 7.748945713043213, + "learning_rate": 3.426673361497514e-06, + "loss": 0.4015, + "mean_token_accuracy": 0.8688301354646683, + "num_tokens": 211577795.0, + "step": 175830 + }, + { + "entropy": 1.7991741731762887, + "epoch": 0.5450886148738452, + "grad_norm": 4.371496677398682, + "learning_rate": 3.426575922303655e-06, + "loss": 0.3885, + "mean_token_accuracy": 0.8593105405569077, + "num_tokens": 211590921.0, + "step": 175840 + }, + { + "entropy": 1.888825187087059, + "epoch": 0.5451196139988949, + "grad_norm": 8.067505836486816, + "learning_rate": 3.4264784914215223e-06, + "loss": 0.4248, + "mean_token_accuracy": 0.8572896763682365, + "num_tokens": 211602924.0, + "step": 175850 + }, + { + "entropy": 1.926749548316002, + "epoch": 0.5451506131239445, + "grad_norm": 7.764368534088135, + "learning_rate": 3.4263810688499335e-06, + "loss": 0.5012, + "mean_token_accuracy": 0.8423927545547485, + "num_tokens": 211614393.0, + "step": 175860 + }, + { + "entropy": 1.876363991200924, + "epoch": 0.5451816122489943, + "grad_norm": 8.829561233520508, + "learning_rate": 3.4262836545877082e-06, + "loss": 0.4313, + "mean_token_accuracy": 0.8627447575330734, + "num_tokens": 211626201.0, + "step": 175870 + }, + { + "entropy": 1.8496167272329331, + "epoch": 0.545212611374044, + "grad_norm": 4.570156574249268, + "learning_rate": 3.426186248633664e-06, + "loss": 0.4114, + "mean_token_accuracy": 0.8569568589329719, + "num_tokens": 211638710.0, + "step": 175880 + }, + { + "entropy": 1.9157718800008297, + "epoch": 0.5452436104990936, + "grad_norm": 8.592246055603027, + "learning_rate": 3.4260888509866207e-06, + "loss": 0.4602, + "mean_token_accuracy": 0.8535981699824333, + "num_tokens": 211650495.0, + "step": 175890 + }, + { + "entropy": 1.9073502153158188, + "epoch": 0.5452746096241433, + "grad_norm": 8.481475830078125, + "learning_rate": 3.425991461645398e-06, + "loss": 0.4699, + "mean_token_accuracy": 0.8480674445629119, + "num_tokens": 211662190.0, + "step": 175900 + }, + { + "entropy": 1.8318237490952014, + "epoch": 0.5453056087491931, + "grad_norm": 7.986082077026367, + "learning_rate": 3.4258940806088153e-06, + "loss": 0.371, + "mean_token_accuracy": 0.8599618077278137, + "num_tokens": 211674333.0, + "step": 175910 + }, + { + "entropy": 1.8723966658115387, + "epoch": 0.5453366078742428, + "grad_norm": 4.092220783233643, + "learning_rate": 3.425796707875692e-06, + "loss": 0.4584, + "mean_token_accuracy": 0.8486889436841011, + "num_tokens": 211686356.0, + "step": 175920 + }, + { + "entropy": 1.9107608869671822, + "epoch": 0.5453676069992924, + "grad_norm": 7.9507222175598145, + "learning_rate": 3.4256993434448477e-06, + "loss": 0.4382, + "mean_token_accuracy": 0.8596844106912613, + "num_tokens": 211697780.0, + "step": 175930 + }, + { + "entropy": 1.8922893926501274, + "epoch": 0.5453986061243421, + "grad_norm": 7.145480155944824, + "learning_rate": 3.4256019873151043e-06, + "loss": 0.4013, + "mean_token_accuracy": 0.869250500202179, + "num_tokens": 211709704.0, + "step": 175940 + }, + { + "entropy": 1.7841238886117936, + "epoch": 0.5454296052493919, + "grad_norm": 6.294280529022217, + "learning_rate": 3.425504639485281e-06, + "loss": 0.3564, + "mean_token_accuracy": 0.8691599667072296, + "num_tokens": 211723529.0, + "step": 175950 + }, + { + "entropy": 1.892759595811367, + "epoch": 0.5454606043744415, + "grad_norm": 9.486949920654297, + "learning_rate": 3.425407299954198e-06, + "loss": 0.4968, + "mean_token_accuracy": 0.852388060092926, + "num_tokens": 211736158.0, + "step": 175960 + }, + { + "entropy": 1.8831520915031432, + "epoch": 0.5454916034994912, + "grad_norm": 3.6384406089782715, + "learning_rate": 3.4253099687206783e-06, + "loss": 0.4208, + "mean_token_accuracy": 0.8568282395601272, + "num_tokens": 211748342.0, + "step": 175970 + }, + { + "entropy": 1.9316042974591254, + "epoch": 0.5455226026245409, + "grad_norm": 8.037728309631348, + "learning_rate": 3.425212645783541e-06, + "loss": 0.4168, + "mean_token_accuracy": 0.865487614274025, + "num_tokens": 211759533.0, + "step": 175980 + }, + { + "entropy": 1.9403535202145576, + "epoch": 0.5455536017495907, + "grad_norm": 8.629094123840332, + "learning_rate": 3.425115331141609e-06, + "loss": 0.4796, + "mean_token_accuracy": 0.8469337105751038, + "num_tokens": 211771568.0, + "step": 175990 + }, + { + "entropy": 1.931604516506195, + "epoch": 0.5455846008746403, + "grad_norm": 8.681797981262207, + "learning_rate": 3.4250180247937037e-06, + "loss": 0.4437, + "mean_token_accuracy": 0.858528833091259, + "num_tokens": 211783261.0, + "step": 176000 + }, + { + "entropy": 1.8988345205783843, + "epoch": 0.54561559999969, + "grad_norm": 9.937957763671875, + "learning_rate": 3.424920726738647e-06, + "loss": 0.4417, + "mean_token_accuracy": 0.8475104600191117, + "num_tokens": 211795458.0, + "step": 176010 + }, + { + "entropy": 1.9326810777187347, + "epoch": 0.5456465991247397, + "grad_norm": 9.935134887695312, + "learning_rate": 3.42482343697526e-06, + "loss": 0.4285, + "mean_token_accuracy": 0.8590706020593644, + "num_tokens": 211806886.0, + "step": 176020 + }, + { + "entropy": 1.8864729851484299, + "epoch": 0.5456775982497895, + "grad_norm": 2.4956181049346924, + "learning_rate": 3.424726155502366e-06, + "loss": 0.4375, + "mean_token_accuracy": 0.8551818341016769, + "num_tokens": 211819657.0, + "step": 176030 + }, + { + "entropy": 1.9797317951917648, + "epoch": 0.5457085973748391, + "grad_norm": 7.554134845733643, + "learning_rate": 3.4246288823187878e-06, + "loss": 0.4961, + "mean_token_accuracy": 0.8407960489392281, + "num_tokens": 211831292.0, + "step": 176040 + }, + { + "entropy": 1.8862545937299728, + "epoch": 0.5457395964998888, + "grad_norm": 4.226990222930908, + "learning_rate": 3.4245316174233486e-06, + "loss": 0.4034, + "mean_token_accuracy": 0.8609147578477859, + "num_tokens": 211842562.0, + "step": 176050 + }, + { + "entropy": 1.9039891347289086, + "epoch": 0.5457705956249385, + "grad_norm": 6.957450866699219, + "learning_rate": 3.4244343608148693e-06, + "loss": 0.4324, + "mean_token_accuracy": 0.854132778942585, + "num_tokens": 211854877.0, + "step": 176060 + }, + { + "entropy": 1.9222405552864075, + "epoch": 0.5458015947499881, + "grad_norm": 7.754835605621338, + "learning_rate": 3.4243371124921765e-06, + "loss": 0.4526, + "mean_token_accuracy": 0.8527779713273048, + "num_tokens": 211866852.0, + "step": 176070 + }, + { + "entropy": 1.85050872862339, + "epoch": 0.5458325938750379, + "grad_norm": 8.02645206451416, + "learning_rate": 3.424239872454091e-06, + "loss": 0.4146, + "mean_token_accuracy": 0.8586536258459091, + "num_tokens": 211880242.0, + "step": 176080 + }, + { + "entropy": 2.002450078725815, + "epoch": 0.5458635930000876, + "grad_norm": 7.224414825439453, + "learning_rate": 3.4241426406994375e-06, + "loss": 0.4909, + "mean_token_accuracy": 0.848207288980484, + "num_tokens": 211891539.0, + "step": 176090 + }, + { + "entropy": 1.9079863592982291, + "epoch": 0.5458945921251372, + "grad_norm": 6.804903507232666, + "learning_rate": 3.424045417227041e-06, + "loss": 0.3954, + "mean_token_accuracy": 0.8677893698215484, + "num_tokens": 211903998.0, + "step": 176100 + }, + { + "entropy": 1.9110632047057152, + "epoch": 0.5459255912501869, + "grad_norm": 8.103818893432617, + "learning_rate": 3.4239482020357246e-06, + "loss": 0.4405, + "mean_token_accuracy": 0.8578624501824379, + "num_tokens": 211915738.0, + "step": 176110 + }, + { + "entropy": 1.9270017936825752, + "epoch": 0.5459565903752367, + "grad_norm": 8.514951705932617, + "learning_rate": 3.423850995124313e-06, + "loss": 0.4535, + "mean_token_accuracy": 0.8561173945665359, + "num_tokens": 211927936.0, + "step": 176120 + }, + { + "entropy": 1.8232133775949477, + "epoch": 0.5459875895002864, + "grad_norm": 3.2978906631469727, + "learning_rate": 3.4237537964916305e-06, + "loss": 0.3729, + "mean_token_accuracy": 0.8648741155862808, + "num_tokens": 211941066.0, + "step": 176130 + }, + { + "entropy": 2.0083392202854156, + "epoch": 0.546018588625336, + "grad_norm": 8.977083206176758, + "learning_rate": 3.4236566061365035e-06, + "loss": 0.4844, + "mean_token_accuracy": 0.8511565178632736, + "num_tokens": 211951856.0, + "step": 176140 + }, + { + "entropy": 1.9292023211717606, + "epoch": 0.5460495877503857, + "grad_norm": 8.130477905273438, + "learning_rate": 3.4235594240577557e-06, + "loss": 0.4471, + "mean_token_accuracy": 0.8568751186132431, + "num_tokens": 211963564.0, + "step": 176150 + }, + { + "entropy": 1.9371128112077713, + "epoch": 0.5460805868754355, + "grad_norm": 7.703921318054199, + "learning_rate": 3.423462250254213e-06, + "loss": 0.434, + "mean_token_accuracy": 0.8611638352274895, + "num_tokens": 211974754.0, + "step": 176160 + }, + { + "entropy": 1.9029362380504609, + "epoch": 0.5461115860004851, + "grad_norm": 7.6146111488342285, + "learning_rate": 3.423365084724701e-06, + "loss": 0.4769, + "mean_token_accuracy": 0.8572221979498863, + "num_tokens": 211986813.0, + "step": 176170 + }, + { + "entropy": 1.8494912847876548, + "epoch": 0.5461425851255348, + "grad_norm": 8.22357177734375, + "learning_rate": 3.4232679274680464e-06, + "loss": 0.3777, + "mean_token_accuracy": 0.8618330404162406, + "num_tokens": 211999495.0, + "step": 176180 + }, + { + "entropy": 1.8791470482945443, + "epoch": 0.5461735842505845, + "grad_norm": 9.076931953430176, + "learning_rate": 3.423170778483074e-06, + "loss": 0.4411, + "mean_token_accuracy": 0.854033924639225, + "num_tokens": 212011756.0, + "step": 176190 + }, + { + "entropy": 1.8349644854664802, + "epoch": 0.5462045833756343, + "grad_norm": 9.453182220458984, + "learning_rate": 3.4230736377686115e-06, + "loss": 0.4189, + "mean_token_accuracy": 0.8578223511576653, + "num_tokens": 212024516.0, + "step": 176200 + }, + { + "entropy": 1.873240815103054, + "epoch": 0.5462355825006839, + "grad_norm": 3.723876714706421, + "learning_rate": 3.4229765053234847e-06, + "loss": 0.418, + "mean_token_accuracy": 0.8600479438900948, + "num_tokens": 212036668.0, + "step": 176210 + }, + { + "entropy": 1.881936539709568, + "epoch": 0.5462665816257336, + "grad_norm": 3.735361099243164, + "learning_rate": 3.42287938114652e-06, + "loss": 0.4051, + "mean_token_accuracy": 0.8556406930088997, + "num_tokens": 212049504.0, + "step": 176220 + }, + { + "entropy": 1.8623495429754258, + "epoch": 0.5462975807507833, + "grad_norm": 7.01729154586792, + "learning_rate": 3.422782265236545e-06, + "loss": 0.4193, + "mean_token_accuracy": 0.8536556616425515, + "num_tokens": 212061955.0, + "step": 176230 + }, + { + "entropy": 1.8628789693117143, + "epoch": 0.546328579875833, + "grad_norm": 10.14441967010498, + "learning_rate": 3.422685157592387e-06, + "loss": 0.4202, + "mean_token_accuracy": 0.8602953940629959, + "num_tokens": 212074670.0, + "step": 176240 + }, + { + "entropy": 1.8812679126858711, + "epoch": 0.5463595790008827, + "grad_norm": 7.10945987701416, + "learning_rate": 3.422588058212874e-06, + "loss": 0.436, + "mean_token_accuracy": 0.8632075861096382, + "num_tokens": 212086210.0, + "step": 176250 + }, + { + "entropy": 1.891566054522991, + "epoch": 0.5463905781259324, + "grad_norm": 9.948281288146973, + "learning_rate": 3.422490967096833e-06, + "loss": 0.4718, + "mean_token_accuracy": 0.8480950236320496, + "num_tokens": 212098498.0, + "step": 176260 + }, + { + "entropy": 1.897965356707573, + "epoch": 0.5464215772509821, + "grad_norm": 8.800344467163086, + "learning_rate": 3.422393884243092e-06, + "loss": 0.4876, + "mean_token_accuracy": 0.8485878229141235, + "num_tokens": 212110580.0, + "step": 176270 + }, + { + "entropy": 1.944313894212246, + "epoch": 0.5464525763760318, + "grad_norm": 7.566771984100342, + "learning_rate": 3.422296809650479e-06, + "loss": 0.438, + "mean_token_accuracy": 0.8603140458464622, + "num_tokens": 212121926.0, + "step": 176280 + }, + { + "entropy": 1.8406140804290771, + "epoch": 0.5464835755010815, + "grad_norm": 3.525524377822876, + "learning_rate": 3.4221997433178233e-06, + "loss": 0.4135, + "mean_token_accuracy": 0.867090655863285, + "num_tokens": 212134194.0, + "step": 176290 + }, + { + "entropy": 1.8961115300655365, + "epoch": 0.5465145746261312, + "grad_norm": 8.181693077087402, + "learning_rate": 3.4221026852439532e-06, + "loss": 0.434, + "mean_token_accuracy": 0.8568276822566986, + "num_tokens": 212146986.0, + "step": 176300 + }, + { + "entropy": 1.9399204954504967, + "epoch": 0.5465455737511808, + "grad_norm": 10.183042526245117, + "learning_rate": 3.4220056354276985e-06, + "loss": 0.4762, + "mean_token_accuracy": 0.8448749095201492, + "num_tokens": 212157998.0, + "step": 176310 + }, + { + "entropy": 1.841617615520954, + "epoch": 0.5465765728762305, + "grad_norm": 8.120635986328125, + "learning_rate": 3.4219085938678864e-06, + "loss": 0.4175, + "mean_token_accuracy": 0.853398184478283, + "num_tokens": 212170935.0, + "step": 176320 + }, + { + "entropy": 1.9700021624565125, + "epoch": 0.5466075720012803, + "grad_norm": 8.018360137939453, + "learning_rate": 3.4218115605633472e-06, + "loss": 0.519, + "mean_token_accuracy": 0.8460304543375969, + "num_tokens": 212181943.0, + "step": 176330 + }, + { + "entropy": 1.9688458293676376, + "epoch": 0.54663857112633, + "grad_norm": 8.161321640014648, + "learning_rate": 3.4217145355129107e-06, + "loss": 0.4783, + "mean_token_accuracy": 0.8540339127182961, + "num_tokens": 212192813.0, + "step": 176340 + }, + { + "entropy": 1.8571071356534958, + "epoch": 0.5466695702513796, + "grad_norm": 9.385038375854492, + "learning_rate": 3.421617518715407e-06, + "loss": 0.4457, + "mean_token_accuracy": 0.857005886733532, + "num_tokens": 212205062.0, + "step": 176350 + }, + { + "entropy": 1.9261850699782372, + "epoch": 0.5467005693764293, + "grad_norm": 7.309939861297607, + "learning_rate": 3.4215205101696656e-06, + "loss": 0.4476, + "mean_token_accuracy": 0.8574215292930603, + "num_tokens": 212216611.0, + "step": 176360 + }, + { + "entropy": 1.8523739635944367, + "epoch": 0.5467315685014791, + "grad_norm": 9.52919864654541, + "learning_rate": 3.4214235098745175e-06, + "loss": 0.4102, + "mean_token_accuracy": 0.8608949258923531, + "num_tokens": 212228523.0, + "step": 176370 + }, + { + "entropy": 1.9219273343682288, + "epoch": 0.5467625676265288, + "grad_norm": 4.339505672454834, + "learning_rate": 3.4213265178287926e-06, + "loss": 0.4414, + "mean_token_accuracy": 0.8524685263633728, + "num_tokens": 212240103.0, + "step": 176380 + }, + { + "entropy": 1.8322541788220406, + "epoch": 0.5467935667515784, + "grad_norm": 8.57495403289795, + "learning_rate": 3.4212295340313217e-06, + "loss": 0.3702, + "mean_token_accuracy": 0.8736605256795883, + "num_tokens": 212252653.0, + "step": 176390 + }, + { + "entropy": 1.9648008421063423, + "epoch": 0.5468245658766281, + "grad_norm": 9.930930137634277, + "learning_rate": 3.4211325584809363e-06, + "loss": 0.4705, + "mean_token_accuracy": 0.8485071301460266, + "num_tokens": 212264077.0, + "step": 176400 + }, + { + "entropy": 1.867118813097477, + "epoch": 0.5468555650016779, + "grad_norm": 7.651360511779785, + "learning_rate": 3.421035591176467e-06, + "loss": 0.4321, + "mean_token_accuracy": 0.8613616541028023, + "num_tokens": 212275869.0, + "step": 176410 + }, + { + "entropy": 1.8624306157231332, + "epoch": 0.5468865641267275, + "grad_norm": 9.126770973205566, + "learning_rate": 3.420938632116746e-06, + "loss": 0.4176, + "mean_token_accuracy": 0.8574478432536126, + "num_tokens": 212288650.0, + "step": 176420 + }, + { + "entropy": 1.9164914295077324, + "epoch": 0.5469175632517772, + "grad_norm": 4.5546464920043945, + "learning_rate": 3.420841681300604e-06, + "loss": 0.4307, + "mean_token_accuracy": 0.8624847665429115, + "num_tokens": 212300163.0, + "step": 176430 + }, + { + "entropy": 1.9037815898656845, + "epoch": 0.5469485623768269, + "grad_norm": 7.759820938110352, + "learning_rate": 3.4207447387268737e-06, + "loss": 0.4273, + "mean_token_accuracy": 0.8627464339137078, + "num_tokens": 212311103.0, + "step": 176440 + }, + { + "entropy": 1.7071588724851607, + "epoch": 0.5469795615018767, + "grad_norm": 2.4559144973754883, + "learning_rate": 3.4206478043943875e-06, + "loss": 0.3622, + "mean_token_accuracy": 0.8709375485777855, + "num_tokens": 212325146.0, + "step": 176450 + }, + { + "entropy": 1.8931735321879386, + "epoch": 0.5470105606269263, + "grad_norm": 9.125152587890625, + "learning_rate": 3.4205508783019776e-06, + "loss": 0.4376, + "mean_token_accuracy": 0.8564422413706779, + "num_tokens": 212337549.0, + "step": 176460 + }, + { + "entropy": 1.9534398928284644, + "epoch": 0.547041559751976, + "grad_norm": 7.810306072235107, + "learning_rate": 3.4204539604484755e-06, + "loss": 0.4873, + "mean_token_accuracy": 0.8494616970419884, + "num_tokens": 212349414.0, + "step": 176470 + }, + { + "entropy": 1.910618807375431, + "epoch": 0.5470725588770257, + "grad_norm": 3.4500370025634766, + "learning_rate": 3.4203570508327157e-06, + "loss": 0.4225, + "mean_token_accuracy": 0.8558408632874489, + "num_tokens": 212361418.0, + "step": 176480 + }, + { + "entropy": 1.8364159688353539, + "epoch": 0.5471035580020754, + "grad_norm": 8.173087120056152, + "learning_rate": 3.4202601494535305e-06, + "loss": 0.397, + "mean_token_accuracy": 0.8700931832194329, + "num_tokens": 212374620.0, + "step": 176490 + }, + { + "entropy": 1.8050838321447373, + "epoch": 0.5471345571271251, + "grad_norm": 8.026020050048828, + "learning_rate": 3.420163256309753e-06, + "loss": 0.437, + "mean_token_accuracy": 0.8587026596069336, + "num_tokens": 212388166.0, + "step": 176500 + }, + { + "entropy": 1.8974261716008187, + "epoch": 0.5471655562521748, + "grad_norm": 10.081825256347656, + "learning_rate": 3.420066371400217e-06, + "loss": 0.4402, + "mean_token_accuracy": 0.8566187128424645, + "num_tokens": 212400535.0, + "step": 176510 + }, + { + "entropy": 1.9240003824234009, + "epoch": 0.5471965553772244, + "grad_norm": 9.594392776489258, + "learning_rate": 3.4199694947237566e-06, + "loss": 0.4192, + "mean_token_accuracy": 0.8579287111759186, + "num_tokens": 212412419.0, + "step": 176520 + }, + { + "entropy": 1.9196303203701972, + "epoch": 0.5472275545022742, + "grad_norm": 9.45782470703125, + "learning_rate": 3.4198726262792054e-06, + "loss": 0.4266, + "mean_token_accuracy": 0.8646796032786369, + "num_tokens": 212423898.0, + "step": 176530 + }, + { + "entropy": 1.825406238436699, + "epoch": 0.5472585536273239, + "grad_norm": 7.693227291107178, + "learning_rate": 3.419775766065398e-06, + "loss": 0.4145, + "mean_token_accuracy": 0.8612094387412071, + "num_tokens": 212436667.0, + "step": 176540 + }, + { + "entropy": 1.9636890798807145, + "epoch": 0.5472895527523736, + "grad_norm": 8.634857177734375, + "learning_rate": 3.4196789140811686e-06, + "loss": 0.4654, + "mean_token_accuracy": 0.8505135089159012, + "num_tokens": 212447404.0, + "step": 176550 + }, + { + "entropy": 1.981712308526039, + "epoch": 0.5473205518774232, + "grad_norm": 8.67991828918457, + "learning_rate": 3.419582070325352e-06, + "loss": 0.4891, + "mean_token_accuracy": 0.8396664500236511, + "num_tokens": 212458756.0, + "step": 176560 + }, + { + "entropy": 1.9482917726039886, + "epoch": 0.5473515510024729, + "grad_norm": 8.68653392791748, + "learning_rate": 3.419485234796783e-06, + "loss": 0.4686, + "mean_token_accuracy": 0.8521886184811592, + "num_tokens": 212469914.0, + "step": 176570 + }, + { + "entropy": 1.8495470777153968, + "epoch": 0.5473825501275227, + "grad_norm": 8.767205238342285, + "learning_rate": 3.4193884074942973e-06, + "loss": 0.475, + "mean_token_accuracy": 0.8421885713934898, + "num_tokens": 212482617.0, + "step": 176580 + }, + { + "entropy": 1.925833135843277, + "epoch": 0.5474135492525724, + "grad_norm": 9.569941520690918, + "learning_rate": 3.419291588416729e-06, + "loss": 0.4369, + "mean_token_accuracy": 0.8548354566097259, + "num_tokens": 212493736.0, + "step": 176590 + }, + { + "entropy": 1.927579800784588, + "epoch": 0.547444548377622, + "grad_norm": 6.636186122894287, + "learning_rate": 3.419194777562916e-06, + "loss": 0.4381, + "mean_token_accuracy": 0.8604348719120025, + "num_tokens": 212505362.0, + "step": 176600 + }, + { + "entropy": 1.884330753982067, + "epoch": 0.5474755475026717, + "grad_norm": 3.8287642002105713, + "learning_rate": 3.4190979749316914e-06, + "loss": 0.4075, + "mean_token_accuracy": 0.8639156118035316, + "num_tokens": 212517532.0, + "step": 176610 + }, + { + "entropy": 1.8772628799080848, + "epoch": 0.5475065466277215, + "grad_norm": 3.413010835647583, + "learning_rate": 3.419001180521894e-06, + "loss": 0.4058, + "mean_token_accuracy": 0.8584465265274048, + "num_tokens": 212529948.0, + "step": 176620 + }, + { + "entropy": 1.788725607097149, + "epoch": 0.5475375457527711, + "grad_norm": 3.859731674194336, + "learning_rate": 3.418904394332358e-06, + "loss": 0.4216, + "mean_token_accuracy": 0.8587246060371398, + "num_tokens": 212543917.0, + "step": 176630 + }, + { + "entropy": 2.0069529950618743, + "epoch": 0.5475685448778208, + "grad_norm": 9.134621620178223, + "learning_rate": 3.418807616361922e-06, + "loss": 0.4914, + "mean_token_accuracy": 0.8446884289383888, + "num_tokens": 212554218.0, + "step": 176640 + }, + { + "entropy": 1.9287556931376457, + "epoch": 0.5475995440028705, + "grad_norm": 7.816536903381348, + "learning_rate": 3.41871084660942e-06, + "loss": 0.4572, + "mean_token_accuracy": 0.856015394628048, + "num_tokens": 212565960.0, + "step": 176650 + }, + { + "entropy": 1.9495777159929275, + "epoch": 0.5476305431279203, + "grad_norm": 8.957378387451172, + "learning_rate": 3.418614085073691e-06, + "loss": 0.4634, + "mean_token_accuracy": 0.8514742866158486, + "num_tokens": 212576897.0, + "step": 176660 + }, + { + "entropy": 1.9367633134126663, + "epoch": 0.5476615422529699, + "grad_norm": 7.440584659576416, + "learning_rate": 3.4185173317535724e-06, + "loss": 0.5019, + "mean_token_accuracy": 0.8516129940748215, + "num_tokens": 212588931.0, + "step": 176670 + }, + { + "entropy": 1.826993039250374, + "epoch": 0.5476925413780196, + "grad_norm": 5.0734968185424805, + "learning_rate": 3.4184205866479007e-06, + "loss": 0.4267, + "mean_token_accuracy": 0.8655605405569077, + "num_tokens": 212601793.0, + "step": 176680 + }, + { + "entropy": 1.8689114913344382, + "epoch": 0.5477235405030693, + "grad_norm": 7.733332633972168, + "learning_rate": 3.418323849755514e-06, + "loss": 0.431, + "mean_token_accuracy": 0.8604257687926292, + "num_tokens": 212613657.0, + "step": 176690 + }, + { + "entropy": 1.8902839928865434, + "epoch": 0.547754539628119, + "grad_norm": 9.017712593078613, + "learning_rate": 3.4182271210752506e-06, + "loss": 0.434, + "mean_token_accuracy": 0.8590465992689132, + "num_tokens": 212625786.0, + "step": 176700 + }, + { + "entropy": 1.832987940311432, + "epoch": 0.5477855387531687, + "grad_norm": 4.72005033493042, + "learning_rate": 3.418130400605948e-06, + "loss": 0.3948, + "mean_token_accuracy": 0.8583321020007133, + "num_tokens": 212638893.0, + "step": 176710 + }, + { + "entropy": 1.9117455899715423, + "epoch": 0.5478165378782184, + "grad_norm": 9.520256996154785, + "learning_rate": 3.418033688346445e-06, + "loss": 0.4751, + "mean_token_accuracy": 0.8495288237929344, + "num_tokens": 212650833.0, + "step": 176720 + }, + { + "entropy": 1.9174165919423103, + "epoch": 0.547847537003268, + "grad_norm": 9.004129409790039, + "learning_rate": 3.41793698429558e-06, + "loss": 0.4574, + "mean_token_accuracy": 0.855744905769825, + "num_tokens": 212662564.0, + "step": 176730 + }, + { + "entropy": 1.7828210145235062, + "epoch": 0.5478785361283178, + "grad_norm": 3.5656747817993164, + "learning_rate": 3.417840288452193e-06, + "loss": 0.3524, + "mean_token_accuracy": 0.8767985865473747, + "num_tokens": 212675271.0, + "step": 176740 + }, + { + "entropy": 1.968538075685501, + "epoch": 0.5479095352533675, + "grad_norm": 8.913453102111816, + "learning_rate": 3.417743600815121e-06, + "loss": 0.4958, + "mean_token_accuracy": 0.8396434724330902, + "num_tokens": 212686870.0, + "step": 176750 + }, + { + "entropy": 1.8434936612844468, + "epoch": 0.5479405343784172, + "grad_norm": 7.429874897003174, + "learning_rate": 3.417646921383205e-06, + "loss": 0.4147, + "mean_token_accuracy": 0.859374138712883, + "num_tokens": 212699255.0, + "step": 176760 + }, + { + "entropy": 1.8811804696917533, + "epoch": 0.5479715335034668, + "grad_norm": 3.9854447841644287, + "learning_rate": 3.417550250155284e-06, + "loss": 0.4246, + "mean_token_accuracy": 0.854819868505001, + "num_tokens": 212711239.0, + "step": 176770 + }, + { + "entropy": 1.7966576874256135, + "epoch": 0.5480025326285166, + "grad_norm": 4.191843032836914, + "learning_rate": 3.417453587130197e-06, + "loss": 0.4258, + "mean_token_accuracy": 0.8515931665897369, + "num_tokens": 212725043.0, + "step": 176780 + }, + { + "entropy": 1.9064485728740692, + "epoch": 0.5480335317535663, + "grad_norm": 7.229835510253906, + "learning_rate": 3.4173569323067857e-06, + "loss": 0.4399, + "mean_token_accuracy": 0.8546797648072243, + "num_tokens": 212736700.0, + "step": 176790 + }, + { + "entropy": 1.8742914631962777, + "epoch": 0.548064530878616, + "grad_norm": 8.884461402893066, + "learning_rate": 3.4172602856838886e-06, + "loss": 0.4433, + "mean_token_accuracy": 0.8549501106142998, + "num_tokens": 212748462.0, + "step": 176800 + }, + { + "entropy": 1.9238539248704911, + "epoch": 0.5480955300036656, + "grad_norm": 7.668822765350342, + "learning_rate": 3.4171636472603475e-06, + "loss": 0.4465, + "mean_token_accuracy": 0.8567259907722473, + "num_tokens": 212760223.0, + "step": 176810 + }, + { + "entropy": 1.8961792960762978, + "epoch": 0.5481265291287153, + "grad_norm": 9.429628372192383, + "learning_rate": 3.4170670170350023e-06, + "loss": 0.4037, + "mean_token_accuracy": 0.8663010224699974, + "num_tokens": 212772026.0, + "step": 176820 + }, + { + "entropy": 1.9418221697211266, + "epoch": 0.5481575282537651, + "grad_norm": 8.488309860229492, + "learning_rate": 3.4169703950066953e-06, + "loss": 0.4807, + "mean_token_accuracy": 0.8481117516756058, + "num_tokens": 212783135.0, + "step": 176830 + }, + { + "entropy": 1.8715976506471634, + "epoch": 0.5481885273788147, + "grad_norm": 3.8496992588043213, + "learning_rate": 3.4168737811742647e-06, + "loss": 0.4583, + "mean_token_accuracy": 0.8450487107038498, + "num_tokens": 212795268.0, + "step": 176840 + }, + { + "entropy": 1.9355599626898765, + "epoch": 0.5482195265038644, + "grad_norm": 8.719645500183105, + "learning_rate": 3.4167771755365547e-06, + "loss": 0.4067, + "mean_token_accuracy": 0.8616823077201843, + "num_tokens": 212807043.0, + "step": 176850 + }, + { + "entropy": 1.962566938996315, + "epoch": 0.5482505256289141, + "grad_norm": 9.193645477294922, + "learning_rate": 3.416680578092406e-06, + "loss": 0.4672, + "mean_token_accuracy": 0.8520191237330437, + "num_tokens": 212818031.0, + "step": 176860 + }, + { + "entropy": 1.985722643136978, + "epoch": 0.5482815247539639, + "grad_norm": 8.554361343383789, + "learning_rate": 3.4165839888406603e-06, + "loss": 0.508, + "mean_token_accuracy": 0.8496146738529206, + "num_tokens": 212828755.0, + "step": 176870 + }, + { + "entropy": 1.964412897825241, + "epoch": 0.5483125238790135, + "grad_norm": 7.86843729019165, + "learning_rate": 3.41648740778016e-06, + "loss": 0.4946, + "mean_token_accuracy": 0.8463870123028755, + "num_tokens": 212839422.0, + "step": 176880 + }, + { + "entropy": 1.8350771561264991, + "epoch": 0.5483435230040632, + "grad_norm": 8.836234092712402, + "learning_rate": 3.416390834909747e-06, + "loss": 0.4558, + "mean_token_accuracy": 0.8595826268196106, + "num_tokens": 212852284.0, + "step": 176890 + }, + { + "entropy": 1.8943647101521492, + "epoch": 0.5483745221291129, + "grad_norm": 3.6767899990081787, + "learning_rate": 3.4162942702282643e-06, + "loss": 0.3818, + "mean_token_accuracy": 0.8682879105210304, + "num_tokens": 212864828.0, + "step": 176900 + }, + { + "entropy": 1.8380630269646645, + "epoch": 0.5484055212541626, + "grad_norm": 4.24230432510376, + "learning_rate": 3.416197713734554e-06, + "loss": 0.4151, + "mean_token_accuracy": 0.8496894925832749, + "num_tokens": 212878003.0, + "step": 176910 + }, + { + "entropy": 1.8945675104856492, + "epoch": 0.5484365203792123, + "grad_norm": 4.138571739196777, + "learning_rate": 3.4161011654274595e-06, + "loss": 0.4444, + "mean_token_accuracy": 0.8576994270086289, + "num_tokens": 212889803.0, + "step": 176920 + }, + { + "entropy": 1.9622309237718583, + "epoch": 0.548467519504262, + "grad_norm": 8.334486961364746, + "learning_rate": 3.416004625305824e-06, + "loss": 0.4922, + "mean_token_accuracy": 0.8503334224224091, + "num_tokens": 212900420.0, + "step": 176930 + }, + { + "entropy": 1.9361471354961395, + "epoch": 0.5484985186293116, + "grad_norm": 8.700508117675781, + "learning_rate": 3.41590809336849e-06, + "loss": 0.4483, + "mean_token_accuracy": 0.865361250936985, + "num_tokens": 212911692.0, + "step": 176940 + }, + { + "entropy": 1.9572209745645524, + "epoch": 0.5485295177543614, + "grad_norm": 7.193476676940918, + "learning_rate": 3.4158115696143034e-06, + "loss": 0.463, + "mean_token_accuracy": 0.8585177928209304, + "num_tokens": 212922654.0, + "step": 176950 + }, + { + "entropy": 1.8604034408926964, + "epoch": 0.5485605168794111, + "grad_norm": 4.305553436279297, + "learning_rate": 3.4157150540421064e-06, + "loss": 0.4161, + "mean_token_accuracy": 0.8506237909197807, + "num_tokens": 212935189.0, + "step": 176960 + }, + { + "entropy": 1.849170657992363, + "epoch": 0.5485915160044608, + "grad_norm": 8.496451377868652, + "learning_rate": 3.4156185466507438e-06, + "loss": 0.3962, + "mean_token_accuracy": 0.8654500484466553, + "num_tokens": 212948374.0, + "step": 176970 + }, + { + "entropy": 1.8641334384679795, + "epoch": 0.5486225151295104, + "grad_norm": 8.56639289855957, + "learning_rate": 3.4155220474390593e-06, + "loss": 0.4587, + "mean_token_accuracy": 0.8593708544969558, + "num_tokens": 212960091.0, + "step": 176980 + }, + { + "entropy": 1.8458046644926072, + "epoch": 0.5486535142545602, + "grad_norm": 9.526514053344727, + "learning_rate": 3.415425556405898e-06, + "loss": 0.4167, + "mean_token_accuracy": 0.8565845727920532, + "num_tokens": 212973786.0, + "step": 176990 + }, + { + "entropy": 1.902486439049244, + "epoch": 0.5486845133796099, + "grad_norm": 6.776129722595215, + "learning_rate": 3.4153290735501043e-06, + "loss": 0.4011, + "mean_token_accuracy": 0.8635582402348518, + "num_tokens": 212985730.0, + "step": 177000 + }, + { + "entropy": 1.7577807061374187, + "epoch": 0.5487155125046596, + "grad_norm": 3.811516284942627, + "learning_rate": 3.4152325988705235e-06, + "loss": 0.322, + "mean_token_accuracy": 0.8757859200239182, + "num_tokens": 212999329.0, + "step": 177010 + }, + { + "entropy": 1.9133754044771194, + "epoch": 0.5487465116297092, + "grad_norm": 9.288987159729004, + "learning_rate": 3.415136132366002e-06, + "loss": 0.4793, + "mean_token_accuracy": 0.8521696537733078, + "num_tokens": 213011377.0, + "step": 177020 + }, + { + "entropy": 1.911231203377247, + "epoch": 0.548777510754759, + "grad_norm": 9.659815788269043, + "learning_rate": 3.4150396740353836e-06, + "loss": 0.4877, + "mean_token_accuracy": 0.8533487483859062, + "num_tokens": 213023298.0, + "step": 177030 + }, + { + "entropy": 1.8330079704523086, + "epoch": 0.5488085098798087, + "grad_norm": 3.9295654296875, + "learning_rate": 3.414943223877514e-06, + "loss": 0.4096, + "mean_token_accuracy": 0.8561473324894905, + "num_tokens": 213036685.0, + "step": 177040 + }, + { + "entropy": 1.9210347287356853, + "epoch": 0.5488395090048583, + "grad_norm": 9.26740550994873, + "learning_rate": 3.4148467818912405e-06, + "loss": 0.4531, + "mean_token_accuracy": 0.8491527557373046, + "num_tokens": 213048305.0, + "step": 177050 + }, + { + "entropy": 1.762385478615761, + "epoch": 0.548870508129908, + "grad_norm": 3.844062328338623, + "learning_rate": 3.414750348075409e-06, + "loss": 0.3472, + "mean_token_accuracy": 0.8719919562339783, + "num_tokens": 213063074.0, + "step": 177060 + }, + { + "entropy": 1.8222975119948388, + "epoch": 0.5489015072549577, + "grad_norm": 6.260597229003906, + "learning_rate": 3.4146539224288642e-06, + "loss": 0.4227, + "mean_token_accuracy": 0.850552336871624, + "num_tokens": 213076571.0, + "step": 177070 + }, + { + "entropy": 1.8614796712994575, + "epoch": 0.5489325063800075, + "grad_norm": 8.594372749328613, + "learning_rate": 3.414557504950455e-06, + "loss": 0.4338, + "mean_token_accuracy": 0.8523694068193436, + "num_tokens": 213088764.0, + "step": 177080 + }, + { + "entropy": 1.7709208399057388, + "epoch": 0.5489635055050571, + "grad_norm": 3.2780303955078125, + "learning_rate": 3.414461095639028e-06, + "loss": 0.3706, + "mean_token_accuracy": 0.8679910570383071, + "num_tokens": 213103075.0, + "step": 177090 + }, + { + "entropy": 1.775453121960163, + "epoch": 0.5489945046301068, + "grad_norm": 4.352440357208252, + "learning_rate": 3.4143646944934284e-06, + "loss": 0.3508, + "mean_token_accuracy": 0.8666783288121224, + "num_tokens": 213115647.0, + "step": 177100 + }, + { + "entropy": 1.9121138677001, + "epoch": 0.5490255037551565, + "grad_norm": 9.411645889282227, + "learning_rate": 3.414268301512505e-06, + "loss": 0.4668, + "mean_token_accuracy": 0.8493054270744324, + "num_tokens": 213127680.0, + "step": 177110 + }, + { + "entropy": 1.8475042030215263, + "epoch": 0.5490565028802062, + "grad_norm": 7.3609232902526855, + "learning_rate": 3.4141719166951058e-06, + "loss": 0.4844, + "mean_token_accuracy": 0.8392117530107498, + "num_tokens": 213140708.0, + "step": 177120 + }, + { + "entropy": 1.9383308678865432, + "epoch": 0.5490875020052559, + "grad_norm": 9.486416816711426, + "learning_rate": 3.414075540040077e-06, + "loss": 0.4934, + "mean_token_accuracy": 0.8431765273213386, + "num_tokens": 213151379.0, + "step": 177130 + }, + { + "entropy": 1.9540158912539483, + "epoch": 0.5491185011303056, + "grad_norm": 11.021252632141113, + "learning_rate": 3.4139791715462683e-06, + "loss": 0.4682, + "mean_token_accuracy": 0.8570782467722893, + "num_tokens": 213162500.0, + "step": 177140 + }, + { + "entropy": 1.8902826353907585, + "epoch": 0.5491495002553552, + "grad_norm": 9.033095359802246, + "learning_rate": 3.413882811212527e-06, + "loss": 0.4514, + "mean_token_accuracy": 0.8544860497117043, + "num_tokens": 213174567.0, + "step": 177150 + }, + { + "entropy": 1.874396102130413, + "epoch": 0.549180499380405, + "grad_norm": 8.876323699951172, + "learning_rate": 3.413786459037702e-06, + "loss": 0.4313, + "mean_token_accuracy": 0.8585318312048912, + "num_tokens": 213186648.0, + "step": 177160 + }, + { + "entropy": 1.86837887018919, + "epoch": 0.5492114985054547, + "grad_norm": 8.945581436157227, + "learning_rate": 3.4136901150206408e-06, + "loss": 0.3991, + "mean_token_accuracy": 0.8691040202975273, + "num_tokens": 213198967.0, + "step": 177170 + }, + { + "entropy": 1.917728215456009, + "epoch": 0.5492424976305044, + "grad_norm": 8.172450065612793, + "learning_rate": 3.4135937791601936e-06, + "loss": 0.4237, + "mean_token_accuracy": 0.8643875792622566, + "num_tokens": 213210339.0, + "step": 177180 + }, + { + "entropy": 1.897295144200325, + "epoch": 0.549273496755554, + "grad_norm": 8.179666519165039, + "learning_rate": 3.4134974514552092e-06, + "loss": 0.4711, + "mean_token_accuracy": 0.847081832587719, + "num_tokens": 213222457.0, + "step": 177190 + }, + { + "entropy": 1.8492035642266273, + "epoch": 0.5493044958806038, + "grad_norm": 9.86941909790039, + "learning_rate": 3.4134011319045374e-06, + "loss": 0.4404, + "mean_token_accuracy": 0.8557023197412491, + "num_tokens": 213235500.0, + "step": 177200 + }, + { + "entropy": 1.9028280600905418, + "epoch": 0.5493354950056535, + "grad_norm": 7.475229263305664, + "learning_rate": 3.413304820507027e-06, + "loss": 0.4379, + "mean_token_accuracy": 0.856609919667244, + "num_tokens": 213246948.0, + "step": 177210 + }, + { + "entropy": 1.821316310763359, + "epoch": 0.5493664941307032, + "grad_norm": 8.389219284057617, + "learning_rate": 3.4132085172615283e-06, + "loss": 0.3987, + "mean_token_accuracy": 0.8570780649781227, + "num_tokens": 213259384.0, + "step": 177220 + }, + { + "entropy": 1.8156044453382492, + "epoch": 0.5493974932557528, + "grad_norm": 8.002120018005371, + "learning_rate": 3.413112222166891e-06, + "loss": 0.3753, + "mean_token_accuracy": 0.8619963735342026, + "num_tokens": 213272870.0, + "step": 177230 + }, + { + "entropy": 1.8711471810936928, + "epoch": 0.5494284923808026, + "grad_norm": 6.479476451873779, + "learning_rate": 3.413015935221966e-06, + "loss": 0.4178, + "mean_token_accuracy": 0.8612604930996894, + "num_tokens": 213284570.0, + "step": 177240 + }, + { + "entropy": 1.8935720384120942, + "epoch": 0.5494594915058523, + "grad_norm": 3.4970638751983643, + "learning_rate": 3.412919656425603e-06, + "loss": 0.4378, + "mean_token_accuracy": 0.8482785031199456, + "num_tokens": 213297004.0, + "step": 177250 + }, + { + "entropy": 1.8641334936022758, + "epoch": 0.5494904906309019, + "grad_norm": 7.379012584686279, + "learning_rate": 3.4128233857766536e-06, + "loss": 0.4602, + "mean_token_accuracy": 0.8560739994049072, + "num_tokens": 213309632.0, + "step": 177260 + }, + { + "entropy": 1.91132210791111, + "epoch": 0.5495214897559516, + "grad_norm": 9.017353057861328, + "learning_rate": 3.4127271232739683e-06, + "loss": 0.4547, + "mean_token_accuracy": 0.8572400063276291, + "num_tokens": 213320429.0, + "step": 177270 + }, + { + "entropy": 1.8946466326713562, + "epoch": 0.5495524888810014, + "grad_norm": 10.003750801086426, + "learning_rate": 3.4126308689163976e-06, + "loss": 0.4677, + "mean_token_accuracy": 0.8475130766630172, + "num_tokens": 213331719.0, + "step": 177280 + }, + { + "entropy": 1.9153622835874557, + "epoch": 0.5495834880060511, + "grad_norm": 7.587196350097656, + "learning_rate": 3.4125346227027955e-06, + "loss": 0.4754, + "mean_token_accuracy": 0.8539967402815819, + "num_tokens": 213343303.0, + "step": 177290 + }, + { + "entropy": 1.9106049820780755, + "epoch": 0.5496144871311007, + "grad_norm": 6.409841537475586, + "learning_rate": 3.4124383846320103e-06, + "loss": 0.4409, + "mean_token_accuracy": 0.8548314660787583, + "num_tokens": 213355306.0, + "step": 177300 + }, + { + "entropy": 1.869752712547779, + "epoch": 0.5496454862561504, + "grad_norm": 7.410600185394287, + "learning_rate": 3.412342154702895e-06, + "loss": 0.3933, + "mean_token_accuracy": 0.8623670980334281, + "num_tokens": 213367450.0, + "step": 177310 + }, + { + "entropy": 1.9162461429834365, + "epoch": 0.5496764853812001, + "grad_norm": 7.369466781616211, + "learning_rate": 3.4122459329143033e-06, + "loss": 0.4437, + "mean_token_accuracy": 0.8539816990494729, + "num_tokens": 213378872.0, + "step": 177320 + }, + { + "entropy": 1.9060623481869698, + "epoch": 0.5497074845062498, + "grad_norm": 8.427303314208984, + "learning_rate": 3.4121497192650853e-06, + "loss": 0.4491, + "mean_token_accuracy": 0.8590623676776886, + "num_tokens": 213390091.0, + "step": 177330 + }, + { + "entropy": 1.8645298302173614, + "epoch": 0.5497384836312995, + "grad_norm": 2.700556993484497, + "learning_rate": 3.4120535137540954e-06, + "loss": 0.4582, + "mean_token_accuracy": 0.8587430387735366, + "num_tokens": 213402523.0, + "step": 177340 + }, + { + "entropy": 1.841796737909317, + "epoch": 0.5497694827563492, + "grad_norm": 3.1640028953552246, + "learning_rate": 3.4119573163801855e-06, + "loss": 0.3854, + "mean_token_accuracy": 0.8694748774170875, + "num_tokens": 213414146.0, + "step": 177350 + }, + { + "entropy": 1.8999506399035453, + "epoch": 0.5498004818813989, + "grad_norm": 7.3486552238464355, + "learning_rate": 3.411861127142209e-06, + "loss": 0.4441, + "mean_token_accuracy": 0.8545324966311455, + "num_tokens": 213425589.0, + "step": 177360 + }, + { + "entropy": 1.9142435252666474, + "epoch": 0.5498314810064486, + "grad_norm": 7.839383125305176, + "learning_rate": 3.411764946039018e-06, + "loss": 0.4879, + "mean_token_accuracy": 0.8502327635884285, + "num_tokens": 213436889.0, + "step": 177370 + }, + { + "entropy": 1.767600019276142, + "epoch": 0.5498624801314983, + "grad_norm": 5.790437698364258, + "learning_rate": 3.4116687730694664e-06, + "loss": 0.3863, + "mean_token_accuracy": 0.862785404920578, + "num_tokens": 213450524.0, + "step": 177380 + }, + { + "entropy": 1.9111123844981193, + "epoch": 0.549893479256548, + "grad_norm": 8.019339561462402, + "learning_rate": 3.411572608232409e-06, + "loss": 0.4339, + "mean_token_accuracy": 0.8633904352784156, + "num_tokens": 213461443.0, + "step": 177390 + }, + { + "entropy": 1.9108331441879272, + "epoch": 0.5499244783815976, + "grad_norm": 8.2950439453125, + "learning_rate": 3.4114764515266983e-06, + "loss": 0.413, + "mean_token_accuracy": 0.8633098036050797, + "num_tokens": 213472425.0, + "step": 177400 + }, + { + "entropy": 1.923930747807026, + "epoch": 0.5499554775066474, + "grad_norm": 8.409180641174316, + "learning_rate": 3.411380302951189e-06, + "loss": 0.5004, + "mean_token_accuracy": 0.8442714124917984, + "num_tokens": 213483267.0, + "step": 177410 + }, + { + "entropy": 1.6820106402039527, + "epoch": 0.5499864766316971, + "grad_norm": 3.9981110095977783, + "learning_rate": 3.4112841625047366e-06, + "loss": 0.3014, + "mean_token_accuracy": 0.8790968149900437, + "num_tokens": 213497545.0, + "step": 177420 + }, + { + "entropy": 1.9298490598797797, + "epoch": 0.5500174757567468, + "grad_norm": 9.856298446655273, + "learning_rate": 3.411188030186193e-06, + "loss": 0.4957, + "mean_token_accuracy": 0.8398898839950562, + "num_tokens": 213508743.0, + "step": 177430 + }, + { + "entropy": 1.8625278130173684, + "epoch": 0.5500484748817964, + "grad_norm": 6.03195858001709, + "learning_rate": 3.411091905994416e-06, + "loss": 0.4289, + "mean_token_accuracy": 0.8528363704681396, + "num_tokens": 213520882.0, + "step": 177440 + }, + { + "entropy": 1.9055146545171737, + "epoch": 0.5500794740068462, + "grad_norm": 8.34013557434082, + "learning_rate": 3.410995789928259e-06, + "loss": 0.4526, + "mean_token_accuracy": 0.8541034102439881, + "num_tokens": 213533088.0, + "step": 177450 + }, + { + "entropy": 1.9039269611239433, + "epoch": 0.5501104731318959, + "grad_norm": 7.9535346031188965, + "learning_rate": 3.4108996819865776e-06, + "loss": 0.5036, + "mean_token_accuracy": 0.8517986431717872, + "num_tokens": 213544684.0, + "step": 177460 + }, + { + "entropy": 1.852563814818859, + "epoch": 0.5501414722569455, + "grad_norm": 4.031369686126709, + "learning_rate": 3.4108035821682268e-06, + "loss": 0.4249, + "mean_token_accuracy": 0.8509661376476287, + "num_tokens": 213557896.0, + "step": 177470 + }, + { + "entropy": 1.9033602237701417, + "epoch": 0.5501724713819952, + "grad_norm": 8.224656105041504, + "learning_rate": 3.4107074904720634e-06, + "loss": 0.4596, + "mean_token_accuracy": 0.8506034925580025, + "num_tokens": 213569530.0, + "step": 177480 + }, + { + "entropy": 1.9195539087057114, + "epoch": 0.550203470507045, + "grad_norm": 9.235480308532715, + "learning_rate": 3.4106114068969415e-06, + "loss": 0.4282, + "mean_token_accuracy": 0.8559677705168725, + "num_tokens": 213581205.0, + "step": 177490 + }, + { + "entropy": 1.8721505463123322, + "epoch": 0.5502344696320947, + "grad_norm": 7.580912113189697, + "learning_rate": 3.4105153314417192e-06, + "loss": 0.4263, + "mean_token_accuracy": 0.8641788467764855, + "num_tokens": 213592977.0, + "step": 177500 + }, + { + "entropy": 1.8690673634409904, + "epoch": 0.5502654687571443, + "grad_norm": 9.193328857421875, + "learning_rate": 3.4104192641052523e-06, + "loss": 0.4289, + "mean_token_accuracy": 0.85512455701828, + "num_tokens": 213604540.0, + "step": 177510 + }, + { + "entropy": 1.89061731249094, + "epoch": 0.550296467882194, + "grad_norm": 8.917570114135742, + "learning_rate": 3.410323204886397e-06, + "loss": 0.4453, + "mean_token_accuracy": 0.8444926172494889, + "num_tokens": 213616613.0, + "step": 177520 + }, + { + "entropy": 1.857405199110508, + "epoch": 0.5503274670072438, + "grad_norm": 7.011704444885254, + "learning_rate": 3.4102271537840104e-06, + "loss": 0.4116, + "mean_token_accuracy": 0.855528536438942, + "num_tokens": 213629454.0, + "step": 177530 + }, + { + "entropy": 1.9259560242295266, + "epoch": 0.5503584661322934, + "grad_norm": 5.598184108734131, + "learning_rate": 3.4101311107969497e-06, + "loss": 0.4306, + "mean_token_accuracy": 0.8530476734042167, + "num_tokens": 213641852.0, + "step": 177540 + }, + { + "entropy": 1.9008552610874176, + "epoch": 0.5503894652573431, + "grad_norm": 8.267748832702637, + "learning_rate": 3.410035075924073e-06, + "loss": 0.4357, + "mean_token_accuracy": 0.8589813798666001, + "num_tokens": 213653420.0, + "step": 177550 + }, + { + "entropy": 1.8692269191145896, + "epoch": 0.5504204643823928, + "grad_norm": 7.912377834320068, + "learning_rate": 3.4099390491642353e-06, + "loss": 0.3968, + "mean_token_accuracy": 0.8613191261887551, + "num_tokens": 213665530.0, + "step": 177560 + }, + { + "entropy": 1.8758762568235396, + "epoch": 0.5504514635074425, + "grad_norm": 7.625454902648926, + "learning_rate": 3.4098430305162966e-06, + "loss": 0.4776, + "mean_token_accuracy": 0.848248441517353, + "num_tokens": 213677266.0, + "step": 177570 + }, + { + "entropy": 1.923265139758587, + "epoch": 0.5504824626324922, + "grad_norm": 9.186751365661621, + "learning_rate": 3.4097470199791143e-06, + "loss": 0.4604, + "mean_token_accuracy": 0.8520221054553986, + "num_tokens": 213689292.0, + "step": 177580 + }, + { + "entropy": 1.8164562806487083, + "epoch": 0.5505134617575419, + "grad_norm": 7.9483113288879395, + "learning_rate": 3.409651017551546e-06, + "loss": 0.3789, + "mean_token_accuracy": 0.8560757398605346, + "num_tokens": 213702018.0, + "step": 177590 + }, + { + "entropy": 1.9234494641423225, + "epoch": 0.5505444608825916, + "grad_norm": 7.260690689086914, + "learning_rate": 3.409555023232452e-06, + "loss": 0.4335, + "mean_token_accuracy": 0.8594354689121246, + "num_tokens": 213713623.0, + "step": 177600 + }, + { + "entropy": 1.8692736342549323, + "epoch": 0.5505754600076412, + "grad_norm": 7.934293270111084, + "learning_rate": 3.409459037020688e-06, + "loss": 0.4769, + "mean_token_accuracy": 0.8551886230707169, + "num_tokens": 213726229.0, + "step": 177610 + }, + { + "entropy": 1.8698809787631034, + "epoch": 0.550606459132691, + "grad_norm": 8.365594863891602, + "learning_rate": 3.4093630589151157e-06, + "loss": 0.407, + "mean_token_accuracy": 0.8643792599439621, + "num_tokens": 213738593.0, + "step": 177620 + }, + { + "entropy": 1.8786334797739983, + "epoch": 0.5506374582577407, + "grad_norm": 3.4549944400787354, + "learning_rate": 3.4092670889145925e-06, + "loss": 0.4304, + "mean_token_accuracy": 0.8562889456748962, + "num_tokens": 213750663.0, + "step": 177630 + }, + { + "entropy": 1.8907557740807532, + "epoch": 0.5506684573827904, + "grad_norm": 8.042924880981445, + "learning_rate": 3.4091711270179773e-06, + "loss": 0.4265, + "mean_token_accuracy": 0.8546307638287545, + "num_tokens": 213762199.0, + "step": 177640 + }, + { + "entropy": 1.8805308431386947, + "epoch": 0.55069945650784, + "grad_norm": 9.960572242736816, + "learning_rate": 3.409075173224132e-06, + "loss": 0.427, + "mean_token_accuracy": 0.8486173838376999, + "num_tokens": 213774792.0, + "step": 177650 + }, + { + "entropy": 1.9167560517787934, + "epoch": 0.5507304556328898, + "grad_norm": 7.627819538116455, + "learning_rate": 3.4089792275319143e-06, + "loss": 0.4545, + "mean_token_accuracy": 0.8527622357010841, + "num_tokens": 213786490.0, + "step": 177660 + }, + { + "entropy": 1.8611066058278083, + "epoch": 0.5507614547579395, + "grad_norm": 7.6696295738220215, + "learning_rate": 3.408883289940184e-06, + "loss": 0.425, + "mean_token_accuracy": 0.8635546818375588, + "num_tokens": 213798884.0, + "step": 177670 + }, + { + "entropy": 1.9489878684282302, + "epoch": 0.5507924538829891, + "grad_norm": 3.252110004425049, + "learning_rate": 3.408787360447803e-06, + "loss": 0.4764, + "mean_token_accuracy": 0.8568515613675117, + "num_tokens": 213810509.0, + "step": 177680 + }, + { + "entropy": 1.925804816186428, + "epoch": 0.5508234530080388, + "grad_norm": 8.378000259399414, + "learning_rate": 3.4086914390536304e-06, + "loss": 0.4567, + "mean_token_accuracy": 0.8556630969047546, + "num_tokens": 213822155.0, + "step": 177690 + }, + { + "entropy": 1.8882811307907104, + "epoch": 0.5508544521330886, + "grad_norm": 7.851574897766113, + "learning_rate": 3.408595525756528e-06, + "loss": 0.4318, + "mean_token_accuracy": 0.854667441546917, + "num_tokens": 213834194.0, + "step": 177700 + }, + { + "entropy": 1.9184136673808099, + "epoch": 0.5508854512581383, + "grad_norm": 3.8098981380462646, + "learning_rate": 3.4084996205553554e-06, + "loss": 0.4328, + "mean_token_accuracy": 0.85739406645298, + "num_tokens": 213845395.0, + "step": 177710 + }, + { + "entropy": 1.955088695883751, + "epoch": 0.5509164503831879, + "grad_norm": 7.2159953117370605, + "learning_rate": 3.4084037234489744e-06, + "loss": 0.4649, + "mean_token_accuracy": 0.8552487745881081, + "num_tokens": 213855942.0, + "step": 177720 + }, + { + "entropy": 1.876141707599163, + "epoch": 0.5509474495082376, + "grad_norm": 7.4983229637146, + "learning_rate": 3.4083078344362464e-06, + "loss": 0.4537, + "mean_token_accuracy": 0.8518855974078179, + "num_tokens": 213867853.0, + "step": 177730 + }, + { + "entropy": 1.835733339190483, + "epoch": 0.5509784486332874, + "grad_norm": 4.303890228271484, + "learning_rate": 3.4082119535160323e-06, + "loss": 0.4295, + "mean_token_accuracy": 0.8588640108704567, + "num_tokens": 213880352.0, + "step": 177740 + }, + { + "entropy": 1.9237274020910262, + "epoch": 0.551009447758337, + "grad_norm": 6.245169162750244, + "learning_rate": 3.4081160806871948e-06, + "loss": 0.4426, + "mean_token_accuracy": 0.8537571519613266, + "num_tokens": 213891709.0, + "step": 177750 + }, + { + "entropy": 1.9308589279651642, + "epoch": 0.5510404468833867, + "grad_norm": 11.33820629119873, + "learning_rate": 3.4080202159485964e-06, + "loss": 0.4657, + "mean_token_accuracy": 0.8526531413197518, + "num_tokens": 213903261.0, + "step": 177760 + }, + { + "entropy": 1.8311443090438844, + "epoch": 0.5510714460084364, + "grad_norm": 7.7392802238464355, + "learning_rate": 3.4079243592990975e-06, + "loss": 0.368, + "mean_token_accuracy": 0.8646254643797875, + "num_tokens": 213916092.0, + "step": 177770 + }, + { + "entropy": 1.8906705155968666, + "epoch": 0.551102445133486, + "grad_norm": 7.602465629577637, + "learning_rate": 3.4078285107375612e-06, + "loss": 0.4493, + "mean_token_accuracy": 0.85352271348238, + "num_tokens": 213926945.0, + "step": 177780 + }, + { + "entropy": 1.7623763605952263, + "epoch": 0.5511334442585358, + "grad_norm": 2.4720168113708496, + "learning_rate": 3.4077326702628514e-06, + "loss": 0.3741, + "mean_token_accuracy": 0.8656698524951935, + "num_tokens": 213940183.0, + "step": 177790 + }, + { + "entropy": 1.8430035665631295, + "epoch": 0.5511644433835855, + "grad_norm": 8.119751930236816, + "learning_rate": 3.4076368378738296e-06, + "loss": 0.403, + "mean_token_accuracy": 0.854887755215168, + "num_tokens": 213952829.0, + "step": 177800 + }, + { + "entropy": 1.788646037876606, + "epoch": 0.5511954425086352, + "grad_norm": 7.915525913238525, + "learning_rate": 3.40754101356936e-06, + "loss": 0.3689, + "mean_token_accuracy": 0.8624294593930244, + "num_tokens": 213965992.0, + "step": 177810 + }, + { + "entropy": 1.8792960986495018, + "epoch": 0.5512264416336848, + "grad_norm": 3.7082908153533936, + "learning_rate": 3.407445197348305e-06, + "loss": 0.5063, + "mean_token_accuracy": 0.8567387446761131, + "num_tokens": 213978690.0, + "step": 177820 + }, + { + "entropy": 1.9804147839546205, + "epoch": 0.5512574407587346, + "grad_norm": 7.826719284057617, + "learning_rate": 3.4073493892095287e-06, + "loss": 0.5066, + "mean_token_accuracy": 0.8443496122956275, + "num_tokens": 213989611.0, + "step": 177830 + }, + { + "entropy": 1.9625308007001876, + "epoch": 0.5512884398837843, + "grad_norm": 7.848978519439697, + "learning_rate": 3.4072535891518947e-06, + "loss": 0.439, + "mean_token_accuracy": 0.8623924240469932, + "num_tokens": 214000412.0, + "step": 177840 + }, + { + "entropy": 1.8899758756160736, + "epoch": 0.551319439008834, + "grad_norm": 10.778916358947754, + "learning_rate": 3.407157797174267e-06, + "loss": 0.4663, + "mean_token_accuracy": 0.8471853539347649, + "num_tokens": 214011641.0, + "step": 177850 + }, + { + "entropy": 1.8247670635581017, + "epoch": 0.5513504381338836, + "grad_norm": 3.6938233375549316, + "learning_rate": 3.4070620132755107e-06, + "loss": 0.3959, + "mean_token_accuracy": 0.8624543026089668, + "num_tokens": 214024699.0, + "step": 177860 + }, + { + "entropy": 1.8587495237588882, + "epoch": 0.5513814372589334, + "grad_norm": 5.922456741333008, + "learning_rate": 3.4069662374544886e-06, + "loss": 0.408, + "mean_token_accuracy": 0.8634365990757942, + "num_tokens": 214037493.0, + "step": 177870 + }, + { + "entropy": 1.902451252937317, + "epoch": 0.5514124363839831, + "grad_norm": 7.852017879486084, + "learning_rate": 3.4068704697100667e-06, + "loss": 0.4425, + "mean_token_accuracy": 0.8582121372222901, + "num_tokens": 214048733.0, + "step": 177880 + }, + { + "entropy": 1.9077907755970955, + "epoch": 0.5514434355090327, + "grad_norm": 7.281789779663086, + "learning_rate": 3.4067747100411104e-06, + "loss": 0.4688, + "mean_token_accuracy": 0.8456922695040703, + "num_tokens": 214059824.0, + "step": 177890 + }, + { + "entropy": 1.94886015355587, + "epoch": 0.5514744346340824, + "grad_norm": 7.4586968421936035, + "learning_rate": 3.4066789584464825e-06, + "loss": 0.5074, + "mean_token_accuracy": 0.8491571620106697, + "num_tokens": 214070576.0, + "step": 177900 + }, + { + "entropy": 1.8953621149063111, + "epoch": 0.5515054337591322, + "grad_norm": 3.4469287395477295, + "learning_rate": 3.406583214925051e-06, + "loss": 0.4315, + "mean_token_accuracy": 0.8564003005623817, + "num_tokens": 214082785.0, + "step": 177910 + }, + { + "entropy": 1.8691608056426048, + "epoch": 0.5515364328841819, + "grad_norm": 9.114605903625488, + "learning_rate": 3.406487479475681e-06, + "loss": 0.4313, + "mean_token_accuracy": 0.8627192363142967, + "num_tokens": 214094914.0, + "step": 177920 + }, + { + "entropy": 1.9036862418055533, + "epoch": 0.5515674320092315, + "grad_norm": 7.558939456939697, + "learning_rate": 3.4063917520972363e-06, + "loss": 0.4081, + "mean_token_accuracy": 0.8584882363677024, + "num_tokens": 214107151.0, + "step": 177930 + }, + { + "entropy": 1.818896722793579, + "epoch": 0.5515984311342812, + "grad_norm": 7.9749956130981445, + "learning_rate": 3.4062960327885846e-06, + "loss": 0.4087, + "mean_token_accuracy": 0.8586197167634964, + "num_tokens": 214120094.0, + "step": 177940 + }, + { + "entropy": 1.8816650286316872, + "epoch": 0.551629430259331, + "grad_norm": 6.810136795043945, + "learning_rate": 3.406200321548592e-06, + "loss": 0.4122, + "mean_token_accuracy": 0.8652727901935577, + "num_tokens": 214131838.0, + "step": 177950 + }, + { + "entropy": 1.8971996203064918, + "epoch": 0.5516604293843806, + "grad_norm": 9.636186599731445, + "learning_rate": 3.4061046183761253e-06, + "loss": 0.4493, + "mean_token_accuracy": 0.8593866720795631, + "num_tokens": 214143806.0, + "step": 177960 + }, + { + "entropy": 1.8690408036112784, + "epoch": 0.5516914285094303, + "grad_norm": 8.90285587310791, + "learning_rate": 3.406008923270051e-06, + "loss": 0.4036, + "mean_token_accuracy": 0.8644983410835266, + "num_tokens": 214155957.0, + "step": 177970 + }, + { + "entropy": 1.931781667470932, + "epoch": 0.55172242763448, + "grad_norm": 6.843881130218506, + "learning_rate": 3.405913236229235e-06, + "loss": 0.4505, + "mean_token_accuracy": 0.8537558436393737, + "num_tokens": 214167498.0, + "step": 177980 + }, + { + "entropy": 1.9443962126970291, + "epoch": 0.5517534267595298, + "grad_norm": 8.771767616271973, + "learning_rate": 3.405817557252546e-06, + "loss": 0.4801, + "mean_token_accuracy": 0.8550885379314422, + "num_tokens": 214178482.0, + "step": 177990 + }, + { + "entropy": 1.8965649604797363, + "epoch": 0.5517844258845794, + "grad_norm": 9.75014591217041, + "learning_rate": 3.4057218863388503e-06, + "loss": 0.4606, + "mean_token_accuracy": 0.8570706993341446, + "num_tokens": 214189426.0, + "step": 178000 + }, + { + "entropy": 1.89953922778368, + "epoch": 0.5518154250096291, + "grad_norm": 8.906521797180176, + "learning_rate": 3.4056262234870164e-06, + "loss": 0.4568, + "mean_token_accuracy": 0.8522025436162949, + "num_tokens": 214201647.0, + "step": 178010 + }, + { + "entropy": 1.9386347502470016, + "epoch": 0.5518464241346788, + "grad_norm": 8.296730995178223, + "learning_rate": 3.4055305686959106e-06, + "loss": 0.4652, + "mean_token_accuracy": 0.8579264089465142, + "num_tokens": 214212554.0, + "step": 178020 + }, + { + "entropy": 1.9103853285312653, + "epoch": 0.5518774232597284, + "grad_norm": 8.537365913391113, + "learning_rate": 3.405434921964403e-06, + "loss": 0.446, + "mean_token_accuracy": 0.8576163098216056, + "num_tokens": 214223866.0, + "step": 178030 + }, + { + "entropy": 1.9078239232301712, + "epoch": 0.5519084223847782, + "grad_norm": 8.394222259521484, + "learning_rate": 3.4053392832913596e-06, + "loss": 0.4268, + "mean_token_accuracy": 0.8546977087855339, + "num_tokens": 214235513.0, + "step": 178040 + }, + { + "entropy": 1.8636566340923308, + "epoch": 0.5519394215098279, + "grad_norm": 9.244743347167969, + "learning_rate": 3.405243652675651e-06, + "loss": 0.408, + "mean_token_accuracy": 0.8607104942202568, + "num_tokens": 214247762.0, + "step": 178050 + }, + { + "entropy": 1.9099171817302705, + "epoch": 0.5519704206348776, + "grad_norm": 7.364317417144775, + "learning_rate": 3.4051480301161445e-06, + "loss": 0.4361, + "mean_token_accuracy": 0.8574780553579331, + "num_tokens": 214258815.0, + "step": 178060 + }, + { + "entropy": 1.8716615453362464, + "epoch": 0.5520014197599272, + "grad_norm": 7.7942681312561035, + "learning_rate": 3.40505241561171e-06, + "loss": 0.4218, + "mean_token_accuracy": 0.857321010529995, + "num_tokens": 214270753.0, + "step": 178070 + }, + { + "entropy": 1.8762011349201202, + "epoch": 0.552032418884977, + "grad_norm": 3.7072243690490723, + "learning_rate": 3.4049568091612157e-06, + "loss": 0.4013, + "mean_token_accuracy": 0.8608313351869583, + "num_tokens": 214282462.0, + "step": 178080 + }, + { + "entropy": 1.897752860188484, + "epoch": 0.5520634180100267, + "grad_norm": 7.348748207092285, + "learning_rate": 3.404861210763532e-06, + "loss": 0.4795, + "mean_token_accuracy": 0.8526074305176735, + "num_tokens": 214294085.0, + "step": 178090 + }, + { + "entropy": 1.939253604412079, + "epoch": 0.5520944171350763, + "grad_norm": 8.933135032653809, + "learning_rate": 3.4047656204175273e-06, + "loss": 0.4585, + "mean_token_accuracy": 0.8538300707936287, + "num_tokens": 214305365.0, + "step": 178100 + }, + { + "entropy": 1.8425916746258735, + "epoch": 0.552125416260126, + "grad_norm": 9.111211776733398, + "learning_rate": 3.404670038122073e-06, + "loss": 0.4216, + "mean_token_accuracy": 0.8560868754982949, + "num_tokens": 214318160.0, + "step": 178110 + }, + { + "entropy": 1.9103170096874238, + "epoch": 0.5521564153851758, + "grad_norm": 9.979475021362305, + "learning_rate": 3.404574463876037e-06, + "loss": 0.4577, + "mean_token_accuracy": 0.8544515520334244, + "num_tokens": 214329036.0, + "step": 178120 + }, + { + "entropy": 1.8822571218013764, + "epoch": 0.5521874145102255, + "grad_norm": 9.598880767822266, + "learning_rate": 3.404478897678291e-06, + "loss": 0.4516, + "mean_token_accuracy": 0.8551209643483162, + "num_tokens": 214341063.0, + "step": 178130 + }, + { + "entropy": 1.8942831814289094, + "epoch": 0.5522184136352751, + "grad_norm": 9.16766357421875, + "learning_rate": 3.404383339527706e-06, + "loss": 0.5151, + "mean_token_accuracy": 0.8461239233613014, + "num_tokens": 214353635.0, + "step": 178140 + }, + { + "entropy": 1.8503873273730278, + "epoch": 0.5522494127603248, + "grad_norm": 7.313621520996094, + "learning_rate": 3.404287789423152e-06, + "loss": 0.4015, + "mean_token_accuracy": 0.8708026796579361, + "num_tokens": 214366216.0, + "step": 178150 + }, + { + "entropy": 1.9046386152505874, + "epoch": 0.5522804118853746, + "grad_norm": 7.900165557861328, + "learning_rate": 3.4041922473634986e-06, + "loss": 0.4703, + "mean_token_accuracy": 0.8486365541815758, + "num_tokens": 214377957.0, + "step": 178160 + }, + { + "entropy": 1.9191514551639557, + "epoch": 0.5523114110104242, + "grad_norm": 10.395622253417969, + "learning_rate": 3.4040967133476198e-06, + "loss": 0.4728, + "mean_token_accuracy": 0.8610471084713935, + "num_tokens": 214388687.0, + "step": 178170 + }, + { + "entropy": 1.92356576025486, + "epoch": 0.5523424101354739, + "grad_norm": 7.511850833892822, + "learning_rate": 3.404001187374385e-06, + "loss": 0.4613, + "mean_token_accuracy": 0.8621419206261635, + "num_tokens": 214400793.0, + "step": 178180 + }, + { + "entropy": 1.8569615572690963, + "epoch": 0.5523734092605236, + "grad_norm": 8.08397102355957, + "learning_rate": 3.4039056694426665e-06, + "loss": 0.4074, + "mean_token_accuracy": 0.8697955429553985, + "num_tokens": 214413097.0, + "step": 178190 + }, + { + "entropy": 1.912328739464283, + "epoch": 0.5524044083855734, + "grad_norm": 7.70759391784668, + "learning_rate": 3.403810159551335e-06, + "loss": 0.4225, + "mean_token_accuracy": 0.8634463772177696, + "num_tokens": 214424684.0, + "step": 178200 + }, + { + "entropy": 1.874166764318943, + "epoch": 0.552435407510623, + "grad_norm": 4.644807815551758, + "learning_rate": 3.4037146576992636e-06, + "loss": 0.4492, + "mean_token_accuracy": 0.8502660781145096, + "num_tokens": 214437087.0, + "step": 178210 + }, + { + "entropy": 1.8312366858124733, + "epoch": 0.5524664066356727, + "grad_norm": 7.805587291717529, + "learning_rate": 3.4036191638853257e-06, + "loss": 0.4046, + "mean_token_accuracy": 0.8689468517899513, + "num_tokens": 214450085.0, + "step": 178220 + }, + { + "entropy": 1.8324200913310051, + "epoch": 0.5524974057607224, + "grad_norm": 8.242805480957031, + "learning_rate": 3.403523678108391e-06, + "loss": 0.4323, + "mean_token_accuracy": 0.8615979239344597, + "num_tokens": 214462485.0, + "step": 178230 + }, + { + "entropy": 1.7994080841541291, + "epoch": 0.5525284048857722, + "grad_norm": 4.501520156860352, + "learning_rate": 3.403428200367334e-06, + "loss": 0.3608, + "mean_token_accuracy": 0.8683733329176903, + "num_tokens": 214475717.0, + "step": 178240 + }, + { + "entropy": 1.8464299738407135, + "epoch": 0.5525594040108218, + "grad_norm": 6.4583353996276855, + "learning_rate": 3.403332730661027e-06, + "loss": 0.3707, + "mean_token_accuracy": 0.868985840678215, + "num_tokens": 214488274.0, + "step": 178250 + }, + { + "entropy": 1.8997196108102798, + "epoch": 0.5525904031358715, + "grad_norm": 8.174156188964844, + "learning_rate": 3.4032372689883443e-06, + "loss": 0.4303, + "mean_token_accuracy": 0.8552378371357918, + "num_tokens": 214500434.0, + "step": 178260 + }, + { + "entropy": 1.8080697432160378, + "epoch": 0.5526214022609212, + "grad_norm": 7.958045959472656, + "learning_rate": 3.4031418153481583e-06, + "loss": 0.3823, + "mean_token_accuracy": 0.8627307131886482, + "num_tokens": 214513847.0, + "step": 178270 + }, + { + "entropy": 1.8869553804397583, + "epoch": 0.5526524013859708, + "grad_norm": 4.627087116241455, + "learning_rate": 3.403046369739342e-06, + "loss": 0.3881, + "mean_token_accuracy": 0.8562980949878692, + "num_tokens": 214526034.0, + "step": 178280 + }, + { + "entropy": 1.8838186770677567, + "epoch": 0.5526834005110206, + "grad_norm": 6.58920431137085, + "learning_rate": 3.40295093216077e-06, + "loss": 0.4378, + "mean_token_accuracy": 0.8497677370905876, + "num_tokens": 214537809.0, + "step": 178290 + }, + { + "entropy": 1.9977750718593597, + "epoch": 0.5527143996360703, + "grad_norm": 8.55444622039795, + "learning_rate": 3.402855502611316e-06, + "loss": 0.4819, + "mean_token_accuracy": 0.8497657686471939, + "num_tokens": 214548718.0, + "step": 178300 + }, + { + "entropy": 1.961137953400612, + "epoch": 0.5527453987611199, + "grad_norm": 7.637760639190674, + "learning_rate": 3.402760081089855e-06, + "loss": 0.4594, + "mean_token_accuracy": 0.8611582666635513, + "num_tokens": 214559723.0, + "step": 178310 + }, + { + "entropy": 1.8815833404660225, + "epoch": 0.5527763978861696, + "grad_norm": 8.812824249267578, + "learning_rate": 3.402664667595261e-06, + "loss": 0.4037, + "mean_token_accuracy": 0.8595074489712715, + "num_tokens": 214572097.0, + "step": 178320 + }, + { + "entropy": 1.8904899656772614, + "epoch": 0.5528073970112194, + "grad_norm": 8.477423667907715, + "learning_rate": 3.402569262126409e-06, + "loss": 0.4027, + "mean_token_accuracy": 0.860831169784069, + "num_tokens": 214584506.0, + "step": 178330 + }, + { + "entropy": 1.9823995113372803, + "epoch": 0.5528383961362691, + "grad_norm": 14.011083602905273, + "learning_rate": 3.4024738646821725e-06, + "loss": 0.5019, + "mean_token_accuracy": 0.8458440572023391, + "num_tokens": 214595212.0, + "step": 178340 + }, + { + "entropy": 1.9233446091413497, + "epoch": 0.5528693952613187, + "grad_norm": 4.202877044677734, + "learning_rate": 3.402378475261428e-06, + "loss": 0.4589, + "mean_token_accuracy": 0.8481686040759087, + "num_tokens": 214607213.0, + "step": 178350 + }, + { + "entropy": 1.8569358214735985, + "epoch": 0.5529003943863684, + "grad_norm": 7.546200275421143, + "learning_rate": 3.402283093863051e-06, + "loss": 0.3933, + "mean_token_accuracy": 0.8649446710944175, + "num_tokens": 214619852.0, + "step": 178360 + }, + { + "entropy": 1.8237456619739532, + "epoch": 0.5529313935114182, + "grad_norm": 4.462338447570801, + "learning_rate": 3.4021877204859167e-06, + "loss": 0.4097, + "mean_token_accuracy": 0.8608748510479927, + "num_tokens": 214632981.0, + "step": 178370 + }, + { + "entropy": 1.7977847412228585, + "epoch": 0.5529623926364678, + "grad_norm": 7.902055740356445, + "learning_rate": 3.402092355128901e-06, + "loss": 0.4027, + "mean_token_accuracy": 0.8611117780208588, + "num_tokens": 214645980.0, + "step": 178380 + }, + { + "entropy": 1.9663831263780593, + "epoch": 0.5529933917615175, + "grad_norm": 8.112964630126953, + "learning_rate": 3.401996997790879e-06, + "loss": 0.4681, + "mean_token_accuracy": 0.8413886666297913, + "num_tokens": 214656890.0, + "step": 178390 + }, + { + "entropy": 1.9313152968883514, + "epoch": 0.5530243908865672, + "grad_norm": 8.793562889099121, + "learning_rate": 3.4019016484707284e-06, + "loss": 0.4452, + "mean_token_accuracy": 0.8541143208742141, + "num_tokens": 214668760.0, + "step": 178400 + }, + { + "entropy": 1.8870180428028107, + "epoch": 0.553055390011617, + "grad_norm": 8.036027908325195, + "learning_rate": 3.4018063071673245e-06, + "loss": 0.3979, + "mean_token_accuracy": 0.8699349537491798, + "num_tokens": 214680662.0, + "step": 178410 + }, + { + "entropy": 1.8556498274207116, + "epoch": 0.5530863891366666, + "grad_norm": 3.778268337249756, + "learning_rate": 3.4017109738795445e-06, + "loss": 0.419, + "mean_token_accuracy": 0.8546939373016358, + "num_tokens": 214693500.0, + "step": 178420 + }, + { + "entropy": 1.9078700512647628, + "epoch": 0.5531173882617163, + "grad_norm": 3.581650733947754, + "learning_rate": 3.4016156486062657e-06, + "loss": 0.4424, + "mean_token_accuracy": 0.8544475376605988, + "num_tokens": 214705682.0, + "step": 178430 + }, + { + "entropy": 1.9096450537443161, + "epoch": 0.553148387386766, + "grad_norm": 8.489810943603516, + "learning_rate": 3.4015203313463646e-06, + "loss": 0.4332, + "mean_token_accuracy": 0.8597532555460929, + "num_tokens": 214717144.0, + "step": 178440 + }, + { + "entropy": 1.7644469410181045, + "epoch": 0.5531793865118158, + "grad_norm": 3.961745500564575, + "learning_rate": 3.4014250220987184e-06, + "loss": 0.3481, + "mean_token_accuracy": 0.8705752789974213, + "num_tokens": 214731161.0, + "step": 178450 + }, + { + "entropy": 1.949856935441494, + "epoch": 0.5532103856368654, + "grad_norm": 7.908204555511475, + "learning_rate": 3.4013297208622048e-06, + "loss": 0.4865, + "mean_token_accuracy": 0.8459553733468056, + "num_tokens": 214742166.0, + "step": 178460 + }, + { + "entropy": 1.9470009535551072, + "epoch": 0.5532413847619151, + "grad_norm": 8.177698135375977, + "learning_rate": 3.4012344276357022e-06, + "loss": 0.4503, + "mean_token_accuracy": 0.8542072013020515, + "num_tokens": 214752463.0, + "step": 178470 + }, + { + "entropy": 1.889072911441326, + "epoch": 0.5532723838869648, + "grad_norm": 7.375345230102539, + "learning_rate": 3.4011391424180885e-06, + "loss": 0.4467, + "mean_token_accuracy": 0.8569256499409675, + "num_tokens": 214764692.0, + "step": 178480 + }, + { + "entropy": 1.9212247535586358, + "epoch": 0.5533033830120145, + "grad_norm": 7.591136455535889, + "learning_rate": 3.4010438652082413e-06, + "loss": 0.4432, + "mean_token_accuracy": 0.8538636654615402, + "num_tokens": 214776359.0, + "step": 178490 + }, + { + "entropy": 1.943355268239975, + "epoch": 0.5533343821370642, + "grad_norm": 8.059382438659668, + "learning_rate": 3.4009485960050386e-06, + "loss": 0.4625, + "mean_token_accuracy": 0.8473861545324326, + "num_tokens": 214788078.0, + "step": 178500 + }, + { + "entropy": 1.9362692579627037, + "epoch": 0.5533653812621139, + "grad_norm": 9.145943641662598, + "learning_rate": 3.4008533348073603e-06, + "loss": 0.5119, + "mean_token_accuracy": 0.8416964054107666, + "num_tokens": 214800127.0, + "step": 178510 + }, + { + "entropy": 1.8370727390050887, + "epoch": 0.5533963803871635, + "grad_norm": 9.235921859741211, + "learning_rate": 3.4007580816140845e-06, + "loss": 0.4117, + "mean_token_accuracy": 0.860569167137146, + "num_tokens": 214812714.0, + "step": 178520 + }, + { + "entropy": 1.874185086786747, + "epoch": 0.5534273795122132, + "grad_norm": 8.020057678222656, + "learning_rate": 3.4006628364240914e-06, + "loss": 0.4091, + "mean_token_accuracy": 0.862524189054966, + "num_tokens": 214824970.0, + "step": 178530 + }, + { + "entropy": 1.887164406478405, + "epoch": 0.553458378637263, + "grad_norm": 8.896452903747559, + "learning_rate": 3.400567599236259e-06, + "loss": 0.45, + "mean_token_accuracy": 0.8595001310110092, + "num_tokens": 214836577.0, + "step": 178540 + }, + { + "entropy": 1.9219500213861465, + "epoch": 0.5534893777623127, + "grad_norm": 9.564613342285156, + "learning_rate": 3.4004723700494674e-06, + "loss": 0.4559, + "mean_token_accuracy": 0.8518328025937081, + "num_tokens": 214847973.0, + "step": 178550 + }, + { + "entropy": 1.9421744406223298, + "epoch": 0.5535203768873623, + "grad_norm": 9.683865547180176, + "learning_rate": 3.4003771488625957e-06, + "loss": 0.4834, + "mean_token_accuracy": 0.849106827378273, + "num_tokens": 214859099.0, + "step": 178560 + }, + { + "entropy": 1.9177052691578864, + "epoch": 0.553551376012412, + "grad_norm": 7.975922107696533, + "learning_rate": 3.4002819356745246e-06, + "loss": 0.4332, + "mean_token_accuracy": 0.856335574388504, + "num_tokens": 214870059.0, + "step": 178570 + }, + { + "entropy": 1.8940430551767349, + "epoch": 0.5535823751374618, + "grad_norm": 8.784914016723633, + "learning_rate": 3.400186730484135e-06, + "loss": 0.4355, + "mean_token_accuracy": 0.8558658957481384, + "num_tokens": 214882455.0, + "step": 178580 + }, + { + "entropy": 1.8574804171919823, + "epoch": 0.5536133742625114, + "grad_norm": 10.208087921142578, + "learning_rate": 3.4000915332903057e-06, + "loss": 0.4285, + "mean_token_accuracy": 0.8440673872828484, + "num_tokens": 214894606.0, + "step": 178590 + }, + { + "entropy": 1.745209413766861, + "epoch": 0.5536443733875611, + "grad_norm": 7.461440086364746, + "learning_rate": 3.3999963440919182e-06, + "loss": 0.3436, + "mean_token_accuracy": 0.8850232422351837, + "num_tokens": 214908808.0, + "step": 178600 + }, + { + "entropy": 1.996240535378456, + "epoch": 0.5536753725126108, + "grad_norm": 8.930257797241211, + "learning_rate": 3.3999011628878536e-06, + "loss": 0.5221, + "mean_token_accuracy": 0.8388968229293823, + "num_tokens": 214919755.0, + "step": 178610 + }, + { + "entropy": 1.935780856013298, + "epoch": 0.5537063716376606, + "grad_norm": 8.519956588745117, + "learning_rate": 3.3998059896769924e-06, + "loss": 0.4509, + "mean_token_accuracy": 0.8615932404994965, + "num_tokens": 214931240.0, + "step": 178620 + }, + { + "entropy": 1.9266652196645737, + "epoch": 0.5537373707627102, + "grad_norm": 8.8302001953125, + "learning_rate": 3.3997108244582166e-06, + "loss": 0.4402, + "mean_token_accuracy": 0.8644925430417061, + "num_tokens": 214942808.0, + "step": 178630 + }, + { + "entropy": 1.9325824990868568, + "epoch": 0.5537683698877599, + "grad_norm": 6.90614652633667, + "learning_rate": 3.399615667230407e-06, + "loss": 0.4483, + "mean_token_accuracy": 0.8497261658310891, + "num_tokens": 214953920.0, + "step": 178640 + }, + { + "entropy": 1.8539125517010688, + "epoch": 0.5537993690128096, + "grad_norm": 3.6557509899139404, + "learning_rate": 3.3995205179924456e-06, + "loss": 0.3863, + "mean_token_accuracy": 0.8620917737483978, + "num_tokens": 214966871.0, + "step": 178650 + }, + { + "entropy": 1.8373021736741066, + "epoch": 0.5538303681378594, + "grad_norm": 4.263208866119385, + "learning_rate": 3.399425376743214e-06, + "loss": 0.415, + "mean_token_accuracy": 0.8628732219338417, + "num_tokens": 214979239.0, + "step": 178660 + }, + { + "entropy": 1.8937377348542213, + "epoch": 0.553861367262909, + "grad_norm": 8.342369079589844, + "learning_rate": 3.399330243481595e-06, + "loss": 0.427, + "mean_token_accuracy": 0.8600796848535538, + "num_tokens": 214990681.0, + "step": 178670 + }, + { + "entropy": 1.9099412277340888, + "epoch": 0.5538923663879587, + "grad_norm": 9.195755958557129, + "learning_rate": 3.3992351182064708e-06, + "loss": 0.436, + "mean_token_accuracy": 0.8553076282143592, + "num_tokens": 215003112.0, + "step": 178680 + }, + { + "entropy": 1.8709931746125221, + "epoch": 0.5539233655130084, + "grad_norm": 7.637581825256348, + "learning_rate": 3.3991400009167243e-06, + "loss": 0.414, + "mean_token_accuracy": 0.8645595505833625, + "num_tokens": 215014645.0, + "step": 178690 + }, + { + "entropy": 1.8919344574213028, + "epoch": 0.5539543646380581, + "grad_norm": 9.006782531738281, + "learning_rate": 3.3990448916112375e-06, + "loss": 0.4642, + "mean_token_accuracy": 0.8571132674813271, + "num_tokens": 215025975.0, + "step": 178700 + }, + { + "entropy": 1.8450028151273727, + "epoch": 0.5539853637631078, + "grad_norm": 3.9442358016967773, + "learning_rate": 3.398949790288894e-06, + "loss": 0.4166, + "mean_token_accuracy": 0.8657394587993622, + "num_tokens": 215038005.0, + "step": 178710 + }, + { + "entropy": 1.8893791273236276, + "epoch": 0.5540163628881575, + "grad_norm": 9.21173095703125, + "learning_rate": 3.398854696948577e-06, + "loss": 0.4072, + "mean_token_accuracy": 0.8602850198745727, + "num_tokens": 215050378.0, + "step": 178720 + }, + { + "entropy": 1.856932234764099, + "epoch": 0.5540473620132071, + "grad_norm": 9.254220962524414, + "learning_rate": 3.3987596115891695e-06, + "loss": 0.384, + "mean_token_accuracy": 0.8598960101604461, + "num_tokens": 215063082.0, + "step": 178730 + }, + { + "entropy": 1.9601772159337998, + "epoch": 0.5540783611382569, + "grad_norm": 8.915283203125, + "learning_rate": 3.3986645342095564e-06, + "loss": 0.5036, + "mean_token_accuracy": 0.8410410761833191, + "num_tokens": 215074576.0, + "step": 178740 + }, + { + "entropy": 1.8779058650135994, + "epoch": 0.5541093602633066, + "grad_norm": 7.433322429656982, + "learning_rate": 3.3985694648086206e-06, + "loss": 0.4263, + "mean_token_accuracy": 0.8639277085661888, + "num_tokens": 215086708.0, + "step": 178750 + }, + { + "entropy": 1.8679771453142167, + "epoch": 0.5541403593883563, + "grad_norm": 7.492850303649902, + "learning_rate": 3.3984744033852464e-06, + "loss": 0.3961, + "mean_token_accuracy": 0.8661182969808578, + "num_tokens": 215098777.0, + "step": 178760 + }, + { + "entropy": 1.8983294278383256, + "epoch": 0.5541713585134059, + "grad_norm": 3.626197338104248, + "learning_rate": 3.3983793499383184e-06, + "loss": 0.4469, + "mean_token_accuracy": 0.8521765947341919, + "num_tokens": 215109847.0, + "step": 178770 + }, + { + "entropy": 1.808673584461212, + "epoch": 0.5542023576384556, + "grad_norm": 2.6603171825408936, + "learning_rate": 3.3982843044667215e-06, + "loss": 0.4092, + "mean_token_accuracy": 0.8597457781434059, + "num_tokens": 215122763.0, + "step": 178780 + }, + { + "entropy": 1.8716325148940087, + "epoch": 0.5542333567635054, + "grad_norm": 6.67529821395874, + "learning_rate": 3.3981892669693396e-06, + "loss": 0.4246, + "mean_token_accuracy": 0.8609462291002273, + "num_tokens": 215134946.0, + "step": 178790 + }, + { + "entropy": 1.8793714240193367, + "epoch": 0.554264355888555, + "grad_norm": 8.348356246948242, + "learning_rate": 3.398094237445058e-06, + "loss": 0.415, + "mean_token_accuracy": 0.855781489610672, + "num_tokens": 215147406.0, + "step": 178800 + }, + { + "entropy": 2.0147987425327303, + "epoch": 0.5542953550136047, + "grad_norm": 8.565105438232422, + "learning_rate": 3.3979992158927623e-06, + "loss": 0.5062, + "mean_token_accuracy": 0.8423831447958946, + "num_tokens": 215157966.0, + "step": 178810 + }, + { + "entropy": 1.9154839143157005, + "epoch": 0.5543263541386544, + "grad_norm": 5.788300037384033, + "learning_rate": 3.397904202311338e-06, + "loss": 0.4506, + "mean_token_accuracy": 0.8576369374990463, + "num_tokens": 215169739.0, + "step": 178820 + }, + { + "entropy": 1.8988683700561524, + "epoch": 0.5543573532637042, + "grad_norm": 4.657011032104492, + "learning_rate": 3.39780919669967e-06, + "loss": 0.4615, + "mean_token_accuracy": 0.8491096451878548, + "num_tokens": 215182189.0, + "step": 178830 + }, + { + "entropy": 1.9139982163906097, + "epoch": 0.5543883523887538, + "grad_norm": 8.855412483215332, + "learning_rate": 3.3977141990566454e-06, + "loss": 0.4652, + "mean_token_accuracy": 0.8528054267168045, + "num_tokens": 215193816.0, + "step": 178840 + }, + { + "entropy": 1.8482922896742822, + "epoch": 0.5544193515138035, + "grad_norm": 4.143898963928223, + "learning_rate": 3.3976192093811496e-06, + "loss": 0.4073, + "mean_token_accuracy": 0.8553830787539483, + "num_tokens": 215206994.0, + "step": 178850 + }, + { + "entropy": 1.862758006155491, + "epoch": 0.5544503506388532, + "grad_norm": 7.396191120147705, + "learning_rate": 3.397524227672068e-06, + "loss": 0.3892, + "mean_token_accuracy": 0.8633818462491035, + "num_tokens": 215219502.0, + "step": 178860 + }, + { + "entropy": 1.8616106539964676, + "epoch": 0.554481349763903, + "grad_norm": 8.938167572021484, + "learning_rate": 3.3974292539282892e-06, + "loss": 0.4637, + "mean_token_accuracy": 0.8673127844929696, + "num_tokens": 215231589.0, + "step": 178870 + }, + { + "entropy": 1.8870980799198152, + "epoch": 0.5545123488889526, + "grad_norm": 4.069732666015625, + "learning_rate": 3.3973342881486983e-06, + "loss": 0.4574, + "mean_token_accuracy": 0.8577647238969803, + "num_tokens": 215244003.0, + "step": 178880 + }, + { + "entropy": 1.8829326719045638, + "epoch": 0.5545433480140023, + "grad_norm": 7.217538356781006, + "learning_rate": 3.397239330332184e-06, + "loss": 0.45, + "mean_token_accuracy": 0.8619607254862786, + "num_tokens": 215256216.0, + "step": 178890 + }, + { + "entropy": 1.883328229188919, + "epoch": 0.554574347139052, + "grad_norm": 8.35718059539795, + "learning_rate": 3.3971443804776305e-06, + "loss": 0.4297, + "mean_token_accuracy": 0.8626414731144905, + "num_tokens": 215268764.0, + "step": 178900 + }, + { + "entropy": 1.814480559527874, + "epoch": 0.5546053462641017, + "grad_norm": 6.604519844055176, + "learning_rate": 3.3970494385839274e-06, + "loss": 0.3997, + "mean_token_accuracy": 0.8664494514465332, + "num_tokens": 215281709.0, + "step": 178910 + }, + { + "entropy": 1.9098477900028228, + "epoch": 0.5546363453891514, + "grad_norm": 8.933908462524414, + "learning_rate": 3.396954504649963e-06, + "loss": 0.4445, + "mean_token_accuracy": 0.8497991293668747, + "num_tokens": 215294413.0, + "step": 178920 + }, + { + "entropy": 1.907794676721096, + "epoch": 0.5546673445142011, + "grad_norm": 5.517307758331299, + "learning_rate": 3.3968595786746234e-06, + "loss": 0.4222, + "mean_token_accuracy": 0.8596857756376266, + "num_tokens": 215305421.0, + "step": 178930 + }, + { + "entropy": 1.9482158571481705, + "epoch": 0.5546983436392507, + "grad_norm": 3.700831890106201, + "learning_rate": 3.396764660656798e-06, + "loss": 0.4664, + "mean_token_accuracy": 0.8476640805602074, + "num_tokens": 215317173.0, + "step": 178940 + }, + { + "entropy": 1.8691062763333322, + "epoch": 0.5547293427643005, + "grad_norm": 5.681051254272461, + "learning_rate": 3.3966697505953737e-06, + "loss": 0.441, + "mean_token_accuracy": 0.8514941841363907, + "num_tokens": 215330505.0, + "step": 178950 + }, + { + "entropy": 1.9252370223402977, + "epoch": 0.5547603418893502, + "grad_norm": 7.053623199462891, + "learning_rate": 3.3965748484892403e-06, + "loss": 0.4534, + "mean_token_accuracy": 0.8586797535419464, + "num_tokens": 215342454.0, + "step": 178960 + }, + { + "entropy": 1.8300693720579146, + "epoch": 0.5547913410143999, + "grad_norm": 6.6290998458862305, + "learning_rate": 3.3964799543372855e-06, + "loss": 0.4482, + "mean_token_accuracy": 0.8510792776942253, + "num_tokens": 215356481.0, + "step": 178970 + }, + { + "entropy": 1.9910599797964097, + "epoch": 0.5548223401394495, + "grad_norm": 8.265558242797852, + "learning_rate": 3.396385068138399e-06, + "loss": 0.5201, + "mean_token_accuracy": 0.8351536065340042, + "num_tokens": 215367564.0, + "step": 178980 + }, + { + "entropy": 1.843056969344616, + "epoch": 0.5548533392644993, + "grad_norm": 8.04113483428955, + "learning_rate": 3.3962901898914703e-06, + "loss": 0.4067, + "mean_token_accuracy": 0.8625307679176331, + "num_tokens": 215380757.0, + "step": 178990 + }, + { + "entropy": 1.9277337312698364, + "epoch": 0.554884338389549, + "grad_norm": 8.615413665771484, + "learning_rate": 3.3961953195953873e-06, + "loss": 0.437, + "mean_token_accuracy": 0.8572276189923287, + "num_tokens": 215392044.0, + "step": 179000 + }, + { + "entropy": 1.9238358929753303, + "epoch": 0.5549153375145986, + "grad_norm": 8.563727378845215, + "learning_rate": 3.3961004572490406e-06, + "loss": 0.4158, + "mean_token_accuracy": 0.8628179371356964, + "num_tokens": 215404277.0, + "step": 179010 + }, + { + "entropy": 1.9175738573074341, + "epoch": 0.5549463366396483, + "grad_norm": 8.990974426269531, + "learning_rate": 3.3960056028513206e-06, + "loss": 0.4156, + "mean_token_accuracy": 0.8585062682628631, + "num_tokens": 215416501.0, + "step": 179020 + }, + { + "entropy": 1.9726496756076812, + "epoch": 0.554977335764698, + "grad_norm": 8.099010467529297, + "learning_rate": 3.395910756401116e-06, + "loss": 0.4686, + "mean_token_accuracy": 0.8548742011189461, + "num_tokens": 215427859.0, + "step": 179030 + }, + { + "entropy": 1.9836474925279617, + "epoch": 0.5550083348897478, + "grad_norm": 10.268874168395996, + "learning_rate": 3.3958159178973173e-06, + "loss": 0.4616, + "mean_token_accuracy": 0.8632662326097489, + "num_tokens": 215438192.0, + "step": 179040 + }, + { + "entropy": 1.8734207972884178, + "epoch": 0.5550393340147974, + "grad_norm": 8.809417724609375, + "learning_rate": 3.3957210873388156e-06, + "loss": 0.4392, + "mean_token_accuracy": 0.8660463884472847, + "num_tokens": 215450883.0, + "step": 179050 + }, + { + "entropy": 1.8947151079773903, + "epoch": 0.5550703331398471, + "grad_norm": 8.365950584411621, + "learning_rate": 3.395626264724501e-06, + "loss": 0.4302, + "mean_token_accuracy": 0.8539259016513825, + "num_tokens": 215462732.0, + "step": 179060 + }, + { + "entropy": 1.9040836334228515, + "epoch": 0.5551013322648968, + "grad_norm": 8.468751907348633, + "learning_rate": 3.3955314500532647e-06, + "loss": 0.3862, + "mean_token_accuracy": 0.863848377764225, + "num_tokens": 215475111.0, + "step": 179070 + }, + { + "entropy": 1.9592783853411675, + "epoch": 0.5551323313899466, + "grad_norm": 10.57170581817627, + "learning_rate": 3.395436643323997e-06, + "loss": 0.472, + "mean_token_accuracy": 0.848185470700264, + "num_tokens": 215486025.0, + "step": 179080 + }, + { + "entropy": 1.7874959081411361, + "epoch": 0.5551633305149962, + "grad_norm": 9.114726066589355, + "learning_rate": 3.395341844535591e-06, + "loss": 0.3474, + "mean_token_accuracy": 0.8718330755829811, + "num_tokens": 215500439.0, + "step": 179090 + }, + { + "entropy": 1.9357839733362199, + "epoch": 0.5551943296400459, + "grad_norm": 9.298722267150879, + "learning_rate": 3.3952470536869374e-06, + "loss": 0.4712, + "mean_token_accuracy": 0.8474046647548675, + "num_tokens": 215511982.0, + "step": 179100 + }, + { + "entropy": 1.9739779576659202, + "epoch": 0.5552253287650956, + "grad_norm": 3.7368788719177246, + "learning_rate": 3.395152270776927e-06, + "loss": 0.4991, + "mean_token_accuracy": 0.8470907166600228, + "num_tokens": 215523514.0, + "step": 179110 + }, + { + "entropy": 1.9102428033947945, + "epoch": 0.5552563278901453, + "grad_norm": 4.0481109619140625, + "learning_rate": 3.3950574958044526e-06, + "loss": 0.4144, + "mean_token_accuracy": 0.863105945289135, + "num_tokens": 215535555.0, + "step": 179120 + }, + { + "entropy": 1.9373560398817062, + "epoch": 0.555287327015195, + "grad_norm": 6.839097023010254, + "learning_rate": 3.3949627287684063e-06, + "loss": 0.4458, + "mean_token_accuracy": 0.862397214770317, + "num_tokens": 215546809.0, + "step": 179130 + }, + { + "entropy": 1.9235127955675124, + "epoch": 0.5553183261402447, + "grad_norm": 8.559110641479492, + "learning_rate": 3.39486796966768e-06, + "loss": 0.5766, + "mean_token_accuracy": 0.8470494523644447, + "num_tokens": 215558903.0, + "step": 179140 + }, + { + "entropy": 1.9319372728466988, + "epoch": 0.5553493252652943, + "grad_norm": 8.896299362182617, + "learning_rate": 3.3947732185011684e-06, + "loss": 0.4639, + "mean_token_accuracy": 0.8440060004591942, + "num_tokens": 215570155.0, + "step": 179150 + }, + { + "entropy": 1.8521119982004166, + "epoch": 0.5553803243903441, + "grad_norm": 8.03239631652832, + "learning_rate": 3.3946784752677613e-06, + "loss": 0.3786, + "mean_token_accuracy": 0.8644193202257157, + "num_tokens": 215581883.0, + "step": 179160 + }, + { + "entropy": 1.849208802729845, + "epoch": 0.5554113235153938, + "grad_norm": 5.667628765106201, + "learning_rate": 3.394583739966354e-06, + "loss": 0.3963, + "mean_token_accuracy": 0.8646664693951607, + "num_tokens": 215595076.0, + "step": 179170 + }, + { + "entropy": 1.8232768312096597, + "epoch": 0.5554423226404435, + "grad_norm": 3.93361234664917, + "learning_rate": 3.3944890125958384e-06, + "loss": 0.3884, + "mean_token_accuracy": 0.8674084261059761, + "num_tokens": 215607710.0, + "step": 179180 + }, + { + "entropy": 1.901517029106617, + "epoch": 0.5554733217654931, + "grad_norm": 8.467094421386719, + "learning_rate": 3.3943942931551087e-06, + "loss": 0.4349, + "mean_token_accuracy": 0.8507841497659683, + "num_tokens": 215619794.0, + "step": 179190 + }, + { + "entropy": 1.8633908212184906, + "epoch": 0.5555043208905429, + "grad_norm": 8.088972091674805, + "learning_rate": 3.394299581643059e-06, + "loss": 0.421, + "mean_token_accuracy": 0.8526325955986976, + "num_tokens": 215632836.0, + "step": 179200 + }, + { + "entropy": 1.8850948512554169, + "epoch": 0.5555353200155926, + "grad_norm": 7.233824729919434, + "learning_rate": 3.3942048780585822e-06, + "loss": 0.3987, + "mean_token_accuracy": 0.8667260557413101, + "num_tokens": 215644918.0, + "step": 179210 + }, + { + "entropy": 1.9587379336357116, + "epoch": 0.5555663191406423, + "grad_norm": 10.04115104675293, + "learning_rate": 3.394110182400573e-06, + "loss": 0.4951, + "mean_token_accuracy": 0.8412050887942314, + "num_tokens": 215656229.0, + "step": 179220 + }, + { + "entropy": 1.899441859126091, + "epoch": 0.5555973182656919, + "grad_norm": 9.780128479003906, + "learning_rate": 3.394015494667926e-06, + "loss": 0.4271, + "mean_token_accuracy": 0.8703594207763672, + "num_tokens": 215667872.0, + "step": 179230 + }, + { + "entropy": 1.8831466823816299, + "epoch": 0.5556283173907417, + "grad_norm": 7.8261260986328125, + "learning_rate": 3.393920814859535e-06, + "loss": 0.4334, + "mean_token_accuracy": 0.8532602831721305, + "num_tokens": 215680339.0, + "step": 179240 + }, + { + "entropy": 1.9097533985972404, + "epoch": 0.5556593165157914, + "grad_norm": 9.267577171325684, + "learning_rate": 3.393826142974295e-06, + "loss": 0.4261, + "mean_token_accuracy": 0.862908025085926, + "num_tokens": 215691416.0, + "step": 179250 + }, + { + "entropy": 1.9572410345077516, + "epoch": 0.555690315640841, + "grad_norm": 8.92303466796875, + "learning_rate": 3.3937314790111027e-06, + "loss": 0.4594, + "mean_token_accuracy": 0.8508604303002357, + "num_tokens": 215702129.0, + "step": 179260 + }, + { + "entropy": 1.8611516401171684, + "epoch": 0.5557213147658907, + "grad_norm": 5.001405715942383, + "learning_rate": 3.3936368229688506e-06, + "loss": 0.399, + "mean_token_accuracy": 0.8587961971759797, + "num_tokens": 215715265.0, + "step": 179270 + }, + { + "entropy": 1.8070557743310929, + "epoch": 0.5557523138909404, + "grad_norm": 2.886479616165161, + "learning_rate": 3.3935421748464355e-06, + "loss": 0.3995, + "mean_token_accuracy": 0.8679655134677887, + "num_tokens": 215728529.0, + "step": 179280 + }, + { + "entropy": 1.9148321211338044, + "epoch": 0.5557833130159902, + "grad_norm": 3.6641204357147217, + "learning_rate": 3.393447534642752e-06, + "loss": 0.4342, + "mean_token_accuracy": 0.8596915438771248, + "num_tokens": 215740571.0, + "step": 179290 + }, + { + "entropy": 1.9517053723335267, + "epoch": 0.5558143121410398, + "grad_norm": 8.067102432250977, + "learning_rate": 3.393352902356698e-06, + "loss": 0.4207, + "mean_token_accuracy": 0.8586618512868881, + "num_tokens": 215752177.0, + "step": 179300 + }, + { + "entropy": 1.9378379747271537, + "epoch": 0.5558453112660895, + "grad_norm": 4.564868927001953, + "learning_rate": 3.3932582779871685e-06, + "loss": 0.4421, + "mean_token_accuracy": 0.855698038637638, + "num_tokens": 215763713.0, + "step": 179310 + }, + { + "entropy": 1.9738548263907432, + "epoch": 0.5558763103911392, + "grad_norm": 8.826933860778809, + "learning_rate": 3.393163661533059e-06, + "loss": 0.4929, + "mean_token_accuracy": 0.8455957636237145, + "num_tokens": 215774828.0, + "step": 179320 + }, + { + "entropy": 1.8536319456994534, + "epoch": 0.5559073095161889, + "grad_norm": 10.126803398132324, + "learning_rate": 3.3930690529932682e-06, + "loss": 0.4485, + "mean_token_accuracy": 0.857026931643486, + "num_tokens": 215788064.0, + "step": 179330 + }, + { + "entropy": 1.8911743879318237, + "epoch": 0.5559383086412386, + "grad_norm": 3.8450915813446045, + "learning_rate": 3.3929744523666895e-06, + "loss": 0.3811, + "mean_token_accuracy": 0.8661640584468842, + "num_tokens": 215800499.0, + "step": 179340 + }, + { + "entropy": 1.8529613867402077, + "epoch": 0.5559693077662883, + "grad_norm": 7.823770523071289, + "learning_rate": 3.3928798596522235e-06, + "loss": 0.3802, + "mean_token_accuracy": 0.8706428721547127, + "num_tokens": 215812209.0, + "step": 179350 + }, + { + "entropy": 1.972227230668068, + "epoch": 0.556000306891338, + "grad_norm": 8.537871360778809, + "learning_rate": 3.3927852748487644e-06, + "loss": 0.4798, + "mean_token_accuracy": 0.8534824714064598, + "num_tokens": 215823150.0, + "step": 179360 + }, + { + "entropy": 1.8717477947473526, + "epoch": 0.5560313060163877, + "grad_norm": 9.057830810546875, + "learning_rate": 3.392690697955211e-06, + "loss": 0.4367, + "mean_token_accuracy": 0.8547875434160233, + "num_tokens": 215835082.0, + "step": 179370 + }, + { + "entropy": 1.9645709812641143, + "epoch": 0.5560623051414374, + "grad_norm": 8.062889099121094, + "learning_rate": 3.3925961289704608e-06, + "loss": 0.4744, + "mean_token_accuracy": 0.847657410800457, + "num_tokens": 215846350.0, + "step": 179380 + }, + { + "entropy": 1.8887288227677346, + "epoch": 0.5560933042664871, + "grad_norm": 9.724749565124512, + "learning_rate": 3.3925015678934114e-06, + "loss": 0.4379, + "mean_token_accuracy": 0.8625921666622162, + "num_tokens": 215857447.0, + "step": 179390 + }, + { + "entropy": 1.8312772125005723, + "epoch": 0.5561243033915367, + "grad_norm": 8.317710876464844, + "learning_rate": 3.3924070147229606e-06, + "loss": 0.395, + "mean_token_accuracy": 0.8607356131076813, + "num_tokens": 215869731.0, + "step": 179400 + }, + { + "entropy": 1.8348938211798669, + "epoch": 0.5561553025165865, + "grad_norm": 3.625267505645752, + "learning_rate": 3.392312469458007e-06, + "loss": 0.4183, + "mean_token_accuracy": 0.8623997822403908, + "num_tokens": 215882539.0, + "step": 179410 + }, + { + "entropy": 1.9049311459064484, + "epoch": 0.5561863016416362, + "grad_norm": 8.238363265991211, + "learning_rate": 3.3922179320974487e-06, + "loss": 0.4466, + "mean_token_accuracy": 0.8547548159956933, + "num_tokens": 215893749.0, + "step": 179420 + }, + { + "entropy": 1.9604707762598992, + "epoch": 0.5562173007666859, + "grad_norm": 4.5083842277526855, + "learning_rate": 3.392123402640185e-06, + "loss": 0.4975, + "mean_token_accuracy": 0.842636513710022, + "num_tokens": 215905283.0, + "step": 179430 + }, + { + "entropy": 1.9151503935456275, + "epoch": 0.5562482998917355, + "grad_norm": 8.561266899108887, + "learning_rate": 3.3920288810851136e-06, + "loss": 0.4073, + "mean_token_accuracy": 0.8592620491981506, + "num_tokens": 215917371.0, + "step": 179440 + }, + { + "entropy": 1.9202652871608734, + "epoch": 0.5562792990167853, + "grad_norm": 8.415385246276855, + "learning_rate": 3.3919343674311346e-06, + "loss": 0.4692, + "mean_token_accuracy": 0.8525185972452164, + "num_tokens": 215928244.0, + "step": 179450 + }, + { + "entropy": 1.9669117599725723, + "epoch": 0.556310298141835, + "grad_norm": 9.637960433959961, + "learning_rate": 3.3918398616771477e-06, + "loss": 0.536, + "mean_token_accuracy": 0.8396474197506905, + "num_tokens": 215939994.0, + "step": 179460 + }, + { + "entropy": 1.8051816076040268, + "epoch": 0.5563412972668846, + "grad_norm": 3.206972599029541, + "learning_rate": 3.391745363822051e-06, + "loss": 0.3796, + "mean_token_accuracy": 0.8714934229850769, + "num_tokens": 215953601.0, + "step": 179470 + }, + { + "entropy": 1.7923723876476287, + "epoch": 0.5563722963919343, + "grad_norm": 9.600335121154785, + "learning_rate": 3.3916508738647453e-06, + "loss": 0.3852, + "mean_token_accuracy": 0.8598181501030921, + "num_tokens": 215966356.0, + "step": 179480 + }, + { + "entropy": 1.7627654626965523, + "epoch": 0.5564032955169841, + "grad_norm": 9.009991645812988, + "learning_rate": 3.3915563918041296e-06, + "loss": 0.3674, + "mean_token_accuracy": 0.8625359818339348, + "num_tokens": 215979553.0, + "step": 179490 + }, + { + "entropy": 1.9511140018701554, + "epoch": 0.5564342946420338, + "grad_norm": 7.965838432312012, + "learning_rate": 3.3914619176391057e-06, + "loss": 0.5094, + "mean_token_accuracy": 0.8408928826451302, + "num_tokens": 215990229.0, + "step": 179500 + }, + { + "entropy": 1.8817020952701569, + "epoch": 0.5564652937670834, + "grad_norm": 6.978695869445801, + "learning_rate": 3.3913674513685724e-06, + "loss": 0.4293, + "mean_token_accuracy": 0.8560591101646423, + "num_tokens": 216001836.0, + "step": 179510 + }, + { + "entropy": 1.8947572827339172, + "epoch": 0.5564962928921331, + "grad_norm": 8.471761703491211, + "learning_rate": 3.391272992991431e-06, + "loss": 0.4332, + "mean_token_accuracy": 0.8558598890900612, + "num_tokens": 216012863.0, + "step": 179520 + }, + { + "entropy": 1.8559965267777443, + "epoch": 0.5565272920171828, + "grad_norm": 8.303871154785156, + "learning_rate": 3.3911785425065817e-06, + "loss": 0.4302, + "mean_token_accuracy": 0.8528834328055381, + "num_tokens": 216025581.0, + "step": 179530 + }, + { + "entropy": 1.9025905281305313, + "epoch": 0.5565582911422325, + "grad_norm": 4.735015392303467, + "learning_rate": 3.391084099912926e-06, + "loss": 0.4623, + "mean_token_accuracy": 0.846812778711319, + "num_tokens": 216037600.0, + "step": 179540 + }, + { + "entropy": 1.9127229869365692, + "epoch": 0.5565892902672822, + "grad_norm": 6.8129777908325195, + "learning_rate": 3.3909896652093653e-06, + "loss": 0.3978, + "mean_token_accuracy": 0.8685696572065353, + "num_tokens": 216050178.0, + "step": 179550 + }, + { + "entropy": 1.8396325334906578, + "epoch": 0.5566202893923319, + "grad_norm": 4.305300712585449, + "learning_rate": 3.3908952383948007e-06, + "loss": 0.4182, + "mean_token_accuracy": 0.8576811730861664, + "num_tokens": 216063083.0, + "step": 179560 + }, + { + "entropy": 1.8920865491032601, + "epoch": 0.5566512885173815, + "grad_norm": 8.195730209350586, + "learning_rate": 3.3908008194681347e-06, + "loss": 0.4456, + "mean_token_accuracy": 0.8563900366425514, + "num_tokens": 216074865.0, + "step": 179570 + }, + { + "entropy": 1.8664014101028443, + "epoch": 0.5566822876424313, + "grad_norm": 6.416919231414795, + "learning_rate": 3.3907064084282674e-06, + "loss": 0.4388, + "mean_token_accuracy": 0.8568268820643425, + "num_tokens": 216086684.0, + "step": 179580 + }, + { + "entropy": 1.8229648873209954, + "epoch": 0.556713286767481, + "grad_norm": 4.24645471572876, + "learning_rate": 3.3906120052741033e-06, + "loss": 0.4015, + "mean_token_accuracy": 0.8657773956656456, + "num_tokens": 216099331.0, + "step": 179590 + }, + { + "entropy": 1.9399886280298233, + "epoch": 0.5567442858925307, + "grad_norm": 8.29112720489502, + "learning_rate": 3.3905176100045423e-06, + "loss": 0.47, + "mean_token_accuracy": 0.8519351586699486, + "num_tokens": 216111189.0, + "step": 179600 + }, + { + "entropy": 1.936243087053299, + "epoch": 0.5567752850175803, + "grad_norm": 6.696967124938965, + "learning_rate": 3.3904232226184886e-06, + "loss": 0.4882, + "mean_token_accuracy": 0.8442001849412918, + "num_tokens": 216122264.0, + "step": 179610 + }, + { + "entropy": 1.9167515233159065, + "epoch": 0.5568062841426301, + "grad_norm": 8.134096145629883, + "learning_rate": 3.390328843114844e-06, + "loss": 0.435, + "mean_token_accuracy": 0.8545194461941719, + "num_tokens": 216133960.0, + "step": 179620 + }, + { + "entropy": 1.9942961156368255, + "epoch": 0.5568372832676798, + "grad_norm": 9.308036804199219, + "learning_rate": 3.390234471492512e-06, + "loss": 0.4398, + "mean_token_accuracy": 0.8635125383734703, + "num_tokens": 216144747.0, + "step": 179630 + }, + { + "entropy": 1.862863690406084, + "epoch": 0.5568682823927295, + "grad_norm": 9.135284423828125, + "learning_rate": 3.390140107750395e-06, + "loss": 0.4474, + "mean_token_accuracy": 0.8429133668541908, + "num_tokens": 216157517.0, + "step": 179640 + }, + { + "entropy": 1.9794843196868896, + "epoch": 0.5568992815177791, + "grad_norm": 6.671160697937012, + "learning_rate": 3.390045751887397e-06, + "loss": 0.533, + "mean_token_accuracy": 0.8423982381820678, + "num_tokens": 216167900.0, + "step": 179650 + }, + { + "entropy": 1.9758970573544503, + "epoch": 0.5569302806428289, + "grad_norm": 8.529526710510254, + "learning_rate": 3.389951403902422e-06, + "loss": 0.4779, + "mean_token_accuracy": 0.8486952930688858, + "num_tokens": 216179421.0, + "step": 179660 + }, + { + "entropy": 1.8976741746068, + "epoch": 0.5569612797678786, + "grad_norm": 8.507506370544434, + "learning_rate": 3.389857063794373e-06, + "loss": 0.4353, + "mean_token_accuracy": 0.8545589908957482, + "num_tokens": 216191649.0, + "step": 179670 + }, + { + "entropy": 1.9042115330696106, + "epoch": 0.5569922788929282, + "grad_norm": 3.675581455230713, + "learning_rate": 3.389762731562154e-06, + "loss": 0.4994, + "mean_token_accuracy": 0.8550908967852593, + "num_tokens": 216203571.0, + "step": 179680 + }, + { + "entropy": 1.9547617256641387, + "epoch": 0.5570232780179779, + "grad_norm": 7.903559684753418, + "learning_rate": 3.38966840720467e-06, + "loss": 0.4249, + "mean_token_accuracy": 0.8654710337519645, + "num_tokens": 216214089.0, + "step": 179690 + }, + { + "entropy": 1.9081181332468986, + "epoch": 0.5570542771430277, + "grad_norm": 4.779351711273193, + "learning_rate": 3.3895740907208246e-06, + "loss": 0.4533, + "mean_token_accuracy": 0.8563227370381356, + "num_tokens": 216225286.0, + "step": 179700 + }, + { + "entropy": 1.9601490914821624, + "epoch": 0.5570852762680774, + "grad_norm": 8.551769256591797, + "learning_rate": 3.3894797821095223e-06, + "loss": 0.5053, + "mean_token_accuracy": 0.8430069953203201, + "num_tokens": 216236582.0, + "step": 179710 + }, + { + "entropy": 1.936085006594658, + "epoch": 0.557116275393127, + "grad_norm": 6.998064041137695, + "learning_rate": 3.3893854813696696e-06, + "loss": 0.4181, + "mean_token_accuracy": 0.8596946880221367, + "num_tokens": 216247995.0, + "step": 179720 + }, + { + "entropy": 1.9320401161909104, + "epoch": 0.5571472745181767, + "grad_norm": 7.7311272621154785, + "learning_rate": 3.38929118850017e-06, + "loss": 0.5353, + "mean_token_accuracy": 0.8525026142597198, + "num_tokens": 216259406.0, + "step": 179730 + }, + { + "entropy": 1.9668843865394592, + "epoch": 0.5571782736432265, + "grad_norm": 7.928240776062012, + "learning_rate": 3.389196903499929e-06, + "loss": 0.5082, + "mean_token_accuracy": 0.8433253467082977, + "num_tokens": 216270626.0, + "step": 179740 + }, + { + "entropy": 1.895675989985466, + "epoch": 0.5572092727682761, + "grad_norm": 7.530059337615967, + "learning_rate": 3.3891026263678523e-06, + "loss": 0.4499, + "mean_token_accuracy": 0.86041489392519, + "num_tokens": 216282050.0, + "step": 179750 + }, + { + "entropy": 1.8358202219009399, + "epoch": 0.5572402718933258, + "grad_norm": 3.5207154750823975, + "learning_rate": 3.389008357102846e-06, + "loss": 0.3855, + "mean_token_accuracy": 0.8601732328534126, + "num_tokens": 216294458.0, + "step": 179760 + }, + { + "entropy": 1.9326426953077316, + "epoch": 0.5572712710183755, + "grad_norm": 8.358792304992676, + "learning_rate": 3.3889140957038156e-06, + "loss": 0.45, + "mean_token_accuracy": 0.8487183004617691, + "num_tokens": 216305382.0, + "step": 179770 + }, + { + "entropy": 1.9060441732406617, + "epoch": 0.5573022701434251, + "grad_norm": 6.259328842163086, + "learning_rate": 3.3888198421696677e-06, + "loss": 0.4413, + "mean_token_accuracy": 0.8615023106336593, + "num_tokens": 216317172.0, + "step": 179780 + }, + { + "entropy": 1.8866166576743126, + "epoch": 0.5573332692684749, + "grad_norm": 8.117493629455566, + "learning_rate": 3.3887255964993077e-06, + "loss": 0.443, + "mean_token_accuracy": 0.8601656764745712, + "num_tokens": 216328605.0, + "step": 179790 + }, + { + "entropy": 1.8952102825045585, + "epoch": 0.5573642683935246, + "grad_norm": 10.748100280761719, + "learning_rate": 3.388631358691643e-06, + "loss": 0.439, + "mean_token_accuracy": 0.853930875658989, + "num_tokens": 216340153.0, + "step": 179800 + }, + { + "entropy": 1.8524406239390374, + "epoch": 0.5573952675185743, + "grad_norm": 8.204578399658203, + "learning_rate": 3.3885371287455803e-06, + "loss": 0.4055, + "mean_token_accuracy": 0.8580199882388115, + "num_tokens": 216351612.0, + "step": 179810 + }, + { + "entropy": 1.910023505985737, + "epoch": 0.5574262666436239, + "grad_norm": 9.937796592712402, + "learning_rate": 3.3884429066600267e-06, + "loss": 0.465, + "mean_token_accuracy": 0.8472720056772232, + "num_tokens": 216362609.0, + "step": 179820 + }, + { + "entropy": 1.9104634687304496, + "epoch": 0.5574572657686737, + "grad_norm": 6.716761589050293, + "learning_rate": 3.3883486924338895e-06, + "loss": 0.4488, + "mean_token_accuracy": 0.8515969440340996, + "num_tokens": 216374245.0, + "step": 179830 + }, + { + "entropy": 1.8826022177934647, + "epoch": 0.5574882648937234, + "grad_norm": 4.184479713439941, + "learning_rate": 3.388254486066074e-06, + "loss": 0.4613, + "mean_token_accuracy": 0.8498593136668205, + "num_tokens": 216385180.0, + "step": 179840 + }, + { + "entropy": 1.8581200003623963, + "epoch": 0.557519264018773, + "grad_norm": 6.967178821563721, + "learning_rate": 3.388160287555491e-06, + "loss": 0.3964, + "mean_token_accuracy": 0.8627260774374008, + "num_tokens": 216398368.0, + "step": 179850 + }, + { + "entropy": 1.8448219254612923, + "epoch": 0.5575502631438227, + "grad_norm": 7.474876403808594, + "learning_rate": 3.388066096901047e-06, + "loss": 0.3825, + "mean_token_accuracy": 0.866121557354927, + "num_tokens": 216410681.0, + "step": 179860 + }, + { + "entropy": 1.9550578266382217, + "epoch": 0.5575812622688725, + "grad_norm": 7.641628742218018, + "learning_rate": 3.38797191410165e-06, + "loss": 0.479, + "mean_token_accuracy": 0.8485138610005378, + "num_tokens": 216421405.0, + "step": 179870 + }, + { + "entropy": 1.873170644044876, + "epoch": 0.5576122613939222, + "grad_norm": 3.591663360595703, + "learning_rate": 3.3878777391562086e-06, + "loss": 0.4301, + "mean_token_accuracy": 0.8564320370554924, + "num_tokens": 216433518.0, + "step": 179880 + }, + { + "entropy": 1.9816926151514054, + "epoch": 0.5576432605189718, + "grad_norm": 8.375580787658691, + "learning_rate": 3.3877835720636297e-06, + "loss": 0.4995, + "mean_token_accuracy": 0.8439663261175155, + "num_tokens": 216444283.0, + "step": 179890 + }, + { + "entropy": 1.9584947019815444, + "epoch": 0.5576742596440215, + "grad_norm": 7.122523784637451, + "learning_rate": 3.387689412822824e-06, + "loss": 0.4879, + "mean_token_accuracy": 0.8492183342576027, + "num_tokens": 216455129.0, + "step": 179900 + }, + { + "entropy": 1.8550873950123787, + "epoch": 0.5577052587690713, + "grad_norm": 4.019259929656982, + "learning_rate": 3.3875952614327e-06, + "loss": 0.4312, + "mean_token_accuracy": 0.8580047935247421, + "num_tokens": 216467330.0, + "step": 179910 + }, + { + "entropy": 1.985216537117958, + "epoch": 0.557736257894121, + "grad_norm": 9.813446998596191, + "learning_rate": 3.387501117892166e-06, + "loss": 0.4712, + "mean_token_accuracy": 0.8531203463673591, + "num_tokens": 216477979.0, + "step": 179920 + }, + { + "entropy": 1.923241350054741, + "epoch": 0.5577672570191706, + "grad_norm": 8.68863582611084, + "learning_rate": 3.387406982200133e-06, + "loss": 0.439, + "mean_token_accuracy": 0.8578556135296822, + "num_tokens": 216489302.0, + "step": 179930 + }, + { + "entropy": 1.8665193885564804, + "epoch": 0.5577982561442203, + "grad_norm": 7.6800618171691895, + "learning_rate": 3.3873128543555078e-06, + "loss": 0.4732, + "mean_token_accuracy": 0.8479210451245308, + "num_tokens": 216502141.0, + "step": 179940 + }, + { + "entropy": 1.848955325782299, + "epoch": 0.5578292552692701, + "grad_norm": 6.941798686981201, + "learning_rate": 3.3872187343572026e-06, + "loss": 0.4394, + "mean_token_accuracy": 0.8580316364765167, + "num_tokens": 216514646.0, + "step": 179950 + }, + { + "entropy": 1.868771779537201, + "epoch": 0.5578602543943197, + "grad_norm": 3.62846302986145, + "learning_rate": 3.3871246222041264e-06, + "loss": 0.4252, + "mean_token_accuracy": 0.8587727159261703, + "num_tokens": 216526552.0, + "step": 179960 + }, + { + "entropy": 1.9015920877456665, + "epoch": 0.5578912535193694, + "grad_norm": 7.458217620849609, + "learning_rate": 3.3870305178951897e-06, + "loss": 0.4416, + "mean_token_accuracy": 0.8526492983102798, + "num_tokens": 216537686.0, + "step": 179970 + }, + { + "entropy": 1.890255093574524, + "epoch": 0.5579222526444191, + "grad_norm": 9.054282188415527, + "learning_rate": 3.386936421429302e-06, + "loss": 0.4167, + "mean_token_accuracy": 0.8584634244441987, + "num_tokens": 216550036.0, + "step": 179980 + }, + { + "entropy": 1.8730426907539368, + "epoch": 0.5579532517694689, + "grad_norm": 8.789347648620605, + "learning_rate": 3.3868423328053756e-06, + "loss": 0.4425, + "mean_token_accuracy": 0.8546410590410233, + "num_tokens": 216561857.0, + "step": 179990 + }, + { + "entropy": 1.8982899636030197, + "epoch": 0.5579842508945185, + "grad_norm": 4.101022243499756, + "learning_rate": 3.38674825202232e-06, + "loss": 0.398, + "mean_token_accuracy": 0.8562424078583717, + "num_tokens": 216573765.0, + "step": 180000 + }, + { + "entropy": 1.9094096958637237, + "epoch": 0.5580152500195682, + "grad_norm": 8.289060592651367, + "learning_rate": 3.3866541790790464e-06, + "loss": 0.4275, + "mean_token_accuracy": 0.8647675678133965, + "num_tokens": 216585551.0, + "step": 180010 + }, + { + "entropy": 1.8832878440618515, + "epoch": 0.5580462491446179, + "grad_norm": 9.198235511779785, + "learning_rate": 3.386560113974466e-06, + "loss": 0.4308, + "mean_token_accuracy": 0.8545297279953956, + "num_tokens": 216597949.0, + "step": 180020 + }, + { + "entropy": 1.8627006128430366, + "epoch": 0.5580772482696675, + "grad_norm": 7.299589157104492, + "learning_rate": 3.386466056707491e-06, + "loss": 0.4376, + "mean_token_accuracy": 0.8517317429184914, + "num_tokens": 216610160.0, + "step": 180030 + }, + { + "entropy": 1.8306142285466194, + "epoch": 0.5581082473947173, + "grad_norm": 8.921402931213379, + "learning_rate": 3.386372007277032e-06, + "loss": 0.4133, + "mean_token_accuracy": 0.8578998699784279, + "num_tokens": 216623520.0, + "step": 180040 + }, + { + "entropy": 1.8765130326151849, + "epoch": 0.558139246519767, + "grad_norm": 2.5298173427581787, + "learning_rate": 3.386277965682002e-06, + "loss": 0.4125, + "mean_token_accuracy": 0.8659792527556419, + "num_tokens": 216635942.0, + "step": 180050 + }, + { + "entropy": 1.8612042382359504, + "epoch": 0.5581702456448167, + "grad_norm": 7.690088748931885, + "learning_rate": 3.3861839319213115e-06, + "loss": 0.4431, + "mean_token_accuracy": 0.8645087346434593, + "num_tokens": 216647774.0, + "step": 180060 + }, + { + "entropy": 1.8710038140416145, + "epoch": 0.5582012447698663, + "grad_norm": 7.982130527496338, + "learning_rate": 3.386089905993874e-06, + "loss": 0.4545, + "mean_token_accuracy": 0.8475197136402131, + "num_tokens": 216659937.0, + "step": 180070 + }, + { + "entropy": 1.8144805401563644, + "epoch": 0.5582322438949161, + "grad_norm": 7.199626445770264, + "learning_rate": 3.3859958878986027e-06, + "loss": 0.3962, + "mean_token_accuracy": 0.8654684230685235, + "num_tokens": 216672789.0, + "step": 180080 + }, + { + "entropy": 1.9647779405117034, + "epoch": 0.5582632430199658, + "grad_norm": 7.467442035675049, + "learning_rate": 3.385901877634408e-06, + "loss": 0.4716, + "mean_token_accuracy": 0.8525005370378494, + "num_tokens": 216683584.0, + "step": 180090 + }, + { + "entropy": 1.9427626758813858, + "epoch": 0.5582942421450154, + "grad_norm": 8.000640869140625, + "learning_rate": 3.3858078752002057e-06, + "loss": 0.4699, + "mean_token_accuracy": 0.8522224456071854, + "num_tokens": 216694614.0, + "step": 180100 + }, + { + "entropy": 1.8860655903816224, + "epoch": 0.5583252412700651, + "grad_norm": 8.802091598510742, + "learning_rate": 3.3857138805949064e-06, + "loss": 0.4276, + "mean_token_accuracy": 0.8601176783442497, + "num_tokens": 216706596.0, + "step": 180110 + }, + { + "entropy": 1.8183810651302337, + "epoch": 0.5583562403951149, + "grad_norm": 9.341395378112793, + "learning_rate": 3.3856198938174252e-06, + "loss": 0.3951, + "mean_token_accuracy": 0.8621927976608277, + "num_tokens": 216718825.0, + "step": 180120 + }, + { + "entropy": 1.816829715669155, + "epoch": 0.5583872395201646, + "grad_norm": 8.577747344970703, + "learning_rate": 3.3855259148666746e-06, + "loss": 0.3878, + "mean_token_accuracy": 0.8645969301462173, + "num_tokens": 216731098.0, + "step": 180130 + }, + { + "entropy": 1.8784575372934342, + "epoch": 0.5584182386452142, + "grad_norm": 7.502293109893799, + "learning_rate": 3.3854319437415695e-06, + "loss": 0.4159, + "mean_token_accuracy": 0.8556536450982094, + "num_tokens": 216742776.0, + "step": 180140 + }, + { + "entropy": 1.9293906539678574, + "epoch": 0.5584492377702639, + "grad_norm": 4.2252326011657715, + "learning_rate": 3.3853379804410227e-06, + "loss": 0.4437, + "mean_token_accuracy": 0.8556514650583267, + "num_tokens": 216753738.0, + "step": 180150 + }, + { + "entropy": 1.9010004952549935, + "epoch": 0.5584802368953137, + "grad_norm": 7.127376079559326, + "learning_rate": 3.385244024963948e-06, + "loss": 0.4391, + "mean_token_accuracy": 0.8559280022978782, + "num_tokens": 216765075.0, + "step": 180160 + }, + { + "entropy": 1.877715417742729, + "epoch": 0.5585112360203633, + "grad_norm": 8.160017013549805, + "learning_rate": 3.3851500773092618e-06, + "loss": 0.4217, + "mean_token_accuracy": 0.8626894101500511, + "num_tokens": 216776057.0, + "step": 180170 + }, + { + "entropy": 1.9002387523651123, + "epoch": 0.558542235145413, + "grad_norm": 3.840376377105713, + "learning_rate": 3.385056137475877e-06, + "loss": 0.4262, + "mean_token_accuracy": 0.8568391934037208, + "num_tokens": 216787593.0, + "step": 180180 + }, + { + "entropy": 1.956373357772827, + "epoch": 0.5585732342704627, + "grad_norm": 8.961759567260742, + "learning_rate": 3.3849622054627097e-06, + "loss": 0.5069, + "mean_token_accuracy": 0.8425930678844452, + "num_tokens": 216798230.0, + "step": 180190 + }, + { + "entropy": 1.8474145486950875, + "epoch": 0.5586042333955125, + "grad_norm": 7.407924652099609, + "learning_rate": 3.3848682812686738e-06, + "loss": 0.4604, + "mean_token_accuracy": 0.8518683150410652, + "num_tokens": 216810802.0, + "step": 180200 + }, + { + "entropy": 1.8440819442272187, + "epoch": 0.5586352325205621, + "grad_norm": 3.6953446865081787, + "learning_rate": 3.384774364892685e-06, + "loss": 0.3735, + "mean_token_accuracy": 0.8669338792562484, + "num_tokens": 216823216.0, + "step": 180210 + }, + { + "entropy": 1.9052780613303184, + "epoch": 0.5586662316456118, + "grad_norm": 8.425951957702637, + "learning_rate": 3.3846804563336588e-06, + "loss": 0.4715, + "mean_token_accuracy": 0.8528780624270439, + "num_tokens": 216834842.0, + "step": 180220 + }, + { + "entropy": 1.8528162971138955, + "epoch": 0.5586972307706615, + "grad_norm": 7.783543586730957, + "learning_rate": 3.384586555590511e-06, + "loss": 0.4665, + "mean_token_accuracy": 0.8446239486336709, + "num_tokens": 216846620.0, + "step": 180230 + }, + { + "entropy": 1.8845822378993033, + "epoch": 0.5587282298957112, + "grad_norm": 7.422481060028076, + "learning_rate": 3.3844926626621576e-06, + "loss": 0.4212, + "mean_token_accuracy": 0.8657801419496536, + "num_tokens": 216857442.0, + "step": 180240 + }, + { + "entropy": 1.8267993465065957, + "epoch": 0.5587592290207609, + "grad_norm": 7.172003269195557, + "learning_rate": 3.384398777547514e-06, + "loss": 0.39, + "mean_token_accuracy": 0.86737762093544, + "num_tokens": 216870883.0, + "step": 180250 + }, + { + "entropy": 1.9326736852526665, + "epoch": 0.5587902281458106, + "grad_norm": 8.402005195617676, + "learning_rate": 3.3843049002454976e-06, + "loss": 0.4408, + "mean_token_accuracy": 0.85916518419981, + "num_tokens": 216882245.0, + "step": 180260 + }, + { + "entropy": 1.9091658741235733, + "epoch": 0.5588212272708603, + "grad_norm": 4.516397476196289, + "learning_rate": 3.3842110307550237e-06, + "loss": 0.426, + "mean_token_accuracy": 0.8538550525903702, + "num_tokens": 216893966.0, + "step": 180270 + }, + { + "entropy": 1.828370450437069, + "epoch": 0.5588522263959099, + "grad_norm": 8.806618690490723, + "learning_rate": 3.3841171690750097e-06, + "loss": 0.4108, + "mean_token_accuracy": 0.8570792287588119, + "num_tokens": 216905937.0, + "step": 180280 + }, + { + "entropy": 1.885986079275608, + "epoch": 0.5588832255209597, + "grad_norm": 8.106322288513184, + "learning_rate": 3.3840233152043726e-06, + "loss": 0.3967, + "mean_token_accuracy": 0.8598363786935806, + "num_tokens": 216918163.0, + "step": 180290 + }, + { + "entropy": 1.7997533604502678, + "epoch": 0.5589142246460094, + "grad_norm": 4.08975076675415, + "learning_rate": 3.383929469142029e-06, + "loss": 0.3597, + "mean_token_accuracy": 0.8623867243528366, + "num_tokens": 216931475.0, + "step": 180300 + }, + { + "entropy": 1.8519815653562546, + "epoch": 0.558945223771059, + "grad_norm": 8.924751281738281, + "learning_rate": 3.3838356308868977e-06, + "loss": 0.4515, + "mean_token_accuracy": 0.8536183342337609, + "num_tokens": 216944144.0, + "step": 180310 + }, + { + "entropy": 1.8736138075590134, + "epoch": 0.5589762228961087, + "grad_norm": 8.16180419921875, + "learning_rate": 3.3837418004378943e-06, + "loss": 0.4119, + "mean_token_accuracy": 0.8557430118322372, + "num_tokens": 216956827.0, + "step": 180320 + }, + { + "entropy": 1.8750511035323143, + "epoch": 0.5590072220211585, + "grad_norm": 6.086195945739746, + "learning_rate": 3.3836479777939375e-06, + "loss": 0.4514, + "mean_token_accuracy": 0.8607206657528877, + "num_tokens": 216968537.0, + "step": 180330 + }, + { + "entropy": 1.904085485637188, + "epoch": 0.5590382211462082, + "grad_norm": 7.704912185668945, + "learning_rate": 3.3835541629539455e-06, + "loss": 0.4277, + "mean_token_accuracy": 0.856925904750824, + "num_tokens": 216980705.0, + "step": 180340 + }, + { + "entropy": 1.8759368106722831, + "epoch": 0.5590692202712578, + "grad_norm": 7.605447292327881, + "learning_rate": 3.3834603559168363e-06, + "loss": 0.468, + "mean_token_accuracy": 0.8501611366868019, + "num_tokens": 216992340.0, + "step": 180350 + }, + { + "entropy": 1.9152187556028366, + "epoch": 0.5591002193963075, + "grad_norm": 10.049348831176758, + "learning_rate": 3.383366556681528e-06, + "loss": 0.4489, + "mean_token_accuracy": 0.8524507984519005, + "num_tokens": 217003401.0, + "step": 180360 + }, + { + "entropy": 1.8950191289186478, + "epoch": 0.5591312185213573, + "grad_norm": 8.646200180053711, + "learning_rate": 3.38327276524694e-06, + "loss": 0.4646, + "mean_token_accuracy": 0.8536058843135834, + "num_tokens": 217015175.0, + "step": 180370 + }, + { + "entropy": 1.8828943863511085, + "epoch": 0.559162217646407, + "grad_norm": 7.870319366455078, + "learning_rate": 3.38317898161199e-06, + "loss": 0.438, + "mean_token_accuracy": 0.8609470695257186, + "num_tokens": 217027041.0, + "step": 180380 + }, + { + "entropy": 1.9423518463969232, + "epoch": 0.5591932167714566, + "grad_norm": 7.569737911224365, + "learning_rate": 3.3830852057755987e-06, + "loss": 0.4684, + "mean_token_accuracy": 0.8526441812515259, + "num_tokens": 217038149.0, + "step": 180390 + }, + { + "entropy": 1.8567870572209357, + "epoch": 0.5592242158965063, + "grad_norm": 9.234038352966309, + "learning_rate": 3.3829914377366834e-06, + "loss": 0.4124, + "mean_token_accuracy": 0.8553133830428123, + "num_tokens": 217050007.0, + "step": 180400 + }, + { + "entropy": 1.9220700711011887, + "epoch": 0.5592552150215561, + "grad_norm": 3.5993828773498535, + "learning_rate": 3.3828976774941647e-06, + "loss": 0.4788, + "mean_token_accuracy": 0.85472621768713, + "num_tokens": 217060818.0, + "step": 180410 + }, + { + "entropy": 1.9288207918405533, + "epoch": 0.5592862141466057, + "grad_norm": 9.777146339416504, + "learning_rate": 3.3828039250469622e-06, + "loss": 0.5021, + "mean_token_accuracy": 0.8496465310454369, + "num_tokens": 217071834.0, + "step": 180420 + }, + { + "entropy": 1.9446373760700226, + "epoch": 0.5593172132716554, + "grad_norm": 9.732808113098145, + "learning_rate": 3.3827101803939956e-06, + "loss": 0.4787, + "mean_token_accuracy": 0.8456583991646767, + "num_tokens": 217082352.0, + "step": 180430 + }, + { + "entropy": 1.8855812832713128, + "epoch": 0.5593482123967051, + "grad_norm": 8.88985538482666, + "learning_rate": 3.382616443534185e-06, + "loss": 0.4613, + "mean_token_accuracy": 0.854129233956337, + "num_tokens": 217094782.0, + "step": 180440 + }, + { + "entropy": 1.9131209135055542, + "epoch": 0.5593792115217548, + "grad_norm": 8.299091339111328, + "learning_rate": 3.3825227144664507e-06, + "loss": 0.4168, + "mean_token_accuracy": 0.8680454924702644, + "num_tokens": 217105718.0, + "step": 180450 + }, + { + "entropy": 1.9053846567869186, + "epoch": 0.5594102106468045, + "grad_norm": 7.925025939941406, + "learning_rate": 3.382428993189713e-06, + "loss": 0.429, + "mean_token_accuracy": 0.8584159076213836, + "num_tokens": 217117301.0, + "step": 180460 + }, + { + "entropy": 2.009468224644661, + "epoch": 0.5594412097718542, + "grad_norm": 8.280466079711914, + "learning_rate": 3.3823352797028937e-06, + "loss": 0.5359, + "mean_token_accuracy": 0.8343116879463196, + "num_tokens": 217127990.0, + "step": 180470 + }, + { + "entropy": 1.9432775482535363, + "epoch": 0.5594722088969039, + "grad_norm": 11.16329288482666, + "learning_rate": 3.3822415740049113e-06, + "loss": 0.4675, + "mean_token_accuracy": 0.8549125969409943, + "num_tokens": 217139731.0, + "step": 180480 + }, + { + "entropy": 1.9695190101861955, + "epoch": 0.5595032080219536, + "grad_norm": 7.9360151290893555, + "learning_rate": 3.3821478760946896e-06, + "loss": 0.4965, + "mean_token_accuracy": 0.8430364161729813, + "num_tokens": 217151051.0, + "step": 180490 + }, + { + "entropy": 1.9071122258901596, + "epoch": 0.5595342071470033, + "grad_norm": 8.332737922668457, + "learning_rate": 3.3820541859711486e-06, + "loss": 0.4596, + "mean_token_accuracy": 0.8503222405910492, + "num_tokens": 217163091.0, + "step": 180500 + }, + { + "entropy": 1.9587863117456437, + "epoch": 0.559565206272053, + "grad_norm": 8.474498748779297, + "learning_rate": 3.3819605036332104e-06, + "loss": 0.5526, + "mean_token_accuracy": 0.8527715653181076, + "num_tokens": 217174509.0, + "step": 180510 + }, + { + "entropy": 1.9071698397397996, + "epoch": 0.5595962053971026, + "grad_norm": 7.895602226257324, + "learning_rate": 3.381866829079796e-06, + "loss": 0.3977, + "mean_token_accuracy": 0.854709891974926, + "num_tokens": 217187334.0, + "step": 180520 + }, + { + "entropy": 1.9282568588852882, + "epoch": 0.5596272045221523, + "grad_norm": 9.097548484802246, + "learning_rate": 3.3817731623098284e-06, + "loss": 0.4441, + "mean_token_accuracy": 0.8497316464781761, + "num_tokens": 217199374.0, + "step": 180530 + }, + { + "entropy": 1.89561807513237, + "epoch": 0.5596582036472021, + "grad_norm": 7.903514862060547, + "learning_rate": 3.3816795033222283e-06, + "loss": 0.4522, + "mean_token_accuracy": 0.8572723090648651, + "num_tokens": 217210367.0, + "step": 180540 + }, + { + "entropy": 1.8535553574562074, + "epoch": 0.5596892027722518, + "grad_norm": 7.994612693786621, + "learning_rate": 3.3815858521159193e-06, + "loss": 0.4345, + "mean_token_accuracy": 0.8570759683847428, + "num_tokens": 217223005.0, + "step": 180550 + }, + { + "entropy": 1.9246620133519172, + "epoch": 0.5597202018973014, + "grad_norm": 3.801100969314575, + "learning_rate": 3.381492208689824e-06, + "loss": 0.4914, + "mean_token_accuracy": 0.8471505641937256, + "num_tokens": 217234469.0, + "step": 180560 + }, + { + "entropy": 1.9053822994232177, + "epoch": 0.5597512010223511, + "grad_norm": 7.562039375305176, + "learning_rate": 3.3813985730428643e-06, + "loss": 0.4727, + "mean_token_accuracy": 0.8488843128085136, + "num_tokens": 217245583.0, + "step": 180570 + }, + { + "entropy": 1.9393005177378655, + "epoch": 0.5597822001474009, + "grad_norm": 6.803304195404053, + "learning_rate": 3.3813049451739642e-06, + "loss": 0.4526, + "mean_token_accuracy": 0.8515973404049874, + "num_tokens": 217257128.0, + "step": 180580 + }, + { + "entropy": 1.841621221601963, + "epoch": 0.5598131992724505, + "grad_norm": 3.8993310928344727, + "learning_rate": 3.381211325082046e-06, + "loss": 0.433, + "mean_token_accuracy": 0.853488278388977, + "num_tokens": 217271173.0, + "step": 180590 + }, + { + "entropy": 1.8540022999048233, + "epoch": 0.5598441983975002, + "grad_norm": 7.256168365478516, + "learning_rate": 3.3811177127660346e-06, + "loss": 0.4081, + "mean_token_accuracy": 0.8576511323451996, + "num_tokens": 217284025.0, + "step": 180600 + }, + { + "entropy": 1.8471224382519722, + "epoch": 0.5598751975225499, + "grad_norm": 7.620468616485596, + "learning_rate": 3.381024108224852e-06, + "loss": 0.4046, + "mean_token_accuracy": 0.8556807443499566, + "num_tokens": 217296298.0, + "step": 180610 + }, + { + "entropy": 1.8334318205714226, + "epoch": 0.5599061966475997, + "grad_norm": 9.344206809997559, + "learning_rate": 3.380930511457423e-06, + "loss": 0.401, + "mean_token_accuracy": 0.8624182164669036, + "num_tokens": 217309050.0, + "step": 180620 + }, + { + "entropy": 1.9678836941719056, + "epoch": 0.5599371957726493, + "grad_norm": 8.84647274017334, + "learning_rate": 3.3808369224626708e-06, + "loss": 0.5047, + "mean_token_accuracy": 0.8477346986532212, + "num_tokens": 217319744.0, + "step": 180630 + }, + { + "entropy": 1.8432461515069007, + "epoch": 0.559968194897699, + "grad_norm": 8.65550708770752, + "learning_rate": 3.3807433412395207e-06, + "loss": 0.3991, + "mean_token_accuracy": 0.8575503960251808, + "num_tokens": 217332550.0, + "step": 180640 + }, + { + "entropy": 1.9450537994503976, + "epoch": 0.5599991940227487, + "grad_norm": 8.20946979522705, + "learning_rate": 3.380649767786897e-06, + "loss": 0.4791, + "mean_token_accuracy": 0.8566359907388688, + "num_tokens": 217343970.0, + "step": 180650 + }, + { + "entropy": 1.8421220853924751, + "epoch": 0.5600301931477984, + "grad_norm": 4.591266632080078, + "learning_rate": 3.3805562021037243e-06, + "loss": 0.4047, + "mean_token_accuracy": 0.8594477117061615, + "num_tokens": 217356623.0, + "step": 180660 + }, + { + "entropy": 1.8869489043951035, + "epoch": 0.5600611922728481, + "grad_norm": 9.574021339416504, + "learning_rate": 3.3804626441889266e-06, + "loss": 0.4696, + "mean_token_accuracy": 0.8554420277476311, + "num_tokens": 217368615.0, + "step": 180670 + }, + { + "entropy": 1.9096188768744469, + "epoch": 0.5600921913978978, + "grad_norm": 9.077582359313965, + "learning_rate": 3.38036909404143e-06, + "loss": 0.4591, + "mean_token_accuracy": 0.8512759923934936, + "num_tokens": 217380573.0, + "step": 180680 + }, + { + "entropy": 1.9089935168623924, + "epoch": 0.5601231905229475, + "grad_norm": 4.384575366973877, + "learning_rate": 3.3802755516601593e-06, + "loss": 0.4301, + "mean_token_accuracy": 0.8570929080247879, + "num_tokens": 217391703.0, + "step": 180690 + }, + { + "entropy": 1.8445791646838188, + "epoch": 0.5601541896479972, + "grad_norm": 3.831366777420044, + "learning_rate": 3.3801820170440408e-06, + "loss": 0.4018, + "mean_token_accuracy": 0.8612964913249016, + "num_tokens": 217404339.0, + "step": 180700 + }, + { + "entropy": 1.796808883547783, + "epoch": 0.5601851887730469, + "grad_norm": 5.263000011444092, + "learning_rate": 3.380088490191999e-06, + "loss": 0.3628, + "mean_token_accuracy": 0.8694143995642662, + "num_tokens": 217418163.0, + "step": 180710 + }, + { + "entropy": 1.848763944208622, + "epoch": 0.5602161878980966, + "grad_norm": 8.649629592895508, + "learning_rate": 3.3799949711029606e-06, + "loss": 0.4028, + "mean_token_accuracy": 0.8593548595905304, + "num_tokens": 217430946.0, + "step": 180720 + }, + { + "entropy": 1.7809420630335808, + "epoch": 0.5602471870231462, + "grad_norm": 8.454540252685547, + "learning_rate": 3.3799014597758516e-06, + "loss": 0.3918, + "mean_token_accuracy": 0.8662583023309708, + "num_tokens": 217444104.0, + "step": 180730 + }, + { + "entropy": 1.962505580484867, + "epoch": 0.560278186148196, + "grad_norm": 9.19823932647705, + "learning_rate": 3.379807956209598e-06, + "loss": 0.519, + "mean_token_accuracy": 0.8412714377045631, + "num_tokens": 217455742.0, + "step": 180740 + }, + { + "entropy": 1.9442083448171616, + "epoch": 0.5603091852732457, + "grad_norm": 8.571081161499023, + "learning_rate": 3.3797144604031275e-06, + "loss": 0.488, + "mean_token_accuracy": 0.8483389243483543, + "num_tokens": 217467004.0, + "step": 180750 + }, + { + "entropy": 1.926881869137287, + "epoch": 0.5603401843982954, + "grad_norm": 8.731983184814453, + "learning_rate": 3.379620972355366e-06, + "loss": 0.4925, + "mean_token_accuracy": 0.8496791407465935, + "num_tokens": 217478640.0, + "step": 180760 + }, + { + "entropy": 1.9326330974698067, + "epoch": 0.560371183523345, + "grad_norm": 8.762312889099121, + "learning_rate": 3.37952749206524e-06, + "loss": 0.4452, + "mean_token_accuracy": 0.8533821493387223, + "num_tokens": 217490602.0, + "step": 180770 + }, + { + "entropy": 1.9303222298622131, + "epoch": 0.5604021826483947, + "grad_norm": 3.5224175453186035, + "learning_rate": 3.3794340195316775e-06, + "loss": 0.4442, + "mean_token_accuracy": 0.8604114964604378, + "num_tokens": 217501995.0, + "step": 180780 + }, + { + "entropy": 1.9192448794841765, + "epoch": 0.5604331817734445, + "grad_norm": 7.257331371307373, + "learning_rate": 3.3793405547536056e-06, + "loss": 0.4342, + "mean_token_accuracy": 0.8658577457070351, + "num_tokens": 217512568.0, + "step": 180790 + }, + { + "entropy": 1.8068484604358672, + "epoch": 0.5604641808984941, + "grad_norm": 4.195193290710449, + "learning_rate": 3.3792470977299516e-06, + "loss": 0.3891, + "mean_token_accuracy": 0.863228565454483, + "num_tokens": 217525201.0, + "step": 180800 + }, + { + "entropy": 1.9096578344702722, + "epoch": 0.5604951800235438, + "grad_norm": 5.258492469787598, + "learning_rate": 3.3791536484596447e-06, + "loss": 0.4536, + "mean_token_accuracy": 0.8544479146599769, + "num_tokens": 217537342.0, + "step": 180810 + }, + { + "entropy": 1.843989658355713, + "epoch": 0.5605261791485935, + "grad_norm": 7.95306921005249, + "learning_rate": 3.3790602069416108e-06, + "loss": 0.4277, + "mean_token_accuracy": 0.8533659532666207, + "num_tokens": 217549457.0, + "step": 180820 + }, + { + "entropy": 1.8372595831751823, + "epoch": 0.5605571782736433, + "grad_norm": 9.44124698638916, + "learning_rate": 3.3789667731747796e-06, + "loss": 0.4008, + "mean_token_accuracy": 0.8592608660459519, + "num_tokens": 217561678.0, + "step": 180830 + }, + { + "entropy": 1.9358297988772393, + "epoch": 0.5605881773986929, + "grad_norm": 8.502898216247559, + "learning_rate": 3.3788733471580787e-06, + "loss": 0.5051, + "mean_token_accuracy": 0.8386871755123139, + "num_tokens": 217573016.0, + "step": 180840 + }, + { + "entropy": 1.8033106908202172, + "epoch": 0.5606191765237426, + "grad_norm": 4.1827778816223145, + "learning_rate": 3.3787799288904372e-06, + "loss": 0.4344, + "mean_token_accuracy": 0.8475642517209053, + "num_tokens": 217585990.0, + "step": 180850 + }, + { + "entropy": 1.8953222632408142, + "epoch": 0.5606501756487923, + "grad_norm": 9.73503303527832, + "learning_rate": 3.378686518370784e-06, + "loss": 0.4351, + "mean_token_accuracy": 0.8606355428695679, + "num_tokens": 217598445.0, + "step": 180860 + }, + { + "entropy": 1.845881848037243, + "epoch": 0.560681174773842, + "grad_norm": 4.5580596923828125, + "learning_rate": 3.378593115598048e-06, + "loss": 0.4545, + "mean_token_accuracy": 0.8616230577230454, + "num_tokens": 217611053.0, + "step": 180870 + }, + { + "entropy": 1.9507944285869598, + "epoch": 0.5607121738988917, + "grad_norm": 7.940414905548096, + "learning_rate": 3.3784997205711583e-06, + "loss": 0.4911, + "mean_token_accuracy": 0.8515224680304527, + "num_tokens": 217621744.0, + "step": 180880 + }, + { + "entropy": 1.885131499171257, + "epoch": 0.5607431730239414, + "grad_norm": 6.840700149536133, + "learning_rate": 3.378406333289044e-06, + "loss": 0.417, + "mean_token_accuracy": 0.8618391931056977, + "num_tokens": 217633886.0, + "step": 180890 + }, + { + "entropy": 1.8611220195889473, + "epoch": 0.5607741721489911, + "grad_norm": 9.760030746459961, + "learning_rate": 3.378312953750636e-06, + "loss": 0.4317, + "mean_token_accuracy": 0.855574083328247, + "num_tokens": 217646166.0, + "step": 180900 + }, + { + "entropy": 1.889116460084915, + "epoch": 0.5608051712740408, + "grad_norm": 6.201332092285156, + "learning_rate": 3.378219581954863e-06, + "loss": 0.3904, + "mean_token_accuracy": 0.862904566526413, + "num_tokens": 217659044.0, + "step": 180910 + }, + { + "entropy": 1.866611397266388, + "epoch": 0.5608361703990905, + "grad_norm": 7.196857452392578, + "learning_rate": 3.3781262179006557e-06, + "loss": 0.4999, + "mean_token_accuracy": 0.8469146594405175, + "num_tokens": 217671556.0, + "step": 180920 + }, + { + "entropy": 1.9216289982199668, + "epoch": 0.5608671695241402, + "grad_norm": 8.19442367553711, + "learning_rate": 3.3780328615869445e-06, + "loss": 0.4755, + "mean_token_accuracy": 0.8460784748196601, + "num_tokens": 217683298.0, + "step": 180930 + }, + { + "entropy": 1.8504672214388846, + "epoch": 0.5608981686491898, + "grad_norm": 8.605911254882812, + "learning_rate": 3.3779395130126593e-06, + "loss": 0.4582, + "mean_token_accuracy": 0.8548540785908699, + "num_tokens": 217695959.0, + "step": 180940 + }, + { + "entropy": 1.8894923403859138, + "epoch": 0.5609291677742396, + "grad_norm": 6.890311241149902, + "learning_rate": 3.3778461721767307e-06, + "loss": 0.386, + "mean_token_accuracy": 0.8696539297699928, + "num_tokens": 217708357.0, + "step": 180950 + }, + { + "entropy": 1.98569797873497, + "epoch": 0.5609601668992893, + "grad_norm": 8.540229797363281, + "learning_rate": 3.3777528390780907e-06, + "loss": 0.5213, + "mean_token_accuracy": 0.8437126785516739, + "num_tokens": 217718928.0, + "step": 180960 + }, + { + "entropy": 1.8385651513934136, + "epoch": 0.560991166024339, + "grad_norm": 7.91072416305542, + "learning_rate": 3.3776595137156694e-06, + "loss": 0.4382, + "mean_token_accuracy": 0.8596760660409928, + "num_tokens": 217731229.0, + "step": 180970 + }, + { + "entropy": 1.920003080368042, + "epoch": 0.5610221651493886, + "grad_norm": 8.425142288208008, + "learning_rate": 3.3775661960883983e-06, + "loss": 0.4694, + "mean_token_accuracy": 0.8521833911538124, + "num_tokens": 217742639.0, + "step": 180980 + }, + { + "entropy": 1.8847643241286278, + "epoch": 0.5610531642744383, + "grad_norm": 7.655343055725098, + "learning_rate": 3.3774728861952093e-06, + "loss": 0.4288, + "mean_token_accuracy": 0.8574874952435494, + "num_tokens": 217755303.0, + "step": 180990 + }, + { + "entropy": 1.8867151036858558, + "epoch": 0.5610841633994881, + "grad_norm": 7.4873552322387695, + "learning_rate": 3.377379584035034e-06, + "loss": 0.394, + "mean_token_accuracy": 0.8581796258687973, + "num_tokens": 217767553.0, + "step": 181000 + }, + { + "entropy": 1.9579679414629936, + "epoch": 0.5611151625245377, + "grad_norm": 8.260603904724121, + "learning_rate": 3.3772862896068038e-06, + "loss": 0.4768, + "mean_token_accuracy": 0.8498960748314858, + "num_tokens": 217778968.0, + "step": 181010 + }, + { + "entropy": 1.9273895144462585, + "epoch": 0.5611461616495874, + "grad_norm": 7.966805934906006, + "learning_rate": 3.377193002909452e-06, + "loss": 0.4217, + "mean_token_accuracy": 0.8586221739649773, + "num_tokens": 217789896.0, + "step": 181020 + }, + { + "entropy": 1.907166676223278, + "epoch": 0.5611771607746371, + "grad_norm": 7.4472126960754395, + "learning_rate": 3.3770997239419097e-06, + "loss": 0.4468, + "mean_token_accuracy": 0.8602954059839248, + "num_tokens": 217801388.0, + "step": 181030 + }, + { + "entropy": 1.807604393362999, + "epoch": 0.5612081598996869, + "grad_norm": 7.787635326385498, + "learning_rate": 3.37700645270311e-06, + "loss": 0.3718, + "mean_token_accuracy": 0.868038372695446, + "num_tokens": 217814403.0, + "step": 181040 + }, + { + "entropy": 1.854034560918808, + "epoch": 0.5612391590247365, + "grad_norm": 3.2894787788391113, + "learning_rate": 3.376913189191986e-06, + "loss": 0.4321, + "mean_token_accuracy": 0.8540055334568024, + "num_tokens": 217826856.0, + "step": 181050 + }, + { + "entropy": 1.7675895690917969, + "epoch": 0.5612701581497862, + "grad_norm": 7.049602031707764, + "learning_rate": 3.37681993340747e-06, + "loss": 0.3731, + "mean_token_accuracy": 0.868474094569683, + "num_tokens": 217840044.0, + "step": 181060 + }, + { + "entropy": 1.8834607109427453, + "epoch": 0.5613011572748359, + "grad_norm": 4.7856645584106445, + "learning_rate": 3.376726685348496e-06, + "loss": 0.4672, + "mean_token_accuracy": 0.850586649775505, + "num_tokens": 217852080.0, + "step": 181070 + }, + { + "entropy": 1.8780501514673233, + "epoch": 0.5613321563998857, + "grad_norm": 9.758124351501465, + "learning_rate": 3.3766334450139965e-06, + "loss": 0.4613, + "mean_token_accuracy": 0.8537691235542297, + "num_tokens": 217863129.0, + "step": 181080 + }, + { + "entropy": 1.8639142349362374, + "epoch": 0.5613631555249353, + "grad_norm": 6.5365681648254395, + "learning_rate": 3.3765402124029056e-06, + "loss": 0.4419, + "mean_token_accuracy": 0.8505441218614578, + "num_tokens": 217875833.0, + "step": 181090 + }, + { + "entropy": 1.8525004491209984, + "epoch": 0.561394154649985, + "grad_norm": 9.217967987060547, + "learning_rate": 3.376446987514157e-06, + "loss": 0.4707, + "mean_token_accuracy": 0.8587792262434959, + "num_tokens": 217887693.0, + "step": 181100 + }, + { + "entropy": 1.8868872478604317, + "epoch": 0.5614251537750347, + "grad_norm": 7.5307536125183105, + "learning_rate": 3.3763537703466853e-06, + "loss": 0.4419, + "mean_token_accuracy": 0.8458724349737168, + "num_tokens": 217900013.0, + "step": 181110 + }, + { + "entropy": 1.9061895191669465, + "epoch": 0.5614561529000844, + "grad_norm": 8.436769485473633, + "learning_rate": 3.3762605608994243e-06, + "loss": 0.4541, + "mean_token_accuracy": 0.8492231115698814, + "num_tokens": 217911913.0, + "step": 181120 + }, + { + "entropy": 1.8444679781794548, + "epoch": 0.5614871520251341, + "grad_norm": 7.85495138168335, + "learning_rate": 3.3761673591713067e-06, + "loss": 0.4285, + "mean_token_accuracy": 0.8593740567564965, + "num_tokens": 217924423.0, + "step": 181130 + }, + { + "entropy": 1.8500046521425246, + "epoch": 0.5615181511501838, + "grad_norm": 7.943841457366943, + "learning_rate": 3.37607416516127e-06, + "loss": 0.4448, + "mean_token_accuracy": 0.8572930499911309, + "num_tokens": 217937018.0, + "step": 181140 + }, + { + "entropy": 1.817262691259384, + "epoch": 0.5615491502752334, + "grad_norm": 3.3246030807495117, + "learning_rate": 3.3759809788682475e-06, + "loss": 0.4198, + "mean_token_accuracy": 0.8568054154515267, + "num_tokens": 217949942.0, + "step": 181150 + }, + { + "entropy": 1.8344750300049781, + "epoch": 0.5615801494002832, + "grad_norm": 8.242709159851074, + "learning_rate": 3.375887800291174e-06, + "loss": 0.3961, + "mean_token_accuracy": 0.8631076112389564, + "num_tokens": 217962691.0, + "step": 181160 + }, + { + "entropy": 1.7523364052176476, + "epoch": 0.5616111485253329, + "grad_norm": 3.873065233230591, + "learning_rate": 3.375794629428986e-06, + "loss": 0.395, + "mean_token_accuracy": 0.8638006061315536, + "num_tokens": 217976123.0, + "step": 181170 + }, + { + "entropy": 1.8123833030462264, + "epoch": 0.5616421476503826, + "grad_norm": 7.738763332366943, + "learning_rate": 3.3757014662806175e-06, + "loss": 0.3936, + "mean_token_accuracy": 0.8730161666870118, + "num_tokens": 217987749.0, + "step": 181180 + }, + { + "entropy": 1.8182844251394272, + "epoch": 0.5616731467754322, + "grad_norm": 8.23554801940918, + "learning_rate": 3.375608310845005e-06, + "loss": 0.3712, + "mean_token_accuracy": 0.8598535865545273, + "num_tokens": 218000711.0, + "step": 181190 + }, + { + "entropy": 1.8626601822674274, + "epoch": 0.561704145900482, + "grad_norm": 8.313724517822266, + "learning_rate": 3.3755151631210835e-06, + "loss": 0.4396, + "mean_token_accuracy": 0.8639240145683289, + "num_tokens": 218012655.0, + "step": 181200 + }, + { + "entropy": 1.8847302973270417, + "epoch": 0.5617351450255317, + "grad_norm": 3.391298532485962, + "learning_rate": 3.3754220231077907e-06, + "loss": 0.4619, + "mean_token_accuracy": 0.8537727981805802, + "num_tokens": 218024210.0, + "step": 181210 + }, + { + "entropy": 1.8442262202501296, + "epoch": 0.5617661441505813, + "grad_norm": 8.985638618469238, + "learning_rate": 3.3753288908040614e-06, + "loss": 0.4286, + "mean_token_accuracy": 0.8595643252134323, + "num_tokens": 218035577.0, + "step": 181220 + }, + { + "entropy": 1.8500028520822525, + "epoch": 0.561797143275631, + "grad_norm": 4.673086643218994, + "learning_rate": 3.3752357662088324e-06, + "loss": 0.4176, + "mean_token_accuracy": 0.8605799928307534, + "num_tokens": 218047979.0, + "step": 181230 + }, + { + "entropy": 1.8613388493657113, + "epoch": 0.5618281424006807, + "grad_norm": 8.323710441589355, + "learning_rate": 3.375142649321041e-06, + "loss": 0.4013, + "mean_token_accuracy": 0.863029134273529, + "num_tokens": 218059650.0, + "step": 181240 + }, + { + "entropy": 1.8575000897049905, + "epoch": 0.5618591415257305, + "grad_norm": 8.198358535766602, + "learning_rate": 3.3750495401396232e-06, + "loss": 0.3981, + "mean_token_accuracy": 0.866016274690628, + "num_tokens": 218071502.0, + "step": 181250 + }, + { + "entropy": 1.8431531935930252, + "epoch": 0.5618901406507801, + "grad_norm": 7.122215747833252, + "learning_rate": 3.3749564386635165e-06, + "loss": 0.425, + "mean_token_accuracy": 0.8529547974467278, + "num_tokens": 218083432.0, + "step": 181260 + }, + { + "entropy": 1.8526543870568275, + "epoch": 0.5619211397758298, + "grad_norm": 9.108004570007324, + "learning_rate": 3.374863344891659e-06, + "loss": 0.4862, + "mean_token_accuracy": 0.8487247556447983, + "num_tokens": 218095552.0, + "step": 181270 + }, + { + "entropy": 1.812180995941162, + "epoch": 0.5619521389008795, + "grad_norm": 7.413666725158691, + "learning_rate": 3.3747702588229863e-06, + "loss": 0.422, + "mean_token_accuracy": 0.8599738642573357, + "num_tokens": 218108445.0, + "step": 181280 + }, + { + "entropy": 1.8726341724395752, + "epoch": 0.5619831380259293, + "grad_norm": 7.884952068328857, + "learning_rate": 3.3746771804564375e-06, + "loss": 0.4402, + "mean_token_accuracy": 0.8589571207761765, + "num_tokens": 218119972.0, + "step": 181290 + }, + { + "entropy": 1.9018690153956412, + "epoch": 0.5620141371509789, + "grad_norm": 7.468959331512451, + "learning_rate": 3.3745841097909506e-06, + "loss": 0.4528, + "mean_token_accuracy": 0.8478945583105088, + "num_tokens": 218131689.0, + "step": 181300 + }, + { + "entropy": 1.8484178960323334, + "epoch": 0.5620451362760286, + "grad_norm": 8.958715438842773, + "learning_rate": 3.374491046825463e-06, + "loss": 0.4233, + "mean_token_accuracy": 0.8532816350460053, + "num_tokens": 218144676.0, + "step": 181310 + }, + { + "entropy": 1.7837095826864242, + "epoch": 0.5620761354010783, + "grad_norm": 3.5719587802886963, + "learning_rate": 3.3743979915589137e-06, + "loss": 0.4143, + "mean_token_accuracy": 0.8583781942725182, + "num_tokens": 218158190.0, + "step": 181320 + }, + { + "entropy": 1.8700577557086944, + "epoch": 0.562107134526128, + "grad_norm": 7.927095890045166, + "learning_rate": 3.3743049439902402e-06, + "loss": 0.4154, + "mean_token_accuracy": 0.8564609229564667, + "num_tokens": 218170560.0, + "step": 181330 + }, + { + "entropy": 1.9219986885786056, + "epoch": 0.5621381336511777, + "grad_norm": 6.3925065994262695, + "learning_rate": 3.3742119041183824e-06, + "loss": 0.4689, + "mean_token_accuracy": 0.8569181904196739, + "num_tokens": 218181816.0, + "step": 181340 + }, + { + "entropy": 1.8808184131979941, + "epoch": 0.5621691327762274, + "grad_norm": 6.876534938812256, + "learning_rate": 3.3741188719422784e-06, + "loss": 0.407, + "mean_token_accuracy": 0.8688705995678901, + "num_tokens": 218193715.0, + "step": 181350 + }, + { + "entropy": 1.8285144343972206, + "epoch": 0.562200131901277, + "grad_norm": 10.440187454223633, + "learning_rate": 3.3740258474608677e-06, + "loss": 0.3971, + "mean_token_accuracy": 0.8620707169175148, + "num_tokens": 218205927.0, + "step": 181360 + }, + { + "entropy": 1.7766144782304765, + "epoch": 0.5622311310263268, + "grad_norm": 7.924615859985352, + "learning_rate": 3.373932830673091e-06, + "loss": 0.3888, + "mean_token_accuracy": 0.8614334642887116, + "num_tokens": 218220052.0, + "step": 181370 + }, + { + "entropy": 1.757065513730049, + "epoch": 0.5622621301513765, + "grad_norm": 4.278183937072754, + "learning_rate": 3.3738398215778845e-06, + "loss": 0.3149, + "mean_token_accuracy": 0.874184074997902, + "num_tokens": 218233698.0, + "step": 181380 + }, + { + "entropy": 1.723100933432579, + "epoch": 0.5622931292764262, + "grad_norm": 3.8255221843719482, + "learning_rate": 3.3737468201741915e-06, + "loss": 0.3361, + "mean_token_accuracy": 0.8724021464586258, + "num_tokens": 218248320.0, + "step": 181390 + }, + { + "entropy": 1.7919250458478928, + "epoch": 0.5623241284014758, + "grad_norm": 8.5140962600708, + "learning_rate": 3.3736538264609485e-06, + "loss": 0.3807, + "mean_token_accuracy": 0.861699141561985, + "num_tokens": 218260949.0, + "step": 181400 + }, + { + "entropy": 1.9182514041662215, + "epoch": 0.5623551275265256, + "grad_norm": 7.7248053550720215, + "learning_rate": 3.3735608404370995e-06, + "loss": 0.4438, + "mean_token_accuracy": 0.8568162053823472, + "num_tokens": 218272641.0, + "step": 181410 + }, + { + "entropy": 1.8269777074456215, + "epoch": 0.5623861266515753, + "grad_norm": 8.181014060974121, + "learning_rate": 3.373467862101582e-06, + "loss": 0.3907, + "mean_token_accuracy": 0.8538804024457931, + "num_tokens": 218285341.0, + "step": 181420 + }, + { + "entropy": 1.8285759806632995, + "epoch": 0.562417125776625, + "grad_norm": 8.884249687194824, + "learning_rate": 3.373374891453338e-06, + "loss": 0.3753, + "mean_token_accuracy": 0.8638269484043122, + "num_tokens": 218297741.0, + "step": 181430 + }, + { + "entropy": 1.8283386200666427, + "epoch": 0.5624481249016746, + "grad_norm": 9.47203540802002, + "learning_rate": 3.373281928491307e-06, + "loss": 0.4152, + "mean_token_accuracy": 0.8587843701243401, + "num_tokens": 218309967.0, + "step": 181440 + }, + { + "entropy": 1.9383835330605508, + "epoch": 0.5624791240267244, + "grad_norm": 3.79379940032959, + "learning_rate": 3.3731889732144313e-06, + "loss": 0.4411, + "mean_token_accuracy": 0.8580824270844459, + "num_tokens": 218321382.0, + "step": 181450 + }, + { + "entropy": 1.9270856872200965, + "epoch": 0.5625101231517741, + "grad_norm": 3.337674140930176, + "learning_rate": 3.373096025621651e-06, + "loss": 0.5043, + "mean_token_accuracy": 0.8432699516415596, + "num_tokens": 218332602.0, + "step": 181460 + }, + { + "entropy": 1.7900800183415413, + "epoch": 0.5625411222768237, + "grad_norm": 10.64853572845459, + "learning_rate": 3.3730030857119085e-06, + "loss": 0.396, + "mean_token_accuracy": 0.8590823590755463, + "num_tokens": 218345274.0, + "step": 181470 + }, + { + "entropy": 1.8173652663826942, + "epoch": 0.5625721214018734, + "grad_norm": 8.35346794128418, + "learning_rate": 3.3729101534841448e-06, + "loss": 0.4087, + "mean_token_accuracy": 0.858729963004589, + "num_tokens": 218358082.0, + "step": 181480 + }, + { + "entropy": 1.910238166153431, + "epoch": 0.5626031205269231, + "grad_norm": 8.957452774047852, + "learning_rate": 3.3728172289373016e-06, + "loss": 0.4586, + "mean_token_accuracy": 0.8480331093072891, + "num_tokens": 218368843.0, + "step": 181490 + }, + { + "entropy": 1.9071012750267982, + "epoch": 0.5626341196519729, + "grad_norm": 9.414094924926758, + "learning_rate": 3.372724312070321e-06, + "loss": 0.4599, + "mean_token_accuracy": 0.856868302822113, + "num_tokens": 218380607.0, + "step": 181500 + }, + { + "entropy": 1.811684738099575, + "epoch": 0.5626651187770225, + "grad_norm": 3.4954662322998047, + "learning_rate": 3.372631402882146e-06, + "loss": 0.3706, + "mean_token_accuracy": 0.860798105597496, + "num_tokens": 218393372.0, + "step": 181510 + }, + { + "entropy": 1.7179926961660386, + "epoch": 0.5626961179020722, + "grad_norm": 3.3773441314697266, + "learning_rate": 3.3725385013717184e-06, + "loss": 0.3536, + "mean_token_accuracy": 0.8678355023264885, + "num_tokens": 218408589.0, + "step": 181520 + }, + { + "entropy": 1.8567181393504142, + "epoch": 0.5627271170271219, + "grad_norm": 9.402314186096191, + "learning_rate": 3.3724456075379795e-06, + "loss": 0.4668, + "mean_token_accuracy": 0.8574750825762749, + "num_tokens": 218420524.0, + "step": 181530 + }, + { + "entropy": 1.9640752226114273, + "epoch": 0.5627581161521716, + "grad_norm": 6.7787580490112305, + "learning_rate": 3.3723527213798744e-06, + "loss": 0.4768, + "mean_token_accuracy": 0.850230510532856, + "num_tokens": 218431700.0, + "step": 181540 + }, + { + "entropy": 1.846253375709057, + "epoch": 0.5627891152772213, + "grad_norm": 10.096291542053223, + "learning_rate": 3.3722598428963448e-06, + "loss": 0.4509, + "mean_token_accuracy": 0.8543033003807068, + "num_tokens": 218444142.0, + "step": 181550 + }, + { + "entropy": 1.9752780228853226, + "epoch": 0.562820114402271, + "grad_norm": 6.870584964752197, + "learning_rate": 3.3721669720863344e-06, + "loss": 0.4711, + "mean_token_accuracy": 0.8470905661582947, + "num_tokens": 218455101.0, + "step": 181560 + }, + { + "entropy": 1.8309684470295906, + "epoch": 0.5628511135273206, + "grad_norm": 12.073573112487793, + "learning_rate": 3.372074108948786e-06, + "loss": 0.3821, + "mean_token_accuracy": 0.8653752103447914, + "num_tokens": 218467013.0, + "step": 181570 + }, + { + "entropy": 1.899401769042015, + "epoch": 0.5628821126523704, + "grad_norm": 7.853172302246094, + "learning_rate": 3.3719812534826446e-06, + "loss": 0.3908, + "mean_token_accuracy": 0.8672493144869804, + "num_tokens": 218478180.0, + "step": 181580 + }, + { + "entropy": 1.851992353796959, + "epoch": 0.5629131117774201, + "grad_norm": 9.056402206420898, + "learning_rate": 3.3718884056868523e-06, + "loss": 0.4164, + "mean_token_accuracy": 0.859456042945385, + "num_tokens": 218489874.0, + "step": 181590 + }, + { + "entropy": 1.8867039635777474, + "epoch": 0.5629441109024698, + "grad_norm": 8.47264289855957, + "learning_rate": 3.371795565560354e-06, + "loss": 0.4586, + "mean_token_accuracy": 0.851148608326912, + "num_tokens": 218501597.0, + "step": 181600 + }, + { + "entropy": 1.913269890844822, + "epoch": 0.5629751100275194, + "grad_norm": 9.350337028503418, + "learning_rate": 3.371702733102094e-06, + "loss": 0.4597, + "mean_token_accuracy": 0.8451100900769234, + "num_tokens": 218513013.0, + "step": 181610 + }, + { + "entropy": 1.8536349445581437, + "epoch": 0.5630061091525692, + "grad_norm": 8.868657112121582, + "learning_rate": 3.3716099083110165e-06, + "loss": 0.4204, + "mean_token_accuracy": 0.856175285577774, + "num_tokens": 218525416.0, + "step": 181620 + }, + { + "entropy": 1.7822051152586937, + "epoch": 0.5630371082776189, + "grad_norm": 6.670261383056641, + "learning_rate": 3.3715170911860665e-06, + "loss": 0.4111, + "mean_token_accuracy": 0.8628834947943688, + "num_tokens": 218538704.0, + "step": 181630 + }, + { + "entropy": 1.926338541507721, + "epoch": 0.5630681074026685, + "grad_norm": 7.397019863128662, + "learning_rate": 3.3714242817261888e-06, + "loss": 0.4562, + "mean_token_accuracy": 0.8480693429708481, + "num_tokens": 218549814.0, + "step": 181640 + }, + { + "entropy": 1.9475450232625007, + "epoch": 0.5630991065277182, + "grad_norm": 7.937694549560547, + "learning_rate": 3.3713314799303282e-06, + "loss": 0.4916, + "mean_token_accuracy": 0.8462337568402291, + "num_tokens": 218561432.0, + "step": 181650 + }, + { + "entropy": 1.9426579788327216, + "epoch": 0.563130105652768, + "grad_norm": 3.288832664489746, + "learning_rate": 3.3712386857974295e-06, + "loss": 0.4888, + "mean_token_accuracy": 0.8500252008438111, + "num_tokens": 218572081.0, + "step": 181660 + }, + { + "entropy": 1.9244334518909454, + "epoch": 0.5631611047778177, + "grad_norm": 8.894190788269043, + "learning_rate": 3.3711458993264397e-06, + "loss": 0.4831, + "mean_token_accuracy": 0.8501438573002815, + "num_tokens": 218582749.0, + "step": 181670 + }, + { + "entropy": 1.7644945442676545, + "epoch": 0.5631921039028673, + "grad_norm": 8.007524490356445, + "learning_rate": 3.3710531205163026e-06, + "loss": 0.378, + "mean_token_accuracy": 0.871790862083435, + "num_tokens": 218596221.0, + "step": 181680 + }, + { + "entropy": 1.8322599783539772, + "epoch": 0.563223103027917, + "grad_norm": 8.165786743164062, + "learning_rate": 3.370960349365965e-06, + "loss": 0.4591, + "mean_token_accuracy": 0.8571014538407326, + "num_tokens": 218609139.0, + "step": 181690 + }, + { + "entropy": 1.8685690701007842, + "epoch": 0.5632541021529668, + "grad_norm": 7.681282043457031, + "learning_rate": 3.3708675858743733e-06, + "loss": 0.4513, + "mean_token_accuracy": 0.8578503727912903, + "num_tokens": 218621107.0, + "step": 181700 + }, + { + "entropy": 1.823290081322193, + "epoch": 0.5632851012780165, + "grad_norm": 8.989456176757812, + "learning_rate": 3.370774830040473e-06, + "loss": 0.3913, + "mean_token_accuracy": 0.8660375028848648, + "num_tokens": 218633534.0, + "step": 181710 + }, + { + "entropy": 1.8917041584849357, + "epoch": 0.5633161004030661, + "grad_norm": 8.177289009094238, + "learning_rate": 3.370682081863211e-06, + "loss": 0.4436, + "mean_token_accuracy": 0.8619986966252327, + "num_tokens": 218644523.0, + "step": 181720 + }, + { + "entropy": 1.8801601991057395, + "epoch": 0.5633470995281158, + "grad_norm": 3.5459909439086914, + "learning_rate": 3.370589341341534e-06, + "loss": 0.4332, + "mean_token_accuracy": 0.8533681690692901, + "num_tokens": 218656058.0, + "step": 181730 + }, + { + "entropy": 1.8284826904535294, + "epoch": 0.5633780986531655, + "grad_norm": 7.410562992095947, + "learning_rate": 3.3704966084743894e-06, + "loss": 0.415, + "mean_token_accuracy": 0.8601794943213463, + "num_tokens": 218668031.0, + "step": 181740 + }, + { + "entropy": 1.8929321303963662, + "epoch": 0.5634090977782152, + "grad_norm": 7.5576605796813965, + "learning_rate": 3.3704038832607233e-06, + "loss": 0.4436, + "mean_token_accuracy": 0.8542150601744651, + "num_tokens": 218679679.0, + "step": 181750 + }, + { + "entropy": 1.8731725335121154, + "epoch": 0.5634400969032649, + "grad_norm": 8.853988647460938, + "learning_rate": 3.3703111656994835e-06, + "loss": 0.4276, + "mean_token_accuracy": 0.8578036665916443, + "num_tokens": 218691566.0, + "step": 181760 + }, + { + "entropy": 1.8713270604610444, + "epoch": 0.5634710960283146, + "grad_norm": 4.01953125, + "learning_rate": 3.3702184557896173e-06, + "loss": 0.4284, + "mean_token_accuracy": 0.8601006358861923, + "num_tokens": 218703686.0, + "step": 181770 + }, + { + "entropy": 1.8796763598918915, + "epoch": 0.5635020951533642, + "grad_norm": 7.775223255157471, + "learning_rate": 3.3701257535300722e-06, + "loss": 0.4516, + "mean_token_accuracy": 0.8497362047433853, + "num_tokens": 218715301.0, + "step": 181780 + }, + { + "entropy": 1.8186719685792923, + "epoch": 0.563533094278414, + "grad_norm": 3.402961015701294, + "learning_rate": 3.370033058919797e-06, + "loss": 0.4191, + "mean_token_accuracy": 0.8580001771450043, + "num_tokens": 218728361.0, + "step": 181790 + }, + { + "entropy": 1.8945069938898087, + "epoch": 0.5635640934034637, + "grad_norm": 7.426494598388672, + "learning_rate": 3.3699403719577394e-06, + "loss": 0.451, + "mean_token_accuracy": 0.850483974814415, + "num_tokens": 218739876.0, + "step": 181800 + }, + { + "entropy": 1.7792957812547683, + "epoch": 0.5635950925285134, + "grad_norm": 4.302509307861328, + "learning_rate": 3.369847692642847e-06, + "loss": 0.3832, + "mean_token_accuracy": 0.8662660971283913, + "num_tokens": 218753484.0, + "step": 181810 + }, + { + "entropy": 1.8422718912363052, + "epoch": 0.563626091653563, + "grad_norm": 2.8755502700805664, + "learning_rate": 3.369755020974068e-06, + "loss": 0.4281, + "mean_token_accuracy": 0.8658865958452224, + "num_tokens": 218765774.0, + "step": 181820 + }, + { + "entropy": 1.9335956647992134, + "epoch": 0.5636570907786128, + "grad_norm": 6.701143264770508, + "learning_rate": 3.3696623569503535e-06, + "loss": 0.4164, + "mean_token_accuracy": 0.8665258780121803, + "num_tokens": 218778083.0, + "step": 181830 + }, + { + "entropy": 1.9469329804182052, + "epoch": 0.5636880899036625, + "grad_norm": 10.032766342163086, + "learning_rate": 3.36956970057065e-06, + "loss": 0.4773, + "mean_token_accuracy": 0.8478762224316597, + "num_tokens": 218788844.0, + "step": 181840 + }, + { + "entropy": 1.8688429698348046, + "epoch": 0.5637190890287122, + "grad_norm": 6.791390895843506, + "learning_rate": 3.3694770518339077e-06, + "loss": 0.3792, + "mean_token_accuracy": 0.861931498348713, + "num_tokens": 218801226.0, + "step": 181850 + }, + { + "entropy": 1.9176680132746697, + "epoch": 0.5637500881537618, + "grad_norm": 7.465823173522949, + "learning_rate": 3.3693844107390755e-06, + "loss": 0.4407, + "mean_token_accuracy": 0.8572080507874489, + "num_tokens": 218813183.0, + "step": 181860 + }, + { + "entropy": 1.8121962711215018, + "epoch": 0.5637810872788116, + "grad_norm": 8.418926239013672, + "learning_rate": 3.369291777285103e-06, + "loss": 0.3942, + "mean_token_accuracy": 0.8700475439429283, + "num_tokens": 218825988.0, + "step": 181870 + }, + { + "entropy": 1.8737857460975647, + "epoch": 0.5638120864038613, + "grad_norm": 8.048589706420898, + "learning_rate": 3.36919915147094e-06, + "loss": 0.4066, + "mean_token_accuracy": 0.8601662442088127, + "num_tokens": 218837922.0, + "step": 181880 + }, + { + "entropy": 1.8621310248970986, + "epoch": 0.5638430855289109, + "grad_norm": 4.604039669036865, + "learning_rate": 3.369106533295537e-06, + "loss": 0.3964, + "mean_token_accuracy": 0.8758772403001786, + "num_tokens": 218849411.0, + "step": 181890 + }, + { + "entropy": 1.8950058773159981, + "epoch": 0.5638740846539606, + "grad_norm": 5.675747871398926, + "learning_rate": 3.3690139227578422e-06, + "loss": 0.4684, + "mean_token_accuracy": 0.8569332420825958, + "num_tokens": 218861088.0, + "step": 181900 + }, + { + "entropy": 1.8820589035749435, + "epoch": 0.5639050837790104, + "grad_norm": 7.437646389007568, + "learning_rate": 3.368921319856808e-06, + "loss": 0.4316, + "mean_token_accuracy": 0.8657960072159767, + "num_tokens": 218873158.0, + "step": 181910 + }, + { + "entropy": 1.8981757640838623, + "epoch": 0.56393608290406, + "grad_norm": 8.741206169128418, + "learning_rate": 3.3688287245913843e-06, + "loss": 0.4531, + "mean_token_accuracy": 0.8553640082478523, + "num_tokens": 218884537.0, + "step": 181920 + }, + { + "entropy": 1.835992193222046, + "epoch": 0.5639670820291097, + "grad_norm": 9.34242057800293, + "learning_rate": 3.3687361369605216e-06, + "loss": 0.4143, + "mean_token_accuracy": 0.8556068152189255, + "num_tokens": 218896869.0, + "step": 181930 + }, + { + "entropy": 1.9087290972471238, + "epoch": 0.5639980811541594, + "grad_norm": 9.419221878051758, + "learning_rate": 3.3686435569631716e-06, + "loss": 0.4621, + "mean_token_accuracy": 0.8489242225885392, + "num_tokens": 218908053.0, + "step": 181940 + }, + { + "entropy": 1.9714890956878661, + "epoch": 0.5640290802792092, + "grad_norm": 8.146427154541016, + "learning_rate": 3.3685509845982834e-06, + "loss": 0.544, + "mean_token_accuracy": 0.8375766724348068, + "num_tokens": 218919129.0, + "step": 181950 + }, + { + "entropy": 1.9061841890215874, + "epoch": 0.5640600794042588, + "grad_norm": 9.288558959960938, + "learning_rate": 3.3684584198648105e-06, + "loss": 0.4434, + "mean_token_accuracy": 0.8559746578335762, + "num_tokens": 218930350.0, + "step": 181960 + }, + { + "entropy": 1.8730715066194534, + "epoch": 0.5640910785293085, + "grad_norm": 7.72963285446167, + "learning_rate": 3.368365862761704e-06, + "loss": 0.416, + "mean_token_accuracy": 0.8536106586456299, + "num_tokens": 218942941.0, + "step": 181970 + }, + { + "entropy": 1.883928555250168, + "epoch": 0.5641220776543582, + "grad_norm": 10.418731689453125, + "learning_rate": 3.3682733132879146e-06, + "loss": 0.4637, + "mean_token_accuracy": 0.8562978118658066, + "num_tokens": 218953961.0, + "step": 181980 + }, + { + "entropy": 1.896950177848339, + "epoch": 0.5641530767794078, + "grad_norm": 7.440096378326416, + "learning_rate": 3.368180771442396e-06, + "loss": 0.4428, + "mean_token_accuracy": 0.8484185233712196, + "num_tokens": 218965790.0, + "step": 181990 + }, + { + "entropy": 1.8978441506624222, + "epoch": 0.5641840759044576, + "grad_norm": 8.00220012664795, + "learning_rate": 3.3680882372240982e-06, + "loss": 0.4073, + "mean_token_accuracy": 0.8625353053212166, + "num_tokens": 218977796.0, + "step": 182000 + }, + { + "entropy": 1.9124243408441544, + "epoch": 0.5642150750295073, + "grad_norm": 7.3425612449646, + "learning_rate": 3.3679957106319743e-06, + "loss": 0.4538, + "mean_token_accuracy": 0.8619111463427543, + "num_tokens": 218988584.0, + "step": 182010 + }, + { + "entropy": 1.8750930547714233, + "epoch": 0.564246074154557, + "grad_norm": 8.522233963012695, + "learning_rate": 3.367903191664978e-06, + "loss": 0.3983, + "mean_token_accuracy": 0.8666472569108009, + "num_tokens": 219000591.0, + "step": 182020 + }, + { + "entropy": 1.7184800058603287, + "epoch": 0.5642770732796066, + "grad_norm": 3.9416251182556152, + "learning_rate": 3.3678106803220605e-06, + "loss": 0.3806, + "mean_token_accuracy": 0.8791543394327164, + "num_tokens": 219014594.0, + "step": 182030 + }, + { + "entropy": 1.8457399889826775, + "epoch": 0.5643080724046564, + "grad_norm": 4.241663455963135, + "learning_rate": 3.367718176602176e-06, + "loss": 0.44, + "mean_token_accuracy": 0.8577973261475563, + "num_tokens": 219026319.0, + "step": 182040 + }, + { + "entropy": 1.8218046829104424, + "epoch": 0.5643390715297061, + "grad_norm": 7.572890758514404, + "learning_rate": 3.3676256805042766e-06, + "loss": 0.4048, + "mean_token_accuracy": 0.8654015228152275, + "num_tokens": 219038716.0, + "step": 182050 + }, + { + "entropy": 1.9060118064284324, + "epoch": 0.5643700706547558, + "grad_norm": 8.616740226745605, + "learning_rate": 3.367533192027317e-06, + "loss": 0.4448, + "mean_token_accuracy": 0.8549579024314881, + "num_tokens": 219050480.0, + "step": 182060 + }, + { + "entropy": 1.8430630937218666, + "epoch": 0.5644010697798054, + "grad_norm": 10.946331977844238, + "learning_rate": 3.367440711170249e-06, + "loss": 0.439, + "mean_token_accuracy": 0.8547526493668556, + "num_tokens": 219063246.0, + "step": 182070 + }, + { + "entropy": 1.82396737113595, + "epoch": 0.5644320689048552, + "grad_norm": 7.970212459564209, + "learning_rate": 3.367348237932027e-06, + "loss": 0.3988, + "mean_token_accuracy": 0.8534167259931564, + "num_tokens": 219075833.0, + "step": 182080 + }, + { + "entropy": 1.8939991384744643, + "epoch": 0.5644630680299049, + "grad_norm": 8.313651084899902, + "learning_rate": 3.367255772311606e-06, + "loss": 0.4177, + "mean_token_accuracy": 0.8648671418428421, + "num_tokens": 219087696.0, + "step": 182090 + }, + { + "entropy": 1.9231013625860214, + "epoch": 0.5644940671549545, + "grad_norm": 8.496840476989746, + "learning_rate": 3.3671633143079395e-06, + "loss": 0.4876, + "mean_token_accuracy": 0.8469110265374183, + "num_tokens": 219099058.0, + "step": 182100 + }, + { + "entropy": 1.8222051367163659, + "epoch": 0.5645250662800042, + "grad_norm": 4.980925559997559, + "learning_rate": 3.3670708639199813e-06, + "loss": 0.4101, + "mean_token_accuracy": 0.8577820032835006, + "num_tokens": 219110780.0, + "step": 182110 + }, + { + "entropy": 1.8365742474794389, + "epoch": 0.564556065405054, + "grad_norm": 8.180721282958984, + "learning_rate": 3.366978421146686e-06, + "loss": 0.4459, + "mean_token_accuracy": 0.8519570723176002, + "num_tokens": 219122821.0, + "step": 182120 + }, + { + "entropy": 1.914619317650795, + "epoch": 0.5645870645301037, + "grad_norm": 7.3940582275390625, + "learning_rate": 3.3668859859870096e-06, + "loss": 0.4661, + "mean_token_accuracy": 0.8543267771601677, + "num_tokens": 219134702.0, + "step": 182130 + }, + { + "entropy": 1.9344725817441941, + "epoch": 0.5646180636551533, + "grad_norm": 8.559662818908691, + "learning_rate": 3.366793558439905e-06, + "loss": 0.4758, + "mean_token_accuracy": 0.8544822856783867, + "num_tokens": 219145554.0, + "step": 182140 + }, + { + "entropy": 1.8622259184718133, + "epoch": 0.564649062780203, + "grad_norm": 8.896142959594727, + "learning_rate": 3.3667011385043298e-06, + "loss": 0.4579, + "mean_token_accuracy": 0.8583355069160461, + "num_tokens": 219157347.0, + "step": 182150 + }, + { + "entropy": 1.8627305820584297, + "epoch": 0.5646800619052528, + "grad_norm": 3.9189836978912354, + "learning_rate": 3.366608726179237e-06, + "loss": 0.4274, + "mean_token_accuracy": 0.8522779107093811, + "num_tokens": 219169536.0, + "step": 182160 + }, + { + "entropy": 1.907105678319931, + "epoch": 0.5647110610303024, + "grad_norm": 7.8243632316589355, + "learning_rate": 3.3665163214635838e-06, + "loss": 0.4709, + "mean_token_accuracy": 0.8565170675516128, + "num_tokens": 219180940.0, + "step": 182170 + }, + { + "entropy": 1.901413296163082, + "epoch": 0.5647420601553521, + "grad_norm": 6.818977355957031, + "learning_rate": 3.366423924356325e-06, + "loss": 0.4327, + "mean_token_accuracy": 0.8567003801465034, + "num_tokens": 219192893.0, + "step": 182180 + }, + { + "entropy": 1.8407423809170722, + "epoch": 0.5647730592804018, + "grad_norm": 8.929377555847168, + "learning_rate": 3.3663315348564173e-06, + "loss": 0.3894, + "mean_token_accuracy": 0.8660805970430374, + "num_tokens": 219205279.0, + "step": 182190 + }, + { + "entropy": 1.9222645550966262, + "epoch": 0.5648040584054516, + "grad_norm": 7.735604763031006, + "learning_rate": 3.366239152962816e-06, + "loss": 0.4425, + "mean_token_accuracy": 0.860330006480217, + "num_tokens": 219216624.0, + "step": 182200 + }, + { + "entropy": 1.9066086545586587, + "epoch": 0.5648350575305012, + "grad_norm": 9.061637878417969, + "learning_rate": 3.366146778674479e-06, + "loss": 0.4158, + "mean_token_accuracy": 0.859072470664978, + "num_tokens": 219228847.0, + "step": 182210 + }, + { + "entropy": 1.8881817117333413, + "epoch": 0.5648660566555509, + "grad_norm": 9.51163387298584, + "learning_rate": 3.366054411990361e-06, + "loss": 0.4759, + "mean_token_accuracy": 0.8498694509267807, + "num_tokens": 219240418.0, + "step": 182220 + }, + { + "entropy": 1.947641570866108, + "epoch": 0.5648970557806006, + "grad_norm": 9.015745162963867, + "learning_rate": 3.36596205290942e-06, + "loss": 0.4916, + "mean_token_accuracy": 0.8451044097542763, + "num_tokens": 219251188.0, + "step": 182230 + }, + { + "entropy": 1.8586992233991624, + "epoch": 0.5649280549056502, + "grad_norm": 9.301876068115234, + "learning_rate": 3.3658697014306124e-06, + "loss": 0.4404, + "mean_token_accuracy": 0.86016416400671, + "num_tokens": 219263759.0, + "step": 182240 + }, + { + "entropy": 1.769415383040905, + "epoch": 0.5649590540307, + "grad_norm": 8.25972843170166, + "learning_rate": 3.3657773575528956e-06, + "loss": 0.3909, + "mean_token_accuracy": 0.86795554459095, + "num_tokens": 219277713.0, + "step": 182250 + }, + { + "entropy": 1.909845282137394, + "epoch": 0.5649900531557497, + "grad_norm": 10.241829872131348, + "learning_rate": 3.3656850212752273e-06, + "loss": 0.4862, + "mean_token_accuracy": 0.8492648482322693, + "num_tokens": 219289088.0, + "step": 182260 + }, + { + "entropy": 1.8235872462391853, + "epoch": 0.5650210522807994, + "grad_norm": 8.383881568908691, + "learning_rate": 3.365592692596564e-06, + "loss": 0.4238, + "mean_token_accuracy": 0.8593395933508873, + "num_tokens": 219302014.0, + "step": 182270 + }, + { + "entropy": 1.8279115334153175, + "epoch": 0.565052051405849, + "grad_norm": 8.371310234069824, + "learning_rate": 3.3655003715158642e-06, + "loss": 0.348, + "mean_token_accuracy": 0.8745242550969123, + "num_tokens": 219314472.0, + "step": 182280 + }, + { + "entropy": 1.8149750515818597, + "epoch": 0.5650830505308988, + "grad_norm": 8.341879844665527, + "learning_rate": 3.365408058032086e-06, + "loss": 0.386, + "mean_token_accuracy": 0.8628831997513771, + "num_tokens": 219326935.0, + "step": 182290 + }, + { + "entropy": 1.97727922052145, + "epoch": 0.5651140496559485, + "grad_norm": 9.078901290893555, + "learning_rate": 3.3653157521441876e-06, + "loss": 0.5073, + "mean_token_accuracy": 0.837429566681385, + "num_tokens": 219338368.0, + "step": 182300 + }, + { + "entropy": 1.9396789371967316, + "epoch": 0.5651450487809981, + "grad_norm": 8.132356643676758, + "learning_rate": 3.3652234538511265e-06, + "loss": 0.4279, + "mean_token_accuracy": 0.8660401061177254, + "num_tokens": 219349114.0, + "step": 182310 + }, + { + "entropy": 1.862569797039032, + "epoch": 0.5651760479060478, + "grad_norm": 4.012666702270508, + "learning_rate": 3.3651311631518623e-06, + "loss": 0.4103, + "mean_token_accuracy": 0.8647153243422508, + "num_tokens": 219361509.0, + "step": 182320 + }, + { + "entropy": 1.8481622457504272, + "epoch": 0.5652070470310976, + "grad_norm": 4.161468505859375, + "learning_rate": 3.365038880045353e-06, + "loss": 0.4099, + "mean_token_accuracy": 0.8633993789553642, + "num_tokens": 219373790.0, + "step": 182330 + }, + { + "entropy": 1.8544276610016823, + "epoch": 0.5652380461561473, + "grad_norm": 9.143585205078125, + "learning_rate": 3.3649466045305584e-06, + "loss": 0.4366, + "mean_token_accuracy": 0.8522509098052978, + "num_tokens": 219385836.0, + "step": 182340 + }, + { + "entropy": 1.8401200011372567, + "epoch": 0.5652690452811969, + "grad_norm": 7.8143510818481445, + "learning_rate": 3.3648543366064367e-06, + "loss": 0.3762, + "mean_token_accuracy": 0.8740338370203972, + "num_tokens": 219398181.0, + "step": 182350 + }, + { + "entropy": 1.8663185223937035, + "epoch": 0.5653000444062466, + "grad_norm": 7.707705020904541, + "learning_rate": 3.364762076271948e-06, + "loss": 0.4953, + "mean_token_accuracy": 0.8531212002038956, + "num_tokens": 219410931.0, + "step": 182360 + }, + { + "entropy": 1.878646893799305, + "epoch": 0.5653310435312964, + "grad_norm": 8.273179054260254, + "learning_rate": 3.3646698235260516e-06, + "loss": 0.4457, + "mean_token_accuracy": 0.8549691379070282, + "num_tokens": 219422675.0, + "step": 182370 + }, + { + "entropy": 1.842753717303276, + "epoch": 0.565362042656346, + "grad_norm": 2.2988572120666504, + "learning_rate": 3.3645775783677077e-06, + "loss": 0.4344, + "mean_token_accuracy": 0.8559475839138031, + "num_tokens": 219434999.0, + "step": 182380 + }, + { + "entropy": 1.9365380138158799, + "epoch": 0.5653930417813957, + "grad_norm": 7.147967338562012, + "learning_rate": 3.364485340795875e-06, + "loss": 0.4765, + "mean_token_accuracy": 0.8467515155673027, + "num_tokens": 219445954.0, + "step": 182390 + }, + { + "entropy": 1.8891595363616944, + "epoch": 0.5654240409064454, + "grad_norm": 7.55145788192749, + "learning_rate": 3.3643931108095146e-06, + "loss": 0.4173, + "mean_token_accuracy": 0.8647678464651107, + "num_tokens": 219458306.0, + "step": 182400 + }, + { + "entropy": 1.8514413997530936, + "epoch": 0.5654550400314952, + "grad_norm": 5.811746120452881, + "learning_rate": 3.364300888407587e-06, + "loss": 0.4501, + "mean_token_accuracy": 0.8549320101737976, + "num_tokens": 219470617.0, + "step": 182410 + }, + { + "entropy": 1.8094148866832256, + "epoch": 0.5654860391565448, + "grad_norm": 2.7894392013549805, + "learning_rate": 3.364208673589053e-06, + "loss": 0.3814, + "mean_token_accuracy": 0.8556830063462257, + "num_tokens": 219484050.0, + "step": 182420 + }, + { + "entropy": 1.859315599501133, + "epoch": 0.5655170382815945, + "grad_norm": 8.049720764160156, + "learning_rate": 3.3641164663528717e-06, + "loss": 0.4152, + "mean_token_accuracy": 0.8648128852248191, + "num_tokens": 219496016.0, + "step": 182430 + }, + { + "entropy": 1.8675218299031258, + "epoch": 0.5655480374066442, + "grad_norm": 10.070209503173828, + "learning_rate": 3.364024266698006e-06, + "loss": 0.4903, + "mean_token_accuracy": 0.8463949292898179, + "num_tokens": 219508002.0, + "step": 182440 + }, + { + "entropy": 1.871927236020565, + "epoch": 0.565579036531694, + "grad_norm": 8.271561622619629, + "learning_rate": 3.3639320746234164e-06, + "loss": 0.4382, + "mean_token_accuracy": 0.8576414436101913, + "num_tokens": 219519654.0, + "step": 182450 + }, + { + "entropy": 1.8629664570093154, + "epoch": 0.5656100356567436, + "grad_norm": 9.942543983459473, + "learning_rate": 3.3638398901280646e-06, + "loss": 0.436, + "mean_token_accuracy": 0.8536071136593819, + "num_tokens": 219532168.0, + "step": 182460 + }, + { + "entropy": 1.9039793327450751, + "epoch": 0.5656410347817933, + "grad_norm": 9.30384635925293, + "learning_rate": 3.363747713210911e-06, + "loss": 0.4506, + "mean_token_accuracy": 0.8593076154589653, + "num_tokens": 219543244.0, + "step": 182470 + }, + { + "entropy": 1.896394467353821, + "epoch": 0.565672033906843, + "grad_norm": 10.03402328491211, + "learning_rate": 3.363655543870918e-06, + "loss": 0.4525, + "mean_token_accuracy": 0.8478322148323059, + "num_tokens": 219555030.0, + "step": 182480 + }, + { + "entropy": 1.8448296919465066, + "epoch": 0.5657030330318926, + "grad_norm": 8.039837837219238, + "learning_rate": 3.3635633821070474e-06, + "loss": 0.3759, + "mean_token_accuracy": 0.8729675441980362, + "num_tokens": 219567463.0, + "step": 182490 + }, + { + "entropy": 1.7930359467864037, + "epoch": 0.5657340321569424, + "grad_norm": 8.818025588989258, + "learning_rate": 3.363471227918261e-06, + "loss": 0.4062, + "mean_token_accuracy": 0.870102445781231, + "num_tokens": 219580320.0, + "step": 182500 + }, + { + "entropy": 1.7787843875586986, + "epoch": 0.5657650312819921, + "grad_norm": 6.676950931549072, + "learning_rate": 3.3633790813035238e-06, + "loss": 0.3836, + "mean_token_accuracy": 0.8641812324523925, + "num_tokens": 219593450.0, + "step": 182510 + }, + { + "entropy": 1.8252146616578102, + "epoch": 0.5657960304070417, + "grad_norm": 8.978650093078613, + "learning_rate": 3.363286942261795e-06, + "loss": 0.3925, + "mean_token_accuracy": 0.8654040232300758, + "num_tokens": 219605533.0, + "step": 182520 + }, + { + "entropy": 1.914176408946514, + "epoch": 0.5658270295320914, + "grad_norm": 10.18586254119873, + "learning_rate": 3.3631948107920388e-06, + "loss": 0.4876, + "mean_token_accuracy": 0.852830545604229, + "num_tokens": 219616787.0, + "step": 182530 + }, + { + "entropy": 1.8280531086027623, + "epoch": 0.5658580286571412, + "grad_norm": 3.613919973373413, + "learning_rate": 3.3631026868932177e-06, + "loss": 0.4088, + "mean_token_accuracy": 0.8650343582034111, + "num_tokens": 219630185.0, + "step": 182540 + }, + { + "entropy": 1.7496952429413795, + "epoch": 0.5658890277821909, + "grad_norm": 9.850088119506836, + "learning_rate": 3.3630105705642953e-06, + "loss": 0.3499, + "mean_token_accuracy": 0.8619767040014267, + "num_tokens": 219643599.0, + "step": 182550 + }, + { + "entropy": 1.8812230035662652, + "epoch": 0.5659200269072405, + "grad_norm": 7.521579265594482, + "learning_rate": 3.3629184618042353e-06, + "loss": 0.4028, + "mean_token_accuracy": 0.8584534049034118, + "num_tokens": 219655194.0, + "step": 182560 + }, + { + "entropy": 1.861245647072792, + "epoch": 0.5659510260322902, + "grad_norm": 7.2412614822387695, + "learning_rate": 3.3628263606119996e-06, + "loss": 0.4032, + "mean_token_accuracy": 0.8658636063337326, + "num_tokens": 219668106.0, + "step": 182570 + }, + { + "entropy": 1.7555628687143325, + "epoch": 0.56598202515734, + "grad_norm": 8.427504539489746, + "learning_rate": 3.3627342669865537e-06, + "loss": 0.3867, + "mean_token_accuracy": 0.8658777117729187, + "num_tokens": 219681746.0, + "step": 182580 + }, + { + "entropy": 1.925270189344883, + "epoch": 0.5660130242823896, + "grad_norm": 7.512916088104248, + "learning_rate": 3.362642180926861e-06, + "loss": 0.4523, + "mean_token_accuracy": 0.8513542354106903, + "num_tokens": 219692977.0, + "step": 182590 + }, + { + "entropy": 1.7738431617617607, + "epoch": 0.5660440234074393, + "grad_norm": 8.270086288452148, + "learning_rate": 3.3625501024318863e-06, + "loss": 0.4018, + "mean_token_accuracy": 0.8591394439339638, + "num_tokens": 219705620.0, + "step": 182600 + }, + { + "entropy": 1.8614506632089616, + "epoch": 0.566075022532489, + "grad_norm": 8.078804016113281, + "learning_rate": 3.3624580315005917e-06, + "loss": 0.3953, + "mean_token_accuracy": 0.8608483463525772, + "num_tokens": 219718256.0, + "step": 182610 + }, + { + "entropy": 1.9512107968330383, + "epoch": 0.5661060216575388, + "grad_norm": 7.991335391998291, + "learning_rate": 3.3623659681319444e-06, + "loss": 0.4568, + "mean_token_accuracy": 0.8626005321741104, + "num_tokens": 219729201.0, + "step": 182620 + }, + { + "entropy": 1.8471109196543694, + "epoch": 0.5661370207825884, + "grad_norm": 7.598386287689209, + "learning_rate": 3.3622739123249072e-06, + "loss": 0.4344, + "mean_token_accuracy": 0.861255344748497, + "num_tokens": 219741656.0, + "step": 182630 + }, + { + "entropy": 1.888534700870514, + "epoch": 0.5661680199076381, + "grad_norm": 7.569242000579834, + "learning_rate": 3.3621818640784463e-06, + "loss": 0.4501, + "mean_token_accuracy": 0.8554926365613937, + "num_tokens": 219754235.0, + "step": 182640 + }, + { + "entropy": 1.8320038333535194, + "epoch": 0.5661990190326878, + "grad_norm": 7.632225036621094, + "learning_rate": 3.3620898233915266e-06, + "loss": 0.4326, + "mean_token_accuracy": 0.8643024682998657, + "num_tokens": 219766919.0, + "step": 182650 + }, + { + "entropy": 1.8487570151686668, + "epoch": 0.5662300181577375, + "grad_norm": 6.813345432281494, + "learning_rate": 3.3619977902631126e-06, + "loss": 0.3771, + "mean_token_accuracy": 0.8686630889773369, + "num_tokens": 219779203.0, + "step": 182660 + }, + { + "entropy": 1.935315978527069, + "epoch": 0.5662610172827872, + "grad_norm": 8.292950630187988, + "learning_rate": 3.361905764692171e-06, + "loss": 0.4412, + "mean_token_accuracy": 0.8577398896217346, + "num_tokens": 219790189.0, + "step": 182670 + }, + { + "entropy": 1.9341531276702881, + "epoch": 0.5662920164078369, + "grad_norm": 8.29041862487793, + "learning_rate": 3.3618137466776664e-06, + "loss": 0.4463, + "mean_token_accuracy": 0.8639525189995766, + "num_tokens": 219801446.0, + "step": 182680 + }, + { + "entropy": 1.8314605563879014, + "epoch": 0.5663230155328866, + "grad_norm": 3.608710765838623, + "learning_rate": 3.3617217362185654e-06, + "loss": 0.4269, + "mean_token_accuracy": 0.8570237934589386, + "num_tokens": 219813951.0, + "step": 182690 + }, + { + "entropy": 1.8585295498371124, + "epoch": 0.5663540146579363, + "grad_norm": 8.209573745727539, + "learning_rate": 3.361629733313834e-06, + "loss": 0.4293, + "mean_token_accuracy": 0.8623125806450844, + "num_tokens": 219825535.0, + "step": 182700 + }, + { + "entropy": 1.867700320482254, + "epoch": 0.566385013782986, + "grad_norm": 7.110729217529297, + "learning_rate": 3.3615377379624386e-06, + "loss": 0.4966, + "mean_token_accuracy": 0.8486958160996437, + "num_tokens": 219838008.0, + "step": 182710 + }, + { + "entropy": 1.7865567639470101, + "epoch": 0.5664160129080357, + "grad_norm": 4.511937141418457, + "learning_rate": 3.361445750163346e-06, + "loss": 0.3865, + "mean_token_accuracy": 0.8617598488926888, + "num_tokens": 219850952.0, + "step": 182720 + }, + { + "entropy": 1.9098443865776062, + "epoch": 0.5664470120330853, + "grad_norm": 6.9264984130859375, + "learning_rate": 3.3613537699155224e-06, + "loss": 0.4365, + "mean_token_accuracy": 0.8570198655128479, + "num_tokens": 219862258.0, + "step": 182730 + }, + { + "entropy": 1.8792338341474533, + "epoch": 0.566478011158135, + "grad_norm": 8.768360137939453, + "learning_rate": 3.3612617972179345e-06, + "loss": 0.4238, + "mean_token_accuracy": 0.8603082686662674, + "num_tokens": 219874000.0, + "step": 182740 + }, + { + "entropy": 1.8674265652894975, + "epoch": 0.5665090102831848, + "grad_norm": 4.6239471435546875, + "learning_rate": 3.3611698320695502e-06, + "loss": 0.4112, + "mean_token_accuracy": 0.866169311106205, + "num_tokens": 219885856.0, + "step": 182750 + }, + { + "entropy": 1.8426985383033752, + "epoch": 0.5665400094082345, + "grad_norm": 7.317948818206787, + "learning_rate": 3.361077874469336e-06, + "loss": 0.4306, + "mean_token_accuracy": 0.8583661854267121, + "num_tokens": 219898930.0, + "step": 182760 + }, + { + "entropy": 1.906522662937641, + "epoch": 0.5665710085332841, + "grad_norm": 7.433799743652344, + "learning_rate": 3.3609859244162606e-06, + "loss": 0.4428, + "mean_token_accuracy": 0.8607842057943345, + "num_tokens": 219909785.0, + "step": 182770 + }, + { + "entropy": 1.858467762172222, + "epoch": 0.5666020076583338, + "grad_norm": 2.7514004707336426, + "learning_rate": 3.3608939819092902e-06, + "loss": 0.4304, + "mean_token_accuracy": 0.8575167447328568, + "num_tokens": 219921781.0, + "step": 182780 + }, + { + "entropy": 1.838193103671074, + "epoch": 0.5666330067833836, + "grad_norm": 9.16029167175293, + "learning_rate": 3.360802046947394e-06, + "loss": 0.4182, + "mean_token_accuracy": 0.8564060479402542, + "num_tokens": 219934047.0, + "step": 182790 + }, + { + "entropy": 1.8478391572833062, + "epoch": 0.5666640059084332, + "grad_norm": 3.9323177337646484, + "learning_rate": 3.360710119529539e-06, + "loss": 0.416, + "mean_token_accuracy": 0.8672307848930358, + "num_tokens": 219946952.0, + "step": 182800 + }, + { + "entropy": 1.8906191244721413, + "epoch": 0.5666950050334829, + "grad_norm": 7.394486904144287, + "learning_rate": 3.3606181996546943e-06, + "loss": 0.4671, + "mean_token_accuracy": 0.8540799006819725, + "num_tokens": 219958787.0, + "step": 182810 + }, + { + "entropy": 1.8989437848329545, + "epoch": 0.5667260041585326, + "grad_norm": 8.674349784851074, + "learning_rate": 3.3605262873218285e-06, + "loss": 0.4569, + "mean_token_accuracy": 0.8600521177053452, + "num_tokens": 219970417.0, + "step": 182820 + }, + { + "entropy": 1.8586945995688438, + "epoch": 0.5667570032835824, + "grad_norm": 3.3568546772003174, + "learning_rate": 3.3604343825299098e-06, + "loss": 0.4322, + "mean_token_accuracy": 0.8593952640891075, + "num_tokens": 219982749.0, + "step": 182830 + }, + { + "entropy": 1.880942103266716, + "epoch": 0.566788002408632, + "grad_norm": 4.652389049530029, + "learning_rate": 3.360342485277907e-06, + "loss": 0.4517, + "mean_token_accuracy": 0.8370317235589028, + "num_tokens": 219995175.0, + "step": 182840 + }, + { + "entropy": 1.9068979054689408, + "epoch": 0.5668190015336817, + "grad_norm": 8.30090045928955, + "learning_rate": 3.360250595564789e-06, + "loss": 0.4531, + "mean_token_accuracy": 0.8553275540471077, + "num_tokens": 220006219.0, + "step": 182850 + }, + { + "entropy": 1.9159462600946426, + "epoch": 0.5668500006587314, + "grad_norm": 8.318293571472168, + "learning_rate": 3.3601587133895264e-06, + "loss": 0.4175, + "mean_token_accuracy": 0.8619872912764549, + "num_tokens": 220018022.0, + "step": 182860 + }, + { + "entropy": 1.9070228204131126, + "epoch": 0.5668809997837811, + "grad_norm": 8.146712303161621, + "learning_rate": 3.360066838751088e-06, + "loss": 0.4446, + "mean_token_accuracy": 0.852077630162239, + "num_tokens": 220030045.0, + "step": 182870 + }, + { + "entropy": 1.8928291469812393, + "epoch": 0.5669119989088308, + "grad_norm": 8.228689193725586, + "learning_rate": 3.359974971648442e-06, + "loss": 0.4269, + "mean_token_accuracy": 0.8475547671318054, + "num_tokens": 220042402.0, + "step": 182880 + }, + { + "entropy": 1.9087701261043548, + "epoch": 0.5669429980338805, + "grad_norm": 4.87774658203125, + "learning_rate": 3.359883112080561e-06, + "loss": 0.4429, + "mean_token_accuracy": 0.8584561154246331, + "num_tokens": 220054088.0, + "step": 182890 + }, + { + "entropy": 1.9589183256030083, + "epoch": 0.5669739971589302, + "grad_norm": 8.75279426574707, + "learning_rate": 3.359791260046413e-06, + "loss": 0.4998, + "mean_token_accuracy": 0.8392207562923432, + "num_tokens": 220065452.0, + "step": 182900 + }, + { + "entropy": 1.8772664383053779, + "epoch": 0.5670049962839799, + "grad_norm": 7.904957294464111, + "learning_rate": 3.3596994155449686e-06, + "loss": 0.4242, + "mean_token_accuracy": 0.8565537855029106, + "num_tokens": 220078156.0, + "step": 182910 + }, + { + "entropy": 1.904349359869957, + "epoch": 0.5670359954090296, + "grad_norm": 7.1998491287231445, + "learning_rate": 3.359607578575199e-06, + "loss": 0.4379, + "mean_token_accuracy": 0.8597839877009392, + "num_tokens": 220089673.0, + "step": 182920 + }, + { + "entropy": 1.9074815943837167, + "epoch": 0.5670669945340793, + "grad_norm": 9.475333213806152, + "learning_rate": 3.3595157491360746e-06, + "loss": 0.4209, + "mean_token_accuracy": 0.864620278775692, + "num_tokens": 220101476.0, + "step": 182930 + }, + { + "entropy": 1.909410683810711, + "epoch": 0.5670979936591289, + "grad_norm": 3.6434812545776367, + "learning_rate": 3.3594239272265657e-06, + "loss": 0.46, + "mean_token_accuracy": 0.8497651174664498, + "num_tokens": 220112593.0, + "step": 182940 + }, + { + "entropy": 1.8805428713560104, + "epoch": 0.5671289927841787, + "grad_norm": 4.920924663543701, + "learning_rate": 3.359332112845644e-06, + "loss": 0.4708, + "mean_token_accuracy": 0.8507418289780617, + "num_tokens": 220124868.0, + "step": 182950 + }, + { + "entropy": 1.7920621424913405, + "epoch": 0.5671599919092284, + "grad_norm": 7.2576069831848145, + "learning_rate": 3.359240305992281e-06, + "loss": 0.3795, + "mean_token_accuracy": 0.8677036970853805, + "num_tokens": 220138016.0, + "step": 182960 + }, + { + "entropy": 1.9227833956480027, + "epoch": 0.5671909910342781, + "grad_norm": 6.757200241088867, + "learning_rate": 3.3591485066654474e-06, + "loss": 0.4196, + "mean_token_accuracy": 0.8580241650342941, + "num_tokens": 220149184.0, + "step": 182970 + }, + { + "entropy": 1.9060191959142685, + "epoch": 0.5672219901593277, + "grad_norm": 8.616246223449707, + "learning_rate": 3.3590567148641155e-06, + "loss": 0.4649, + "mean_token_accuracy": 0.8536302790045738, + "num_tokens": 220161347.0, + "step": 182980 + }, + { + "entropy": 1.849088190495968, + "epoch": 0.5672529892843774, + "grad_norm": 3.9549193382263184, + "learning_rate": 3.3589649305872564e-06, + "loss": 0.4058, + "mean_token_accuracy": 0.860573235154152, + "num_tokens": 220173832.0, + "step": 182990 + }, + { + "entropy": 1.9189164862036705, + "epoch": 0.5672839884094272, + "grad_norm": 8.847037315368652, + "learning_rate": 3.3588731538338426e-06, + "loss": 0.4376, + "mean_token_accuracy": 0.8600964099168777, + "num_tokens": 220185558.0, + "step": 183000 + }, + { + "entropy": 1.908226054906845, + "epoch": 0.5673149875344768, + "grad_norm": 7.739184379577637, + "learning_rate": 3.3587813846028462e-06, + "loss": 0.4298, + "mean_token_accuracy": 0.8561240792274475, + "num_tokens": 220197401.0, + "step": 183010 + }, + { + "entropy": 1.9531918078660966, + "epoch": 0.5673459866595265, + "grad_norm": 9.055732727050781, + "learning_rate": 3.35868962289324e-06, + "loss": 0.4726, + "mean_token_accuracy": 0.8516348704695702, + "num_tokens": 220208127.0, + "step": 183020 + }, + { + "entropy": 1.931714966893196, + "epoch": 0.5673769857845762, + "grad_norm": 8.877476692199707, + "learning_rate": 3.3585978687039964e-06, + "loss": 0.451, + "mean_token_accuracy": 0.8567020818591118, + "num_tokens": 220218719.0, + "step": 183030 + }, + { + "entropy": 1.8644310608506203, + "epoch": 0.567407984909626, + "grad_norm": 10.3399019241333, + "learning_rate": 3.358506122034088e-06, + "loss": 0.3853, + "mean_token_accuracy": 0.8704071551561355, + "num_tokens": 220230842.0, + "step": 183040 + }, + { + "entropy": 1.9125615239143372, + "epoch": 0.5674389840346756, + "grad_norm": 7.946232795715332, + "learning_rate": 3.358414382882489e-06, + "loss": 0.4469, + "mean_token_accuracy": 0.852666375041008, + "num_tokens": 220242813.0, + "step": 183050 + }, + { + "entropy": 1.9153546869754792, + "epoch": 0.5674699831597253, + "grad_norm": 3.3038265705108643, + "learning_rate": 3.3583226512481703e-06, + "loss": 0.4751, + "mean_token_accuracy": 0.8458715483546257, + "num_tokens": 220254735.0, + "step": 183060 + }, + { + "entropy": 1.884465442597866, + "epoch": 0.567500982284775, + "grad_norm": 7.026264667510986, + "learning_rate": 3.3582309271301076e-06, + "loss": 0.4013, + "mean_token_accuracy": 0.860778984427452, + "num_tokens": 220266193.0, + "step": 183070 + }, + { + "entropy": 1.975827944278717, + "epoch": 0.5675319814098247, + "grad_norm": 8.726240158081055, + "learning_rate": 3.3581392105272736e-06, + "loss": 0.5006, + "mean_token_accuracy": 0.8460676193237304, + "num_tokens": 220276810.0, + "step": 183080 + }, + { + "entropy": 1.8663788139820099, + "epoch": 0.5675629805348744, + "grad_norm": 9.79867172241211, + "learning_rate": 3.3580475014386416e-06, + "loss": 0.3921, + "mean_token_accuracy": 0.8591655671596528, + "num_tokens": 220289125.0, + "step": 183090 + }, + { + "entropy": 1.9003127381205558, + "epoch": 0.5675939796599241, + "grad_norm": 3.7742018699645996, + "learning_rate": 3.357955799863186e-06, + "loss": 0.466, + "mean_token_accuracy": 0.860464958846569, + "num_tokens": 220300198.0, + "step": 183100 + }, + { + "entropy": 1.891619788110256, + "epoch": 0.5676249787849738, + "grad_norm": 8.731237411499023, + "learning_rate": 3.3578641057998818e-06, + "loss": 0.4292, + "mean_token_accuracy": 0.8640395820140838, + "num_tokens": 220311070.0, + "step": 183110 + }, + { + "entropy": 1.9159280955791473, + "epoch": 0.5676559779100235, + "grad_norm": 8.504816055297852, + "learning_rate": 3.3577724192477028e-06, + "loss": 0.4635, + "mean_token_accuracy": 0.8534823596477509, + "num_tokens": 220322812.0, + "step": 183120 + }, + { + "entropy": 1.9482707679271698, + "epoch": 0.5676869770350732, + "grad_norm": 9.165209770202637, + "learning_rate": 3.3576807402056232e-06, + "loss": 0.4358, + "mean_token_accuracy": 0.8688961327075958, + "num_tokens": 220333449.0, + "step": 183130 + }, + { + "entropy": 1.874471677839756, + "epoch": 0.5677179761601229, + "grad_norm": 9.139283180236816, + "learning_rate": 3.3575890686726183e-06, + "loss": 0.4151, + "mean_token_accuracy": 0.8565481066703796, + "num_tokens": 220345253.0, + "step": 183140 + }, + { + "entropy": 1.8991287276148796, + "epoch": 0.5677489752851725, + "grad_norm": 8.980961799621582, + "learning_rate": 3.3574974046476634e-06, + "loss": 0.4268, + "mean_token_accuracy": 0.8628603085875511, + "num_tokens": 220356474.0, + "step": 183150 + }, + { + "entropy": 1.8723631590604781, + "epoch": 0.5677799744102223, + "grad_norm": 3.9973182678222656, + "learning_rate": 3.3574057481297323e-06, + "loss": 0.4319, + "mean_token_accuracy": 0.8480863317847251, + "num_tokens": 220368857.0, + "step": 183160 + }, + { + "entropy": 1.7563340038061142, + "epoch": 0.567810973535272, + "grad_norm": 8.3080472946167, + "learning_rate": 3.3573140991178016e-06, + "loss": 0.3625, + "mean_token_accuracy": 0.8686195388436317, + "num_tokens": 220382271.0, + "step": 183170 + }, + { + "entropy": 1.9225044384598733, + "epoch": 0.5678419726603217, + "grad_norm": 8.145635604858398, + "learning_rate": 3.3572224576108474e-06, + "loss": 0.4519, + "mean_token_accuracy": 0.8540999129414558, + "num_tokens": 220394223.0, + "step": 183180 + }, + { + "entropy": 1.8276883363723755, + "epoch": 0.5678729717853713, + "grad_norm": 7.337801456451416, + "learning_rate": 3.3571308236078437e-06, + "loss": 0.3733, + "mean_token_accuracy": 0.8690663799643517, + "num_tokens": 220407171.0, + "step": 183190 + }, + { + "entropy": 1.9450956612825394, + "epoch": 0.5679039709104211, + "grad_norm": 8.292513847351074, + "learning_rate": 3.3570391971077676e-06, + "loss": 0.5, + "mean_token_accuracy": 0.8465719684958458, + "num_tokens": 220418012.0, + "step": 183200 + }, + { + "entropy": 1.8319994553923606, + "epoch": 0.5679349700354708, + "grad_norm": 8.656058311462402, + "learning_rate": 3.3569475781095955e-06, + "loss": 0.4086, + "mean_token_accuracy": 0.861232291162014, + "num_tokens": 220431041.0, + "step": 183210 + }, + { + "entropy": 1.8853465974330903, + "epoch": 0.5679659691605204, + "grad_norm": 8.341654777526855, + "learning_rate": 3.356855966612303e-06, + "loss": 0.4425, + "mean_token_accuracy": 0.8643447190523148, + "num_tokens": 220442375.0, + "step": 183220 + }, + { + "entropy": 1.9103177651762961, + "epoch": 0.5679969682855701, + "grad_norm": 9.43069076538086, + "learning_rate": 3.3567643626148666e-06, + "loss": 0.43, + "mean_token_accuracy": 0.8563966482877732, + "num_tokens": 220454284.0, + "step": 183230 + }, + { + "entropy": 1.9459126323461533, + "epoch": 0.5680279674106198, + "grad_norm": 9.42568302154541, + "learning_rate": 3.3566727661162647e-06, + "loss": 0.4701, + "mean_token_accuracy": 0.8557520717382431, + "num_tokens": 220466716.0, + "step": 183240 + }, + { + "entropy": 1.9446142554283141, + "epoch": 0.5680589665356696, + "grad_norm": 7.449951648712158, + "learning_rate": 3.3565811771154718e-06, + "loss": 0.5724, + "mean_token_accuracy": 0.8319184750318527, + "num_tokens": 220478467.0, + "step": 183250 + }, + { + "entropy": 1.8678555905818939, + "epoch": 0.5680899656607192, + "grad_norm": 9.808244705200195, + "learning_rate": 3.3564895956114668e-06, + "loss": 0.4239, + "mean_token_accuracy": 0.8574158445000648, + "num_tokens": 220490488.0, + "step": 183260 + }, + { + "entropy": 1.8901596933603286, + "epoch": 0.5681209647857689, + "grad_norm": 8.877074241638184, + "learning_rate": 3.3563980216032265e-06, + "loss": 0.4224, + "mean_token_accuracy": 0.8529431104660035, + "num_tokens": 220502198.0, + "step": 183270 + }, + { + "entropy": 1.8711446061730386, + "epoch": 0.5681519639108186, + "grad_norm": 6.931746482849121, + "learning_rate": 3.3563064550897285e-06, + "loss": 0.4328, + "mean_token_accuracy": 0.8635139778256417, + "num_tokens": 220513840.0, + "step": 183280 + }, + { + "entropy": 1.7962354853749276, + "epoch": 0.5681829630358683, + "grad_norm": 7.381433963775635, + "learning_rate": 3.356214896069951e-06, + "loss": 0.3305, + "mean_token_accuracy": 0.8718640059232712, + "num_tokens": 220527216.0, + "step": 183290 + }, + { + "entropy": 1.89014650657773, + "epoch": 0.568213962160918, + "grad_norm": 10.772461891174316, + "learning_rate": 3.3561233445428705e-06, + "loss": 0.4507, + "mean_token_accuracy": 0.8503888994455338, + "num_tokens": 220539151.0, + "step": 183300 + }, + { + "entropy": 1.925488579273224, + "epoch": 0.5682449612859677, + "grad_norm": 8.212474822998047, + "learning_rate": 3.3560318005074654e-06, + "loss": 0.4384, + "mean_token_accuracy": 0.8614701345562935, + "num_tokens": 220550011.0, + "step": 183310 + }, + { + "entropy": 1.9450786262750626, + "epoch": 0.5682759604110174, + "grad_norm": 8.625468254089355, + "learning_rate": 3.355940263962716e-06, + "loss": 0.4781, + "mean_token_accuracy": 0.8568323805928231, + "num_tokens": 220560733.0, + "step": 183320 + }, + { + "entropy": 1.8635306119918824, + "epoch": 0.5683069595360671, + "grad_norm": 3.6268012523651123, + "learning_rate": 3.3558487349075984e-06, + "loss": 0.4041, + "mean_token_accuracy": 0.8642154484987259, + "num_tokens": 220573023.0, + "step": 183330 + }, + { + "entropy": 1.8082802429795266, + "epoch": 0.5683379586611168, + "grad_norm": 2.5699939727783203, + "learning_rate": 3.355757213341093e-06, + "loss": 0.3472, + "mean_token_accuracy": 0.8663254842162132, + "num_tokens": 220586830.0, + "step": 183340 + }, + { + "entropy": 1.9095750093460082, + "epoch": 0.5683689577861665, + "grad_norm": 8.66707706451416, + "learning_rate": 3.355665699262178e-06, + "loss": 0.4464, + "mean_token_accuracy": 0.8577072054147721, + "num_tokens": 220599167.0, + "step": 183350 + }, + { + "entropy": 1.8929280027747155, + "epoch": 0.5683999569112161, + "grad_norm": 4.26317834854126, + "learning_rate": 3.355574192669832e-06, + "loss": 0.4378, + "mean_token_accuracy": 0.853906175494194, + "num_tokens": 220611085.0, + "step": 183360 + }, + { + "entropy": 1.968779844045639, + "epoch": 0.5684309560362659, + "grad_norm": 7.8538007736206055, + "learning_rate": 3.355482693563035e-06, + "loss": 0.5, + "mean_token_accuracy": 0.8414223611354827, + "num_tokens": 220621870.0, + "step": 183370 + }, + { + "entropy": 1.8689953058958053, + "epoch": 0.5684619551613156, + "grad_norm": 7.304516315460205, + "learning_rate": 3.3553912019407663e-06, + "loss": 0.4096, + "mean_token_accuracy": 0.8525772273540497, + "num_tokens": 220634736.0, + "step": 183380 + }, + { + "entropy": 1.9347944945096969, + "epoch": 0.5684929542863653, + "grad_norm": 9.088506698608398, + "learning_rate": 3.3552997178020064e-06, + "loss": 0.4833, + "mean_token_accuracy": 0.8514841318130493, + "num_tokens": 220646056.0, + "step": 183390 + }, + { + "entropy": 1.9211686462163926, + "epoch": 0.5685239534114149, + "grad_norm": 8.135913848876953, + "learning_rate": 3.355208241145733e-06, + "loss": 0.441, + "mean_token_accuracy": 0.8514508843421936, + "num_tokens": 220658047.0, + "step": 183400 + }, + { + "entropy": 1.908077448606491, + "epoch": 0.5685549525364647, + "grad_norm": 3.7041707038879395, + "learning_rate": 3.3551167719709283e-06, + "loss": 0.4337, + "mean_token_accuracy": 0.8491874933242798, + "num_tokens": 220670530.0, + "step": 183410 + }, + { + "entropy": 1.8245035156607627, + "epoch": 0.5685859516615144, + "grad_norm": 8.679533958435059, + "learning_rate": 3.3550253102765717e-06, + "loss": 0.3993, + "mean_token_accuracy": 0.8612040519714356, + "num_tokens": 220682946.0, + "step": 183420 + }, + { + "entropy": 1.9670052647590637, + "epoch": 0.568616950786564, + "grad_norm": 8.43134593963623, + "learning_rate": 3.3549338560616435e-06, + "loss": 0.49, + "mean_token_accuracy": 0.8479528352618217, + "num_tokens": 220694045.0, + "step": 183430 + }, + { + "entropy": 1.7410773530602455, + "epoch": 0.5686479499116137, + "grad_norm": 7.65750789642334, + "learning_rate": 3.354842409325125e-06, + "loss": 0.3453, + "mean_token_accuracy": 0.8736346036195755, + "num_tokens": 220707673.0, + "step": 183440 + }, + { + "entropy": 1.852739727497101, + "epoch": 0.5686789490366635, + "grad_norm": 7.290695667266846, + "learning_rate": 3.3547509700659964e-06, + "loss": 0.4512, + "mean_token_accuracy": 0.8603045761585235, + "num_tokens": 220720156.0, + "step": 183450 + }, + { + "entropy": 1.8762595668435096, + "epoch": 0.5687099481617132, + "grad_norm": 8.566794395446777, + "learning_rate": 3.3546595382832387e-06, + "loss": 0.4302, + "mean_token_accuracy": 0.8579180598258972, + "num_tokens": 220732483.0, + "step": 183460 + }, + { + "entropy": 1.9345880046486854, + "epoch": 0.5687409472867628, + "grad_norm": 9.06840991973877, + "learning_rate": 3.354568113975834e-06, + "loss": 0.4413, + "mean_token_accuracy": 0.8575801998376846, + "num_tokens": 220743293.0, + "step": 183470 + }, + { + "entropy": 1.9276087015867234, + "epoch": 0.5687719464118125, + "grad_norm": 7.404775619506836, + "learning_rate": 3.354476697142762e-06, + "loss": 0.4017, + "mean_token_accuracy": 0.8662512883543968, + "num_tokens": 220754912.0, + "step": 183480 + }, + { + "entropy": 1.9424118250608444, + "epoch": 0.5688029455368622, + "grad_norm": 6.11152982711792, + "learning_rate": 3.3543852877830067e-06, + "loss": 0.4545, + "mean_token_accuracy": 0.8560282364487648, + "num_tokens": 220766123.0, + "step": 183490 + }, + { + "entropy": 1.8890651270747185, + "epoch": 0.568833944661912, + "grad_norm": 8.704337120056152, + "learning_rate": 3.354293885895549e-06, + "loss": 0.4835, + "mean_token_accuracy": 0.855229164659977, + "num_tokens": 220778032.0, + "step": 183500 + }, + { + "entropy": 1.7918873026967048, + "epoch": 0.5688649437869616, + "grad_norm": 5.357420444488525, + "learning_rate": 3.3542024914793692e-06, + "loss": 0.3684, + "mean_token_accuracy": 0.8629763871431351, + "num_tokens": 220791558.0, + "step": 183510 + }, + { + "entropy": 1.8693130612373352, + "epoch": 0.5688959429120113, + "grad_norm": 3.724578380584717, + "learning_rate": 3.3541111045334514e-06, + "loss": 0.3959, + "mean_token_accuracy": 0.8646483674645424, + "num_tokens": 220803255.0, + "step": 183520 + }, + { + "entropy": 1.8977489918470383, + "epoch": 0.568926942037061, + "grad_norm": 6.983435153961182, + "learning_rate": 3.3540197250567773e-06, + "loss": 0.4671, + "mean_token_accuracy": 0.8517172873020172, + "num_tokens": 220814834.0, + "step": 183530 + }, + { + "entropy": 1.9508653551340103, + "epoch": 0.5689579411621107, + "grad_norm": 7.906034469604492, + "learning_rate": 3.35392835304833e-06, + "loss": 0.473, + "mean_token_accuracy": 0.8508275121450424, + "num_tokens": 220825869.0, + "step": 183540 + }, + { + "entropy": 1.8913847014307976, + "epoch": 0.5689889402871604, + "grad_norm": 3.185633659362793, + "learning_rate": 3.3538369885070923e-06, + "loss": 0.3913, + "mean_token_accuracy": 0.8732113286852836, + "num_tokens": 220837783.0, + "step": 183550 + }, + { + "entropy": 1.914082932472229, + "epoch": 0.5690199394122101, + "grad_norm": 6.322134494781494, + "learning_rate": 3.3537456314320467e-06, + "loss": 0.4325, + "mean_token_accuracy": 0.8633823007345199, + "num_tokens": 220848662.0, + "step": 183560 + }, + { + "entropy": 1.9293600648641587, + "epoch": 0.5690509385372597, + "grad_norm": 8.116273880004883, + "learning_rate": 3.3536542818221757e-06, + "loss": 0.4947, + "mean_token_accuracy": 0.8529650032520294, + "num_tokens": 220859719.0, + "step": 183570 + }, + { + "entropy": 1.9193365216255187, + "epoch": 0.5690819376623095, + "grad_norm": 7.5625996589660645, + "learning_rate": 3.353562939676464e-06, + "loss": 0.4578, + "mean_token_accuracy": 0.8576421469449997, + "num_tokens": 220870811.0, + "step": 183580 + }, + { + "entropy": 1.842278851568699, + "epoch": 0.5691129367873592, + "grad_norm": 8.025686264038086, + "learning_rate": 3.353471604993895e-06, + "loss": 0.4059, + "mean_token_accuracy": 0.8678732305765152, + "num_tokens": 220882929.0, + "step": 183590 + }, + { + "entropy": 1.9584626644849776, + "epoch": 0.5691439359124089, + "grad_norm": 9.167868614196777, + "learning_rate": 3.3533802777734523e-06, + "loss": 0.5023, + "mean_token_accuracy": 0.8444946691393852, + "num_tokens": 220893487.0, + "step": 183600 + }, + { + "entropy": 1.8276565596461296, + "epoch": 0.5691749350374585, + "grad_norm": 3.9966540336608887, + "learning_rate": 3.3532889580141193e-06, + "loss": 0.3778, + "mean_token_accuracy": 0.8683551594614982, + "num_tokens": 220905974.0, + "step": 183610 + }, + { + "entropy": 1.893023744225502, + "epoch": 0.5692059341625083, + "grad_norm": 7.548893451690674, + "learning_rate": 3.3531976457148803e-06, + "loss": 0.4178, + "mean_token_accuracy": 0.8617466360330581, + "num_tokens": 220917499.0, + "step": 183620 + }, + { + "entropy": 1.9760723441839219, + "epoch": 0.569236933287558, + "grad_norm": 7.412717342376709, + "learning_rate": 3.35310634087472e-06, + "loss": 0.4774, + "mean_token_accuracy": 0.8563466891646385, + "num_tokens": 220928532.0, + "step": 183630 + }, + { + "entropy": 1.8784570693969727, + "epoch": 0.5692679324126076, + "grad_norm": 9.210555076599121, + "learning_rate": 3.353015043492623e-06, + "loss": 0.445, + "mean_token_accuracy": 0.8611171096563339, + "num_tokens": 220941186.0, + "step": 183640 + }, + { + "entropy": 1.8664739981293679, + "epoch": 0.5692989315376573, + "grad_norm": 8.183813095092773, + "learning_rate": 3.352923753567574e-06, + "loss": 0.4071, + "mean_token_accuracy": 0.8593077078461647, + "num_tokens": 220952736.0, + "step": 183650 + }, + { + "entropy": 1.9031253203749656, + "epoch": 0.5693299306627071, + "grad_norm": 8.010009765625, + "learning_rate": 3.352832471098557e-06, + "loss": 0.464, + "mean_token_accuracy": 0.8554147705435753, + "num_tokens": 220964870.0, + "step": 183660 + }, + { + "entropy": 1.9231388315558433, + "epoch": 0.5693609297877568, + "grad_norm": 7.923822402954102, + "learning_rate": 3.352741196084558e-06, + "loss": 0.4639, + "mean_token_accuracy": 0.8455759003758431, + "num_tokens": 220976650.0, + "step": 183670 + }, + { + "entropy": 1.9567074686288835, + "epoch": 0.5693919289128064, + "grad_norm": 8.853241920471191, + "learning_rate": 3.352649928524562e-06, + "loss": 0.496, + "mean_token_accuracy": 0.8528679683804512, + "num_tokens": 220987807.0, + "step": 183680 + }, + { + "entropy": 1.812475062906742, + "epoch": 0.5694229280378561, + "grad_norm": 4.6928606033325195, + "learning_rate": 3.3525586684175554e-06, + "loss": 0.3857, + "mean_token_accuracy": 0.8690725296735764, + "num_tokens": 221001512.0, + "step": 183690 + }, + { + "entropy": 1.9027960404753685, + "epoch": 0.5694539271629059, + "grad_norm": 8.04566764831543, + "learning_rate": 3.3524674157625238e-06, + "loss": 0.4908, + "mean_token_accuracy": 0.8378259748220444, + "num_tokens": 221013722.0, + "step": 183700 + }, + { + "entropy": 1.877773503959179, + "epoch": 0.5694849262879556, + "grad_norm": 7.116755485534668, + "learning_rate": 3.3523761705584506e-06, + "loss": 0.4057, + "mean_token_accuracy": 0.8673833280801773, + "num_tokens": 221025952.0, + "step": 183710 + }, + { + "entropy": 1.880093328654766, + "epoch": 0.5695159254130052, + "grad_norm": 7.747344017028809, + "learning_rate": 3.3522849328043243e-06, + "loss": 0.447, + "mean_token_accuracy": 0.8541530668735504, + "num_tokens": 221037662.0, + "step": 183720 + }, + { + "entropy": 1.8805700927972793, + "epoch": 0.5695469245380549, + "grad_norm": 7.554903507232666, + "learning_rate": 3.352193702499131e-06, + "loss": 0.4396, + "mean_token_accuracy": 0.8540754541754723, + "num_tokens": 221049646.0, + "step": 183730 + }, + { + "entropy": 1.8181962668895721, + "epoch": 0.5695779236631046, + "grad_norm": 8.619624137878418, + "learning_rate": 3.352102479641857e-06, + "loss": 0.3976, + "mean_token_accuracy": 0.8621390506625175, + "num_tokens": 221062839.0, + "step": 183740 + }, + { + "entropy": 1.9224794439971447, + "epoch": 0.5696089227881543, + "grad_norm": 8.067683219909668, + "learning_rate": 3.3520112642314882e-06, + "loss": 0.4298, + "mean_token_accuracy": 0.8606799691915512, + "num_tokens": 221074434.0, + "step": 183750 + }, + { + "entropy": 1.9693562895059586, + "epoch": 0.569639921913204, + "grad_norm": 10.010885238647461, + "learning_rate": 3.3519200562670123e-06, + "loss": 0.5083, + "mean_token_accuracy": 0.8417602896690368, + "num_tokens": 221085251.0, + "step": 183760 + }, + { + "entropy": 1.963064904510975, + "epoch": 0.5696709210382537, + "grad_norm": 9.861781120300293, + "learning_rate": 3.3518288557474155e-06, + "loss": 0.4952, + "mean_token_accuracy": 0.8481303885579109, + "num_tokens": 221096618.0, + "step": 183770 + }, + { + "entropy": 1.9917975157499312, + "epoch": 0.5697019201633033, + "grad_norm": 8.955374717712402, + "learning_rate": 3.3517376626716858e-06, + "loss": 0.4906, + "mean_token_accuracy": 0.8551626339554786, + "num_tokens": 221107422.0, + "step": 183780 + }, + { + "entropy": 1.800957126915455, + "epoch": 0.5697329192883531, + "grad_norm": 7.669996738433838, + "learning_rate": 3.3516464770388104e-06, + "loss": 0.3663, + "mean_token_accuracy": 0.8650017514824867, + "num_tokens": 221120677.0, + "step": 183790 + }, + { + "entropy": 1.8783799439668656, + "epoch": 0.5697639184134028, + "grad_norm": 7.118563652038574, + "learning_rate": 3.351555298847777e-06, + "loss": 0.4096, + "mean_token_accuracy": 0.8592517286539078, + "num_tokens": 221133188.0, + "step": 183800 + }, + { + "entropy": 1.9285740569233893, + "epoch": 0.5697949175384525, + "grad_norm": 8.174911499023438, + "learning_rate": 3.351464128097573e-06, + "loss": 0.4596, + "mean_token_accuracy": 0.8508363962173462, + "num_tokens": 221145365.0, + "step": 183810 + }, + { + "entropy": 1.8228991374373436, + "epoch": 0.5698259166635021, + "grad_norm": 7.459652423858643, + "learning_rate": 3.3513729647871875e-06, + "loss": 0.3882, + "mean_token_accuracy": 0.8594255059957504, + "num_tokens": 221158893.0, + "step": 183820 + }, + { + "entropy": 1.8317522302269935, + "epoch": 0.5698569157885519, + "grad_norm": 8.87353515625, + "learning_rate": 3.3512818089156067e-06, + "loss": 0.3505, + "mean_token_accuracy": 0.8580220609903335, + "num_tokens": 221172123.0, + "step": 183830 + }, + { + "entropy": 1.8688715621829033, + "epoch": 0.5698879149136016, + "grad_norm": 8.272303581237793, + "learning_rate": 3.3511906604818205e-06, + "loss": 0.4211, + "mean_token_accuracy": 0.8654984638094902, + "num_tokens": 221183785.0, + "step": 183840 + }, + { + "entropy": 1.9467407405376433, + "epoch": 0.5699189140386512, + "grad_norm": 8.615777969360352, + "learning_rate": 3.351099519484818e-06, + "loss": 0.4779, + "mean_token_accuracy": 0.8512974575161933, + "num_tokens": 221194815.0, + "step": 183850 + }, + { + "entropy": 1.7814303100109101, + "epoch": 0.5699499131637009, + "grad_norm": 8.84643268585205, + "learning_rate": 3.3510083859235865e-06, + "loss": 0.365, + "mean_token_accuracy": 0.8647480860352517, + "num_tokens": 221208640.0, + "step": 183860 + }, + { + "entropy": 1.9105361938476562, + "epoch": 0.5699809122887507, + "grad_norm": 4.086887836456299, + "learning_rate": 3.3509172597971156e-06, + "loss": 0.4007, + "mean_token_accuracy": 0.8670039981603622, + "num_tokens": 221220350.0, + "step": 183870 + }, + { + "entropy": 1.929616743326187, + "epoch": 0.5700119114138004, + "grad_norm": 10.38846492767334, + "learning_rate": 3.350826141104395e-06, + "loss": 0.4369, + "mean_token_accuracy": 0.861053255200386, + "num_tokens": 221231474.0, + "step": 183880 + }, + { + "entropy": 1.879753015935421, + "epoch": 0.57004291053885, + "grad_norm": 8.4418306350708, + "learning_rate": 3.3507350298444134e-06, + "loss": 0.4435, + "mean_token_accuracy": 0.8551955938339233, + "num_tokens": 221243202.0, + "step": 183890 + }, + { + "entropy": 1.872837108373642, + "epoch": 0.5700739096638997, + "grad_norm": 4.017162322998047, + "learning_rate": 3.3506439260161606e-06, + "loss": 0.4691, + "mean_token_accuracy": 0.8521816402673721, + "num_tokens": 221256087.0, + "step": 183900 + }, + { + "entropy": 1.8944094195961951, + "epoch": 0.5701049087889495, + "grad_norm": 7.516350746154785, + "learning_rate": 3.3505528296186263e-06, + "loss": 0.4123, + "mean_token_accuracy": 0.8568247303366661, + "num_tokens": 221267912.0, + "step": 183910 + }, + { + "entropy": 1.8976934224367141, + "epoch": 0.5701359079139992, + "grad_norm": 6.143962383270264, + "learning_rate": 3.3504617406508e-06, + "loss": 0.5026, + "mean_token_accuracy": 0.8466740503907204, + "num_tokens": 221279968.0, + "step": 183920 + }, + { + "entropy": 1.9248884186148643, + "epoch": 0.5701669070390488, + "grad_norm": 9.086318969726562, + "learning_rate": 3.350370659111672e-06, + "loss": 0.4393, + "mean_token_accuracy": 0.8509233519434929, + "num_tokens": 221292101.0, + "step": 183930 + }, + { + "entropy": 1.8520910263061523, + "epoch": 0.5701979061640985, + "grad_norm": 8.545215606689453, + "learning_rate": 3.3502795850002332e-06, + "loss": 0.4501, + "mean_token_accuracy": 0.8515016883611679, + "num_tokens": 221304057.0, + "step": 183940 + }, + { + "entropy": 1.9516564026474952, + "epoch": 0.5702289052891483, + "grad_norm": 7.543873310089111, + "learning_rate": 3.3501885183154742e-06, + "loss": 0.4834, + "mean_token_accuracy": 0.8392630383372307, + "num_tokens": 221315295.0, + "step": 183950 + }, + { + "entropy": 1.9031856134533882, + "epoch": 0.5702599044141979, + "grad_norm": 8.644308090209961, + "learning_rate": 3.350097459056385e-06, + "loss": 0.4412, + "mean_token_accuracy": 0.8548777773976326, + "num_tokens": 221326997.0, + "step": 183960 + }, + { + "entropy": 1.8481758192181588, + "epoch": 0.5702909035392476, + "grad_norm": 6.953856945037842, + "learning_rate": 3.3500064072219567e-06, + "loss": 0.4322, + "mean_token_accuracy": 0.8590131789445877, + "num_tokens": 221339994.0, + "step": 183970 + }, + { + "entropy": 1.8460357397794724, + "epoch": 0.5703219026642973, + "grad_norm": 3.499201536178589, + "learning_rate": 3.34991536281118e-06, + "loss": 0.4079, + "mean_token_accuracy": 0.8534980297088623, + "num_tokens": 221352932.0, + "step": 183980 + }, + { + "entropy": 1.874594159424305, + "epoch": 0.5703529017893469, + "grad_norm": 8.420859336853027, + "learning_rate": 3.349824325823047e-06, + "loss": 0.429, + "mean_token_accuracy": 0.858637835085392, + "num_tokens": 221365971.0, + "step": 183990 + }, + { + "entropy": 1.8195825964212418, + "epoch": 0.5703839009143967, + "grad_norm": 8.116730690002441, + "learning_rate": 3.349733296256548e-06, + "loss": 0.3564, + "mean_token_accuracy": 0.879566079378128, + "num_tokens": 221379522.0, + "step": 184000 + }, + { + "entropy": 1.9340773046016693, + "epoch": 0.5704149000394464, + "grad_norm": 7.0886030197143555, + "learning_rate": 3.349642274110677e-06, + "loss": 0.4667, + "mean_token_accuracy": 0.8608402445912361, + "num_tokens": 221390338.0, + "step": 184010 + }, + { + "entropy": 1.8692950300872326, + "epoch": 0.5704458991644961, + "grad_norm": 7.241457939147949, + "learning_rate": 3.3495512593844233e-06, + "loss": 0.3905, + "mean_token_accuracy": 0.8651369750499726, + "num_tokens": 221402614.0, + "step": 184020 + }, + { + "entropy": 1.9052888661623002, + "epoch": 0.5704768982895457, + "grad_norm": 9.200166702270508, + "learning_rate": 3.34946025207678e-06, + "loss": 0.4433, + "mean_token_accuracy": 0.8558596163988114, + "num_tokens": 221414238.0, + "step": 184030 + }, + { + "entropy": 1.8719256192445755, + "epoch": 0.5705078974145955, + "grad_norm": 7.344235420227051, + "learning_rate": 3.349369252186739e-06, + "loss": 0.4642, + "mean_token_accuracy": 0.858357360959053, + "num_tokens": 221426450.0, + "step": 184040 + }, + { + "entropy": 1.8825799271464347, + "epoch": 0.5705388965396452, + "grad_norm": 8.594145774841309, + "learning_rate": 3.3492782597132935e-06, + "loss": 0.3814, + "mean_token_accuracy": 0.8614272058010102, + "num_tokens": 221438819.0, + "step": 184050 + }, + { + "entropy": 1.9068335115909576, + "epoch": 0.5705698956646948, + "grad_norm": 3.7201220989227295, + "learning_rate": 3.349187274655436e-06, + "loss": 0.454, + "mean_token_accuracy": 0.8533118292689323, + "num_tokens": 221450339.0, + "step": 184060 + }, + { + "entropy": 1.7704555153846742, + "epoch": 0.5706008947897445, + "grad_norm": 5.395776748657227, + "learning_rate": 3.349096297012158e-06, + "loss": 0.3804, + "mean_token_accuracy": 0.8651518598198891, + "num_tokens": 221464112.0, + "step": 184070 + }, + { + "entropy": 1.8785992763936519, + "epoch": 0.5706318939147943, + "grad_norm": 7.935133934020996, + "learning_rate": 3.3490053267824535e-06, + "loss": 0.4394, + "mean_token_accuracy": 0.8562982544302941, + "num_tokens": 221476400.0, + "step": 184080 + }, + { + "entropy": 1.9128589004278183, + "epoch": 0.570662893039844, + "grad_norm": 8.40445613861084, + "learning_rate": 3.348914363965316e-06, + "loss": 0.4707, + "mean_token_accuracy": 0.8426511645317077, + "num_tokens": 221488509.0, + "step": 184090 + }, + { + "entropy": 1.8774797767400742, + "epoch": 0.5706938921648936, + "grad_norm": 6.458841800689697, + "learning_rate": 3.3488234085597382e-06, + "loss": 0.4032, + "mean_token_accuracy": 0.8713184729218483, + "num_tokens": 221500300.0, + "step": 184100 + }, + { + "entropy": 1.862789809703827, + "epoch": 0.5707248912899433, + "grad_norm": 8.445926666259766, + "learning_rate": 3.348732460564714e-06, + "loss": 0.4022, + "mean_token_accuracy": 0.8612472444772721, + "num_tokens": 221512682.0, + "step": 184110 + }, + { + "entropy": 1.9176673144102097, + "epoch": 0.5707558904149931, + "grad_norm": 8.715554237365723, + "learning_rate": 3.348641519979238e-06, + "loss": 0.4771, + "mean_token_accuracy": 0.8517135217785835, + "num_tokens": 221524087.0, + "step": 184120 + }, + { + "entropy": 1.8521241903305055, + "epoch": 0.5707868895400428, + "grad_norm": 3.694570541381836, + "learning_rate": 3.3485505868023025e-06, + "loss": 0.4083, + "mean_token_accuracy": 0.8623161077499389, + "num_tokens": 221536141.0, + "step": 184130 + }, + { + "entropy": 1.94044778496027, + "epoch": 0.5708178886650924, + "grad_norm": 7.394150733947754, + "learning_rate": 3.3484596610329025e-06, + "loss": 0.4764, + "mean_token_accuracy": 0.8515927508473397, + "num_tokens": 221547849.0, + "step": 184140 + }, + { + "entropy": 1.9291246131062507, + "epoch": 0.5708488877901421, + "grad_norm": 4.240436553955078, + "learning_rate": 3.348368742670032e-06, + "loss": 0.4467, + "mean_token_accuracy": 0.8608126997947693, + "num_tokens": 221558692.0, + "step": 184150 + }, + { + "entropy": 1.9156172186136247, + "epoch": 0.5708798869151919, + "grad_norm": 6.2478742599487305, + "learning_rate": 3.3482778317126867e-06, + "loss": 0.4477, + "mean_token_accuracy": 0.8602596551179886, + "num_tokens": 221570280.0, + "step": 184160 + }, + { + "entropy": 1.8546249985694885, + "epoch": 0.5709108860402415, + "grad_norm": 4.608593463897705, + "learning_rate": 3.3481869281598605e-06, + "loss": 0.3939, + "mean_token_accuracy": 0.862372313439846, + "num_tokens": 221583436.0, + "step": 184170 + }, + { + "entropy": 1.8053992852568626, + "epoch": 0.5709418851652912, + "grad_norm": 4.155288219451904, + "learning_rate": 3.348096032010548e-06, + "loss": 0.3995, + "mean_token_accuracy": 0.8623798578977585, + "num_tokens": 221596995.0, + "step": 184180 + }, + { + "entropy": 1.9127266079187393, + "epoch": 0.5709728842903409, + "grad_norm": 8.820666313171387, + "learning_rate": 3.348005143263744e-06, + "loss": 0.4983, + "mean_token_accuracy": 0.8540406107902527, + "num_tokens": 221608475.0, + "step": 184190 + }, + { + "entropy": 1.9034708708524704, + "epoch": 0.5710038834153905, + "grad_norm": 8.682132720947266, + "learning_rate": 3.3479142619184447e-06, + "loss": 0.4888, + "mean_token_accuracy": 0.8417846232652664, + "num_tokens": 221620246.0, + "step": 184200 + }, + { + "entropy": 1.8972658574581147, + "epoch": 0.5710348825404403, + "grad_norm": 8.441329002380371, + "learning_rate": 3.3478233879736455e-06, + "loss": 0.4557, + "mean_token_accuracy": 0.8526045650243759, + "num_tokens": 221631407.0, + "step": 184210 + }, + { + "entropy": 1.9070860490202903, + "epoch": 0.57106588166549, + "grad_norm": 7.873006343841553, + "learning_rate": 3.347732521428342e-06, + "loss": 0.4474, + "mean_token_accuracy": 0.8503958001732826, + "num_tokens": 221643299.0, + "step": 184220 + }, + { + "entropy": 1.9741833984851838, + "epoch": 0.5710968807905397, + "grad_norm": 8.034712791442871, + "learning_rate": 3.3476416622815293e-06, + "loss": 0.4831, + "mean_token_accuracy": 0.8505676060914993, + "num_tokens": 221654730.0, + "step": 184230 + }, + { + "entropy": 1.8234672516584396, + "epoch": 0.5711278799155893, + "grad_norm": 9.788541793823242, + "learning_rate": 3.3475508105322048e-06, + "loss": 0.3888, + "mean_token_accuracy": 0.8619481399655342, + "num_tokens": 221668242.0, + "step": 184240 + }, + { + "entropy": 1.9055007576942444, + "epoch": 0.5711588790406391, + "grad_norm": 9.112692832946777, + "learning_rate": 3.3474599661793634e-06, + "loss": 0.4432, + "mean_token_accuracy": 0.8560148790478707, + "num_tokens": 221679959.0, + "step": 184250 + }, + { + "entropy": 1.8885867521166801, + "epoch": 0.5711898781656888, + "grad_norm": 8.689698219299316, + "learning_rate": 3.347369129222002e-06, + "loss": 0.4453, + "mean_token_accuracy": 0.8601107582449913, + "num_tokens": 221691999.0, + "step": 184260 + }, + { + "entropy": 1.9156020715832711, + "epoch": 0.5712208772907384, + "grad_norm": 8.803537368774414, + "learning_rate": 3.3472782996591185e-06, + "loss": 0.4405, + "mean_token_accuracy": 0.8553341671824455, + "num_tokens": 221703723.0, + "step": 184270 + }, + { + "entropy": 1.9182790204882623, + "epoch": 0.5712518764157881, + "grad_norm": 8.862181663513184, + "learning_rate": 3.347187477489707e-06, + "loss": 0.4623, + "mean_token_accuracy": 0.8531744465231895, + "num_tokens": 221714671.0, + "step": 184280 + }, + { + "entropy": 1.9861332833766938, + "epoch": 0.5712828755408379, + "grad_norm": 7.757084369659424, + "learning_rate": 3.3470966627127677e-06, + "loss": 0.5033, + "mean_token_accuracy": 0.8418697059154511, + "num_tokens": 221725356.0, + "step": 184290 + }, + { + "entropy": 1.858552846312523, + "epoch": 0.5713138746658876, + "grad_norm": 6.145276069641113, + "learning_rate": 3.347005855327296e-06, + "loss": 0.3798, + "mean_token_accuracy": 0.8689824387431144, + "num_tokens": 221737404.0, + "step": 184300 + }, + { + "entropy": 1.7062400430440903, + "epoch": 0.5713448737909372, + "grad_norm": 9.089742660522461, + "learning_rate": 3.3469150553322895e-06, + "loss": 0.3334, + "mean_token_accuracy": 0.8730680406093597, + "num_tokens": 221751518.0, + "step": 184310 + }, + { + "entropy": 1.8535742238163948, + "epoch": 0.5713758729159869, + "grad_norm": 7.534306049346924, + "learning_rate": 3.3468242627267454e-06, + "loss": 0.4175, + "mean_token_accuracy": 0.8658009812235832, + "num_tokens": 221763099.0, + "step": 184320 + }, + { + "entropy": 1.8937023341655732, + "epoch": 0.5714068720410367, + "grad_norm": 4.602698802947998, + "learning_rate": 3.346733477509662e-06, + "loss": 0.4561, + "mean_token_accuracy": 0.8501476779580116, + "num_tokens": 221774434.0, + "step": 184330 + }, + { + "entropy": 1.8975597500801087, + "epoch": 0.5714378711660864, + "grad_norm": 3.3787753582000732, + "learning_rate": 3.3466426996800364e-06, + "loss": 0.3821, + "mean_token_accuracy": 0.8754017919301986, + "num_tokens": 221786154.0, + "step": 184340 + }, + { + "entropy": 1.9251778170466423, + "epoch": 0.571468870291136, + "grad_norm": 7.641809463500977, + "learning_rate": 3.3465519292368692e-06, + "loss": 0.4741, + "mean_token_accuracy": 0.8496328294277191, + "num_tokens": 221797589.0, + "step": 184350 + }, + { + "entropy": 1.9057638555765153, + "epoch": 0.5714998694161857, + "grad_norm": 7.094670295715332, + "learning_rate": 3.3464611661791564e-06, + "loss": 0.4109, + "mean_token_accuracy": 0.8637410655617714, + "num_tokens": 221809125.0, + "step": 184360 + }, + { + "entropy": 1.9552569746971131, + "epoch": 0.5715308685412355, + "grad_norm": 8.105437278747559, + "learning_rate": 3.346370410505897e-06, + "loss": 0.4735, + "mean_token_accuracy": 0.850199231505394, + "num_tokens": 221819459.0, + "step": 184370 + }, + { + "entropy": 1.9675395026803018, + "epoch": 0.5715618676662851, + "grad_norm": 8.247036933898926, + "learning_rate": 3.3462796622160893e-06, + "loss": 0.4602, + "mean_token_accuracy": 0.8617465287446976, + "num_tokens": 221830856.0, + "step": 184380 + }, + { + "entropy": 1.857323682308197, + "epoch": 0.5715928667913348, + "grad_norm": 7.871233940124512, + "learning_rate": 3.346188921308734e-06, + "loss": 0.4005, + "mean_token_accuracy": 0.8622802942991257, + "num_tokens": 221843358.0, + "step": 184390 + }, + { + "entropy": 1.8538252532482147, + "epoch": 0.5716238659163845, + "grad_norm": 7.213409900665283, + "learning_rate": 3.346098187782828e-06, + "loss": 0.4626, + "mean_token_accuracy": 0.857527782022953, + "num_tokens": 221856160.0, + "step": 184400 + }, + { + "entropy": 1.850920520722866, + "epoch": 0.5716548650414343, + "grad_norm": 8.479315757751465, + "learning_rate": 3.346007461637373e-06, + "loss": 0.4254, + "mean_token_accuracy": 0.8530377745628357, + "num_tokens": 221868467.0, + "step": 184410 + }, + { + "entropy": 1.9420105203986169, + "epoch": 0.5716858641664839, + "grad_norm": 9.029580116271973, + "learning_rate": 3.345916742871366e-06, + "loss": 0.4586, + "mean_token_accuracy": 0.8558577001094818, + "num_tokens": 221879886.0, + "step": 184420 + }, + { + "entropy": 1.9024136036634445, + "epoch": 0.5717168632915336, + "grad_norm": 7.557387351989746, + "learning_rate": 3.345826031483808e-06, + "loss": 0.4562, + "mean_token_accuracy": 0.8580532029271126, + "num_tokens": 221892180.0, + "step": 184430 + }, + { + "entropy": 1.8506018549203873, + "epoch": 0.5717478624165833, + "grad_norm": 7.604180812835693, + "learning_rate": 3.3457353274736993e-06, + "loss": 0.3891, + "mean_token_accuracy": 0.8685560122132301, + "num_tokens": 221904480.0, + "step": 184440 + }, + { + "entropy": 1.9249087005853653, + "epoch": 0.5717788615416329, + "grad_norm": 6.4561238288879395, + "learning_rate": 3.3456446308400387e-06, + "loss": 0.4528, + "mean_token_accuracy": 0.8623852536082268, + "num_tokens": 221916316.0, + "step": 184450 + }, + { + "entropy": 1.9324004918336868, + "epoch": 0.5718098606666827, + "grad_norm": 8.638805389404297, + "learning_rate": 3.3455539415818272e-06, + "loss": 0.4662, + "mean_token_accuracy": 0.8507967680692673, + "num_tokens": 221926873.0, + "step": 184460 + }, + { + "entropy": 1.840507847070694, + "epoch": 0.5718408597917324, + "grad_norm": 3.3923275470733643, + "learning_rate": 3.345463259698065e-06, + "loss": 0.4129, + "mean_token_accuracy": 0.8589583113789558, + "num_tokens": 221938994.0, + "step": 184470 + }, + { + "entropy": 1.8590350210666657, + "epoch": 0.571871858916782, + "grad_norm": 7.706557273864746, + "learning_rate": 3.3453725851877535e-06, + "loss": 0.4199, + "mean_token_accuracy": 0.8596820175647736, + "num_tokens": 221951096.0, + "step": 184480 + }, + { + "entropy": 1.8945080533623695, + "epoch": 0.5719028580418317, + "grad_norm": 8.865853309631348, + "learning_rate": 3.3452819180498917e-06, + "loss": 0.4285, + "mean_token_accuracy": 0.8579330548644066, + "num_tokens": 221963513.0, + "step": 184490 + }, + { + "entropy": 1.7538821056485177, + "epoch": 0.5719338571668815, + "grad_norm": 4.122339248657227, + "learning_rate": 3.3451912582834833e-06, + "loss": 0.3788, + "mean_token_accuracy": 0.8662702322006226, + "num_tokens": 221977767.0, + "step": 184500 + }, + { + "entropy": 1.9670095562934875, + "epoch": 0.5719648562919312, + "grad_norm": 8.489312171936035, + "learning_rate": 3.345100605887527e-06, + "loss": 0.5325, + "mean_token_accuracy": 0.8418423220515251, + "num_tokens": 221989148.0, + "step": 184510 + }, + { + "entropy": 1.9049221113324166, + "epoch": 0.5719958554169808, + "grad_norm": 3.830526113510132, + "learning_rate": 3.345009960861025e-06, + "loss": 0.4495, + "mean_token_accuracy": 0.8524208962917328, + "num_tokens": 222001134.0, + "step": 184520 + }, + { + "entropy": 1.8578404620289803, + "epoch": 0.5720268545420305, + "grad_norm": 7.5913872718811035, + "learning_rate": 3.344919323202979e-06, + "loss": 0.4048, + "mean_token_accuracy": 0.8586911797523499, + "num_tokens": 222014081.0, + "step": 184530 + }, + { + "entropy": 1.8676321879029274, + "epoch": 0.5720578536670803, + "grad_norm": 4.292795658111572, + "learning_rate": 3.3448286929123913e-06, + "loss": 0.4647, + "mean_token_accuracy": 0.8599231988191605, + "num_tokens": 222026648.0, + "step": 184540 + }, + { + "entropy": 1.9369481548666954, + "epoch": 0.57208885279213, + "grad_norm": 8.17246150970459, + "learning_rate": 3.3447380699882633e-06, + "loss": 0.44, + "mean_token_accuracy": 0.8523173183202744, + "num_tokens": 222037839.0, + "step": 184550 + }, + { + "entropy": 1.976218593120575, + "epoch": 0.5721198519171796, + "grad_norm": 7.205543041229248, + "learning_rate": 3.3446474544295966e-06, + "loss": 0.5077, + "mean_token_accuracy": 0.8457430496811866, + "num_tokens": 222048700.0, + "step": 184560 + }, + { + "entropy": 1.9195255041122437, + "epoch": 0.5721508510422293, + "grad_norm": 8.929695129394531, + "learning_rate": 3.3445568462353943e-06, + "loss": 0.5033, + "mean_token_accuracy": 0.8410187095403672, + "num_tokens": 222059842.0, + "step": 184570 + }, + { + "entropy": 1.8937813118100166, + "epoch": 0.5721818501672791, + "grad_norm": 2.680063247680664, + "learning_rate": 3.3444662454046596e-06, + "loss": 0.4137, + "mean_token_accuracy": 0.861142173409462, + "num_tokens": 222072051.0, + "step": 184580 + }, + { + "entropy": 1.8604128420352937, + "epoch": 0.5722128492923287, + "grad_norm": 8.521194458007812, + "learning_rate": 3.344375651936393e-06, + "loss": 0.403, + "mean_token_accuracy": 0.8724682793021202, + "num_tokens": 222083791.0, + "step": 184590 + }, + { + "entropy": 1.85889263600111, + "epoch": 0.5722438484173784, + "grad_norm": 8.879921913146973, + "learning_rate": 3.3442850658295996e-06, + "loss": 0.461, + "mean_token_accuracy": 0.8506242975592613, + "num_tokens": 222096815.0, + "step": 184600 + }, + { + "entropy": 1.9790507823228836, + "epoch": 0.5722748475424281, + "grad_norm": 9.367940902709961, + "learning_rate": 3.344194487083281e-06, + "loss": 0.4705, + "mean_token_accuracy": 0.8495551526546479, + "num_tokens": 222107446.0, + "step": 184610 + }, + { + "entropy": 1.8882742941379547, + "epoch": 0.5723058466674779, + "grad_norm": 7.1077494621276855, + "learning_rate": 3.3441039156964413e-06, + "loss": 0.4317, + "mean_token_accuracy": 0.8558852866291999, + "num_tokens": 222119676.0, + "step": 184620 + }, + { + "entropy": 1.8880833089351654, + "epoch": 0.5723368457925275, + "grad_norm": 9.013257026672363, + "learning_rate": 3.344013351668084e-06, + "loss": 0.4912, + "mean_token_accuracy": 0.8326297968626022, + "num_tokens": 222131253.0, + "step": 184630 + }, + { + "entropy": 1.8886737152934074, + "epoch": 0.5723678449175772, + "grad_norm": 7.31126594543457, + "learning_rate": 3.3439227949972125e-06, + "loss": 0.4299, + "mean_token_accuracy": 0.863846381008625, + "num_tokens": 222142462.0, + "step": 184640 + }, + { + "entropy": 1.867803943157196, + "epoch": 0.5723988440426269, + "grad_norm": 7.314255714416504, + "learning_rate": 3.3438322456828306e-06, + "loss": 0.3942, + "mean_token_accuracy": 0.8646995663642884, + "num_tokens": 222155046.0, + "step": 184650 + }, + { + "entropy": 1.9728734582662582, + "epoch": 0.5724298431676766, + "grad_norm": 9.504714965820312, + "learning_rate": 3.3437417037239413e-06, + "loss": 0.5216, + "mean_token_accuracy": 0.8391286611557007, + "num_tokens": 222165883.0, + "step": 184660 + }, + { + "entropy": 1.8285054996609689, + "epoch": 0.5724608422927263, + "grad_norm": 8.178985595703125, + "learning_rate": 3.343651169119551e-06, + "loss": 0.417, + "mean_token_accuracy": 0.8603216618299484, + "num_tokens": 222178296.0, + "step": 184670 + }, + { + "entropy": 1.8394104793667794, + "epoch": 0.572491841417776, + "grad_norm": 8.587333679199219, + "learning_rate": 3.3435606418686635e-06, + "loss": 0.4059, + "mean_token_accuracy": 0.8623070940375328, + "num_tokens": 222190553.0, + "step": 184680 + }, + { + "entropy": 1.9125571131706238, + "epoch": 0.5725228405428257, + "grad_norm": 9.115639686584473, + "learning_rate": 3.3434701219702815e-06, + "loss": 0.5031, + "mean_token_accuracy": 0.8482768699526787, + "num_tokens": 222202320.0, + "step": 184690 + }, + { + "entropy": 1.8850971892476083, + "epoch": 0.5725538396678753, + "grad_norm": 3.7713778018951416, + "learning_rate": 3.3433796094234124e-06, + "loss": 0.4296, + "mean_token_accuracy": 0.8511196061968803, + "num_tokens": 222215003.0, + "step": 184700 + }, + { + "entropy": 1.9377149119973183, + "epoch": 0.5725848387929251, + "grad_norm": 8.725337028503418, + "learning_rate": 3.3432891042270587e-06, + "loss": 0.5203, + "mean_token_accuracy": 0.8383104220032692, + "num_tokens": 222226156.0, + "step": 184710 + }, + { + "entropy": 1.8472907304763795, + "epoch": 0.5726158379179748, + "grad_norm": 3.7417922019958496, + "learning_rate": 3.3431986063802274e-06, + "loss": 0.4063, + "mean_token_accuracy": 0.8651013299822807, + "num_tokens": 222238479.0, + "step": 184720 + }, + { + "entropy": 1.8834988474845886, + "epoch": 0.5726468370430244, + "grad_norm": 4.482487678527832, + "learning_rate": 3.343108115881923e-06, + "loss": 0.4553, + "mean_token_accuracy": 0.8457119733095169, + "num_tokens": 222250852.0, + "step": 184730 + }, + { + "entropy": 1.965399381518364, + "epoch": 0.5726778361680741, + "grad_norm": 8.264575004577637, + "learning_rate": 3.343017632731152e-06, + "loss": 0.4604, + "mean_token_accuracy": 0.8593659609556198, + "num_tokens": 222261325.0, + "step": 184740 + }, + { + "entropy": 1.8925912261009217, + "epoch": 0.5727088352931239, + "grad_norm": 8.465385437011719, + "learning_rate": 3.3429271569269196e-06, + "loss": 0.4592, + "mean_token_accuracy": 0.8567355901002884, + "num_tokens": 222273418.0, + "step": 184750 + }, + { + "entropy": 1.8055074244737626, + "epoch": 0.5727398344181736, + "grad_norm": 10.0112943649292, + "learning_rate": 3.342836688468231e-06, + "loss": 0.3752, + "mean_token_accuracy": 0.8707456439733505, + "num_tokens": 222286332.0, + "step": 184760 + }, + { + "entropy": 1.9379899948835373, + "epoch": 0.5727708335432232, + "grad_norm": 8.254408836364746, + "learning_rate": 3.3427462273540922e-06, + "loss": 0.4284, + "mean_token_accuracy": 0.864084042608738, + "num_tokens": 222297376.0, + "step": 184770 + }, + { + "entropy": 1.8589140594005584, + "epoch": 0.5728018326682729, + "grad_norm": 4.008504390716553, + "learning_rate": 3.3426557735835114e-06, + "loss": 0.4101, + "mean_token_accuracy": 0.8606909677386284, + "num_tokens": 222309500.0, + "step": 184780 + }, + { + "entropy": 1.9276038601994514, + "epoch": 0.5728328317933227, + "grad_norm": 7.754671573638916, + "learning_rate": 3.342565327155493e-06, + "loss": 0.4735, + "mean_token_accuracy": 0.8512817919254303, + "num_tokens": 222321392.0, + "step": 184790 + }, + { + "entropy": 1.8575094789266586, + "epoch": 0.5728638309183723, + "grad_norm": 8.80823040008545, + "learning_rate": 3.3424748880690448e-06, + "loss": 0.3583, + "mean_token_accuracy": 0.8824114248156547, + "num_tokens": 222333134.0, + "step": 184800 + }, + { + "entropy": 1.9034967720508575, + "epoch": 0.572894830043422, + "grad_norm": 7.53140115737915, + "learning_rate": 3.3423844563231735e-06, + "loss": 0.4294, + "mean_token_accuracy": 0.8583787888288498, + "num_tokens": 222344834.0, + "step": 184810 + }, + { + "entropy": 1.9382282495498657, + "epoch": 0.5729258291684717, + "grad_norm": 7.528757572174072, + "learning_rate": 3.3422940319168854e-06, + "loss": 0.4647, + "mean_token_accuracy": 0.8557207301259041, + "num_tokens": 222355554.0, + "step": 184820 + }, + { + "entropy": 1.8476083979010582, + "epoch": 0.5729568282935215, + "grad_norm": 8.761383056640625, + "learning_rate": 3.342203614849189e-06, + "loss": 0.4288, + "mean_token_accuracy": 0.8453965499997139, + "num_tokens": 222368658.0, + "step": 184830 + }, + { + "entropy": 1.9420979261398315, + "epoch": 0.5729878274185711, + "grad_norm": 8.630688667297363, + "learning_rate": 3.342113205119091e-06, + "loss": 0.4568, + "mean_token_accuracy": 0.8516060650348664, + "num_tokens": 222379694.0, + "step": 184840 + }, + { + "entropy": 1.8539077758789062, + "epoch": 0.5730188265436208, + "grad_norm": 8.41552734375, + "learning_rate": 3.3420228027255983e-06, + "loss": 0.4226, + "mean_token_accuracy": 0.85543172955513, + "num_tokens": 222391970.0, + "step": 184850 + }, + { + "entropy": 1.915343302488327, + "epoch": 0.5730498256686705, + "grad_norm": 4.425499439239502, + "learning_rate": 3.34193240766772e-06, + "loss": 0.505, + "mean_token_accuracy": 0.8428995698690415, + "num_tokens": 222404607.0, + "step": 184860 + }, + { + "entropy": 1.873552420735359, + "epoch": 0.5730808247937202, + "grad_norm": 7.669476509094238, + "learning_rate": 3.341842019944464e-06, + "loss": 0.4291, + "mean_token_accuracy": 0.8621705710887909, + "num_tokens": 222416145.0, + "step": 184870 + }, + { + "entropy": 1.9667245000600815, + "epoch": 0.5731118239187699, + "grad_norm": 7.355974197387695, + "learning_rate": 3.341751639554837e-06, + "loss": 0.5173, + "mean_token_accuracy": 0.8417428076267243, + "num_tokens": 222427915.0, + "step": 184880 + }, + { + "entropy": 1.8021482951939105, + "epoch": 0.5731428230438196, + "grad_norm": 10.828743934631348, + "learning_rate": 3.341661266497849e-06, + "loss": 0.3702, + "mean_token_accuracy": 0.8738041296601295, + "num_tokens": 222441840.0, + "step": 184890 + }, + { + "entropy": 1.8824231550097466, + "epoch": 0.5731738221688693, + "grad_norm": 8.7166109085083, + "learning_rate": 3.341570900772508e-06, + "loss": 0.4612, + "mean_token_accuracy": 0.8486659601330757, + "num_tokens": 222453572.0, + "step": 184900 + }, + { + "entropy": 1.8422776013612747, + "epoch": 0.573204821293919, + "grad_norm": 3.8054652214050293, + "learning_rate": 3.3414805423778225e-06, + "loss": 0.4281, + "mean_token_accuracy": 0.8694258004426956, + "num_tokens": 222466149.0, + "step": 184910 + }, + { + "entropy": 1.914320407807827, + "epoch": 0.5732358204189687, + "grad_norm": 7.1034040451049805, + "learning_rate": 3.3413901913128016e-06, + "loss": 0.4678, + "mean_token_accuracy": 0.8578106880187988, + "num_tokens": 222477685.0, + "step": 184920 + }, + { + "entropy": 1.801837073266506, + "epoch": 0.5732668195440184, + "grad_norm": 7.344631671905518, + "learning_rate": 3.3412998475764543e-06, + "loss": 0.3845, + "mean_token_accuracy": 0.8626899436116219, + "num_tokens": 222490789.0, + "step": 184930 + }, + { + "entropy": 1.8856204375624657, + "epoch": 0.573297818669068, + "grad_norm": 9.500457763671875, + "learning_rate": 3.3412095111677906e-06, + "loss": 0.4072, + "mean_token_accuracy": 0.8661552131175995, + "num_tokens": 222503018.0, + "step": 184940 + }, + { + "entropy": 1.9374446853995324, + "epoch": 0.5733288177941177, + "grad_norm": 7.306221961975098, + "learning_rate": 3.341119182085818e-06, + "loss": 0.4638, + "mean_token_accuracy": 0.8488444343209267, + "num_tokens": 222514264.0, + "step": 184950 + }, + { + "entropy": 1.964336033165455, + "epoch": 0.5733598169191675, + "grad_norm": 7.5833635330200195, + "learning_rate": 3.3410288603295487e-06, + "loss": 0.5151, + "mean_token_accuracy": 0.838313241302967, + "num_tokens": 222524996.0, + "step": 184960 + }, + { + "entropy": 1.879715594649315, + "epoch": 0.5733908160442172, + "grad_norm": 4.586198329925537, + "learning_rate": 3.340938545897991e-06, + "loss": 0.4092, + "mean_token_accuracy": 0.8588324815034867, + "num_tokens": 222537104.0, + "step": 184970 + }, + { + "entropy": 1.8826902776956558, + "epoch": 0.5734218151692668, + "grad_norm": 2.1931099891662598, + "learning_rate": 3.3408482387901552e-06, + "loss": 0.4676, + "mean_token_accuracy": 0.8416186437010765, + "num_tokens": 222550488.0, + "step": 184980 + }, + { + "entropy": 1.7597140736877919, + "epoch": 0.5734528142943165, + "grad_norm": 7.864374160766602, + "learning_rate": 3.340757939005052e-06, + "loss": 0.4294, + "mean_token_accuracy": 0.8579392388463021, + "num_tokens": 222564524.0, + "step": 184990 + }, + { + "entropy": 1.8594155356287956, + "epoch": 0.5734838134193663, + "grad_norm": 8.28993034362793, + "learning_rate": 3.340667646541692e-06, + "loss": 0.5026, + "mean_token_accuracy": 0.855444373190403, + "num_tokens": 222576994.0, + "step": 185000 + }, + { + "entropy": 1.9743560194969176, + "epoch": 0.5735148125444159, + "grad_norm": 8.733586311340332, + "learning_rate": 3.3405773613990844e-06, + "loss": 0.4627, + "mean_token_accuracy": 0.8568108826875687, + "num_tokens": 222588182.0, + "step": 185010 + }, + { + "entropy": 1.9266073897480964, + "epoch": 0.5735458116694656, + "grad_norm": 8.534564018249512, + "learning_rate": 3.3404870835762415e-06, + "loss": 0.4355, + "mean_token_accuracy": 0.8611538261175156, + "num_tokens": 222599656.0, + "step": 185020 + }, + { + "entropy": 1.919064535200596, + "epoch": 0.5735768107945153, + "grad_norm": 8.062556266784668, + "learning_rate": 3.340396813072173e-06, + "loss": 0.4306, + "mean_token_accuracy": 0.8658887654542923, + "num_tokens": 222611088.0, + "step": 185030 + }, + { + "entropy": 1.8669011473655701, + "epoch": 0.5736078099195651, + "grad_norm": 3.368703603744507, + "learning_rate": 3.3403065498858922e-06, + "loss": 0.4036, + "mean_token_accuracy": 0.8619739264249802, + "num_tokens": 222623519.0, + "step": 185040 + }, + { + "entropy": 1.9168578773736953, + "epoch": 0.5736388090446147, + "grad_norm": 8.530938148498535, + "learning_rate": 3.3402162940164077e-06, + "loss": 0.4536, + "mean_token_accuracy": 0.8555745720863343, + "num_tokens": 222635620.0, + "step": 185050 + }, + { + "entropy": 1.8974201664328576, + "epoch": 0.5736698081696644, + "grad_norm": 8.14111328125, + "learning_rate": 3.3401260454627328e-06, + "loss": 0.417, + "mean_token_accuracy": 0.8681931644678116, + "num_tokens": 222647379.0, + "step": 185060 + }, + { + "entropy": 1.8431912809610367, + "epoch": 0.5737008072947141, + "grad_norm": 5.964134693145752, + "learning_rate": 3.340035804223879e-06, + "loss": 0.3635, + "mean_token_accuracy": 0.8663034707307815, + "num_tokens": 222660763.0, + "step": 185070 + }, + { + "entropy": 1.9197598233819009, + "epoch": 0.5737318064197638, + "grad_norm": 9.90364933013916, + "learning_rate": 3.339945570298858e-06, + "loss": 0.4583, + "mean_token_accuracy": 0.8376195728778839, + "num_tokens": 222673262.0, + "step": 185080 + }, + { + "entropy": 1.8817140839993953, + "epoch": 0.5737628055448135, + "grad_norm": 3.678171396255493, + "learning_rate": 3.3398553436866814e-06, + "loss": 0.4016, + "mean_token_accuracy": 0.8645113825798034, + "num_tokens": 222685683.0, + "step": 185090 + }, + { + "entropy": 1.97952641248703, + "epoch": 0.5737938046698632, + "grad_norm": 8.106505393981934, + "learning_rate": 3.3397651243863634e-06, + "loss": 0.4696, + "mean_token_accuracy": 0.8467271432280541, + "num_tokens": 222696840.0, + "step": 185100 + }, + { + "entropy": 1.8992469534277916, + "epoch": 0.5738248037949129, + "grad_norm": 6.165231227874756, + "learning_rate": 3.339674912396914e-06, + "loss": 0.4567, + "mean_token_accuracy": 0.8624764665961265, + "num_tokens": 222709302.0, + "step": 185110 + }, + { + "entropy": 2.011495552957058, + "epoch": 0.5738558029199626, + "grad_norm": 8.762554168701172, + "learning_rate": 3.3395847077173466e-06, + "loss": 0.4771, + "mean_token_accuracy": 0.8507163986563683, + "num_tokens": 222720793.0, + "step": 185120 + }, + { + "entropy": 1.9778069868683814, + "epoch": 0.5738868020450123, + "grad_norm": 9.670307159423828, + "learning_rate": 3.339494510346675e-06, + "loss": 0.5101, + "mean_token_accuracy": 0.8424128621816636, + "num_tokens": 222732158.0, + "step": 185130 + }, + { + "entropy": 1.9170079410076142, + "epoch": 0.573917801170062, + "grad_norm": 9.013583183288574, + "learning_rate": 3.339404320283912e-06, + "loss": 0.419, + "mean_token_accuracy": 0.8663454532623291, + "num_tokens": 222743467.0, + "step": 185140 + }, + { + "entropy": 1.8615730196237563, + "epoch": 0.5739488002951116, + "grad_norm": 4.1062517166137695, + "learning_rate": 3.3393141375280703e-06, + "loss": 0.413, + "mean_token_accuracy": 0.856800027191639, + "num_tokens": 222756022.0, + "step": 185150 + }, + { + "entropy": 1.7988073825836182, + "epoch": 0.5739797994201614, + "grad_norm": 7.607936382293701, + "learning_rate": 3.3392239620781634e-06, + "loss": 0.3365, + "mean_token_accuracy": 0.8725836887955666, + "num_tokens": 222769768.0, + "step": 185160 + }, + { + "entropy": 1.8891526952385902, + "epoch": 0.5740107985452111, + "grad_norm": 4.363334655761719, + "learning_rate": 3.3391337939332046e-06, + "loss": 0.4161, + "mean_token_accuracy": 0.8500419408082962, + "num_tokens": 222782578.0, + "step": 185170 + }, + { + "entropy": 1.8543832316994666, + "epoch": 0.5740417976702608, + "grad_norm": 7.629032135009766, + "learning_rate": 3.3390436330922088e-06, + "loss": 0.4031, + "mean_token_accuracy": 0.8651989296078682, + "num_tokens": 222795069.0, + "step": 185180 + }, + { + "entropy": 1.951645615696907, + "epoch": 0.5740727967953104, + "grad_norm": 9.708003997802734, + "learning_rate": 3.3389534795541887e-06, + "loss": 0.4839, + "mean_token_accuracy": 0.8511202931404114, + "num_tokens": 222807285.0, + "step": 185190 + }, + { + "entropy": 1.9666521787643432, + "epoch": 0.5741037959203601, + "grad_norm": 8.467106819152832, + "learning_rate": 3.3388633333181598e-06, + "loss": 0.4732, + "mean_token_accuracy": 0.8552713766694069, + "num_tokens": 222818841.0, + "step": 185200 + }, + { + "entropy": 1.889816901087761, + "epoch": 0.5741347950454099, + "grad_norm": 6.984008312225342, + "learning_rate": 3.338773194383135e-06, + "loss": 0.3885, + "mean_token_accuracy": 0.8665608659386634, + "num_tokens": 222830644.0, + "step": 185210 + }, + { + "entropy": 1.81104983240366, + "epoch": 0.5741657941704595, + "grad_norm": 2.781053066253662, + "learning_rate": 3.3386830627481296e-06, + "loss": 0.4114, + "mean_token_accuracy": 0.8583825513720512, + "num_tokens": 222844372.0, + "step": 185220 + }, + { + "entropy": 1.7910630270838737, + "epoch": 0.5741967932955092, + "grad_norm": 8.483392715454102, + "learning_rate": 3.3385929384121583e-06, + "loss": 0.4083, + "mean_token_accuracy": 0.8679147735238075, + "num_tokens": 222858299.0, + "step": 185230 + }, + { + "entropy": 1.8780374467372893, + "epoch": 0.5742277924205589, + "grad_norm": 8.997187614440918, + "learning_rate": 3.338502821374236e-06, + "loss": 0.4365, + "mean_token_accuracy": 0.8558851927518845, + "num_tokens": 222870451.0, + "step": 185240 + }, + { + "entropy": 1.9205268666148185, + "epoch": 0.5742587915456087, + "grad_norm": 8.913628578186035, + "learning_rate": 3.3384127116333775e-06, + "loss": 0.4701, + "mean_token_accuracy": 0.8517489165067673, + "num_tokens": 222881620.0, + "step": 185250 + }, + { + "entropy": 1.8900404646992683, + "epoch": 0.5742897906706583, + "grad_norm": 7.601342678070068, + "learning_rate": 3.338322609188599e-06, + "loss": 0.4345, + "mean_token_accuracy": 0.8635605916380882, + "num_tokens": 222893543.0, + "step": 185260 + }, + { + "entropy": 1.8780425779521466, + "epoch": 0.574320789795708, + "grad_norm": 7.883829116821289, + "learning_rate": 3.3382325140389145e-06, + "loss": 0.4162, + "mean_token_accuracy": 0.866215144097805, + "num_tokens": 222905289.0, + "step": 185270 + }, + { + "entropy": 1.8748355612158776, + "epoch": 0.5743517889207577, + "grad_norm": 7.552894115447998, + "learning_rate": 3.3381424261833405e-06, + "loss": 0.4114, + "mean_token_accuracy": 0.860708336532116, + "num_tokens": 222917674.0, + "step": 185280 + }, + { + "entropy": 1.968424329161644, + "epoch": 0.5743827880458074, + "grad_norm": 8.948657035827637, + "learning_rate": 3.338052345620893e-06, + "loss": 0.4957, + "mean_token_accuracy": 0.848981560766697, + "num_tokens": 222928114.0, + "step": 185290 + }, + { + "entropy": 1.8713356271386146, + "epoch": 0.5744137871708571, + "grad_norm": 3.9206175804138184, + "learning_rate": 3.3379622723505878e-06, + "loss": 0.3843, + "mean_token_accuracy": 0.8630928680300712, + "num_tokens": 222940066.0, + "step": 185300 + }, + { + "entropy": 1.83049533367157, + "epoch": 0.5744447862959068, + "grad_norm": 11.425100326538086, + "learning_rate": 3.337872206371441e-06, + "loss": 0.4477, + "mean_token_accuracy": 0.8608759924769401, + "num_tokens": 222953334.0, + "step": 185310 + }, + { + "entropy": 1.9473757684230804, + "epoch": 0.5744757854209565, + "grad_norm": 8.36613655090332, + "learning_rate": 3.3377821476824686e-06, + "loss": 0.4622, + "mean_token_accuracy": 0.8547708854079247, + "num_tokens": 222964268.0, + "step": 185320 + }, + { + "entropy": 1.8634528711438179, + "epoch": 0.5745067845460062, + "grad_norm": 4.429279804229736, + "learning_rate": 3.337692096282688e-06, + "loss": 0.3893, + "mean_token_accuracy": 0.8642795279622077, + "num_tokens": 222976959.0, + "step": 185330 + }, + { + "entropy": 1.9195874109864235, + "epoch": 0.5745377836710559, + "grad_norm": 8.691102981567383, + "learning_rate": 3.3376020521711158e-06, + "loss": 0.4845, + "mean_token_accuracy": 0.848965446650982, + "num_tokens": 222988480.0, + "step": 185340 + }, + { + "entropy": 1.8805852934718132, + "epoch": 0.5745687827961056, + "grad_norm": 8.096450805664062, + "learning_rate": 3.3375120153467684e-06, + "loss": 0.4481, + "mean_token_accuracy": 0.8595820307731629, + "num_tokens": 223000727.0, + "step": 185350 + }, + { + "entropy": 1.9091674134135246, + "epoch": 0.5745997819211552, + "grad_norm": 9.245760917663574, + "learning_rate": 3.3374219858086632e-06, + "loss": 0.4397, + "mean_token_accuracy": 0.85263032913208, + "num_tokens": 223012381.0, + "step": 185360 + }, + { + "entropy": 1.9092094480991364, + "epoch": 0.574630781046205, + "grad_norm": 7.5631585121154785, + "learning_rate": 3.3373319635558177e-06, + "loss": 0.4805, + "mean_token_accuracy": 0.8590279519557953, + "num_tokens": 223023553.0, + "step": 185370 + }, + { + "entropy": 1.9298975244164467, + "epoch": 0.5746617801712547, + "grad_norm": 8.844869613647461, + "learning_rate": 3.3372419485872493e-06, + "loss": 0.4743, + "mean_token_accuracy": 0.8552899554371833, + "num_tokens": 223034518.0, + "step": 185380 + }, + { + "entropy": 1.9004199922084808, + "epoch": 0.5746927792963044, + "grad_norm": 8.642338752746582, + "learning_rate": 3.337151940901976e-06, + "loss": 0.4267, + "mean_token_accuracy": 0.8602864474058152, + "num_tokens": 223046017.0, + "step": 185390 + }, + { + "entropy": 1.9718008667230607, + "epoch": 0.574723778421354, + "grad_norm": 8.08690071105957, + "learning_rate": 3.3370619404990144e-06, + "loss": 0.4737, + "mean_token_accuracy": 0.8506859511137008, + "num_tokens": 223056921.0, + "step": 185400 + }, + { + "entropy": 1.862248282134533, + "epoch": 0.5747547775464038, + "grad_norm": 7.678162574768066, + "learning_rate": 3.336971947377385e-06, + "loss": 0.4184, + "mean_token_accuracy": 0.8505977019667625, + "num_tokens": 223070656.0, + "step": 185410 + }, + { + "entropy": 1.9419626086950301, + "epoch": 0.5747857766714535, + "grad_norm": 8.565497398376465, + "learning_rate": 3.336881961536103e-06, + "loss": 0.4772, + "mean_token_accuracy": 0.84831403195858, + "num_tokens": 223082072.0, + "step": 185420 + }, + { + "entropy": 1.94987653195858, + "epoch": 0.5748167757965031, + "grad_norm": 8.058905601501465, + "learning_rate": 3.3367919829741894e-06, + "loss": 0.4562, + "mean_token_accuracy": 0.8444725587964058, + "num_tokens": 223094099.0, + "step": 185430 + }, + { + "entropy": 1.952441319823265, + "epoch": 0.5748477749215528, + "grad_norm": 9.241267204284668, + "learning_rate": 3.3367020116906613e-06, + "loss": 0.5223, + "mean_token_accuracy": 0.8438718289136886, + "num_tokens": 223105424.0, + "step": 185440 + }, + { + "entropy": 1.908140140771866, + "epoch": 0.5748787740466025, + "grad_norm": 3.946951150894165, + "learning_rate": 3.3366120476845383e-06, + "loss": 0.4116, + "mean_token_accuracy": 0.8631471082568168, + "num_tokens": 223117241.0, + "step": 185450 + }, + { + "entropy": 1.880707675218582, + "epoch": 0.5749097731716523, + "grad_norm": 3.73358154296875, + "learning_rate": 3.33652209095484e-06, + "loss": 0.4374, + "mean_token_accuracy": 0.848235011100769, + "num_tokens": 223130225.0, + "step": 185460 + }, + { + "entropy": 1.9595543146133423, + "epoch": 0.5749407722967019, + "grad_norm": 8.203792572021484, + "learning_rate": 3.3364321415005833e-06, + "loss": 0.4392, + "mean_token_accuracy": 0.8585345134139061, + "num_tokens": 223140686.0, + "step": 185470 + }, + { + "entropy": 1.938676691055298, + "epoch": 0.5749717714217516, + "grad_norm": 8.655381202697754, + "learning_rate": 3.3363421993207896e-06, + "loss": 0.4542, + "mean_token_accuracy": 0.8510200262069703, + "num_tokens": 223152032.0, + "step": 185480 + }, + { + "entropy": 1.879559238255024, + "epoch": 0.5750027705468013, + "grad_norm": 4.304847240447998, + "learning_rate": 3.336252264414478e-06, + "loss": 0.3947, + "mean_token_accuracy": 0.8723119482398033, + "num_tokens": 223163453.0, + "step": 185490 + }, + { + "entropy": 1.8480073928833007, + "epoch": 0.575033769671851, + "grad_norm": 7.835402011871338, + "learning_rate": 3.336162336780667e-06, + "loss": 0.4536, + "mean_token_accuracy": 0.8501526653766632, + "num_tokens": 223176639.0, + "step": 185500 + }, + { + "entropy": 1.928117537498474, + "epoch": 0.5750647687969007, + "grad_norm": 9.117416381835938, + "learning_rate": 3.3360724164183784e-06, + "loss": 0.4583, + "mean_token_accuracy": 0.8559558629989624, + "num_tokens": 223188517.0, + "step": 185510 + }, + { + "entropy": 1.8703112483024598, + "epoch": 0.5750957679219504, + "grad_norm": 8.261551856994629, + "learning_rate": 3.3359825033266312e-06, + "loss": 0.4208, + "mean_token_accuracy": 0.8563395589590073, + "num_tokens": 223200740.0, + "step": 185520 + }, + { + "entropy": 1.9589730098843574, + "epoch": 0.575126767047, + "grad_norm": 8.426461219787598, + "learning_rate": 3.3358925975044452e-06, + "loss": 0.4503, + "mean_token_accuracy": 0.8574065804481507, + "num_tokens": 223211860.0, + "step": 185530 + }, + { + "entropy": 1.8471322655677795, + "epoch": 0.5751577661720498, + "grad_norm": 7.796889305114746, + "learning_rate": 3.335802698950843e-06, + "loss": 0.4, + "mean_token_accuracy": 0.8555324643850326, + "num_tokens": 223224385.0, + "step": 185540 + }, + { + "entropy": 1.862960086762905, + "epoch": 0.5751887652970995, + "grad_norm": 8.271576881408691, + "learning_rate": 3.335712807664843e-06, + "loss": 0.3873, + "mean_token_accuracy": 0.8653026908636093, + "num_tokens": 223236896.0, + "step": 185550 + }, + { + "entropy": 1.9048970609903335, + "epoch": 0.5752197644221492, + "grad_norm": 3.4314792156219482, + "learning_rate": 3.335622923645467e-06, + "loss": 0.4319, + "mean_token_accuracy": 0.8632827535271644, + "num_tokens": 223248127.0, + "step": 185560 + }, + { + "entropy": 1.868567879498005, + "epoch": 0.5752507635471988, + "grad_norm": 3.734616279602051, + "learning_rate": 3.335533046891736e-06, + "loss": 0.4152, + "mean_token_accuracy": 0.8556340306997299, + "num_tokens": 223260369.0, + "step": 185570 + }, + { + "entropy": 1.8998666673898696, + "epoch": 0.5752817626722486, + "grad_norm": 7.988044261932373, + "learning_rate": 3.3354431774026707e-06, + "loss": 0.4166, + "mean_token_accuracy": 0.8611581310629844, + "num_tokens": 223272457.0, + "step": 185580 + }, + { + "entropy": 1.898111554980278, + "epoch": 0.5753127617972983, + "grad_norm": 7.35044527053833, + "learning_rate": 3.3353533151772933e-06, + "loss": 0.4322, + "mean_token_accuracy": 0.8583281725645066, + "num_tokens": 223284354.0, + "step": 185590 + }, + { + "entropy": 1.9633333265781403, + "epoch": 0.575343760922348, + "grad_norm": 9.622851371765137, + "learning_rate": 3.3352634602146243e-06, + "loss": 0.4728, + "mean_token_accuracy": 0.8559970110654831, + "num_tokens": 223295453.0, + "step": 185600 + }, + { + "entropy": 1.9035291761159896, + "epoch": 0.5753747600473976, + "grad_norm": 7.453666687011719, + "learning_rate": 3.3351736125136876e-06, + "loss": 0.4294, + "mean_token_accuracy": 0.8548563599586487, + "num_tokens": 223307635.0, + "step": 185610 + }, + { + "entropy": 1.9105853006243705, + "epoch": 0.5754057591724474, + "grad_norm": 7.252157688140869, + "learning_rate": 3.3350837720735026e-06, + "loss": 0.4334, + "mean_token_accuracy": 0.8604837030172348, + "num_tokens": 223319421.0, + "step": 185620 + }, + { + "entropy": 1.9059565871953965, + "epoch": 0.5754367582974971, + "grad_norm": 9.886700630187988, + "learning_rate": 3.3349939388930917e-06, + "loss": 0.4179, + "mean_token_accuracy": 0.8626177817583084, + "num_tokens": 223331495.0, + "step": 185630 + }, + { + "entropy": 1.8059578329324721, + "epoch": 0.5754677574225467, + "grad_norm": 7.218722343444824, + "learning_rate": 3.3349041129714787e-06, + "loss": 0.3851, + "mean_token_accuracy": 0.8683707162737846, + "num_tokens": 223344530.0, + "step": 185640 + }, + { + "entropy": 1.9087028831243515, + "epoch": 0.5754987565475964, + "grad_norm": 8.074930191040039, + "learning_rate": 3.334814294307686e-06, + "loss": 0.419, + "mean_token_accuracy": 0.8641566768288612, + "num_tokens": 223355905.0, + "step": 185650 + }, + { + "entropy": 1.9034427180886269, + "epoch": 0.5755297556726462, + "grad_norm": 4.149694919586182, + "learning_rate": 3.3347244829007354e-06, + "loss": 0.4368, + "mean_token_accuracy": 0.8565534576773643, + "num_tokens": 223367855.0, + "step": 185660 + }, + { + "entropy": 1.843117931485176, + "epoch": 0.5755607547976959, + "grad_norm": 8.624125480651855, + "learning_rate": 3.3346346787496496e-06, + "loss": 0.4031, + "mean_token_accuracy": 0.8669708013534546, + "num_tokens": 223379487.0, + "step": 185670 + }, + { + "entropy": 1.9281734496355056, + "epoch": 0.5755917539227455, + "grad_norm": 7.270875453948975, + "learning_rate": 3.334544881853452e-06, + "loss": 0.4725, + "mean_token_accuracy": 0.8511006891727447, + "num_tokens": 223390086.0, + "step": 185680 + }, + { + "entropy": 1.8094001136720181, + "epoch": 0.5756227530477952, + "grad_norm": 10.744769096374512, + "learning_rate": 3.3344550922111663e-06, + "loss": 0.3938, + "mean_token_accuracy": 0.8723492413759232, + "num_tokens": 223403380.0, + "step": 185690 + }, + { + "entropy": 1.7403853356838226, + "epoch": 0.5756537521728449, + "grad_norm": 3.8312265872955322, + "learning_rate": 3.3343653098218156e-06, + "loss": 0.3603, + "mean_token_accuracy": 0.8727297469973564, + "num_tokens": 223416987.0, + "step": 185700 + }, + { + "entropy": 1.9369304656982422, + "epoch": 0.5756847512978946, + "grad_norm": 8.717244148254395, + "learning_rate": 3.334275534684423e-06, + "loss": 0.4751, + "mean_token_accuracy": 0.85114157050848, + "num_tokens": 223428740.0, + "step": 185710 + }, + { + "entropy": 1.8861325442790986, + "epoch": 0.5757157504229443, + "grad_norm": 3.7332777976989746, + "learning_rate": 3.334185766798014e-06, + "loss": 0.4337, + "mean_token_accuracy": 0.8636431515216827, + "num_tokens": 223441017.0, + "step": 185720 + }, + { + "entropy": 1.9709544464945794, + "epoch": 0.575746749547994, + "grad_norm": 9.138751983642578, + "learning_rate": 3.33409600616161e-06, + "loss": 0.4946, + "mean_token_accuracy": 0.8416403874754905, + "num_tokens": 223452264.0, + "step": 185730 + }, + { + "entropy": 1.884700782597065, + "epoch": 0.5757777486730437, + "grad_norm": 7.390668869018555, + "learning_rate": 3.334006252774237e-06, + "loss": 0.4169, + "mean_token_accuracy": 0.855260145664215, + "num_tokens": 223464737.0, + "step": 185740 + }, + { + "entropy": 1.8476317539811133, + "epoch": 0.5758087477980934, + "grad_norm": 7.85823917388916, + "learning_rate": 3.3339165066349184e-06, + "loss": 0.441, + "mean_token_accuracy": 0.858846141397953, + "num_tokens": 223477825.0, + "step": 185750 + }, + { + "entropy": 1.84466263204813, + "epoch": 0.5758397469231431, + "grad_norm": 3.960548162460327, + "learning_rate": 3.333826767742679e-06, + "loss": 0.4561, + "mean_token_accuracy": 0.8526906743645668, + "num_tokens": 223490606.0, + "step": 185760 + }, + { + "entropy": 1.8592259749770164, + "epoch": 0.5758707460481928, + "grad_norm": 4.318323612213135, + "learning_rate": 3.3337370360965444e-06, + "loss": 0.4697, + "mean_token_accuracy": 0.8533689677715302, + "num_tokens": 223503065.0, + "step": 185770 + }, + { + "entropy": 1.8432864606380464, + "epoch": 0.5759017451732424, + "grad_norm": 8.023193359375, + "learning_rate": 3.333647311695538e-06, + "loss": 0.4007, + "mean_token_accuracy": 0.8605846777558327, + "num_tokens": 223515498.0, + "step": 185780 + }, + { + "entropy": 1.8691019058227538, + "epoch": 0.5759327442982922, + "grad_norm": 4.231311321258545, + "learning_rate": 3.333557594538686e-06, + "loss": 0.4092, + "mean_token_accuracy": 0.85765770226717, + "num_tokens": 223528474.0, + "step": 185790 + }, + { + "entropy": 1.9564666375517845, + "epoch": 0.5759637434233419, + "grad_norm": 8.478957176208496, + "learning_rate": 3.3334678846250137e-06, + "loss": 0.4562, + "mean_token_accuracy": 0.8489108413457871, + "num_tokens": 223539675.0, + "step": 185800 + }, + { + "entropy": 1.8829040467739104, + "epoch": 0.5759947425483916, + "grad_norm": 7.5195841789245605, + "learning_rate": 3.333378181953545e-06, + "loss": 0.3945, + "mean_token_accuracy": 0.8718097507953644, + "num_tokens": 223551556.0, + "step": 185810 + }, + { + "entropy": 1.816511270403862, + "epoch": 0.5760257416734412, + "grad_norm": 9.738638877868652, + "learning_rate": 3.333288486523308e-06, + "loss": 0.4132, + "mean_token_accuracy": 0.8638149693608284, + "num_tokens": 223564923.0, + "step": 185820 + }, + { + "entropy": 1.911141762137413, + "epoch": 0.576056740798491, + "grad_norm": 8.631954193115234, + "learning_rate": 3.333198798333326e-06, + "loss": 0.4531, + "mean_token_accuracy": 0.8467279374599457, + "num_tokens": 223576640.0, + "step": 185830 + }, + { + "entropy": 1.8985722064971924, + "epoch": 0.5760877399235407, + "grad_norm": 14.50521469116211, + "learning_rate": 3.3331091173826262e-06, + "loss": 0.4624, + "mean_token_accuracy": 0.8531594723463058, + "num_tokens": 223588249.0, + "step": 185840 + }, + { + "entropy": 1.9046927765011787, + "epoch": 0.5761187390485903, + "grad_norm": 7.594962120056152, + "learning_rate": 3.333019443670235e-06, + "loss": 0.4528, + "mean_token_accuracy": 0.8494858413934707, + "num_tokens": 223599798.0, + "step": 185850 + }, + { + "entropy": 1.8822232261300087, + "epoch": 0.57614973817364, + "grad_norm": 7.868729114532471, + "learning_rate": 3.332929777195178e-06, + "loss": 0.4241, + "mean_token_accuracy": 0.8659264981746674, + "num_tokens": 223611745.0, + "step": 185860 + }, + { + "entropy": 1.902225561439991, + "epoch": 0.5761807372986898, + "grad_norm": 9.798776626586914, + "learning_rate": 3.332840117956483e-06, + "loss": 0.4679, + "mean_token_accuracy": 0.8460187584161758, + "num_tokens": 223623156.0, + "step": 185870 + }, + { + "entropy": 1.8417491719126702, + "epoch": 0.5762117364237395, + "grad_norm": 9.396997451782227, + "learning_rate": 3.332750465953175e-06, + "loss": 0.4264, + "mean_token_accuracy": 0.8622669294476509, + "num_tokens": 223636123.0, + "step": 185880 + }, + { + "entropy": 1.881843839585781, + "epoch": 0.5762427355487891, + "grad_norm": 3.9266364574432373, + "learning_rate": 3.3326608211842826e-06, + "loss": 0.4262, + "mean_token_accuracy": 0.8624861449003219, + "num_tokens": 223647753.0, + "step": 185890 + }, + { + "entropy": 1.8574804991483689, + "epoch": 0.5762737346738388, + "grad_norm": 8.535648345947266, + "learning_rate": 3.3325711836488317e-06, + "loss": 0.4801, + "mean_token_accuracy": 0.8512572422623634, + "num_tokens": 223660418.0, + "step": 185900 + }, + { + "entropy": 1.9249091818928719, + "epoch": 0.5763047337988886, + "grad_norm": 8.348278999328613, + "learning_rate": 3.33248155334585e-06, + "loss": 0.468, + "mean_token_accuracy": 0.8501077264547348, + "num_tokens": 223672031.0, + "step": 185910 + }, + { + "entropy": 1.9589193403720855, + "epoch": 0.5763357329239382, + "grad_norm": 6.652914524078369, + "learning_rate": 3.332391930274365e-06, + "loss": 0.4829, + "mean_token_accuracy": 0.8530449777841568, + "num_tokens": 223682697.0, + "step": 185920 + }, + { + "entropy": 1.9307951003313064, + "epoch": 0.5763667320489879, + "grad_norm": 9.178523063659668, + "learning_rate": 3.3323023144334043e-06, + "loss": 0.4709, + "mean_token_accuracy": 0.8498701199889183, + "num_tokens": 223693854.0, + "step": 185930 + }, + { + "entropy": 1.8847261801362039, + "epoch": 0.5763977311740376, + "grad_norm": 6.794214248657227, + "learning_rate": 3.3322127058219962e-06, + "loss": 0.424, + "mean_token_accuracy": 0.8585624843835831, + "num_tokens": 223705949.0, + "step": 185940 + }, + { + "entropy": 1.8827034577727317, + "epoch": 0.5764287302990873, + "grad_norm": 8.527931213378906, + "learning_rate": 3.3321231044391673e-06, + "loss": 0.4139, + "mean_token_accuracy": 0.8589151486754417, + "num_tokens": 223717648.0, + "step": 185950 + }, + { + "entropy": 1.9328924477100373, + "epoch": 0.576459729424137, + "grad_norm": 7.280888080596924, + "learning_rate": 3.332033510283947e-06, + "loss": 0.4557, + "mean_token_accuracy": 0.8496242374181747, + "num_tokens": 223728682.0, + "step": 185960 + }, + { + "entropy": 1.899811689555645, + "epoch": 0.5764907285491867, + "grad_norm": 8.402411460876465, + "learning_rate": 3.331943923355363e-06, + "loss": 0.4406, + "mean_token_accuracy": 0.8634314224123955, + "num_tokens": 223740221.0, + "step": 185970 + }, + { + "entropy": 1.8626563012599946, + "epoch": 0.5765217276742364, + "grad_norm": 7.133705139160156, + "learning_rate": 3.3318543436524452e-06, + "loss": 0.4281, + "mean_token_accuracy": 0.8618907809257508, + "num_tokens": 223752470.0, + "step": 185980 + }, + { + "entropy": 1.8463318169116973, + "epoch": 0.576552726799286, + "grad_norm": 8.737984657287598, + "learning_rate": 3.3317647711742214e-06, + "loss": 0.3922, + "mean_token_accuracy": 0.8712903305888176, + "num_tokens": 223764746.0, + "step": 185990 + }, + { + "entropy": 1.8844687968492508, + "epoch": 0.5765837259243358, + "grad_norm": 7.062756538391113, + "learning_rate": 3.3316752059197193e-06, + "loss": 0.4376, + "mean_token_accuracy": 0.8577682986855507, + "num_tokens": 223776938.0, + "step": 186000 + }, + { + "entropy": 1.9419248923659325, + "epoch": 0.5766147250493855, + "grad_norm": 8.24322509765625, + "learning_rate": 3.3315856478879698e-06, + "loss": 0.4517, + "mean_token_accuracy": 0.8601583212614059, + "num_tokens": 223788522.0, + "step": 186010 + }, + { + "entropy": 1.9109676256775856, + "epoch": 0.5766457241744352, + "grad_norm": 8.720685005187988, + "learning_rate": 3.331496097078002e-06, + "loss": 0.4521, + "mean_token_accuracy": 0.8536943554878235, + "num_tokens": 223799940.0, + "step": 186020 + }, + { + "entropy": 1.9459070816636086, + "epoch": 0.5766767232994848, + "grad_norm": 8.089960098266602, + "learning_rate": 3.3314065534888447e-06, + "loss": 0.4819, + "mean_token_accuracy": 0.8517192706465722, + "num_tokens": 223811983.0, + "step": 186030 + }, + { + "entropy": 1.9221731379628182, + "epoch": 0.5767077224245346, + "grad_norm": 8.933829307556152, + "learning_rate": 3.3313170171195273e-06, + "loss": 0.459, + "mean_token_accuracy": 0.8519295886158943, + "num_tokens": 223823283.0, + "step": 186040 + }, + { + "entropy": 1.9014252200722694, + "epoch": 0.5767387215495843, + "grad_norm": 8.663115501403809, + "learning_rate": 3.3312274879690805e-06, + "loss": 0.4275, + "mean_token_accuracy": 0.8544277712702751, + "num_tokens": 223834688.0, + "step": 186050 + }, + { + "entropy": 1.9084532499313354, + "epoch": 0.576769720674634, + "grad_norm": 7.686354160308838, + "learning_rate": 3.331137966036534e-06, + "loss": 0.4747, + "mean_token_accuracy": 0.8504864275455475, + "num_tokens": 223846506.0, + "step": 186060 + }, + { + "entropy": 1.9267695754766465, + "epoch": 0.5768007197996836, + "grad_norm": 8.076788902282715, + "learning_rate": 3.3310484513209175e-06, + "loss": 0.5041, + "mean_token_accuracy": 0.844456459581852, + "num_tokens": 223858169.0, + "step": 186070 + }, + { + "entropy": 2.0005256950855257, + "epoch": 0.5768317189247334, + "grad_norm": 7.602105617523193, + "learning_rate": 3.3309589438212626e-06, + "loss": 0.498, + "mean_token_accuracy": 0.8492702186107636, + "num_tokens": 223868879.0, + "step": 186080 + }, + { + "entropy": 1.805472445487976, + "epoch": 0.5768627180497831, + "grad_norm": 7.9939284324646, + "learning_rate": 3.3308694435365983e-06, + "loss": 0.3894, + "mean_token_accuracy": 0.867642617225647, + "num_tokens": 223881970.0, + "step": 186090 + }, + { + "entropy": 1.8925477690994739, + "epoch": 0.5768937171748327, + "grad_norm": 4.912389755249023, + "learning_rate": 3.3307799504659565e-06, + "loss": 0.4159, + "mean_token_accuracy": 0.8653697654604912, + "num_tokens": 223894071.0, + "step": 186100 + }, + { + "entropy": 1.879456302523613, + "epoch": 0.5769247162998824, + "grad_norm": 9.695762634277344, + "learning_rate": 3.330690464608368e-06, + "loss": 0.4224, + "mean_token_accuracy": 0.8551273748278618, + "num_tokens": 223906457.0, + "step": 186110 + }, + { + "entropy": 1.8417584151029587, + "epoch": 0.5769557154249322, + "grad_norm": 3.9739835262298584, + "learning_rate": 3.3306009859628634e-06, + "loss": 0.4145, + "mean_token_accuracy": 0.8579704150557518, + "num_tokens": 223918979.0, + "step": 186120 + }, + { + "entropy": 1.846211352944374, + "epoch": 0.5769867145499818, + "grad_norm": 3.5446488857269287, + "learning_rate": 3.3305115145284746e-06, + "loss": 0.4218, + "mean_token_accuracy": 0.8604969620704651, + "num_tokens": 223932341.0, + "step": 186130 + }, + { + "entropy": 1.8327767252922058, + "epoch": 0.5770177136750315, + "grad_norm": 8.83708667755127, + "learning_rate": 3.3304220503042323e-06, + "loss": 0.3859, + "mean_token_accuracy": 0.8605792403221131, + "num_tokens": 223945064.0, + "step": 186140 + }, + { + "entropy": 1.9210361555218696, + "epoch": 0.5770487128000812, + "grad_norm": 7.3829803466796875, + "learning_rate": 3.3303325932891682e-06, + "loss": 0.4376, + "mean_token_accuracy": 0.8643152773380279, + "num_tokens": 223956532.0, + "step": 186150 + }, + { + "entropy": 1.9436809882521628, + "epoch": 0.577079711925131, + "grad_norm": 11.1614990234375, + "learning_rate": 3.330243143482315e-06, + "loss": 0.4786, + "mean_token_accuracy": 0.8519028946757317, + "num_tokens": 223967817.0, + "step": 186160 + }, + { + "entropy": 1.8733919098973275, + "epoch": 0.5771107110501806, + "grad_norm": 8.432992935180664, + "learning_rate": 3.330153700882705e-06, + "loss": 0.4406, + "mean_token_accuracy": 0.8607887044548989, + "num_tokens": 223980144.0, + "step": 186170 + }, + { + "entropy": 1.9079894140362739, + "epoch": 0.5771417101752303, + "grad_norm": 9.000441551208496, + "learning_rate": 3.3300642654893694e-06, + "loss": 0.4601, + "mean_token_accuracy": 0.8596588268876075, + "num_tokens": 223991651.0, + "step": 186180 + }, + { + "entropy": 1.804750144481659, + "epoch": 0.57717270930028, + "grad_norm": 3.7925076484680176, + "learning_rate": 3.32997483730134e-06, + "loss": 0.4498, + "mean_token_accuracy": 0.8558259546756745, + "num_tokens": 224005369.0, + "step": 186190 + }, + { + "entropy": 1.844835962355137, + "epoch": 0.5772037084253296, + "grad_norm": 4.014327049255371, + "learning_rate": 3.329885416317651e-06, + "loss": 0.3784, + "mean_token_accuracy": 0.8642392948269844, + "num_tokens": 224017679.0, + "step": 186200 + }, + { + "entropy": 1.893115258216858, + "epoch": 0.5772347075503794, + "grad_norm": 6.389863014221191, + "learning_rate": 3.329796002537334e-06, + "loss": 0.4206, + "mean_token_accuracy": 0.8519633457064628, + "num_tokens": 224030018.0, + "step": 186210 + }, + { + "entropy": 1.8942801833152771, + "epoch": 0.5772657066754291, + "grad_norm": 3.37688946723938, + "learning_rate": 3.3297065959594223e-06, + "loss": 0.423, + "mean_token_accuracy": 0.8598490327596664, + "num_tokens": 224041265.0, + "step": 186220 + }, + { + "entropy": 1.9119876235723496, + "epoch": 0.5772967058004788, + "grad_norm": 8.3579740524292, + "learning_rate": 3.329617196582949e-06, + "loss": 0.445, + "mean_token_accuracy": 0.8617330178618431, + "num_tokens": 224052454.0, + "step": 186230 + }, + { + "entropy": 1.8348846569657327, + "epoch": 0.5773277049255284, + "grad_norm": 9.87274169921875, + "learning_rate": 3.3295278044069474e-06, + "loss": 0.4286, + "mean_token_accuracy": 0.8611876145005226, + "num_tokens": 224064686.0, + "step": 186240 + }, + { + "entropy": 1.8601296663284301, + "epoch": 0.5773587040505782, + "grad_norm": 4.845494270324707, + "learning_rate": 3.3294384194304515e-06, + "loss": 0.3801, + "mean_token_accuracy": 0.8686206489801407, + "num_tokens": 224077062.0, + "step": 186250 + }, + { + "entropy": 1.7862082317471504, + "epoch": 0.5773897031756279, + "grad_norm": 3.842564105987549, + "learning_rate": 3.329349041652493e-06, + "loss": 0.411, + "mean_token_accuracy": 0.8551653146743774, + "num_tokens": 224090360.0, + "step": 186260 + }, + { + "entropy": 1.9282741218805313, + "epoch": 0.5774207023006775, + "grad_norm": 9.02146053314209, + "learning_rate": 3.3292596710721083e-06, + "loss": 0.4742, + "mean_token_accuracy": 0.8523161500692368, + "num_tokens": 224101225.0, + "step": 186270 + }, + { + "entropy": 1.8909048795700074, + "epoch": 0.5774517014257272, + "grad_norm": 9.213672637939453, + "learning_rate": 3.3291703076883297e-06, + "loss": 0.4712, + "mean_token_accuracy": 0.8545626923441887, + "num_tokens": 224112311.0, + "step": 186280 + }, + { + "entropy": 1.8705749198794366, + "epoch": 0.577482700550777, + "grad_norm": 9.169853210449219, + "learning_rate": 3.3290809515001925e-06, + "loss": 0.4121, + "mean_token_accuracy": 0.8574076175689698, + "num_tokens": 224124770.0, + "step": 186290 + }, + { + "entropy": 1.8199501633644104, + "epoch": 0.5775136996758267, + "grad_norm": 2.57895565032959, + "learning_rate": 3.3289916025067307e-06, + "loss": 0.4444, + "mean_token_accuracy": 0.8515478879213333, + "num_tokens": 224137410.0, + "step": 186300 + }, + { + "entropy": 1.8437852144241333, + "epoch": 0.5775446988008763, + "grad_norm": 9.475756645202637, + "learning_rate": 3.3289022607069788e-06, + "loss": 0.4212, + "mean_token_accuracy": 0.8646301284432412, + "num_tokens": 224149672.0, + "step": 186310 + }, + { + "entropy": 1.8889194279909134, + "epoch": 0.577575697925926, + "grad_norm": 8.501579284667969, + "learning_rate": 3.3288129260999714e-06, + "loss": 0.4847, + "mean_token_accuracy": 0.8534078598022461, + "num_tokens": 224160181.0, + "step": 186320 + }, + { + "entropy": 1.8528265476226806, + "epoch": 0.5776066970509758, + "grad_norm": 9.43176555633545, + "learning_rate": 3.3287235986847425e-06, + "loss": 0.4503, + "mean_token_accuracy": 0.8597616523504257, + "num_tokens": 224172466.0, + "step": 186330 + }, + { + "entropy": 1.8917500972747803, + "epoch": 0.5776376961760255, + "grad_norm": 6.551401615142822, + "learning_rate": 3.32863427846033e-06, + "loss": 0.4177, + "mean_token_accuracy": 0.8578035309910774, + "num_tokens": 224184378.0, + "step": 186340 + }, + { + "entropy": 1.9086822256445886, + "epoch": 0.5776686953010751, + "grad_norm": 9.394867897033691, + "learning_rate": 3.3285449654257657e-06, + "loss": 0.4568, + "mean_token_accuracy": 0.8590981259942054, + "num_tokens": 224195741.0, + "step": 186350 + }, + { + "entropy": 1.813156895339489, + "epoch": 0.5776996944261248, + "grad_norm": 9.021100044250488, + "learning_rate": 3.3284556595800877e-06, + "loss": 0.3968, + "mean_token_accuracy": 0.8551269844174385, + "num_tokens": 224208130.0, + "step": 186360 + }, + { + "entropy": 1.9111459612846375, + "epoch": 0.5777306935511746, + "grad_norm": 7.168278217315674, + "learning_rate": 3.3283663609223305e-06, + "loss": 0.446, + "mean_token_accuracy": 0.8628562480211258, + "num_tokens": 224219268.0, + "step": 186370 + }, + { + "entropy": 1.88259015083313, + "epoch": 0.5777616926762242, + "grad_norm": 7.674609661102295, + "learning_rate": 3.3282770694515305e-06, + "loss": 0.4337, + "mean_token_accuracy": 0.8598455607891082, + "num_tokens": 224231165.0, + "step": 186380 + }, + { + "entropy": 1.9493561804294586, + "epoch": 0.5777926918012739, + "grad_norm": 8.67314338684082, + "learning_rate": 3.3281877851667234e-06, + "loss": 0.4838, + "mean_token_accuracy": 0.8493429899215699, + "num_tokens": 224242109.0, + "step": 186390 + }, + { + "entropy": 1.8610494658350945, + "epoch": 0.5778236909263236, + "grad_norm": 8.769448280334473, + "learning_rate": 3.328098508066945e-06, + "loss": 0.4097, + "mean_token_accuracy": 0.8556885406374931, + "num_tokens": 224254564.0, + "step": 186400 + }, + { + "entropy": 1.8939135536551475, + "epoch": 0.5778546900513734, + "grad_norm": 8.891929626464844, + "learning_rate": 3.328009238151232e-06, + "loss": 0.4521, + "mean_token_accuracy": 0.8511065915226936, + "num_tokens": 224266367.0, + "step": 186410 + }, + { + "entropy": 1.8918179273605347, + "epoch": 0.577885689176423, + "grad_norm": 7.109013557434082, + "learning_rate": 3.3279199754186207e-06, + "loss": 0.4373, + "mean_token_accuracy": 0.8539350911974907, + "num_tokens": 224277604.0, + "step": 186420 + }, + { + "entropy": 1.8385544650256633, + "epoch": 0.5779166883014727, + "grad_norm": 3.675187110900879, + "learning_rate": 3.3278307198681493e-06, + "loss": 0.3938, + "mean_token_accuracy": 0.8628764376044273, + "num_tokens": 224290724.0, + "step": 186430 + }, + { + "entropy": 1.8970770210027694, + "epoch": 0.5779476874265224, + "grad_norm": 7.674618721008301, + "learning_rate": 3.3277414714988525e-06, + "loss": 0.4101, + "mean_token_accuracy": 0.8650184333324432, + "num_tokens": 224302151.0, + "step": 186440 + }, + { + "entropy": 1.7818903237581254, + "epoch": 0.577978686551572, + "grad_norm": 8.283598899841309, + "learning_rate": 3.3276522303097693e-06, + "loss": 0.4221, + "mean_token_accuracy": 0.8627783268690109, + "num_tokens": 224314955.0, + "step": 186450 + }, + { + "entropy": 1.7635787680745125, + "epoch": 0.5780096856766218, + "grad_norm": 3.638246774673462, + "learning_rate": 3.327562996299935e-06, + "loss": 0.3648, + "mean_token_accuracy": 0.8763737082481384, + "num_tokens": 224328414.0, + "step": 186460 + }, + { + "entropy": 1.9035615831613542, + "epoch": 0.5780406848016715, + "grad_norm": 8.13401985168457, + "learning_rate": 3.327473769468389e-06, + "loss": 0.4871, + "mean_token_accuracy": 0.8484982311725616, + "num_tokens": 224339900.0, + "step": 186470 + }, + { + "entropy": 1.8948525011539459, + "epoch": 0.5780716839267211, + "grad_norm": 9.15529727935791, + "learning_rate": 3.3273845498141684e-06, + "loss": 0.4518, + "mean_token_accuracy": 0.8480350777506829, + "num_tokens": 224351827.0, + "step": 186480 + }, + { + "entropy": 1.8445970296859742, + "epoch": 0.5781026830517708, + "grad_norm": 7.669584274291992, + "learning_rate": 3.3272953373363104e-06, + "loss": 0.4031, + "mean_token_accuracy": 0.8681982949376106, + "num_tokens": 224363747.0, + "step": 186490 + }, + { + "entropy": 1.8775669053196906, + "epoch": 0.5781336821768206, + "grad_norm": 10.55215835571289, + "learning_rate": 3.3272061320338533e-06, + "loss": 0.4386, + "mean_token_accuracy": 0.8503807350993157, + "num_tokens": 224375666.0, + "step": 186500 + }, + { + "entropy": 1.8514750853180886, + "epoch": 0.5781646813018703, + "grad_norm": 9.644965171813965, + "learning_rate": 3.327116933905835e-06, + "loss": 0.4161, + "mean_token_accuracy": 0.850967912375927, + "num_tokens": 224388240.0, + "step": 186510 + }, + { + "entropy": 1.8742257088422776, + "epoch": 0.5781956804269199, + "grad_norm": 8.107645988464355, + "learning_rate": 3.3270277429512948e-06, + "loss": 0.4516, + "mean_token_accuracy": 0.8402368798851967, + "num_tokens": 224400452.0, + "step": 186520 + }, + { + "entropy": 1.9575652033090591, + "epoch": 0.5782266795519696, + "grad_norm": 8.81623649597168, + "learning_rate": 3.3269385591692703e-06, + "loss": 0.4783, + "mean_token_accuracy": 0.8493165135383606, + "num_tokens": 224411160.0, + "step": 186530 + }, + { + "entropy": 1.9046847075223923, + "epoch": 0.5782576786770194, + "grad_norm": 7.765852451324463, + "learning_rate": 3.3268493825588014e-06, + "loss": 0.4726, + "mean_token_accuracy": 0.8459122270345688, + "num_tokens": 224422669.0, + "step": 186540 + }, + { + "entropy": 1.8750623926520347, + "epoch": 0.578288677802069, + "grad_norm": 7.867640495300293, + "learning_rate": 3.326760213118926e-06, + "loss": 0.4073, + "mean_token_accuracy": 0.86184511333704, + "num_tokens": 224434513.0, + "step": 186550 + }, + { + "entropy": 1.9057077065110206, + "epoch": 0.5783196769271187, + "grad_norm": 7.460243225097656, + "learning_rate": 3.3266710508486826e-06, + "loss": 0.4872, + "mean_token_accuracy": 0.8422660037875176, + "num_tokens": 224446264.0, + "step": 186560 + }, + { + "entropy": 1.8610460609197617, + "epoch": 0.5783506760521684, + "grad_norm": 3.488306760787964, + "learning_rate": 3.326581895747112e-06, + "loss": 0.4157, + "mean_token_accuracy": 0.8676585868000984, + "num_tokens": 224457865.0, + "step": 186570 + }, + { + "entropy": 1.8878431126475335, + "epoch": 0.5783816751772182, + "grad_norm": 9.883951187133789, + "learning_rate": 3.3264927478132523e-06, + "loss": 0.464, + "mean_token_accuracy": 0.8584196507930756, + "num_tokens": 224469459.0, + "step": 186580 + }, + { + "entropy": 1.8954463630914689, + "epoch": 0.5784126743022678, + "grad_norm": 8.813650131225586, + "learning_rate": 3.3264036070461443e-06, + "loss": 0.4468, + "mean_token_accuracy": 0.8573021531105042, + "num_tokens": 224481227.0, + "step": 186590 + }, + { + "entropy": 1.8967059776186943, + "epoch": 0.5784436734273175, + "grad_norm": 8.553400039672852, + "learning_rate": 3.326314473444827e-06, + "loss": 0.4707, + "mean_token_accuracy": 0.8501126199960709, + "num_tokens": 224492656.0, + "step": 186600 + }, + { + "entropy": 1.771684955060482, + "epoch": 0.5784746725523672, + "grad_norm": 8.498135566711426, + "learning_rate": 3.32622534700834e-06, + "loss": 0.4226, + "mean_token_accuracy": 0.8598705172538758, + "num_tokens": 224506650.0, + "step": 186610 + }, + { + "entropy": 1.8974428325891495, + "epoch": 0.578505671677417, + "grad_norm": 7.043177604675293, + "learning_rate": 3.3261362277357252e-06, + "loss": 0.5038, + "mean_token_accuracy": 0.8514397636055946, + "num_tokens": 224518531.0, + "step": 186620 + }, + { + "entropy": 1.8689001992344856, + "epoch": 0.5785366708024666, + "grad_norm": 8.190739631652832, + "learning_rate": 3.326047115626021e-06, + "loss": 0.417, + "mean_token_accuracy": 0.8575657159090042, + "num_tokens": 224530692.0, + "step": 186630 + }, + { + "entropy": 1.938750332593918, + "epoch": 0.5785676699275163, + "grad_norm": 8.82866096496582, + "learning_rate": 3.3259580106782683e-06, + "loss": 0.4586, + "mean_token_accuracy": 0.8508216217160225, + "num_tokens": 224542421.0, + "step": 186640 + }, + { + "entropy": 1.883426919579506, + "epoch": 0.578598669052566, + "grad_norm": 6.959622859954834, + "learning_rate": 3.32586891289151e-06, + "loss": 0.4698, + "mean_token_accuracy": 0.8552673414349556, + "num_tokens": 224555049.0, + "step": 186650 + }, + { + "entropy": 1.8930795446038247, + "epoch": 0.5786296681776157, + "grad_norm": 8.692614555358887, + "learning_rate": 3.325779822264784e-06, + "loss": 0.4286, + "mean_token_accuracy": 0.8654409319162368, + "num_tokens": 224566727.0, + "step": 186660 + }, + { + "entropy": 1.8495046302676201, + "epoch": 0.5786606673026654, + "grad_norm": 3.965505599975586, + "learning_rate": 3.325690738797133e-06, + "loss": 0.4155, + "mean_token_accuracy": 0.8580985084176064, + "num_tokens": 224579383.0, + "step": 186670 + }, + { + "entropy": 1.8006903648376464, + "epoch": 0.5786916664277151, + "grad_norm": 3.8050777912139893, + "learning_rate": 3.3256016624875973e-06, + "loss": 0.3696, + "mean_token_accuracy": 0.8640064790844917, + "num_tokens": 224592615.0, + "step": 186680 + }, + { + "entropy": 1.8454625502228736, + "epoch": 0.5787226655527647, + "grad_norm": 3.603572368621826, + "learning_rate": 3.3255125933352197e-06, + "loss": 0.3792, + "mean_token_accuracy": 0.8661387443542481, + "num_tokens": 224604638.0, + "step": 186690 + }, + { + "entropy": 1.8448929190635681, + "epoch": 0.5787536646778144, + "grad_norm": 9.511395454406738, + "learning_rate": 3.3254235313390405e-06, + "loss": 0.391, + "mean_token_accuracy": 0.8726136729121208, + "num_tokens": 224617135.0, + "step": 186700 + }, + { + "entropy": 1.831068354845047, + "epoch": 0.5787846638028642, + "grad_norm": 2.786999225616455, + "learning_rate": 3.3253344764981014e-06, + "loss": 0.4073, + "mean_token_accuracy": 0.8656001672148704, + "num_tokens": 224629636.0, + "step": 186710 + }, + { + "entropy": 1.9099891155958175, + "epoch": 0.5788156629279139, + "grad_norm": 8.675889015197754, + "learning_rate": 3.325245428811446e-06, + "loss": 0.447, + "mean_token_accuracy": 0.859323938190937, + "num_tokens": 224640785.0, + "step": 186720 + }, + { + "entropy": 1.820456326007843, + "epoch": 0.5788466620529635, + "grad_norm": 8.677525520324707, + "learning_rate": 3.3251563882781154e-06, + "loss": 0.4217, + "mean_token_accuracy": 0.8548235654830932, + "num_tokens": 224653264.0, + "step": 186730 + }, + { + "entropy": 1.8897985056042672, + "epoch": 0.5788776611780132, + "grad_norm": 6.519395351409912, + "learning_rate": 3.3250673548971507e-06, + "loss": 0.4324, + "mean_token_accuracy": 0.8648929744958878, + "num_tokens": 224664511.0, + "step": 186740 + }, + { + "entropy": 1.8283945478498935, + "epoch": 0.578908660303063, + "grad_norm": 6.973257064819336, + "learning_rate": 3.3249783286675967e-06, + "loss": 0.4249, + "mean_token_accuracy": 0.8637444868683815, + "num_tokens": 224677556.0, + "step": 186750 + }, + { + "entropy": 1.8844505622982979, + "epoch": 0.5789396594281127, + "grad_norm": 8.534669876098633, + "learning_rate": 3.324889309588494e-06, + "loss": 0.4326, + "mean_token_accuracy": 0.8586746722459793, + "num_tokens": 224689269.0, + "step": 186760 + }, + { + "entropy": 1.8787866935133934, + "epoch": 0.5789706585531623, + "grad_norm": 3.6595122814178467, + "learning_rate": 3.324800297658888e-06, + "loss": 0.3958, + "mean_token_accuracy": 0.870287548005581, + "num_tokens": 224700969.0, + "step": 186770 + }, + { + "entropy": 1.7502490743994712, + "epoch": 0.579001657678212, + "grad_norm": 8.145038604736328, + "learning_rate": 3.324711292877819e-06, + "loss": 0.355, + "mean_token_accuracy": 0.8686568707227706, + "num_tokens": 224714989.0, + "step": 186780 + }, + { + "entropy": 1.9025743126869201, + "epoch": 0.5790326568032618, + "grad_norm": 8.816189765930176, + "learning_rate": 3.3246222952443317e-06, + "loss": 0.4395, + "mean_token_accuracy": 0.860697540640831, + "num_tokens": 224726107.0, + "step": 186790 + }, + { + "entropy": 1.9007497653365135, + "epoch": 0.5790636559283114, + "grad_norm": 9.165550231933594, + "learning_rate": 3.3245333047574696e-06, + "loss": 0.4571, + "mean_token_accuracy": 0.8492210701107978, + "num_tokens": 224737335.0, + "step": 186800 + }, + { + "entropy": 1.853816030919552, + "epoch": 0.5790946550533611, + "grad_norm": 3.157280445098877, + "learning_rate": 3.3244443214162752e-06, + "loss": 0.4464, + "mean_token_accuracy": 0.8501716807484627, + "num_tokens": 224749158.0, + "step": 186810 + }, + { + "entropy": 1.8620413765311241, + "epoch": 0.5791256541784108, + "grad_norm": 7.5174431800842285, + "learning_rate": 3.3243553452197936e-06, + "loss": 0.4302, + "mean_token_accuracy": 0.8606794208288193, + "num_tokens": 224761123.0, + "step": 186820 + }, + { + "entropy": 1.8457257106900216, + "epoch": 0.5791566533034606, + "grad_norm": 9.554959297180176, + "learning_rate": 3.324266376167067e-06, + "loss": 0.4244, + "mean_token_accuracy": 0.8568149819970131, + "num_tokens": 224772955.0, + "step": 186830 + }, + { + "entropy": 1.9089529544115067, + "epoch": 0.5791876524285102, + "grad_norm": 8.996994972229004, + "learning_rate": 3.324177414257142e-06, + "loss": 0.4878, + "mean_token_accuracy": 0.8526898682117462, + "num_tokens": 224784360.0, + "step": 186840 + }, + { + "entropy": 1.894026516377926, + "epoch": 0.5792186515535599, + "grad_norm": 7.662275791168213, + "learning_rate": 3.324088459489061e-06, + "loss": 0.4329, + "mean_token_accuracy": 0.8504077926278114, + "num_tokens": 224795989.0, + "step": 186850 + }, + { + "entropy": 1.8165657207369805, + "epoch": 0.5792496506786096, + "grad_norm": 9.793351173400879, + "learning_rate": 3.323999511861869e-06, + "loss": 0.4487, + "mean_token_accuracy": 0.8550665840506554, + "num_tokens": 224808637.0, + "step": 186860 + }, + { + "entropy": 1.8741124719381332, + "epoch": 0.5792806498036593, + "grad_norm": 8.422378540039062, + "learning_rate": 3.3239105713746105e-06, + "loss": 0.6209, + "mean_token_accuracy": 0.827587154507637, + "num_tokens": 224821374.0, + "step": 186870 + }, + { + "entropy": 1.9334801375865935, + "epoch": 0.579311648928709, + "grad_norm": 7.4311747550964355, + "learning_rate": 3.3238216380263306e-06, + "loss": 0.4331, + "mean_token_accuracy": 0.8683941826224327, + "num_tokens": 224832001.0, + "step": 186880 + }, + { + "entropy": 1.8828293219208718, + "epoch": 0.5793426480537587, + "grad_norm": 8.103363037109375, + "learning_rate": 3.323732711816074e-06, + "loss": 0.4346, + "mean_token_accuracy": 0.8580566599965096, + "num_tokens": 224844029.0, + "step": 186890 + }, + { + "entropy": 1.927389144897461, + "epoch": 0.5793736471788083, + "grad_norm": 8.316423416137695, + "learning_rate": 3.323643792742886e-06, + "loss": 0.4576, + "mean_token_accuracy": 0.8571579784154892, + "num_tokens": 224855164.0, + "step": 186900 + }, + { + "entropy": 1.9288083717226983, + "epoch": 0.5794046463038581, + "grad_norm": 8.076248168945312, + "learning_rate": 3.323554880805812e-06, + "loss": 0.4577, + "mean_token_accuracy": 0.8487173110246659, + "num_tokens": 224866437.0, + "step": 186910 + }, + { + "entropy": 1.8607487827539444, + "epoch": 0.5794356454289078, + "grad_norm": 6.731062889099121, + "learning_rate": 3.3234659760038983e-06, + "loss": 0.4516, + "mean_token_accuracy": 0.8549078598618507, + "num_tokens": 224878689.0, + "step": 186920 + }, + { + "entropy": 1.8520071715116502, + "epoch": 0.5794666445539575, + "grad_norm": 8.76063346862793, + "learning_rate": 3.323377078336189e-06, + "loss": 0.4352, + "mean_token_accuracy": 0.8471438974142075, + "num_tokens": 224890831.0, + "step": 186930 + }, + { + "entropy": 1.750344455242157, + "epoch": 0.5794976436790071, + "grad_norm": 3.300847053527832, + "learning_rate": 3.3232881878017316e-06, + "loss": 0.3531, + "mean_token_accuracy": 0.8734717309474945, + "num_tokens": 224905046.0, + "step": 186940 + }, + { + "entropy": 1.9492957681417464, + "epoch": 0.5795286428040568, + "grad_norm": 7.751434326171875, + "learning_rate": 3.3231993043995707e-06, + "loss": 0.4894, + "mean_token_accuracy": 0.8514164552092552, + "num_tokens": 224916167.0, + "step": 186950 + }, + { + "entropy": 1.8545825704932213, + "epoch": 0.5795596419291066, + "grad_norm": 8.6134672164917, + "learning_rate": 3.3231104281287542e-06, + "loss": 0.3904, + "mean_token_accuracy": 0.8570887267589569, + "num_tokens": 224928011.0, + "step": 186960 + }, + { + "entropy": 1.801781241595745, + "epoch": 0.5795906410541563, + "grad_norm": 3.786421537399292, + "learning_rate": 3.3230215589883276e-06, + "loss": 0.3872, + "mean_token_accuracy": 0.8656805202364921, + "num_tokens": 224940817.0, + "step": 186970 + }, + { + "entropy": 1.9169122859835626, + "epoch": 0.5796216401792059, + "grad_norm": 8.907334327697754, + "learning_rate": 3.3229326969773367e-06, + "loss": 0.4766, + "mean_token_accuracy": 0.8506680697202682, + "num_tokens": 224952881.0, + "step": 186980 + }, + { + "entropy": 1.9773944050073624, + "epoch": 0.5796526393042556, + "grad_norm": 8.65813159942627, + "learning_rate": 3.3228438420948306e-06, + "loss": 0.5282, + "mean_token_accuracy": 0.8384165942668915, + "num_tokens": 224963431.0, + "step": 186990 + }, + { + "entropy": 1.8282001480460166, + "epoch": 0.5796836384293054, + "grad_norm": 7.797760963439941, + "learning_rate": 3.322754994339854e-06, + "loss": 0.4154, + "mean_token_accuracy": 0.8569248840212822, + "num_tokens": 224975554.0, + "step": 187000 + }, + { + "entropy": 1.9159265920519828, + "epoch": 0.579714637554355, + "grad_norm": 7.266661643981934, + "learning_rate": 3.322666153711455e-06, + "loss": 0.4856, + "mean_token_accuracy": 0.848967120051384, + "num_tokens": 224987114.0, + "step": 187010 + }, + { + "entropy": 1.8885586738586426, + "epoch": 0.5797456366794047, + "grad_norm": 6.6897358894348145, + "learning_rate": 3.3225773202086815e-06, + "loss": 0.449, + "mean_token_accuracy": 0.860285858809948, + "num_tokens": 224998810.0, + "step": 187020 + }, + { + "entropy": 1.9200876533985138, + "epoch": 0.5797766358044544, + "grad_norm": 8.429367065429688, + "learning_rate": 3.32248849383058e-06, + "loss": 0.4474, + "mean_token_accuracy": 0.8620867624878883, + "num_tokens": 225010448.0, + "step": 187030 + }, + { + "entropy": 1.8974786669015884, + "epoch": 0.5798076349295042, + "grad_norm": 10.145453453063965, + "learning_rate": 3.3223996745761976e-06, + "loss": 0.4706, + "mean_token_accuracy": 0.8538267195224762, + "num_tokens": 225022286.0, + "step": 187040 + }, + { + "entropy": 1.8508014142513276, + "epoch": 0.5798386340545538, + "grad_norm": 6.897191047668457, + "learning_rate": 3.3223108624445845e-06, + "loss": 0.3944, + "mean_token_accuracy": 0.8764521285891533, + "num_tokens": 225033829.0, + "step": 187050 + }, + { + "entropy": 1.9024966299533843, + "epoch": 0.5798696331796035, + "grad_norm": 9.509614944458008, + "learning_rate": 3.3222220574347875e-06, + "loss": 0.4701, + "mean_token_accuracy": 0.8539675638079643, + "num_tokens": 225044717.0, + "step": 187060 + }, + { + "entropy": 1.9286258906126021, + "epoch": 0.5799006323046532, + "grad_norm": 9.100786209106445, + "learning_rate": 3.322133259545854e-06, + "loss": 0.4526, + "mean_token_accuracy": 0.8484894841909408, + "num_tokens": 225055870.0, + "step": 187070 + }, + { + "entropy": 1.8572608321905135, + "epoch": 0.5799316314297029, + "grad_norm": 7.915788650512695, + "learning_rate": 3.3220444687768326e-06, + "loss": 0.3979, + "mean_token_accuracy": 0.859556196630001, + "num_tokens": 225067867.0, + "step": 187080 + }, + { + "entropy": 1.8662755504250526, + "epoch": 0.5799626305547526, + "grad_norm": 2.604182004928589, + "learning_rate": 3.321955685126773e-06, + "loss": 0.4504, + "mean_token_accuracy": 0.8469509720802307, + "num_tokens": 225080836.0, + "step": 187090 + }, + { + "entropy": 1.8107245221734047, + "epoch": 0.5799936296798023, + "grad_norm": 5.619945049285889, + "learning_rate": 3.321866908594724e-06, + "loss": 0.4282, + "mean_token_accuracy": 0.851016329228878, + "num_tokens": 225093899.0, + "step": 187100 + }, + { + "entropy": 1.8843757688999176, + "epoch": 0.580024628804852, + "grad_norm": 8.56844425201416, + "learning_rate": 3.3217781391797337e-06, + "loss": 0.4644, + "mean_token_accuracy": 0.8601377189159394, + "num_tokens": 225105531.0, + "step": 187110 + }, + { + "entropy": 1.914840179681778, + "epoch": 0.5800556279299017, + "grad_norm": 4.212116718292236, + "learning_rate": 3.321689376880851e-06, + "loss": 0.4418, + "mean_token_accuracy": 0.8556945145130157, + "num_tokens": 225116937.0, + "step": 187120 + }, + { + "entropy": 1.8552332192659378, + "epoch": 0.5800866270549514, + "grad_norm": 8.2677001953125, + "learning_rate": 3.321600621697126e-06, + "loss": 0.445, + "mean_token_accuracy": 0.8575369238853454, + "num_tokens": 225129995.0, + "step": 187130 + }, + { + "entropy": 1.7027757972478867, + "epoch": 0.5801176261800011, + "grad_norm": 7.780810832977295, + "learning_rate": 3.321511873627607e-06, + "loss": 0.2914, + "mean_token_accuracy": 0.8850486144423485, + "num_tokens": 225144239.0, + "step": 187140 + }, + { + "entropy": 1.8446943432092666, + "epoch": 0.5801486253050507, + "grad_norm": 8.228107452392578, + "learning_rate": 3.3214231326713453e-06, + "loss": 0.3869, + "mean_token_accuracy": 0.8636909976601601, + "num_tokens": 225156722.0, + "step": 187150 + }, + { + "entropy": 1.8933214277029038, + "epoch": 0.5801796244301005, + "grad_norm": 8.84110164642334, + "learning_rate": 3.3213343988273893e-06, + "loss": 0.4644, + "mean_token_accuracy": 0.8518497928977012, + "num_tokens": 225168460.0, + "step": 187160 + }, + { + "entropy": 1.7452817946672439, + "epoch": 0.5802106235551502, + "grad_norm": 8.720771789550781, + "learning_rate": 3.3212456720947895e-06, + "loss": 0.3704, + "mean_token_accuracy": 0.869410839676857, + "num_tokens": 225182491.0, + "step": 187170 + }, + { + "entropy": 1.957821214199066, + "epoch": 0.5802416226801999, + "grad_norm": 8.16481876373291, + "learning_rate": 3.3211569524725963e-06, + "loss": 0.5062, + "mean_token_accuracy": 0.8408224180340766, + "num_tokens": 225193214.0, + "step": 187180 + }, + { + "entropy": 1.813154575228691, + "epoch": 0.5802726218052495, + "grad_norm": 7.962581634521484, + "learning_rate": 3.32106823995986e-06, + "loss": 0.378, + "mean_token_accuracy": 0.8722536355257035, + "num_tokens": 225206240.0, + "step": 187190 + }, + { + "entropy": 1.970565813779831, + "epoch": 0.5803036209302992, + "grad_norm": 10.95875072479248, + "learning_rate": 3.3209795345556313e-06, + "loss": 0.4901, + "mean_token_accuracy": 0.8511197417974472, + "num_tokens": 225217327.0, + "step": 187200 + }, + { + "entropy": 1.8491960406303405, + "epoch": 0.580334620055349, + "grad_norm": 4.887589931488037, + "learning_rate": 3.3208908362589596e-06, + "loss": 0.3912, + "mean_token_accuracy": 0.861751139163971, + "num_tokens": 225230075.0, + "step": 187210 + }, + { + "entropy": 1.8215350538492203, + "epoch": 0.5803656191803986, + "grad_norm": 7.864952564239502, + "learning_rate": 3.320802145068898e-06, + "loss": 0.3837, + "mean_token_accuracy": 0.8659237533807754, + "num_tokens": 225243314.0, + "step": 187220 + }, + { + "entropy": 1.8638467997312547, + "epoch": 0.5803966183054483, + "grad_norm": 7.326906204223633, + "learning_rate": 3.320713460984496e-06, + "loss": 0.4286, + "mean_token_accuracy": 0.8706554800271988, + "num_tokens": 225255618.0, + "step": 187230 + }, + { + "entropy": 1.9212041601538659, + "epoch": 0.580427617430498, + "grad_norm": 10.682036399841309, + "learning_rate": 3.320624784004805e-06, + "loss": 0.4907, + "mean_token_accuracy": 0.8419831424951554, + "num_tokens": 225266837.0, + "step": 187240 + }, + { + "entropy": 1.8850742995738983, + "epoch": 0.5804586165555478, + "grad_norm": 3.9985318183898926, + "learning_rate": 3.320536114128877e-06, + "loss": 0.4433, + "mean_token_accuracy": 0.8472855120897294, + "num_tokens": 225278716.0, + "step": 187250 + }, + { + "entropy": 1.9892206102609635, + "epoch": 0.5804896156805974, + "grad_norm": 9.449318885803223, + "learning_rate": 3.3204474513557626e-06, + "loss": 0.5333, + "mean_token_accuracy": 0.8419775024056435, + "num_tokens": 225289299.0, + "step": 187260 + }, + { + "entropy": 1.8804281800985336, + "epoch": 0.5805206148056471, + "grad_norm": 2.9244470596313477, + "learning_rate": 3.320358795684515e-06, + "loss": 0.4574, + "mean_token_accuracy": 0.8517866969108582, + "num_tokens": 225301373.0, + "step": 187270 + }, + { + "entropy": 1.9121669724583625, + "epoch": 0.5805516139306968, + "grad_norm": 7.696053981781006, + "learning_rate": 3.320270147114185e-06, + "loss": 0.4661, + "mean_token_accuracy": 0.8523363262414932, + "num_tokens": 225313048.0, + "step": 187280 + }, + { + "entropy": 1.8642026141285897, + "epoch": 0.5805826130557465, + "grad_norm": 7.293044567108154, + "learning_rate": 3.3201815056438252e-06, + "loss": 0.4044, + "mean_token_accuracy": 0.8651653498411178, + "num_tokens": 225324857.0, + "step": 187290 + }, + { + "entropy": 1.8944362625479698, + "epoch": 0.5806136121807962, + "grad_norm": 8.115937232971191, + "learning_rate": 3.3200928712724882e-06, + "loss": 0.5121, + "mean_token_accuracy": 0.8387358605861663, + "num_tokens": 225336672.0, + "step": 187300 + }, + { + "entropy": 1.9070868030190469, + "epoch": 0.5806446113058459, + "grad_norm": 7.693739414215088, + "learning_rate": 3.320004243999225e-06, + "loss": 0.4052, + "mean_token_accuracy": 0.8645681887865067, + "num_tokens": 225347958.0, + "step": 187310 + }, + { + "entropy": 1.9648890227079392, + "epoch": 0.5806756104308956, + "grad_norm": 8.766926765441895, + "learning_rate": 3.3199156238230913e-06, + "loss": 0.4859, + "mean_token_accuracy": 0.8412608623504638, + "num_tokens": 225358571.0, + "step": 187320 + }, + { + "entropy": 1.8624957248568534, + "epoch": 0.5807066095559453, + "grad_norm": 9.408360481262207, + "learning_rate": 3.319827010743137e-06, + "loss": 0.4336, + "mean_token_accuracy": 0.8628850594162941, + "num_tokens": 225370298.0, + "step": 187330 + }, + { + "entropy": 1.8627533257007598, + "epoch": 0.580737608680995, + "grad_norm": 7.75230598449707, + "learning_rate": 3.3197384047584157e-06, + "loss": 0.4356, + "mean_token_accuracy": 0.8560406729578972, + "num_tokens": 225381912.0, + "step": 187340 + }, + { + "entropy": 1.8439243629574775, + "epoch": 0.5807686078060447, + "grad_norm": 6.105306625366211, + "learning_rate": 3.319649805867981e-06, + "loss": 0.4292, + "mean_token_accuracy": 0.8594496145844459, + "num_tokens": 225394374.0, + "step": 187350 + }, + { + "entropy": 1.8410490676760674, + "epoch": 0.5807996069310943, + "grad_norm": 9.341657638549805, + "learning_rate": 3.319561214070887e-06, + "loss": 0.4584, + "mean_token_accuracy": 0.8523242130875588, + "num_tokens": 225406849.0, + "step": 187360 + }, + { + "entropy": 1.8524947792291642, + "epoch": 0.5808306060561441, + "grad_norm": 8.550660133361816, + "learning_rate": 3.3194726293661866e-06, + "loss": 0.4039, + "mean_token_accuracy": 0.8669483736157417, + "num_tokens": 225418553.0, + "step": 187370 + }, + { + "entropy": 1.7620634004473685, + "epoch": 0.5808616051811938, + "grad_norm": 4.0590410232543945, + "learning_rate": 3.319384051752933e-06, + "loss": 0.3503, + "mean_token_accuracy": 0.8679928451776504, + "num_tokens": 225431710.0, + "step": 187380 + }, + { + "entropy": 1.863413827866316, + "epoch": 0.5808926043062435, + "grad_norm": 8.537755966186523, + "learning_rate": 3.3192954812301805e-06, + "loss": 0.4167, + "mean_token_accuracy": 0.8627542704343796, + "num_tokens": 225443800.0, + "step": 187390 + }, + { + "entropy": 1.9017139032483101, + "epoch": 0.5809236034312931, + "grad_norm": 9.76558780670166, + "learning_rate": 3.319206917796984e-06, + "loss": 0.4272, + "mean_token_accuracy": 0.855692045390606, + "num_tokens": 225455170.0, + "step": 187400 + }, + { + "entropy": 1.8679040119051933, + "epoch": 0.5809546025563428, + "grad_norm": 5.080874919891357, + "learning_rate": 3.3191183614523958e-06, + "loss": 0.4409, + "mean_token_accuracy": 0.8646040439605713, + "num_tokens": 225466335.0, + "step": 187410 + }, + { + "entropy": 1.8454194337129592, + "epoch": 0.5809856016813926, + "grad_norm": 7.6558003425598145, + "learning_rate": 3.319029812195473e-06, + "loss": 0.4163, + "mean_token_accuracy": 0.8634024366736412, + "num_tokens": 225478223.0, + "step": 187420 + }, + { + "entropy": 1.8780192330479621, + "epoch": 0.5810166008064422, + "grad_norm": 6.734908103942871, + "learning_rate": 3.3189412700252675e-06, + "loss": 0.4554, + "mean_token_accuracy": 0.8515690982341766, + "num_tokens": 225489362.0, + "step": 187430 + }, + { + "entropy": 1.8287191420793534, + "epoch": 0.5810475999314919, + "grad_norm": 6.768209457397461, + "learning_rate": 3.3188527349408363e-06, + "loss": 0.4084, + "mean_token_accuracy": 0.860761895775795, + "num_tokens": 225501988.0, + "step": 187440 + }, + { + "entropy": 1.895971204340458, + "epoch": 0.5810785990565416, + "grad_norm": 8.12165641784668, + "learning_rate": 3.318764206941233e-06, + "loss": 0.4789, + "mean_token_accuracy": 0.853890660405159, + "num_tokens": 225513151.0, + "step": 187450 + }, + { + "entropy": 1.8142405509948731, + "epoch": 0.5811095981815914, + "grad_norm": 9.434362411499023, + "learning_rate": 3.318675686025512e-06, + "loss": 0.4006, + "mean_token_accuracy": 0.8667998522520065, + "num_tokens": 225525252.0, + "step": 187460 + }, + { + "entropy": 1.8143661960959434, + "epoch": 0.581140597306641, + "grad_norm": 4.254508972167969, + "learning_rate": 3.3185871721927317e-06, + "loss": 0.387, + "mean_token_accuracy": 0.8701783046126366, + "num_tokens": 225537767.0, + "step": 187470 + }, + { + "entropy": 1.8940456122159959, + "epoch": 0.5811715964316907, + "grad_norm": 8.721658706665039, + "learning_rate": 3.3184986654419447e-06, + "loss": 0.4564, + "mean_token_accuracy": 0.8549002230167388, + "num_tokens": 225549434.0, + "step": 187480 + }, + { + "entropy": 1.8303603425621986, + "epoch": 0.5812025955567404, + "grad_norm": 4.4417405128479, + "learning_rate": 3.318410165772207e-06, + "loss": 0.4196, + "mean_token_accuracy": 0.8598802730441093, + "num_tokens": 225561041.0, + "step": 187490 + }, + { + "entropy": 1.9144440218806267, + "epoch": 0.5812335946817901, + "grad_norm": 8.221935272216797, + "learning_rate": 3.3183216731825756e-06, + "loss": 0.4801, + "mean_token_accuracy": 0.8425498574972152, + "num_tokens": 225572823.0, + "step": 187500 + }, + { + "entropy": 1.8884352698922158, + "epoch": 0.5812645938068398, + "grad_norm": 8.147493362426758, + "learning_rate": 3.3182331876721063e-06, + "loss": 0.4293, + "mean_token_accuracy": 0.8497319743037224, + "num_tokens": 225584616.0, + "step": 187510 + }, + { + "entropy": 1.8196037113666534, + "epoch": 0.5812955929318895, + "grad_norm": 8.769972801208496, + "learning_rate": 3.3181447092398542e-06, + "loss": 0.4064, + "mean_token_accuracy": 0.8692084148526191, + "num_tokens": 225596386.0, + "step": 187520 + }, + { + "entropy": 1.8051419720053672, + "epoch": 0.5813265920569392, + "grad_norm": 3.612482786178589, + "learning_rate": 3.3180562378848775e-06, + "loss": 0.4222, + "mean_token_accuracy": 0.8601198688149452, + "num_tokens": 225608958.0, + "step": 187530 + }, + { + "entropy": 1.8300196036696434, + "epoch": 0.5813575911819889, + "grad_norm": 3.7968201637268066, + "learning_rate": 3.317967773606231e-06, + "loss": 0.3841, + "mean_token_accuracy": 0.8680305197834969, + "num_tokens": 225621216.0, + "step": 187540 + }, + { + "entropy": 1.8044159524142742, + "epoch": 0.5813885903070386, + "grad_norm": 7.892989158630371, + "learning_rate": 3.3178793164029725e-06, + "loss": 0.3922, + "mean_token_accuracy": 0.8621145501732826, + "num_tokens": 225634613.0, + "step": 187550 + }, + { + "entropy": 1.872318272292614, + "epoch": 0.5814195894320883, + "grad_norm": 4.278754711151123, + "learning_rate": 3.3177908662741582e-06, + "loss": 0.4211, + "mean_token_accuracy": 0.8609319195151329, + "num_tokens": 225646477.0, + "step": 187560 + }, + { + "entropy": 1.8809518799185754, + "epoch": 0.5814505885571379, + "grad_norm": 6.835907459259033, + "learning_rate": 3.3177024232188454e-06, + "loss": 0.4415, + "mean_token_accuracy": 0.8569922402501107, + "num_tokens": 225658719.0, + "step": 187570 + }, + { + "entropy": 1.848207938671112, + "epoch": 0.5814815876821877, + "grad_norm": 8.355209350585938, + "learning_rate": 3.317613987236091e-06, + "loss": 0.3822, + "mean_token_accuracy": 0.8651770651340485, + "num_tokens": 225670144.0, + "step": 187580 + }, + { + "entropy": 1.9318113803863526, + "epoch": 0.5815125868072374, + "grad_norm": 7.146533012390137, + "learning_rate": 3.3175255583249538e-06, + "loss": 0.519, + "mean_token_accuracy": 0.8475144997239112, + "num_tokens": 225681379.0, + "step": 187590 + }, + { + "entropy": 1.897273415327072, + "epoch": 0.581543585932287, + "grad_norm": 8.656028747558594, + "learning_rate": 3.31743713648449e-06, + "loss": 0.4406, + "mean_token_accuracy": 0.8592107653617859, + "num_tokens": 225693244.0, + "step": 187600 + }, + { + "entropy": 1.8229992628097533, + "epoch": 0.5815745850573367, + "grad_norm": 8.393926620483398, + "learning_rate": 3.317348721713758e-06, + "loss": 0.4122, + "mean_token_accuracy": 0.8646789059042931, + "num_tokens": 225705254.0, + "step": 187610 + }, + { + "entropy": 1.887227413058281, + "epoch": 0.5816055841823865, + "grad_norm": 8.333544731140137, + "learning_rate": 3.3172603140118146e-06, + "loss": 0.4297, + "mean_token_accuracy": 0.8589311584830284, + "num_tokens": 225716548.0, + "step": 187620 + }, + { + "entropy": 1.8871796131134033, + "epoch": 0.5816365833074362, + "grad_norm": 8.231070518493652, + "learning_rate": 3.31717191337772e-06, + "loss": 0.4664, + "mean_token_accuracy": 0.8593043237924576, + "num_tokens": 225728445.0, + "step": 187630 + }, + { + "entropy": 1.8334648802876472, + "epoch": 0.5816675824324858, + "grad_norm": 8.094388961791992, + "learning_rate": 3.317083519810531e-06, + "loss": 0.4345, + "mean_token_accuracy": 0.8523730918765068, + "num_tokens": 225740029.0, + "step": 187640 + }, + { + "entropy": 1.8685568556189538, + "epoch": 0.5816985815575355, + "grad_norm": 9.478364944458008, + "learning_rate": 3.316995133309307e-06, + "loss": 0.4158, + "mean_token_accuracy": 0.8546089068055153, + "num_tokens": 225752464.0, + "step": 187650 + }, + { + "entropy": 1.8670329421758651, + "epoch": 0.5817295806825852, + "grad_norm": 7.892263412475586, + "learning_rate": 3.3169067538731053e-06, + "loss": 0.427, + "mean_token_accuracy": 0.8570248574018479, + "num_tokens": 225764756.0, + "step": 187660 + }, + { + "entropy": 1.8107655197381973, + "epoch": 0.581760579807635, + "grad_norm": 4.070509910583496, + "learning_rate": 3.3168183815009868e-06, + "loss": 0.4479, + "mean_token_accuracy": 0.8547146677970886, + "num_tokens": 225777573.0, + "step": 187670 + }, + { + "entropy": 1.890822234749794, + "epoch": 0.5817915789326846, + "grad_norm": 8.35913372039795, + "learning_rate": 3.3167300161920084e-06, + "loss": 0.4778, + "mean_token_accuracy": 0.8575819626450538, + "num_tokens": 225788758.0, + "step": 187680 + }, + { + "entropy": 1.8458858624100685, + "epoch": 0.5818225780577343, + "grad_norm": 3.60536789894104, + "learning_rate": 3.3166416579452303e-06, + "loss": 0.4312, + "mean_token_accuracy": 0.8596644833683967, + "num_tokens": 225800356.0, + "step": 187690 + }, + { + "entropy": 1.893759785592556, + "epoch": 0.581853577182784, + "grad_norm": 9.219444274902344, + "learning_rate": 3.316553306759712e-06, + "loss": 0.4349, + "mean_token_accuracy": 0.8564515098929405, + "num_tokens": 225811643.0, + "step": 187700 + }, + { + "entropy": 1.7977493658661843, + "epoch": 0.5818845763078337, + "grad_norm": 3.17759370803833, + "learning_rate": 3.3164649626345125e-06, + "loss": 0.3782, + "mean_token_accuracy": 0.8628740921616554, + "num_tokens": 225824019.0, + "step": 187710 + }, + { + "entropy": 1.86475830078125, + "epoch": 0.5819155754328834, + "grad_norm": 4.8829498291015625, + "learning_rate": 3.316376625568692e-06, + "loss": 0.4555, + "mean_token_accuracy": 0.8398260667920112, + "num_tokens": 225836100.0, + "step": 187720 + }, + { + "entropy": 1.859655699133873, + "epoch": 0.5819465745579331, + "grad_norm": 3.9662086963653564, + "learning_rate": 3.31628829556131e-06, + "loss": 0.4558, + "mean_token_accuracy": 0.8486672386527061, + "num_tokens": 225848121.0, + "step": 187730 + }, + { + "entropy": 1.8039502635598184, + "epoch": 0.5819775736829828, + "grad_norm": 6.981991767883301, + "learning_rate": 3.316199972611427e-06, + "loss": 0.3584, + "mean_token_accuracy": 0.8625715032219887, + "num_tokens": 225861162.0, + "step": 187740 + }, + { + "entropy": 1.9068680822849273, + "epoch": 0.5820085728080325, + "grad_norm": 6.9623894691467285, + "learning_rate": 3.3161116567181025e-06, + "loss": 0.449, + "mean_token_accuracy": 0.8555295959115028, + "num_tokens": 225872355.0, + "step": 187750 + }, + { + "entropy": 1.8862819537520408, + "epoch": 0.5820395719330822, + "grad_norm": 8.505620956420898, + "learning_rate": 3.3160233478803978e-06, + "loss": 0.4282, + "mean_token_accuracy": 0.858185425400734, + "num_tokens": 225884385.0, + "step": 187760 + }, + { + "entropy": 1.8174796253442764, + "epoch": 0.5820705710581319, + "grad_norm": 3.593024253845215, + "learning_rate": 3.3159350460973725e-06, + "loss": 0.3673, + "mean_token_accuracy": 0.8744990170001984, + "num_tokens": 225896421.0, + "step": 187770 + }, + { + "entropy": 1.794604268670082, + "epoch": 0.5821015701831815, + "grad_norm": 4.108510971069336, + "learning_rate": 3.3158467513680887e-06, + "loss": 0.4025, + "mean_token_accuracy": 0.8654602453112602, + "num_tokens": 225909202.0, + "step": 187780 + }, + { + "entropy": 1.856592944264412, + "epoch": 0.5821325693082313, + "grad_norm": 3.522554636001587, + "learning_rate": 3.315758463691606e-06, + "loss": 0.4668, + "mean_token_accuracy": 0.8488976538181305, + "num_tokens": 225921116.0, + "step": 187790 + }, + { + "entropy": 1.7395574852824212, + "epoch": 0.582163568433281, + "grad_norm": 3.735677719116211, + "learning_rate": 3.315670183066986e-06, + "loss": 0.3464, + "mean_token_accuracy": 0.8713794186711311, + "num_tokens": 225934922.0, + "step": 187800 + }, + { + "entropy": 1.8028660848736764, + "epoch": 0.5821945675583307, + "grad_norm": 8.561286926269531, + "learning_rate": 3.3155819094932912e-06, + "loss": 0.4191, + "mean_token_accuracy": 0.8610808923840523, + "num_tokens": 225947073.0, + "step": 187810 + }, + { + "entropy": 1.9100270748138428, + "epoch": 0.5822255666833803, + "grad_norm": 9.145927429199219, + "learning_rate": 3.3154936429695807e-06, + "loss": 0.4629, + "mean_token_accuracy": 0.8617308288812637, + "num_tokens": 225958338.0, + "step": 187820 + }, + { + "entropy": 1.8450745210051536, + "epoch": 0.5822565658084301, + "grad_norm": 3.313938617706299, + "learning_rate": 3.3154053834949173e-06, + "loss": 0.3908, + "mean_token_accuracy": 0.8720159530639648, + "num_tokens": 225970594.0, + "step": 187830 + }, + { + "entropy": 1.8195609986782073, + "epoch": 0.5822875649334798, + "grad_norm": 3.9881591796875, + "learning_rate": 3.315317131068364e-06, + "loss": 0.4007, + "mean_token_accuracy": 0.8652278319001198, + "num_tokens": 225983382.0, + "step": 187840 + }, + { + "entropy": 1.8461186781525611, + "epoch": 0.5823185640585294, + "grad_norm": 8.813285827636719, + "learning_rate": 3.315228885688981e-06, + "loss": 0.4284, + "mean_token_accuracy": 0.8545858830213546, + "num_tokens": 225995123.0, + "step": 187850 + }, + { + "entropy": 1.8090190321207047, + "epoch": 0.5823495631835791, + "grad_norm": 8.754561424255371, + "learning_rate": 3.3151406473558305e-06, + "loss": 0.3976, + "mean_token_accuracy": 0.8618172109127045, + "num_tokens": 226008105.0, + "step": 187860 + }, + { + "entropy": 1.867758809030056, + "epoch": 0.5823805623086289, + "grad_norm": 7.915543556213379, + "learning_rate": 3.315052416067976e-06, + "loss": 0.4739, + "mean_token_accuracy": 0.8492542281746864, + "num_tokens": 226020743.0, + "step": 187870 + }, + { + "entropy": 1.823201984167099, + "epoch": 0.5824115614336786, + "grad_norm": 8.312398910522461, + "learning_rate": 3.3149641918244797e-06, + "loss": 0.4272, + "mean_token_accuracy": 0.8526169434189796, + "num_tokens": 226033165.0, + "step": 187880 + }, + { + "entropy": 1.9004525065422058, + "epoch": 0.5824425605587282, + "grad_norm": 7.280655384063721, + "learning_rate": 3.3148759746244036e-06, + "loss": 0.4729, + "mean_token_accuracy": 0.8470793083310127, + "num_tokens": 226045476.0, + "step": 187890 + }, + { + "entropy": 1.7311370939016342, + "epoch": 0.5824735596837779, + "grad_norm": 9.387449264526367, + "learning_rate": 3.3147877644668114e-06, + "loss": 0.339, + "mean_token_accuracy": 0.8730020001530647, + "num_tokens": 226059148.0, + "step": 187900 + }, + { + "entropy": 1.814567594230175, + "epoch": 0.5825045588088276, + "grad_norm": 3.6906380653381348, + "learning_rate": 3.3146995613507654e-06, + "loss": 0.4556, + "mean_token_accuracy": 0.856498584151268, + "num_tokens": 226072217.0, + "step": 187910 + }, + { + "entropy": 1.8509540766477586, + "epoch": 0.5825355579338773, + "grad_norm": 9.102182388305664, + "learning_rate": 3.3146113652753294e-06, + "loss": 0.4125, + "mean_token_accuracy": 0.8627394825220108, + "num_tokens": 226084204.0, + "step": 187920 + }, + { + "entropy": 1.7961204290390014, + "epoch": 0.582566557058927, + "grad_norm": 7.360368728637695, + "learning_rate": 3.3145231762395663e-06, + "loss": 0.39, + "mean_token_accuracy": 0.8610507771372795, + "num_tokens": 226097223.0, + "step": 187930 + }, + { + "entropy": 1.9453897044062614, + "epoch": 0.5825975561839767, + "grad_norm": 8.529144287109375, + "learning_rate": 3.3144349942425403e-06, + "loss": 0.4752, + "mean_token_accuracy": 0.8498910903930664, + "num_tokens": 226108705.0, + "step": 187940 + }, + { + "entropy": 1.881111888587475, + "epoch": 0.5826285553090264, + "grad_norm": 7.892261505126953, + "learning_rate": 3.3143468192833146e-06, + "loss": 0.4494, + "mean_token_accuracy": 0.867023055255413, + "num_tokens": 226119736.0, + "step": 187950 + }, + { + "entropy": 1.9663893669843673, + "epoch": 0.5826595544340761, + "grad_norm": 9.200639724731445, + "learning_rate": 3.3142586513609527e-06, + "loss": 0.499, + "mean_token_accuracy": 0.841266855597496, + "num_tokens": 226131251.0, + "step": 187960 + }, + { + "entropy": 1.8415462985634803, + "epoch": 0.5826905535591258, + "grad_norm": 7.170725345611572, + "learning_rate": 3.3141704904745196e-06, + "loss": 0.4037, + "mean_token_accuracy": 0.8617974206805229, + "num_tokens": 226142855.0, + "step": 187970 + }, + { + "entropy": 1.869658489525318, + "epoch": 0.5827215526841755, + "grad_norm": 7.896175861358643, + "learning_rate": 3.314082336623079e-06, + "loss": 0.4493, + "mean_token_accuracy": 0.8597209066152572, + "num_tokens": 226154990.0, + "step": 187980 + }, + { + "entropy": 1.8297441124916076, + "epoch": 0.5827525518092251, + "grad_norm": 3.932016134262085, + "learning_rate": 3.313994189805696e-06, + "loss": 0.3952, + "mean_token_accuracy": 0.8598747432231904, + "num_tokens": 226167448.0, + "step": 187990 + }, + { + "entropy": 1.871214209496975, + "epoch": 0.5827835509342749, + "grad_norm": 9.672822952270508, + "learning_rate": 3.3139060500214345e-06, + "loss": 0.4218, + "mean_token_accuracy": 0.8542467385530472, + "num_tokens": 226178960.0, + "step": 188000 + }, + { + "entropy": 1.7930740877985953, + "epoch": 0.5828145500593246, + "grad_norm": 8.267410278320312, + "learning_rate": 3.313817917269359e-06, + "loss": 0.3895, + "mean_token_accuracy": 0.8633213341236115, + "num_tokens": 226191638.0, + "step": 188010 + }, + { + "entropy": 1.8819244831800461, + "epoch": 0.5828455491843743, + "grad_norm": 8.082293510437012, + "learning_rate": 3.313729791548535e-06, + "loss": 0.4395, + "mean_token_accuracy": 0.8530665084719657, + "num_tokens": 226202623.0, + "step": 188020 + }, + { + "entropy": 1.8311271622776986, + "epoch": 0.5828765483094239, + "grad_norm": 11.455146789550781, + "learning_rate": 3.3136416728580277e-06, + "loss": 0.4344, + "mean_token_accuracy": 0.8615739867091179, + "num_tokens": 226215309.0, + "step": 188030 + }, + { + "entropy": 1.904327604174614, + "epoch": 0.5829075474344737, + "grad_norm": 7.183170318603516, + "learning_rate": 3.313553561196902e-06, + "loss": 0.4502, + "mean_token_accuracy": 0.8606903403997421, + "num_tokens": 226226613.0, + "step": 188040 + }, + { + "entropy": 1.897905382514, + "epoch": 0.5829385465595234, + "grad_norm": 8.285850524902344, + "learning_rate": 3.3134654565642236e-06, + "loss": 0.458, + "mean_token_accuracy": 0.8442597165703773, + "num_tokens": 226238398.0, + "step": 188050 + }, + { + "entropy": 1.8283357247710228, + "epoch": 0.582969545684573, + "grad_norm": 6.005077838897705, + "learning_rate": 3.3133773589590583e-06, + "loss": 0.391, + "mean_token_accuracy": 0.862805712223053, + "num_tokens": 226251131.0, + "step": 188060 + }, + { + "entropy": 1.9413809180259705, + "epoch": 0.5830005448096227, + "grad_norm": 14.281983375549316, + "learning_rate": 3.3132892683804725e-06, + "loss": 0.5778, + "mean_token_accuracy": 0.840830785036087, + "num_tokens": 226262959.0, + "step": 188070 + }, + { + "entropy": 1.8144272148609162, + "epoch": 0.5830315439346725, + "grad_norm": 8.84279727935791, + "learning_rate": 3.313201184827531e-06, + "loss": 0.4398, + "mean_token_accuracy": 0.8624243274331093, + "num_tokens": 226276078.0, + "step": 188080 + }, + { + "entropy": 1.9170308411121368, + "epoch": 0.5830625430597222, + "grad_norm": 4.164804935455322, + "learning_rate": 3.3131131082993e-06, + "loss": 0.4998, + "mean_token_accuracy": 0.8424900874495507, + "num_tokens": 226287036.0, + "step": 188090 + }, + { + "entropy": 1.8704634562134743, + "epoch": 0.5830935421847718, + "grad_norm": 8.712486267089844, + "learning_rate": 3.3130250387948467e-06, + "loss": 0.4603, + "mean_token_accuracy": 0.8427205190062523, + "num_tokens": 226299076.0, + "step": 188100 + }, + { + "entropy": 1.841335417330265, + "epoch": 0.5831245413098215, + "grad_norm": 7.971294403076172, + "learning_rate": 3.3129369763132374e-06, + "loss": 0.4456, + "mean_token_accuracy": 0.8621138870716095, + "num_tokens": 226311492.0, + "step": 188110 + }, + { + "entropy": 1.876630488038063, + "epoch": 0.5831555404348713, + "grad_norm": 3.832597255706787, + "learning_rate": 3.312848920853538e-06, + "loss": 0.463, + "mean_token_accuracy": 0.859805490076542, + "num_tokens": 226323785.0, + "step": 188120 + }, + { + "entropy": 1.9168067395687103, + "epoch": 0.583186539559921, + "grad_norm": 8.005231857299805, + "learning_rate": 3.312760872414816e-06, + "loss": 0.4887, + "mean_token_accuracy": 0.8578100666403771, + "num_tokens": 226335299.0, + "step": 188130 + }, + { + "entropy": 1.844995990395546, + "epoch": 0.5832175386849706, + "grad_norm": 6.676909923553467, + "learning_rate": 3.3126728309961387e-06, + "loss": 0.4515, + "mean_token_accuracy": 0.8538479104638099, + "num_tokens": 226347611.0, + "step": 188140 + }, + { + "entropy": 1.9432258665561677, + "epoch": 0.5832485378100203, + "grad_norm": 3.9236388206481934, + "learning_rate": 3.312584796596573e-06, + "loss": 0.4391, + "mean_token_accuracy": 0.8563521727919579, + "num_tokens": 226358832.0, + "step": 188150 + }, + { + "entropy": 1.830638773739338, + "epoch": 0.58327953693507, + "grad_norm": 8.679086685180664, + "learning_rate": 3.312496769215186e-06, + "loss": 0.3824, + "mean_token_accuracy": 0.8629469037055969, + "num_tokens": 226372333.0, + "step": 188160 + }, + { + "entropy": 1.9505089595913887, + "epoch": 0.5833105360601197, + "grad_norm": 9.412543296813965, + "learning_rate": 3.312408748851046e-06, + "loss": 0.5211, + "mean_token_accuracy": 0.8360338136553764, + "num_tokens": 226384676.0, + "step": 188170 + }, + { + "entropy": 1.7972653448581695, + "epoch": 0.5833415351851694, + "grad_norm": 6.050276756286621, + "learning_rate": 3.3123207355032193e-06, + "loss": 0.4329, + "mean_token_accuracy": 0.8623633801937103, + "num_tokens": 226397428.0, + "step": 188180 + }, + { + "entropy": 1.8783445715904237, + "epoch": 0.5833725343102191, + "grad_norm": 3.4504499435424805, + "learning_rate": 3.312232729170776e-06, + "loss": 0.4249, + "mean_token_accuracy": 0.854879067838192, + "num_tokens": 226409277.0, + "step": 188190 + }, + { + "entropy": 1.8437550991773606, + "epoch": 0.5834035334352687, + "grad_norm": 8.014336585998535, + "learning_rate": 3.3121447298527826e-06, + "loss": 0.434, + "mean_token_accuracy": 0.8607449740171432, + "num_tokens": 226421381.0, + "step": 188200 + }, + { + "entropy": 1.8978667482733727, + "epoch": 0.5834345325603185, + "grad_norm": 7.275966644287109, + "learning_rate": 3.3120567375483074e-06, + "loss": 0.4366, + "mean_token_accuracy": 0.8560662552714348, + "num_tokens": 226433314.0, + "step": 188210 + }, + { + "entropy": 1.7705576822161675, + "epoch": 0.5834655316853682, + "grad_norm": 8.13685131072998, + "learning_rate": 3.3119687522564193e-06, + "loss": 0.3593, + "mean_token_accuracy": 0.8652193561196327, + "num_tokens": 226447082.0, + "step": 188220 + }, + { + "entropy": 1.8670435428619385, + "epoch": 0.5834965308104179, + "grad_norm": 8.564881324768066, + "learning_rate": 3.3118807739761864e-06, + "loss": 0.4575, + "mean_token_accuracy": 0.8523515909910202, + "num_tokens": 226458416.0, + "step": 188230 + }, + { + "entropy": 1.7603057324886322, + "epoch": 0.5835275299354675, + "grad_norm": 8.4653959274292, + "learning_rate": 3.311792802706678e-06, + "loss": 0.3732, + "mean_token_accuracy": 0.8672403365373611, + "num_tokens": 226471502.0, + "step": 188240 + }, + { + "entropy": 1.8395553201436996, + "epoch": 0.5835585290605173, + "grad_norm": 8.05081844329834, + "learning_rate": 3.3117048384469636e-06, + "loss": 0.4032, + "mean_token_accuracy": 0.8622571364045143, + "num_tokens": 226483666.0, + "step": 188250 + }, + { + "entropy": 1.9094887733459474, + "epoch": 0.583589528185567, + "grad_norm": 7.921069145202637, + "learning_rate": 3.311616881196111e-06, + "loss": 0.5063, + "mean_token_accuracy": 0.8507940858602524, + "num_tokens": 226494845.0, + "step": 188260 + }, + { + "entropy": 1.9042211279273034, + "epoch": 0.5836205273106166, + "grad_norm": 9.858515739440918, + "learning_rate": 3.3115289309531896e-06, + "loss": 0.455, + "mean_token_accuracy": 0.8520038589835167, + "num_tokens": 226506098.0, + "step": 188270 + }, + { + "entropy": 1.9129501059651375, + "epoch": 0.5836515264356663, + "grad_norm": 4.229575157165527, + "learning_rate": 3.31144098771727e-06, + "loss": 0.4523, + "mean_token_accuracy": 0.8541373431682586, + "num_tokens": 226517333.0, + "step": 188280 + }, + { + "entropy": 1.8737207755446434, + "epoch": 0.5836825255607161, + "grad_norm": 9.133805274963379, + "learning_rate": 3.31135305148742e-06, + "loss": 0.4655, + "mean_token_accuracy": 0.8556098580360413, + "num_tokens": 226529232.0, + "step": 188290 + }, + { + "entropy": 1.8534547090530396, + "epoch": 0.5837135246857658, + "grad_norm": 8.339399337768555, + "learning_rate": 3.3112651222627118e-06, + "loss": 0.4269, + "mean_token_accuracy": 0.8503861218690872, + "num_tokens": 226541327.0, + "step": 188300 + }, + { + "entropy": 1.8679198250174522, + "epoch": 0.5837445238108154, + "grad_norm": 7.660033226013184, + "learning_rate": 3.3111772000422136e-06, + "loss": 0.4435, + "mean_token_accuracy": 0.8588420808315277, + "num_tokens": 226553424.0, + "step": 188310 + }, + { + "entropy": 1.9072294965386392, + "epoch": 0.5837755229358651, + "grad_norm": 7.387581825256348, + "learning_rate": 3.3110892848249966e-06, + "loss": 0.4852, + "mean_token_accuracy": 0.8525169312953949, + "num_tokens": 226564494.0, + "step": 188320 + }, + { + "entropy": 1.8538544602692126, + "epoch": 0.5838065220609149, + "grad_norm": 11.310450553894043, + "learning_rate": 3.3110013766101297e-06, + "loss": 0.423, + "mean_token_accuracy": 0.8551127001643181, + "num_tokens": 226577233.0, + "step": 188330 + }, + { + "entropy": 1.9005723729729653, + "epoch": 0.5838375211859645, + "grad_norm": 8.547388076782227, + "learning_rate": 3.310913475396685e-06, + "loss": 0.4453, + "mean_token_accuracy": 0.8635548874735832, + "num_tokens": 226588550.0, + "step": 188340 + }, + { + "entropy": 1.828601559996605, + "epoch": 0.5838685203110142, + "grad_norm": 4.200283527374268, + "learning_rate": 3.310825581183732e-06, + "loss": 0.3652, + "mean_token_accuracy": 0.8699035704135895, + "num_tokens": 226601722.0, + "step": 188350 + }, + { + "entropy": 1.9584389060735703, + "epoch": 0.5838995194360639, + "grad_norm": 8.566116333007812, + "learning_rate": 3.3107376939703425e-06, + "loss": 0.509, + "mean_token_accuracy": 0.8420336455106735, + "num_tokens": 226612375.0, + "step": 188360 + }, + { + "entropy": 1.930802473425865, + "epoch": 0.5839305185611137, + "grad_norm": 9.12570858001709, + "learning_rate": 3.310649813755587e-06, + "loss": 0.5011, + "mean_token_accuracy": 0.8506847187876702, + "num_tokens": 226623213.0, + "step": 188370 + }, + { + "entropy": 1.9112320333719253, + "epoch": 0.5839615176861633, + "grad_norm": 8.679109573364258, + "learning_rate": 3.3105619405385365e-06, + "loss": 0.4744, + "mean_token_accuracy": 0.8585551649332046, + "num_tokens": 226634267.0, + "step": 188380 + }, + { + "entropy": 1.9466535836458205, + "epoch": 0.583992516811213, + "grad_norm": 4.623629093170166, + "learning_rate": 3.310474074318262e-06, + "loss": 0.4933, + "mean_token_accuracy": 0.843099795281887, + "num_tokens": 226645782.0, + "step": 188390 + }, + { + "entropy": 1.751372517645359, + "epoch": 0.5840235159362627, + "grad_norm": 3.6573829650878906, + "learning_rate": 3.3103862150938366e-06, + "loss": 0.3644, + "mean_token_accuracy": 0.8672532960772514, + "num_tokens": 226659879.0, + "step": 188400 + }, + { + "entropy": 1.878103531897068, + "epoch": 0.5840545150613123, + "grad_norm": 3.6132452487945557, + "learning_rate": 3.310298362864331e-06, + "loss": 0.4072, + "mean_token_accuracy": 0.864518155157566, + "num_tokens": 226671393.0, + "step": 188410 + }, + { + "entropy": 1.8883730322122574, + "epoch": 0.5840855141863621, + "grad_norm": 7.3440327644348145, + "learning_rate": 3.310210517628817e-06, + "loss": 0.4561, + "mean_token_accuracy": 0.8584344744682312, + "num_tokens": 226683271.0, + "step": 188420 + }, + { + "entropy": 1.8579667672514915, + "epoch": 0.5841165133114118, + "grad_norm": 3.851297616958618, + "learning_rate": 3.3101226793863665e-06, + "loss": 0.4369, + "mean_token_accuracy": 0.8607630103826522, + "num_tokens": 226695544.0, + "step": 188430 + }, + { + "entropy": 1.8530279219150543, + "epoch": 0.5841475124364615, + "grad_norm": 9.353023529052734, + "learning_rate": 3.3100348481360524e-06, + "loss": 0.4427, + "mean_token_accuracy": 0.8588374391198158, + "num_tokens": 226707908.0, + "step": 188440 + }, + { + "entropy": 1.9490127682685852, + "epoch": 0.5841785115615111, + "grad_norm": 4.572386741638184, + "learning_rate": 3.3099470238769466e-06, + "loss": 0.4735, + "mean_token_accuracy": 0.8485381156206131, + "num_tokens": 226719751.0, + "step": 188450 + }, + { + "entropy": 1.8131704688072205, + "epoch": 0.5842095106865609, + "grad_norm": 7.624702453613281, + "learning_rate": 3.309859206608122e-06, + "loss": 0.3821, + "mean_token_accuracy": 0.8713184505701065, + "num_tokens": 226732052.0, + "step": 188460 + }, + { + "entropy": 1.9276420667767524, + "epoch": 0.5842405098116106, + "grad_norm": 7.374622344970703, + "learning_rate": 3.3097713963286505e-06, + "loss": 0.484, + "mean_token_accuracy": 0.8484125405550003, + "num_tokens": 226743129.0, + "step": 188470 + }, + { + "entropy": 1.8226234167814255, + "epoch": 0.5842715089366602, + "grad_norm": 7.201011657714844, + "learning_rate": 3.309683593037606e-06, + "loss": 0.4367, + "mean_token_accuracy": 0.8551812097430229, + "num_tokens": 226756342.0, + "step": 188480 + }, + { + "entropy": 1.8935386508703231, + "epoch": 0.5843025080617099, + "grad_norm": 3.1278462409973145, + "learning_rate": 3.309595796734061e-06, + "loss": 0.4143, + "mean_token_accuracy": 0.8602985307574272, + "num_tokens": 226768034.0, + "step": 188490 + }, + { + "entropy": 1.897402636706829, + "epoch": 0.5843335071867597, + "grad_norm": 4.209484577178955, + "learning_rate": 3.3095080074170894e-06, + "loss": 0.4179, + "mean_token_accuracy": 0.8683418110013008, + "num_tokens": 226779243.0, + "step": 188500 + }, + { + "entropy": 1.7633845642209054, + "epoch": 0.5843645063118094, + "grad_norm": 4.389909267425537, + "learning_rate": 3.309420225085764e-06, + "loss": 0.371, + "mean_token_accuracy": 0.87280925065279, + "num_tokens": 226791987.0, + "step": 188510 + }, + { + "entropy": 1.9105349063873291, + "epoch": 0.584395505436859, + "grad_norm": 8.567315101623535, + "learning_rate": 3.3093324497391587e-06, + "loss": 0.502, + "mean_token_accuracy": 0.8505501300096512, + "num_tokens": 226802929.0, + "step": 188520 + }, + { + "entropy": 1.8890083134174347, + "epoch": 0.5844265045619087, + "grad_norm": 7.157079696655273, + "learning_rate": 3.3092446813763467e-06, + "loss": 0.4592, + "mean_token_accuracy": 0.8604133263230324, + "num_tokens": 226814577.0, + "step": 188530 + }, + { + "entropy": 1.93582623898983, + "epoch": 0.5844575036869585, + "grad_norm": 7.214765548706055, + "learning_rate": 3.309156919996403e-06, + "loss": 0.4797, + "mean_token_accuracy": 0.8459798738360405, + "num_tokens": 226825129.0, + "step": 188540 + }, + { + "entropy": 1.8311146199703217, + "epoch": 0.5844885028120081, + "grad_norm": 4.586556434631348, + "learning_rate": 3.309069165598401e-06, + "loss": 0.4468, + "mean_token_accuracy": 0.8530444666743279, + "num_tokens": 226837973.0, + "step": 188550 + }, + { + "entropy": 1.8696679010987283, + "epoch": 0.5845195019370578, + "grad_norm": 7.468491077423096, + "learning_rate": 3.308981418181415e-06, + "loss": 0.3964, + "mean_token_accuracy": 0.8730146139860153, + "num_tokens": 226849471.0, + "step": 188560 + }, + { + "entropy": 1.9135367214679717, + "epoch": 0.5845505010621075, + "grad_norm": 8.727263450622559, + "learning_rate": 3.3088936777445195e-06, + "loss": 0.4769, + "mean_token_accuracy": 0.8547710910439491, + "num_tokens": 226860616.0, + "step": 188570 + }, + { + "entropy": 1.8736458599567414, + "epoch": 0.5845815001871573, + "grad_norm": 4.631098747253418, + "learning_rate": 3.3088059442867896e-06, + "loss": 0.4295, + "mean_token_accuracy": 0.8680781036615371, + "num_tokens": 226871990.0, + "step": 188580 + }, + { + "entropy": 1.8468430042266846, + "epoch": 0.5846124993122069, + "grad_norm": 4.6543049812316895, + "learning_rate": 3.3087182178072996e-06, + "loss": 0.3891, + "mean_token_accuracy": 0.8615953475236893, + "num_tokens": 226884227.0, + "step": 188590 + }, + { + "entropy": 1.8528510972857475, + "epoch": 0.5846434984372566, + "grad_norm": 3.4963319301605225, + "learning_rate": 3.308630498305125e-06, + "loss": 0.4341, + "mean_token_accuracy": 0.8522498548030853, + "num_tokens": 226896429.0, + "step": 188600 + }, + { + "entropy": 1.7623168423771858, + "epoch": 0.5846744975623063, + "grad_norm": 3.6100804805755615, + "learning_rate": 3.30854278577934e-06, + "loss": 0.3711, + "mean_token_accuracy": 0.8676817521452904, + "num_tokens": 226909499.0, + "step": 188610 + }, + { + "entropy": 1.8766853883862495, + "epoch": 0.584705496687356, + "grad_norm": 4.469761848449707, + "learning_rate": 3.308455080229021e-06, + "loss": 0.4275, + "mean_token_accuracy": 0.8567523375153542, + "num_tokens": 226922210.0, + "step": 188620 + }, + { + "entropy": 1.9278080672025681, + "epoch": 0.5847364958124057, + "grad_norm": 7.888815402984619, + "learning_rate": 3.308367381653242e-06, + "loss": 0.4545, + "mean_token_accuracy": 0.851407541334629, + "num_tokens": 226933618.0, + "step": 188630 + }, + { + "entropy": 1.892588023841381, + "epoch": 0.5847674949374554, + "grad_norm": 8.37109661102295, + "learning_rate": 3.3082796900510807e-06, + "loss": 0.4633, + "mean_token_accuracy": 0.8398651763796806, + "num_tokens": 226945299.0, + "step": 188640 + }, + { + "entropy": 1.9134973019361496, + "epoch": 0.5847984940625051, + "grad_norm": 8.243192672729492, + "learning_rate": 3.3081920054216115e-06, + "loss": 0.4962, + "mean_token_accuracy": 0.8384934589266777, + "num_tokens": 226956769.0, + "step": 188650 + }, + { + "entropy": 1.9360808670520782, + "epoch": 0.5848294931875547, + "grad_norm": 8.460143089294434, + "learning_rate": 3.308104327763911e-06, + "loss": 0.5219, + "mean_token_accuracy": 0.8484696820378304, + "num_tokens": 226967465.0, + "step": 188660 + }, + { + "entropy": 1.8962484419345855, + "epoch": 0.5848604923126045, + "grad_norm": 9.766073226928711, + "learning_rate": 3.308016657077055e-06, + "loss": 0.4365, + "mean_token_accuracy": 0.8597484767436981, + "num_tokens": 226978952.0, + "step": 188670 + }, + { + "entropy": 1.8484892681241036, + "epoch": 0.5848914914376542, + "grad_norm": 8.785158157348633, + "learning_rate": 3.3079289933601196e-06, + "loss": 0.4598, + "mean_token_accuracy": 0.8503390610218048, + "num_tokens": 226990498.0, + "step": 188680 + }, + { + "entropy": 1.9609992295503615, + "epoch": 0.5849224905627038, + "grad_norm": 8.992911338806152, + "learning_rate": 3.307841336612182e-06, + "loss": 0.4542, + "mean_token_accuracy": 0.8564995005726814, + "num_tokens": 227001098.0, + "step": 188690 + }, + { + "entropy": 1.7960527956485748, + "epoch": 0.5849534896877535, + "grad_norm": 4.051634311676025, + "learning_rate": 3.307753686832319e-06, + "loss": 0.3667, + "mean_token_accuracy": 0.8742998942732811, + "num_tokens": 227014146.0, + "step": 188700 + }, + { + "entropy": 1.8706871971488, + "epoch": 0.5849844888128033, + "grad_norm": 10.606660842895508, + "learning_rate": 3.3076660440196073e-06, + "loss": 0.4624, + "mean_token_accuracy": 0.8497942507266998, + "num_tokens": 227025140.0, + "step": 188710 + }, + { + "entropy": 1.8714196056127548, + "epoch": 0.585015487937853, + "grad_norm": 5.643126964569092, + "learning_rate": 3.307578408173123e-06, + "loss": 0.4442, + "mean_token_accuracy": 0.8556253165006638, + "num_tokens": 227036917.0, + "step": 188720 + }, + { + "entropy": 1.7751472130417825, + "epoch": 0.5850464870629026, + "grad_norm": 9.494732856750488, + "learning_rate": 3.307490779291944e-06, + "loss": 0.3812, + "mean_token_accuracy": 0.8668629273772239, + "num_tokens": 227049736.0, + "step": 188730 + }, + { + "entropy": 1.8384251773357392, + "epoch": 0.5850774861879523, + "grad_norm": 8.89919376373291, + "learning_rate": 3.307403157375148e-06, + "loss": 0.4083, + "mean_token_accuracy": 0.8520896717905998, + "num_tokens": 227061666.0, + "step": 188740 + }, + { + "entropy": 1.8475905492901803, + "epoch": 0.5851084853130021, + "grad_norm": 8.330229759216309, + "learning_rate": 3.307315542421812e-06, + "loss": 0.4514, + "mean_token_accuracy": 0.8590241074562073, + "num_tokens": 227073116.0, + "step": 188750 + }, + { + "entropy": 1.7911785036325454, + "epoch": 0.5851394844380517, + "grad_norm": 7.692288398742676, + "learning_rate": 3.307227934431014e-06, + "loss": 0.3705, + "mean_token_accuracy": 0.8710220009088516, + "num_tokens": 227085773.0, + "step": 188760 + }, + { + "entropy": 1.8938721969723702, + "epoch": 0.5851704835631014, + "grad_norm": 3.932691812515259, + "learning_rate": 3.3071403334018326e-06, + "loss": 0.4701, + "mean_token_accuracy": 0.8493056833744049, + "num_tokens": 227097247.0, + "step": 188770 + }, + { + "entropy": 1.708780439198017, + "epoch": 0.5852014826881511, + "grad_norm": 3.246295690536499, + "learning_rate": 3.307052739333345e-06, + "loss": 0.344, + "mean_token_accuracy": 0.8691340193152428, + "num_tokens": 227111719.0, + "step": 188780 + }, + { + "entropy": 1.897132021188736, + "epoch": 0.5852324818132009, + "grad_norm": 8.147296905517578, + "learning_rate": 3.3069651522246294e-06, + "loss": 0.4624, + "mean_token_accuracy": 0.8504904195666313, + "num_tokens": 227123856.0, + "step": 188790 + }, + { + "entropy": 1.9081890538334847, + "epoch": 0.5852634809382505, + "grad_norm": 8.91196346282959, + "learning_rate": 3.3068775720747642e-06, + "loss": 0.4584, + "mean_token_accuracy": 0.8543705970048905, + "num_tokens": 227134956.0, + "step": 188800 + }, + { + "entropy": 1.8409344106912613, + "epoch": 0.5852944800633002, + "grad_norm": 7.536337375640869, + "learning_rate": 3.306789998882828e-06, + "loss": 0.4559, + "mean_token_accuracy": 0.8481650918722152, + "num_tokens": 227146690.0, + "step": 188810 + }, + { + "entropy": 1.7761472210288047, + "epoch": 0.5853254791883499, + "grad_norm": 9.95036506652832, + "learning_rate": 3.3067024326478997e-06, + "loss": 0.3968, + "mean_token_accuracy": 0.8567533954977989, + "num_tokens": 227159423.0, + "step": 188820 + }, + { + "entropy": 1.822265100479126, + "epoch": 0.5853564783133997, + "grad_norm": 4.746108531951904, + "learning_rate": 3.3066148733690585e-06, + "loss": 0.4361, + "mean_token_accuracy": 0.8569403976202011, + "num_tokens": 227171637.0, + "step": 188830 + }, + { + "entropy": 1.7964641809463502, + "epoch": 0.5853874774384493, + "grad_norm": 5.9528374671936035, + "learning_rate": 3.306527321045383e-06, + "loss": 0.4271, + "mean_token_accuracy": 0.8559579864144325, + "num_tokens": 227184996.0, + "step": 188840 + }, + { + "entropy": 1.8235788196325302, + "epoch": 0.585418476563499, + "grad_norm": 9.29820442199707, + "learning_rate": 3.3064397756759524e-06, + "loss": 0.4037, + "mean_token_accuracy": 0.8652059838175774, + "num_tokens": 227196975.0, + "step": 188850 + }, + { + "entropy": 1.9079763412475585, + "epoch": 0.5854494756885487, + "grad_norm": 7.375221252441406, + "learning_rate": 3.3063522372598465e-06, + "loss": 0.4912, + "mean_token_accuracy": 0.8551223009824753, + "num_tokens": 227208072.0, + "step": 188860 + }, + { + "entropy": 1.8686128735542298, + "epoch": 0.5854804748135984, + "grad_norm": 3.882939338684082, + "learning_rate": 3.3062647057961455e-06, + "loss": 0.4247, + "mean_token_accuracy": 0.8613992780447006, + "num_tokens": 227219452.0, + "step": 188870 + }, + { + "entropy": 1.8170276552438736, + "epoch": 0.5855114739386481, + "grad_norm": 8.03610897064209, + "learning_rate": 3.306177181283928e-06, + "loss": 0.3851, + "mean_token_accuracy": 0.8691373020410538, + "num_tokens": 227232288.0, + "step": 188880 + }, + { + "entropy": 1.7830303296446801, + "epoch": 0.5855424730636978, + "grad_norm": 4.951107978820801, + "learning_rate": 3.3060896637222738e-06, + "loss": 0.4251, + "mean_token_accuracy": 0.8494992271065712, + "num_tokens": 227245140.0, + "step": 188890 + }, + { + "entropy": 1.808831486105919, + "epoch": 0.5855734721887474, + "grad_norm": 9.004613876342773, + "learning_rate": 3.3060021531102636e-06, + "loss": 0.4261, + "mean_token_accuracy": 0.8603759378194809, + "num_tokens": 227258195.0, + "step": 188900 + }, + { + "entropy": 1.8654738262295723, + "epoch": 0.5856044713137971, + "grad_norm": 8.35360336303711, + "learning_rate": 3.3059146494469785e-06, + "loss": 0.4873, + "mean_token_accuracy": 0.8470656216144562, + "num_tokens": 227269403.0, + "step": 188910 + }, + { + "entropy": 1.90666244328022, + "epoch": 0.5856354704388469, + "grad_norm": 7.269579887390137, + "learning_rate": 3.3058271527314976e-06, + "loss": 0.4363, + "mean_token_accuracy": 0.8671371981501579, + "num_tokens": 227281068.0, + "step": 188920 + }, + { + "entropy": 1.8575873374938965, + "epoch": 0.5856664695638966, + "grad_norm": 7.192220211029053, + "learning_rate": 3.3057396629629024e-06, + "loss": 0.4337, + "mean_token_accuracy": 0.8495822846889496, + "num_tokens": 227292522.0, + "step": 188930 + }, + { + "entropy": 1.8657715290784835, + "epoch": 0.5856974686889462, + "grad_norm": 7.0419135093688965, + "learning_rate": 3.305652180140273e-06, + "loss": 0.4059, + "mean_token_accuracy": 0.8614326119422913, + "num_tokens": 227304770.0, + "step": 188940 + }, + { + "entropy": 1.9066298857331276, + "epoch": 0.5857284678139959, + "grad_norm": 7.648441791534424, + "learning_rate": 3.3055647042626904e-06, + "loss": 0.4575, + "mean_token_accuracy": 0.8575715780258178, + "num_tokens": 227315975.0, + "step": 188950 + }, + { + "entropy": 1.8731012284755706, + "epoch": 0.5857594669390457, + "grad_norm": 6.847329139709473, + "learning_rate": 3.305477235329237e-06, + "loss": 0.468, + "mean_token_accuracy": 0.8527000054717064, + "num_tokens": 227327413.0, + "step": 188960 + }, + { + "entropy": 1.8332221895456313, + "epoch": 0.5857904660640953, + "grad_norm": 9.892985343933105, + "learning_rate": 3.305389773338992e-06, + "loss": 0.3955, + "mean_token_accuracy": 0.8629180341959, + "num_tokens": 227339150.0, + "step": 188970 + }, + { + "entropy": 1.88137661293149, + "epoch": 0.585821465189145, + "grad_norm": 8.720916748046875, + "learning_rate": 3.305302318291039e-06, + "loss": 0.452, + "mean_token_accuracy": 0.8541831061244011, + "num_tokens": 227350744.0, + "step": 188980 + }, + { + "entropy": 1.7486733600497246, + "epoch": 0.5858524643141947, + "grad_norm": 8.791574478149414, + "learning_rate": 3.3052148701844576e-06, + "loss": 0.3755, + "mean_token_accuracy": 0.87051263153553, + "num_tokens": 227364249.0, + "step": 188990 + }, + { + "entropy": 1.8991805881261825, + "epoch": 0.5858834634392445, + "grad_norm": 7.9490461349487305, + "learning_rate": 3.3051274290183317e-06, + "loss": 0.4683, + "mean_token_accuracy": 0.8558553963899612, + "num_tokens": 227375754.0, + "step": 189000 + }, + { + "entropy": 1.895850320160389, + "epoch": 0.5859144625642941, + "grad_norm": 8.310858726501465, + "learning_rate": 3.305039994791741e-06, + "loss": 0.4559, + "mean_token_accuracy": 0.85080985724926, + "num_tokens": 227387025.0, + "step": 189010 + }, + { + "entropy": 1.8519131153821946, + "epoch": 0.5859454616893438, + "grad_norm": 2.4337899684906006, + "learning_rate": 3.3049525675037696e-06, + "loss": 0.4305, + "mean_token_accuracy": 0.8626031950116158, + "num_tokens": 227399366.0, + "step": 189020 + }, + { + "entropy": 1.8202516600489616, + "epoch": 0.5859764608143935, + "grad_norm": 3.6557323932647705, + "learning_rate": 3.3048651471535e-06, + "loss": 0.4176, + "mean_token_accuracy": 0.8586086541414261, + "num_tokens": 227411475.0, + "step": 189030 + }, + { + "entropy": 1.7547407761216163, + "epoch": 0.5860074599394433, + "grad_norm": 6.2841997146606445, + "learning_rate": 3.304777733740012e-06, + "loss": 0.3648, + "mean_token_accuracy": 0.8678300872445106, + "num_tokens": 227425090.0, + "step": 189040 + }, + { + "entropy": 1.876267357170582, + "epoch": 0.5860384590644929, + "grad_norm": 4.17917537689209, + "learning_rate": 3.304690327262391e-06, + "loss": 0.4658, + "mean_token_accuracy": 0.8612060084939003, + "num_tokens": 227436823.0, + "step": 189050 + }, + { + "entropy": 1.8834017142653465, + "epoch": 0.5860694581895426, + "grad_norm": 11.105561256408691, + "learning_rate": 3.304602927719719e-06, + "loss": 0.4476, + "mean_token_accuracy": 0.8533930122852326, + "num_tokens": 227447772.0, + "step": 189060 + }, + { + "entropy": 1.8341946393251418, + "epoch": 0.5861004573145923, + "grad_norm": 8.324958801269531, + "learning_rate": 3.3045155351110786e-06, + "loss": 0.4218, + "mean_token_accuracy": 0.859440878033638, + "num_tokens": 227459616.0, + "step": 189070 + }, + { + "entropy": 1.9266793727874756, + "epoch": 0.586131456439642, + "grad_norm": 8.78128719329834, + "learning_rate": 3.304428149435554e-06, + "loss": 0.5021, + "mean_token_accuracy": 0.8393941059708595, + "num_tokens": 227470147.0, + "step": 189080 + }, + { + "entropy": 1.8669157862663268, + "epoch": 0.5861624555646917, + "grad_norm": 3.7182395458221436, + "learning_rate": 3.304340770692227e-06, + "loss": 0.4446, + "mean_token_accuracy": 0.8524700611829757, + "num_tokens": 227481869.0, + "step": 189090 + }, + { + "entropy": 1.8480639606714249, + "epoch": 0.5861934546897414, + "grad_norm": 7.5054168701171875, + "learning_rate": 3.3042533988801816e-06, + "loss": 0.4159, + "mean_token_accuracy": 0.8661205500364304, + "num_tokens": 227493852.0, + "step": 189100 + }, + { + "entropy": 1.8628501027822495, + "epoch": 0.586224453814791, + "grad_norm": 3.835216522216797, + "learning_rate": 3.304166033998502e-06, + "loss": 0.4198, + "mean_token_accuracy": 0.8646493136882782, + "num_tokens": 227505056.0, + "step": 189110 + }, + { + "entropy": 1.8842681765556335, + "epoch": 0.5862554529398408, + "grad_norm": 4.247763633728027, + "learning_rate": 3.304078676046273e-06, + "loss": 0.4195, + "mean_token_accuracy": 0.854621236026287, + "num_tokens": 227516359.0, + "step": 189120 + }, + { + "entropy": 1.8450437039136887, + "epoch": 0.5862864520648905, + "grad_norm": 7.098427772521973, + "learning_rate": 3.303991325022576e-06, + "loss": 0.4048, + "mean_token_accuracy": 0.8628740802407264, + "num_tokens": 227528633.0, + "step": 189130 + }, + { + "entropy": 1.8995537489652634, + "epoch": 0.5863174511899402, + "grad_norm": 9.48597526550293, + "learning_rate": 3.3039039809264976e-06, + "loss": 0.4433, + "mean_token_accuracy": 0.8529835850000381, + "num_tokens": 227539740.0, + "step": 189140 + }, + { + "entropy": 1.8391615077853203, + "epoch": 0.5863484503149898, + "grad_norm": 8.266154289245605, + "learning_rate": 3.303816643757121e-06, + "loss": 0.4701, + "mean_token_accuracy": 0.8522128477692604, + "num_tokens": 227552532.0, + "step": 189150 + }, + { + "entropy": 1.8922418750822545, + "epoch": 0.5863794494400395, + "grad_norm": 8.158439636230469, + "learning_rate": 3.30372931351353e-06, + "loss": 0.4178, + "mean_token_accuracy": 0.8632095634937287, + "num_tokens": 227564255.0, + "step": 189160 + }, + { + "entropy": 1.9275053888559341, + "epoch": 0.5864104485650893, + "grad_norm": 9.102051734924316, + "learning_rate": 3.3036419901948117e-06, + "loss": 0.4718, + "mean_token_accuracy": 0.851666758954525, + "num_tokens": 227574967.0, + "step": 189170 + }, + { + "entropy": 1.8686299338936805, + "epoch": 0.586441447690139, + "grad_norm": 7.848930835723877, + "learning_rate": 3.303554673800049e-06, + "loss": 0.4426, + "mean_token_accuracy": 0.8474835962057113, + "num_tokens": 227587486.0, + "step": 189180 + }, + { + "entropy": 1.9148722842335701, + "epoch": 0.5864724468151886, + "grad_norm": 7.397693157196045, + "learning_rate": 3.3034673643283273e-06, + "loss": 0.4407, + "mean_token_accuracy": 0.8615544393658638, + "num_tokens": 227598667.0, + "step": 189190 + }, + { + "entropy": 1.8950233474373817, + "epoch": 0.5865034459402383, + "grad_norm": 6.918297290802002, + "learning_rate": 3.3033800617787316e-06, + "loss": 0.427, + "mean_token_accuracy": 0.8527185708284378, + "num_tokens": 227610380.0, + "step": 189200 + }, + { + "entropy": 1.9290294289588927, + "epoch": 0.5865344450652881, + "grad_norm": 3.6919219493865967, + "learning_rate": 3.303292766150348e-06, + "loss": 0.4992, + "mean_token_accuracy": 0.8398040190339089, + "num_tokens": 227621658.0, + "step": 189210 + }, + { + "entropy": 1.7984000250697136, + "epoch": 0.5865654441903377, + "grad_norm": 7.641847133636475, + "learning_rate": 3.303205477442261e-06, + "loss": 0.3887, + "mean_token_accuracy": 0.8632738545536995, + "num_tokens": 227634019.0, + "step": 189220 + }, + { + "entropy": 1.8095714911818503, + "epoch": 0.5865964433153874, + "grad_norm": 8.667887687683105, + "learning_rate": 3.303118195653558e-06, + "loss": 0.4209, + "mean_token_accuracy": 0.8591258674860001, + "num_tokens": 227648132.0, + "step": 189230 + }, + { + "entropy": 1.8297943592071533, + "epoch": 0.5866274424404371, + "grad_norm": 9.270956993103027, + "learning_rate": 3.3030309207833233e-06, + "loss": 0.3771, + "mean_token_accuracy": 0.8597560048103332, + "num_tokens": 227660944.0, + "step": 189240 + }, + { + "entropy": 1.870998741686344, + "epoch": 0.5866584415654869, + "grad_norm": 9.251309394836426, + "learning_rate": 3.3029436528306434e-06, + "loss": 0.4482, + "mean_token_accuracy": 0.853928117454052, + "num_tokens": 227673235.0, + "step": 189250 + }, + { + "entropy": 1.8127895906567573, + "epoch": 0.5866894406905365, + "grad_norm": 8.351231575012207, + "learning_rate": 3.302856391794605e-06, + "loss": 0.431, + "mean_token_accuracy": 0.8543845057487488, + "num_tokens": 227686452.0, + "step": 189260 + }, + { + "entropy": 1.7898150980472565, + "epoch": 0.5867204398155862, + "grad_norm": 2.87815523147583, + "learning_rate": 3.3027691376742943e-06, + "loss": 0.392, + "mean_token_accuracy": 0.8682875841856003, + "num_tokens": 227699484.0, + "step": 189270 + }, + { + "entropy": 1.961062216758728, + "epoch": 0.5867514389406359, + "grad_norm": 4.274531364440918, + "learning_rate": 3.3026818904687975e-06, + "loss": 0.4872, + "mean_token_accuracy": 0.8424155503511429, + "num_tokens": 227710193.0, + "step": 189280 + }, + { + "entropy": 1.8331158310174942, + "epoch": 0.5867824380656856, + "grad_norm": 7.7590012550354, + "learning_rate": 3.3025946501772012e-06, + "loss": 0.3811, + "mean_token_accuracy": 0.8739141911268234, + "num_tokens": 227722944.0, + "step": 189290 + }, + { + "entropy": 1.8074917957186698, + "epoch": 0.5868134371907353, + "grad_norm": 2.69134259223938, + "learning_rate": 3.3025074167985928e-06, + "loss": 0.4411, + "mean_token_accuracy": 0.8565651878714562, + "num_tokens": 227735625.0, + "step": 189300 + }, + { + "entropy": 1.8668998405337334, + "epoch": 0.586844436315785, + "grad_norm": 8.62458610534668, + "learning_rate": 3.3024201903320586e-06, + "loss": 0.4129, + "mean_token_accuracy": 0.8737411797046661, + "num_tokens": 227746528.0, + "step": 189310 + }, + { + "entropy": 1.785508194565773, + "epoch": 0.5868754354408346, + "grad_norm": 8.008922576904297, + "learning_rate": 3.3023329707766865e-06, + "loss": 0.3802, + "mean_token_accuracy": 0.8677184686064721, + "num_tokens": 227759227.0, + "step": 189320 + }, + { + "entropy": 1.8027926161885262, + "epoch": 0.5869064345658844, + "grad_norm": 6.850667953491211, + "learning_rate": 3.3022457581315642e-06, + "loss": 0.3878, + "mean_token_accuracy": 0.8741619393229485, + "num_tokens": 227772084.0, + "step": 189330 + }, + { + "entropy": 1.7575479179620743, + "epoch": 0.5869374336909341, + "grad_norm": 3.98160719871521, + "learning_rate": 3.302158552395779e-06, + "loss": 0.37, + "mean_token_accuracy": 0.8698368772864342, + "num_tokens": 227785177.0, + "step": 189340 + }, + { + "entropy": 1.8547125473618506, + "epoch": 0.5869684328159838, + "grad_norm": 9.128921508789062, + "learning_rate": 3.302071353568418e-06, + "loss": 0.4529, + "mean_token_accuracy": 0.8492949277162551, + "num_tokens": 227797306.0, + "step": 189350 + }, + { + "entropy": 1.8572920322418214, + "epoch": 0.5869994319410334, + "grad_norm": 8.80993938446045, + "learning_rate": 3.3019841616485705e-06, + "loss": 0.441, + "mean_token_accuracy": 0.8457432359457016, + "num_tokens": 227809558.0, + "step": 189360 + }, + { + "entropy": 1.8914913788437844, + "epoch": 0.5870304310660832, + "grad_norm": 8.738089561462402, + "learning_rate": 3.3018969766353227e-06, + "loss": 0.4547, + "mean_token_accuracy": 0.8606926470994949, + "num_tokens": 227820941.0, + "step": 189370 + }, + { + "entropy": 1.917687328159809, + "epoch": 0.5870614301911329, + "grad_norm": 7.077282428741455, + "learning_rate": 3.3018097985277643e-06, + "loss": 0.4211, + "mean_token_accuracy": 0.8651439309120178, + "num_tokens": 227832317.0, + "step": 189380 + }, + { + "entropy": 1.8793218448758124, + "epoch": 0.5870924293161826, + "grad_norm": 9.505925178527832, + "learning_rate": 3.301722627324983e-06, + "loss": 0.4959, + "mean_token_accuracy": 0.8499870494008064, + "num_tokens": 227844172.0, + "step": 189390 + }, + { + "entropy": 1.9014255598187446, + "epoch": 0.5871234284412322, + "grad_norm": 8.800775527954102, + "learning_rate": 3.3016354630260677e-06, + "loss": 0.4709, + "mean_token_accuracy": 0.8577530473470688, + "num_tokens": 227855598.0, + "step": 189400 + }, + { + "entropy": 1.9350775092840196, + "epoch": 0.5871544275662819, + "grad_norm": 8.164290428161621, + "learning_rate": 3.3015483056301072e-06, + "loss": 0.5224, + "mean_token_accuracy": 0.8480610802769661, + "num_tokens": 227866126.0, + "step": 189410 + }, + { + "entropy": 1.8243651062250137, + "epoch": 0.5871854266913317, + "grad_norm": 11.792317390441895, + "learning_rate": 3.3014611551361896e-06, + "loss": 0.4244, + "mean_token_accuracy": 0.8601853996515274, + "num_tokens": 227878353.0, + "step": 189420 + }, + { + "entropy": 1.8115493163466454, + "epoch": 0.5872164258163813, + "grad_norm": 2.0419137477874756, + "learning_rate": 3.3013740115434056e-06, + "loss": 0.4208, + "mean_token_accuracy": 0.8694847837090492, + "num_tokens": 227891678.0, + "step": 189430 + }, + { + "entropy": 1.808267669379711, + "epoch": 0.587247424941431, + "grad_norm": 7.618503570556641, + "learning_rate": 3.301286874850844e-06, + "loss": 0.4137, + "mean_token_accuracy": 0.8591461583971978, + "num_tokens": 227903837.0, + "step": 189440 + }, + { + "entropy": 1.875668992102146, + "epoch": 0.5872784240664807, + "grad_norm": 8.09911060333252, + "learning_rate": 3.301199745057593e-06, + "loss": 0.4322, + "mean_token_accuracy": 0.8624072283506393, + "num_tokens": 227915966.0, + "step": 189450 + }, + { + "entropy": 1.9233120754361153, + "epoch": 0.5873094231915305, + "grad_norm": 8.465187072753906, + "learning_rate": 3.3011126221627424e-06, + "loss": 0.4759, + "mean_token_accuracy": 0.8484439864754677, + "num_tokens": 227927615.0, + "step": 189460 + }, + { + "entropy": 1.8734351724386216, + "epoch": 0.5873404223165801, + "grad_norm": 8.21156120300293, + "learning_rate": 3.301025506165383e-06, + "loss": 0.4166, + "mean_token_accuracy": 0.8689304769039154, + "num_tokens": 227939154.0, + "step": 189470 + }, + { + "entropy": 1.802188740670681, + "epoch": 0.5873714214416298, + "grad_norm": 8.768073081970215, + "learning_rate": 3.3009383970646043e-06, + "loss": 0.3884, + "mean_token_accuracy": 0.8656351312994957, + "num_tokens": 227951955.0, + "step": 189480 + }, + { + "entropy": 1.8910615742206573, + "epoch": 0.5874024205666795, + "grad_norm": 8.669722557067871, + "learning_rate": 3.3008512948594968e-06, + "loss": 0.4498, + "mean_token_accuracy": 0.8541586175560951, + "num_tokens": 227963163.0, + "step": 189490 + }, + { + "entropy": 1.8721488162875175, + "epoch": 0.5874334196917292, + "grad_norm": 2.82273530960083, + "learning_rate": 3.300764199549149e-06, + "loss": 0.4317, + "mean_token_accuracy": 0.8580455660820008, + "num_tokens": 227975840.0, + "step": 189500 + }, + { + "entropy": 1.8127721384167672, + "epoch": 0.5874644188167789, + "grad_norm": 7.7262115478515625, + "learning_rate": 3.300677111132654e-06, + "loss": 0.3995, + "mean_token_accuracy": 0.8592531457543373, + "num_tokens": 227988830.0, + "step": 189510 + }, + { + "entropy": 1.9156973659992218, + "epoch": 0.5874954179418286, + "grad_norm": 9.007680892944336, + "learning_rate": 3.3005900296091005e-06, + "loss": 0.502, + "mean_token_accuracy": 0.84843979626894, + "num_tokens": 227999884.0, + "step": 189520 + }, + { + "entropy": 1.8657537907361985, + "epoch": 0.5875264170668782, + "grad_norm": 7.236568450927734, + "learning_rate": 3.3005029549775797e-06, + "loss": 0.4632, + "mean_token_accuracy": 0.8456703305244446, + "num_tokens": 228012514.0, + "step": 189530 + }, + { + "entropy": 1.8161449700593948, + "epoch": 0.587557416191928, + "grad_norm": 6.984272480010986, + "learning_rate": 3.3004158872371827e-06, + "loss": 0.4328, + "mean_token_accuracy": 0.8517293512821198, + "num_tokens": 228025517.0, + "step": 189540 + }, + { + "entropy": 1.8823605373501777, + "epoch": 0.5875884153169777, + "grad_norm": 8.093108177185059, + "learning_rate": 3.300328826387e-06, + "loss": 0.4367, + "mean_token_accuracy": 0.85855952501297, + "num_tokens": 228037560.0, + "step": 189550 + }, + { + "entropy": 1.9040069863200189, + "epoch": 0.5876194144420274, + "grad_norm": 7.7805399894714355, + "learning_rate": 3.300241772426124e-06, + "loss": 0.4507, + "mean_token_accuracy": 0.8509999126195907, + "num_tokens": 228049640.0, + "step": 189560 + }, + { + "entropy": 1.828610323369503, + "epoch": 0.587650413567077, + "grad_norm": 7.373439788818359, + "learning_rate": 3.3001547253536457e-06, + "loss": 0.4386, + "mean_token_accuracy": 0.8533773705363273, + "num_tokens": 228062020.0, + "step": 189570 + }, + { + "entropy": 1.8179884567856788, + "epoch": 0.5876814126921268, + "grad_norm": 6.4519782066345215, + "learning_rate": 3.3000676851686556e-06, + "loss": 0.4151, + "mean_token_accuracy": 0.8558628976345062, + "num_tokens": 228073965.0, + "step": 189580 + }, + { + "entropy": 1.9012035757303238, + "epoch": 0.5877124118171765, + "grad_norm": 7.507246017456055, + "learning_rate": 3.2999806518702484e-06, + "loss": 0.443, + "mean_token_accuracy": 0.8569813266396522, + "num_tokens": 228084587.0, + "step": 189590 + }, + { + "entropy": 1.8052480682730674, + "epoch": 0.5877434109422262, + "grad_norm": 8.202247619628906, + "learning_rate": 3.2998936254575126e-06, + "loss": 0.399, + "mean_token_accuracy": 0.8647924482822418, + "num_tokens": 228097593.0, + "step": 189600 + }, + { + "entropy": 1.8566293939948082, + "epoch": 0.5877744100672758, + "grad_norm": 8.915870666503906, + "learning_rate": 3.299806605929542e-06, + "loss": 0.4294, + "mean_token_accuracy": 0.8599877521395684, + "num_tokens": 228109710.0, + "step": 189610 + }, + { + "entropy": 1.876941440999508, + "epoch": 0.5878054091923256, + "grad_norm": 7.945460796356201, + "learning_rate": 3.2997195932854287e-06, + "loss": 0.4675, + "mean_token_accuracy": 0.8425981119275093, + "num_tokens": 228121176.0, + "step": 189620 + }, + { + "entropy": 1.8251575097441672, + "epoch": 0.5878364083173753, + "grad_norm": 7.691454887390137, + "learning_rate": 3.2996325875242646e-06, + "loss": 0.4071, + "mean_token_accuracy": 0.859329092502594, + "num_tokens": 228133888.0, + "step": 189630 + }, + { + "entropy": 1.7917707473039628, + "epoch": 0.5878674074424249, + "grad_norm": 7.17902946472168, + "learning_rate": 3.299545588645143e-06, + "loss": 0.3799, + "mean_token_accuracy": 0.8672048881649971, + "num_tokens": 228146650.0, + "step": 189640 + }, + { + "entropy": 1.897452747821808, + "epoch": 0.5878984065674746, + "grad_norm": 9.511686325073242, + "learning_rate": 3.299458596647157e-06, + "loss": 0.4981, + "mean_token_accuracy": 0.8457076713442803, + "num_tokens": 228158498.0, + "step": 189650 + }, + { + "entropy": 1.8302209421992301, + "epoch": 0.5879294056925243, + "grad_norm": 7.85939359664917, + "learning_rate": 3.2993716115293996e-06, + "loss": 0.3928, + "mean_token_accuracy": 0.8569305628538132, + "num_tokens": 228171347.0, + "step": 189660 + }, + { + "entropy": 1.8603603541851044, + "epoch": 0.5879604048175741, + "grad_norm": 8.279330253601074, + "learning_rate": 3.299284633290962e-06, + "loss": 0.4573, + "mean_token_accuracy": 0.850348000228405, + "num_tokens": 228183313.0, + "step": 189670 + }, + { + "entropy": 1.8520802944898604, + "epoch": 0.5879914039426237, + "grad_norm": 6.35352897644043, + "learning_rate": 3.299197661930939e-06, + "loss": 0.3909, + "mean_token_accuracy": 0.8683344289660454, + "num_tokens": 228195055.0, + "step": 189680 + }, + { + "entropy": 1.8788090363144874, + "epoch": 0.5880224030676734, + "grad_norm": 9.007162094116211, + "learning_rate": 3.2991106974484244e-06, + "loss": 0.4573, + "mean_token_accuracy": 0.8503051429986954, + "num_tokens": 228206727.0, + "step": 189690 + }, + { + "entropy": 1.9124977201223374, + "epoch": 0.5880534021927231, + "grad_norm": 7.327961444854736, + "learning_rate": 3.299023739842512e-06, + "loss": 0.4265, + "mean_token_accuracy": 0.8595570877194405, + "num_tokens": 228218308.0, + "step": 189700 + }, + { + "entropy": 1.8749097779393196, + "epoch": 0.5880844013177728, + "grad_norm": 8.713927268981934, + "learning_rate": 3.2989367891122944e-06, + "loss": 0.4444, + "mean_token_accuracy": 0.8532299667596817, + "num_tokens": 228229913.0, + "step": 189710 + }, + { + "entropy": 1.7920349642634392, + "epoch": 0.5881154004428225, + "grad_norm": 4.948719024658203, + "learning_rate": 3.2988498452568652e-06, + "loss": 0.4064, + "mean_token_accuracy": 0.869075883924961, + "num_tokens": 228242445.0, + "step": 189720 + }, + { + "entropy": 1.890162006020546, + "epoch": 0.5881463995678722, + "grad_norm": 7.9683356285095215, + "learning_rate": 3.29876290827532e-06, + "loss": 0.4604, + "mean_token_accuracy": 0.8604482188820839, + "num_tokens": 228253723.0, + "step": 189730 + }, + { + "entropy": 1.8672292068600655, + "epoch": 0.5881773986929218, + "grad_norm": 3.6988465785980225, + "learning_rate": 3.298675978166752e-06, + "loss": 0.4334, + "mean_token_accuracy": 0.8574666917324066, + "num_tokens": 228265723.0, + "step": 189740 + }, + { + "entropy": 1.9488049060106278, + "epoch": 0.5882083978179716, + "grad_norm": 9.950399398803711, + "learning_rate": 3.298589054930257e-06, + "loss": 0.5182, + "mean_token_accuracy": 0.8396835044026375, + "num_tokens": 228276302.0, + "step": 189750 + }, + { + "entropy": 1.923870074748993, + "epoch": 0.5882393969430213, + "grad_norm": 10.66832160949707, + "learning_rate": 3.298502138564928e-06, + "loss": 0.5146, + "mean_token_accuracy": 0.8430263265967369, + "num_tokens": 228287766.0, + "step": 189760 + }, + { + "entropy": 1.8855630785226822, + "epoch": 0.588270396068071, + "grad_norm": 8.5703125, + "learning_rate": 3.298415229069861e-06, + "loss": 0.4446, + "mean_token_accuracy": 0.8629995420575142, + "num_tokens": 228298917.0, + "step": 189770 + }, + { + "entropy": 1.9425514459609985, + "epoch": 0.5883013951931206, + "grad_norm": 8.09411907196045, + "learning_rate": 3.29832832644415e-06, + "loss": 0.4893, + "mean_token_accuracy": 0.8434894353151321, + "num_tokens": 228310478.0, + "step": 189780 + }, + { + "entropy": 1.860796995460987, + "epoch": 0.5883323943181704, + "grad_norm": 5.034776210784912, + "learning_rate": 3.2982414306868905e-06, + "loss": 0.4634, + "mean_token_accuracy": 0.8519127145409584, + "num_tokens": 228322635.0, + "step": 189790 + }, + { + "entropy": 1.9245415329933167, + "epoch": 0.5883633934432201, + "grad_norm": 3.561757802963257, + "learning_rate": 3.2981545417971785e-06, + "loss": 0.4799, + "mean_token_accuracy": 0.8493681281805039, + "num_tokens": 228334274.0, + "step": 189800 + }, + { + "entropy": 1.9446907877922057, + "epoch": 0.5883943925682698, + "grad_norm": 6.972831726074219, + "learning_rate": 3.2980676597741084e-06, + "loss": 0.4795, + "mean_token_accuracy": 0.8560405924916268, + "num_tokens": 228345048.0, + "step": 189810 + }, + { + "entropy": 1.8980532869696618, + "epoch": 0.5884253916933194, + "grad_norm": 7.752938747406006, + "learning_rate": 3.297980784616776e-06, + "loss": 0.4386, + "mean_token_accuracy": 0.861751252412796, + "num_tokens": 228356615.0, + "step": 189820 + }, + { + "entropy": 1.8524247661232949, + "epoch": 0.5884563908183692, + "grad_norm": 3.781128168106079, + "learning_rate": 3.297893916324278e-06, + "loss": 0.4092, + "mean_token_accuracy": 0.8605551645159721, + "num_tokens": 228368712.0, + "step": 189830 + }, + { + "entropy": 1.7883019983768462, + "epoch": 0.5884873899434189, + "grad_norm": 7.834524154663086, + "learning_rate": 3.2978070548957085e-06, + "loss": 0.415, + "mean_token_accuracy": 0.8655453786253929, + "num_tokens": 228381568.0, + "step": 189840 + }, + { + "entropy": 1.8673112966120242, + "epoch": 0.5885183890684685, + "grad_norm": 7.856258869171143, + "learning_rate": 3.297720200330167e-06, + "loss": 0.4367, + "mean_token_accuracy": 0.8590942695736885, + "num_tokens": 228393610.0, + "step": 189850 + }, + { + "entropy": 1.934214359521866, + "epoch": 0.5885493881935182, + "grad_norm": 8.630729675292969, + "learning_rate": 3.297633352626745e-06, + "loss": 0.5183, + "mean_token_accuracy": 0.8391479954123497, + "num_tokens": 228404573.0, + "step": 189860 + }, + { + "entropy": 1.8712112307548523, + "epoch": 0.588580387318568, + "grad_norm": 7.566702842712402, + "learning_rate": 3.2975465117845427e-06, + "loss": 0.4238, + "mean_token_accuracy": 0.8533918172121048, + "num_tokens": 228416198.0, + "step": 189870 + }, + { + "entropy": 1.8967519655823708, + "epoch": 0.5886113864436177, + "grad_norm": 6.956945896148682, + "learning_rate": 3.2974596778026564e-06, + "loss": 0.4759, + "mean_token_accuracy": 0.8500938445329667, + "num_tokens": 228428108.0, + "step": 189880 + }, + { + "entropy": 1.8681173652410508, + "epoch": 0.5886423855686673, + "grad_norm": 8.703429222106934, + "learning_rate": 3.2973728506801805e-06, + "loss": 0.4208, + "mean_token_accuracy": 0.8636728703975678, + "num_tokens": 228439922.0, + "step": 189890 + }, + { + "entropy": 1.87923151999712, + "epoch": 0.588673384693717, + "grad_norm": 8.526222229003906, + "learning_rate": 3.297286030416215e-06, + "loss": 0.4416, + "mean_token_accuracy": 0.8607040166854858, + "num_tokens": 228451960.0, + "step": 189900 + }, + { + "entropy": 1.8602784745395184, + "epoch": 0.5887043838187667, + "grad_norm": 8.467865943908691, + "learning_rate": 3.2971992170098548e-06, + "loss": 0.4323, + "mean_token_accuracy": 0.8517221093177796, + "num_tokens": 228464279.0, + "step": 189910 + }, + { + "entropy": 1.7735490411520005, + "epoch": 0.5887353829438164, + "grad_norm": 7.633058547973633, + "learning_rate": 3.2971124104601977e-06, + "loss": 0.3409, + "mean_token_accuracy": 0.8725137710571289, + "num_tokens": 228477690.0, + "step": 189920 + }, + { + "entropy": 1.926728144288063, + "epoch": 0.5887663820688661, + "grad_norm": 8.148148536682129, + "learning_rate": 3.297025610766342e-06, + "loss": 0.5202, + "mean_token_accuracy": 0.8435298100113868, + "num_tokens": 228488250.0, + "step": 189930 + }, + { + "entropy": 1.8616325289011002, + "epoch": 0.5887973811939158, + "grad_norm": 3.8072237968444824, + "learning_rate": 3.2969388179273836e-06, + "loss": 0.5177, + "mean_token_accuracy": 0.8439561665058136, + "num_tokens": 228501099.0, + "step": 189940 + }, + { + "entropy": 1.9076742500066757, + "epoch": 0.5888283803189654, + "grad_norm": 7.454319477081299, + "learning_rate": 3.2968520319424223e-06, + "loss": 0.4693, + "mean_token_accuracy": 0.8690858289599419, + "num_tokens": 228512982.0, + "step": 189950 + }, + { + "entropy": 1.8784878611564637, + "epoch": 0.5888593794440152, + "grad_norm": 7.8391032218933105, + "learning_rate": 3.2967652528105547e-06, + "loss": 0.4506, + "mean_token_accuracy": 0.8554589852690697, + "num_tokens": 228524707.0, + "step": 189960 + }, + { + "entropy": 1.8729647427797318, + "epoch": 0.5888903785690649, + "grad_norm": 8.286604881286621, + "learning_rate": 3.29667848053088e-06, + "loss": 0.4153, + "mean_token_accuracy": 0.8640352576971054, + "num_tokens": 228537429.0, + "step": 189970 + }, + { + "entropy": 1.8447729453444481, + "epoch": 0.5889213776941146, + "grad_norm": 8.21757698059082, + "learning_rate": 3.2965917151024953e-06, + "loss": 0.3909, + "mean_token_accuracy": 0.8646952509880066, + "num_tokens": 228549779.0, + "step": 189980 + }, + { + "entropy": 1.8539592325687408, + "epoch": 0.5889523768191642, + "grad_norm": 8.536675453186035, + "learning_rate": 3.2965049565244996e-06, + "loss": 0.414, + "mean_token_accuracy": 0.8582399547100067, + "num_tokens": 228561851.0, + "step": 189990 + }, + { + "entropy": 1.862364935874939, + "epoch": 0.588983375944214, + "grad_norm": 7.077996253967285, + "learning_rate": 3.2964182047959915e-06, + "loss": 0.3991, + "mean_token_accuracy": 0.8641496703028679, + "num_tokens": 228573974.0, + "step": 190000 + }, + { + "entropy": 1.8656501933932303, + "epoch": 0.5890143750692637, + "grad_norm": 8.515663146972656, + "learning_rate": 3.29633145991607e-06, + "loss": 0.4006, + "mean_token_accuracy": 0.865130890905857, + "num_tokens": 228586362.0, + "step": 190010 + }, + { + "entropy": 1.7867624640464783, + "epoch": 0.5890453741943134, + "grad_norm": 4.023524284362793, + "learning_rate": 3.2962447218838334e-06, + "loss": 0.3593, + "mean_token_accuracy": 0.8638389229774475, + "num_tokens": 228598677.0, + "step": 190020 + }, + { + "entropy": 1.9330232799053193, + "epoch": 0.589076373319363, + "grad_norm": 7.86823844909668, + "learning_rate": 3.2961579906983814e-06, + "loss": 0.4809, + "mean_token_accuracy": 0.8452364414930343, + "num_tokens": 228609772.0, + "step": 190030 + }, + { + "entropy": 1.8227289929986, + "epoch": 0.5891073724444128, + "grad_norm": 7.341483116149902, + "learning_rate": 3.2960712663588136e-06, + "loss": 0.4069, + "mean_token_accuracy": 0.8582392573356629, + "num_tokens": 228622421.0, + "step": 190040 + }, + { + "entropy": 1.8125467792153358, + "epoch": 0.5891383715694625, + "grad_norm": 2.910619020462036, + "learning_rate": 3.2959845488642283e-06, + "loss": 0.3902, + "mean_token_accuracy": 0.8680084839463234, + "num_tokens": 228635235.0, + "step": 190050 + }, + { + "entropy": 1.8080111667513847, + "epoch": 0.5891693706945121, + "grad_norm": 3.3258447647094727, + "learning_rate": 3.2958978382137264e-06, + "loss": 0.3514, + "mean_token_accuracy": 0.8626864954829216, + "num_tokens": 228648296.0, + "step": 190060 + }, + { + "entropy": 1.8418749034404756, + "epoch": 0.5892003698195618, + "grad_norm": 9.099149703979492, + "learning_rate": 3.295811134406407e-06, + "loss": 0.4577, + "mean_token_accuracy": 0.8486756533384323, + "num_tokens": 228660808.0, + "step": 190070 + }, + { + "entropy": 1.9027037620544434, + "epoch": 0.5892313689446116, + "grad_norm": 3.7178332805633545, + "learning_rate": 3.29572443744137e-06, + "loss": 0.4607, + "mean_token_accuracy": 0.8494066312909127, + "num_tokens": 228672586.0, + "step": 190080 + }, + { + "entropy": 1.8233142986893653, + "epoch": 0.5892623680696613, + "grad_norm": 7.8997721672058105, + "learning_rate": 3.295637747317715e-06, + "loss": 0.3931, + "mean_token_accuracy": 0.8714895188808441, + "num_tokens": 228684573.0, + "step": 190090 + }, + { + "entropy": 1.8038986414670943, + "epoch": 0.5892933671947109, + "grad_norm": 7.1145548820495605, + "learning_rate": 3.2955510640345435e-06, + "loss": 0.3996, + "mean_token_accuracy": 0.8632494986057282, + "num_tokens": 228697371.0, + "step": 190100 + }, + { + "entropy": 1.8754192054271699, + "epoch": 0.5893243663197606, + "grad_norm": 8.730184555053711, + "learning_rate": 3.295464387590955e-06, + "loss": 0.4392, + "mean_token_accuracy": 0.8609462678432465, + "num_tokens": 228709207.0, + "step": 190110 + }, + { + "entropy": 1.7887951895594596, + "epoch": 0.5893553654448104, + "grad_norm": 9.219457626342773, + "learning_rate": 3.2953777179860507e-06, + "loss": 0.3884, + "mean_token_accuracy": 0.8643956810235978, + "num_tokens": 228722841.0, + "step": 190120 + }, + { + "entropy": 1.856288492679596, + "epoch": 0.58938636456986, + "grad_norm": 9.556690216064453, + "learning_rate": 3.295291055218931e-06, + "loss": 0.4015, + "mean_token_accuracy": 0.8619930043816566, + "num_tokens": 228735293.0, + "step": 190130 + }, + { + "entropy": 1.8539328455924988, + "epoch": 0.5894173636949097, + "grad_norm": 9.75001049041748, + "learning_rate": 3.2952043992886974e-06, + "loss": 0.4742, + "mean_token_accuracy": 0.8486363276839256, + "num_tokens": 228747448.0, + "step": 190140 + }, + { + "entropy": 1.8937363922595978, + "epoch": 0.5894483628199594, + "grad_norm": 8.638582229614258, + "learning_rate": 3.2951177501944498e-06, + "loss": 0.4421, + "mean_token_accuracy": 0.8536964386701584, + "num_tokens": 228758391.0, + "step": 190150 + }, + { + "entropy": 1.8685229420661926, + "epoch": 0.589479361945009, + "grad_norm": 7.673697471618652, + "learning_rate": 3.2950311079352905e-06, + "loss": 0.4424, + "mean_token_accuracy": 0.8545270696282387, + "num_tokens": 228769201.0, + "step": 190160 + }, + { + "entropy": 1.7864877060055733, + "epoch": 0.5895103610700588, + "grad_norm": 4.983564853668213, + "learning_rate": 3.294944472510321e-06, + "loss": 0.3753, + "mean_token_accuracy": 0.8641117364168167, + "num_tokens": 228782125.0, + "step": 190170 + }, + { + "entropy": 1.9448559939861298, + "epoch": 0.5895413601951085, + "grad_norm": 6.925676345825195, + "learning_rate": 3.294857843918642e-06, + "loss": 0.4691, + "mean_token_accuracy": 0.8519876688718796, + "num_tokens": 228793264.0, + "step": 190180 + }, + { + "entropy": 1.8454899474978448, + "epoch": 0.5895723593201582, + "grad_norm": 8.729484558105469, + "learning_rate": 3.294771222159356e-06, + "loss": 0.4546, + "mean_token_accuracy": 0.8515693292021751, + "num_tokens": 228805524.0, + "step": 190190 + }, + { + "entropy": 1.9692073345184327, + "epoch": 0.5896033584452078, + "grad_norm": 8.359795570373535, + "learning_rate": 3.2946846072315646e-06, + "loss": 0.501, + "mean_token_accuracy": 0.8440068036317825, + "num_tokens": 228816416.0, + "step": 190200 + }, + { + "entropy": 1.894757117331028, + "epoch": 0.5896343575702576, + "grad_norm": 7.804634094238281, + "learning_rate": 3.2945979991343706e-06, + "loss": 0.5185, + "mean_token_accuracy": 0.8510986045002937, + "num_tokens": 228829075.0, + "step": 190210 + }, + { + "entropy": 1.860536876320839, + "epoch": 0.5896653566953073, + "grad_norm": 2.497119426727295, + "learning_rate": 3.2945113978668752e-06, + "loss": 0.4086, + "mean_token_accuracy": 0.8606500387191772, + "num_tokens": 228841126.0, + "step": 190220 + }, + { + "entropy": 1.8744609951972961, + "epoch": 0.589696355820357, + "grad_norm": 8.66141128540039, + "learning_rate": 3.294424803428181e-06, + "loss": 0.4361, + "mean_token_accuracy": 0.855774176120758, + "num_tokens": 228853330.0, + "step": 190230 + }, + { + "entropy": 1.9277845978736878, + "epoch": 0.5897273549454066, + "grad_norm": 7.061468124389648, + "learning_rate": 3.2943382158173916e-06, + "loss": 0.4606, + "mean_token_accuracy": 0.858029393851757, + "num_tokens": 228864546.0, + "step": 190240 + }, + { + "entropy": 1.8712523072957992, + "epoch": 0.5897583540704564, + "grad_norm": 7.561338424682617, + "learning_rate": 3.2942516350336085e-06, + "loss": 0.4729, + "mean_token_accuracy": 0.8526345446705819, + "num_tokens": 228876361.0, + "step": 190250 + }, + { + "entropy": 1.8795302122831345, + "epoch": 0.5897893531955061, + "grad_norm": 4.068550109863281, + "learning_rate": 3.2941650610759356e-06, + "loss": 0.4298, + "mean_token_accuracy": 0.8507021918892861, + "num_tokens": 228888145.0, + "step": 190260 + }, + { + "entropy": 1.8870377585291862, + "epoch": 0.5898203523205557, + "grad_norm": 7.460421085357666, + "learning_rate": 3.2940784939434754e-06, + "loss": 0.4529, + "mean_token_accuracy": 0.8494185790419578, + "num_tokens": 228900354.0, + "step": 190270 + }, + { + "entropy": 1.9352505058050156, + "epoch": 0.5898513514456054, + "grad_norm": 7.420712947845459, + "learning_rate": 3.2939919336353315e-06, + "loss": 0.4738, + "mean_token_accuracy": 0.8464471936225891, + "num_tokens": 228911066.0, + "step": 190280 + }, + { + "entropy": 1.8705057039856912, + "epoch": 0.5898823505706552, + "grad_norm": 6.3799028396606445, + "learning_rate": 3.293905380150607e-06, + "loss": 0.4178, + "mean_token_accuracy": 0.8655120372772217, + "num_tokens": 228922702.0, + "step": 190290 + }, + { + "entropy": 1.8629606902599334, + "epoch": 0.5899133496957049, + "grad_norm": 9.938080787658691, + "learning_rate": 3.2938188334884057e-06, + "loss": 0.4466, + "mean_token_accuracy": 0.8527127936482429, + "num_tokens": 228934867.0, + "step": 190300 + }, + { + "entropy": 1.783542599529028, + "epoch": 0.5899443488207545, + "grad_norm": 7.353618621826172, + "learning_rate": 3.293732293647831e-06, + "loss": 0.3522, + "mean_token_accuracy": 0.8648853242397309, + "num_tokens": 228947899.0, + "step": 190310 + }, + { + "entropy": 1.891326430439949, + "epoch": 0.5899753479458042, + "grad_norm": 9.56852912902832, + "learning_rate": 3.293645760627988e-06, + "loss": 0.4662, + "mean_token_accuracy": 0.8535473197698593, + "num_tokens": 228959022.0, + "step": 190320 + }, + { + "entropy": 1.8293341875076294, + "epoch": 0.590006347070854, + "grad_norm": 9.369425773620605, + "learning_rate": 3.293559234427979e-06, + "loss": 0.3919, + "mean_token_accuracy": 0.8616625502705574, + "num_tokens": 228971424.0, + "step": 190330 + }, + { + "entropy": 1.8950300842523575, + "epoch": 0.5900373461959036, + "grad_norm": 8.347681045532227, + "learning_rate": 3.2934727150469092e-06, + "loss": 0.4705, + "mean_token_accuracy": 0.8472575053572655, + "num_tokens": 228982796.0, + "step": 190340 + }, + { + "entropy": 1.890362760424614, + "epoch": 0.5900683453209533, + "grad_norm": 3.295876979827881, + "learning_rate": 3.2933862024838826e-06, + "loss": 0.4796, + "mean_token_accuracy": 0.852965684235096, + "num_tokens": 228994264.0, + "step": 190350 + }, + { + "entropy": 1.8526059299707414, + "epoch": 0.590099344446003, + "grad_norm": 3.781344175338745, + "learning_rate": 3.293299696738004e-06, + "loss": 0.4694, + "mean_token_accuracy": 0.847343610227108, + "num_tokens": 229006040.0, + "step": 190360 + }, + { + "entropy": 1.8531452476978303, + "epoch": 0.5901303435710528, + "grad_norm": 8.642162322998047, + "learning_rate": 3.29321319780838e-06, + "loss": 0.4521, + "mean_token_accuracy": 0.8518374875187874, + "num_tokens": 229018075.0, + "step": 190370 + }, + { + "entropy": 1.8752460539340974, + "epoch": 0.5901613426961024, + "grad_norm": 3.7689857482910156, + "learning_rate": 3.293126705694112e-06, + "loss": 0.4529, + "mean_token_accuracy": 0.8554120779037475, + "num_tokens": 229029496.0, + "step": 190380 + }, + { + "entropy": 1.8158239707350732, + "epoch": 0.5901923418211521, + "grad_norm": 8.14268970489502, + "learning_rate": 3.293040220394307e-06, + "loss": 0.4164, + "mean_token_accuracy": 0.8686467096209526, + "num_tokens": 229041879.0, + "step": 190390 + }, + { + "entropy": 1.8639475405216217, + "epoch": 0.5902233409462018, + "grad_norm": 7.9564642906188965, + "learning_rate": 3.29295374190807e-06, + "loss": 0.4521, + "mean_token_accuracy": 0.8520817801356315, + "num_tokens": 229053955.0, + "step": 190400 + }, + { + "entropy": 1.8774761408567429, + "epoch": 0.5902543400712514, + "grad_norm": 7.807625770568848, + "learning_rate": 3.2928672702345065e-06, + "loss": 0.4185, + "mean_token_accuracy": 0.8621849209070206, + "num_tokens": 229065976.0, + "step": 190410 + }, + { + "entropy": 1.933225655555725, + "epoch": 0.5902853391963012, + "grad_norm": 8.07839298248291, + "learning_rate": 3.292780805372722e-06, + "loss": 0.5244, + "mean_token_accuracy": 0.8469988331198692, + "num_tokens": 229076710.0, + "step": 190420 + }, + { + "entropy": 1.8818949446082116, + "epoch": 0.5903163383213509, + "grad_norm": 4.203948020935059, + "learning_rate": 3.2926943473218215e-06, + "loss": 0.415, + "mean_token_accuracy": 0.8559348538517952, + "num_tokens": 229088535.0, + "step": 190430 + }, + { + "entropy": 1.8088610842823982, + "epoch": 0.5903473374464006, + "grad_norm": 8.226593971252441, + "learning_rate": 3.2926078960809123e-06, + "loss": 0.3615, + "mean_token_accuracy": 0.8699506267905235, + "num_tokens": 229101332.0, + "step": 190440 + }, + { + "entropy": 1.8730618119239808, + "epoch": 0.5903783365714502, + "grad_norm": 7.754554271697998, + "learning_rate": 3.2925214516490994e-06, + "loss": 0.4562, + "mean_token_accuracy": 0.8541584327816963, + "num_tokens": 229113342.0, + "step": 190450 + }, + { + "entropy": 1.854330986738205, + "epoch": 0.5904093356965, + "grad_norm": 8.528146743774414, + "learning_rate": 3.2924350140254895e-06, + "loss": 0.4067, + "mean_token_accuracy": 0.8653483808040618, + "num_tokens": 229125410.0, + "step": 190460 + }, + { + "entropy": 1.8535527095198632, + "epoch": 0.5904403348215497, + "grad_norm": 7.2377848625183105, + "learning_rate": 3.2923485832091883e-06, + "loss": 0.4328, + "mean_token_accuracy": 0.85265993475914, + "num_tokens": 229137915.0, + "step": 190470 + }, + { + "entropy": 1.8914847001433372, + "epoch": 0.5904713339465993, + "grad_norm": 8.46504020690918, + "learning_rate": 3.2922621591993033e-06, + "loss": 0.4135, + "mean_token_accuracy": 0.8679022789001465, + "num_tokens": 229149136.0, + "step": 190480 + }, + { + "entropy": 1.842303329706192, + "epoch": 0.590502333071649, + "grad_norm": 2.873488664627075, + "learning_rate": 3.2921757419949406e-06, + "loss": 0.3956, + "mean_token_accuracy": 0.8661413252353668, + "num_tokens": 229161955.0, + "step": 190490 + }, + { + "entropy": 1.9058812111616135, + "epoch": 0.5905333321966988, + "grad_norm": 7.14345645904541, + "learning_rate": 3.2920893315952072e-06, + "loss": 0.4804, + "mean_token_accuracy": 0.8542837470769882, + "num_tokens": 229172882.0, + "step": 190500 + }, + { + "entropy": 1.8523188918828963, + "epoch": 0.5905643313217485, + "grad_norm": 3.61691951751709, + "learning_rate": 3.2920029279992097e-06, + "loss": 0.469, + "mean_token_accuracy": 0.8538831368088722, + "num_tokens": 229185438.0, + "step": 190510 + }, + { + "entropy": 1.9006501093506813, + "epoch": 0.5905953304467981, + "grad_norm": 7.883707523345947, + "learning_rate": 3.291916531206057e-06, + "loss": 0.4411, + "mean_token_accuracy": 0.8513767853379249, + "num_tokens": 229197140.0, + "step": 190520 + }, + { + "entropy": 1.7014507532119751, + "epoch": 0.5906263295718478, + "grad_norm": 7.430270671844482, + "learning_rate": 3.2918301412148545e-06, + "loss": 0.3265, + "mean_token_accuracy": 0.8773944795131683, + "num_tokens": 229210947.0, + "step": 190530 + }, + { + "entropy": 1.8675128430128098, + "epoch": 0.5906573286968976, + "grad_norm": 7.121996879577637, + "learning_rate": 3.2917437580247104e-06, + "loss": 0.4279, + "mean_token_accuracy": 0.8630870550870895, + "num_tokens": 229222512.0, + "step": 190540 + }, + { + "entropy": 1.736918868124485, + "epoch": 0.5906883278219472, + "grad_norm": 8.349037170410156, + "learning_rate": 3.291657381634732e-06, + "loss": 0.4134, + "mean_token_accuracy": 0.8642535865306854, + "num_tokens": 229236428.0, + "step": 190550 + }, + { + "entropy": 1.8738708287477492, + "epoch": 0.5907193269469969, + "grad_norm": 5.51838493347168, + "learning_rate": 3.291571012044028e-06, + "loss": 0.4125, + "mean_token_accuracy": 0.8621172949671745, + "num_tokens": 229248565.0, + "step": 190560 + }, + { + "entropy": 1.9014319479465485, + "epoch": 0.5907503260720466, + "grad_norm": 7.950676918029785, + "learning_rate": 3.2914846492517054e-06, + "loss": 0.472, + "mean_token_accuracy": 0.8562928780913353, + "num_tokens": 229260100.0, + "step": 190570 + }, + { + "entropy": 1.8517659470438956, + "epoch": 0.5907813251970964, + "grad_norm": 3.7238996028900146, + "learning_rate": 3.2913982932568738e-06, + "loss": 0.3979, + "mean_token_accuracy": 0.8641832873225213, + "num_tokens": 229272556.0, + "step": 190580 + }, + { + "entropy": 1.8479650676250459, + "epoch": 0.590812324322146, + "grad_norm": 3.849630117416382, + "learning_rate": 3.2913119440586404e-06, + "loss": 0.3865, + "mean_token_accuracy": 0.8724173724651336, + "num_tokens": 229283852.0, + "step": 190590 + }, + { + "entropy": 1.8994674652814865, + "epoch": 0.5908433234471957, + "grad_norm": 9.459546089172363, + "learning_rate": 3.2912256016561138e-06, + "loss": 0.4241, + "mean_token_accuracy": 0.8530774980783462, + "num_tokens": 229296603.0, + "step": 190600 + }, + { + "entropy": 1.7782090470194816, + "epoch": 0.5908743225722454, + "grad_norm": 3.8931798934936523, + "learning_rate": 3.2911392660484033e-06, + "loss": 0.3921, + "mean_token_accuracy": 0.8701503276824951, + "num_tokens": 229309684.0, + "step": 190610 + }, + { + "entropy": 1.8570687502622605, + "epoch": 0.590905321697295, + "grad_norm": 7.796317100524902, + "learning_rate": 3.291052937234617e-06, + "loss": 0.4414, + "mean_token_accuracy": 0.8513424873352051, + "num_tokens": 229321331.0, + "step": 190620 + }, + { + "entropy": 1.8146572425961494, + "epoch": 0.5909363208223448, + "grad_norm": 4.251481056213379, + "learning_rate": 3.290966615213865e-06, + "loss": 0.4234, + "mean_token_accuracy": 0.8561153456568718, + "num_tokens": 229334264.0, + "step": 190630 + }, + { + "entropy": 1.7932671368122102, + "epoch": 0.5909673199473945, + "grad_norm": 8.984890937805176, + "learning_rate": 3.2908802999852547e-06, + "loss": 0.384, + "mean_token_accuracy": 0.8709327802062035, + "num_tokens": 229346949.0, + "step": 190640 + }, + { + "entropy": 1.9233350574970245, + "epoch": 0.5909983190724442, + "grad_norm": 8.20529556274414, + "learning_rate": 3.2907939915478963e-06, + "loss": 0.4212, + "mean_token_accuracy": 0.8593816369771957, + "num_tokens": 229358034.0, + "step": 190650 + }, + { + "entropy": 1.8050723016262054, + "epoch": 0.5910293181974938, + "grad_norm": 8.480942726135254, + "learning_rate": 3.2907076899009004e-06, + "loss": 0.3899, + "mean_token_accuracy": 0.8727429389953614, + "num_tokens": 229370748.0, + "step": 190660 + }, + { + "entropy": 1.8822183892130853, + "epoch": 0.5910603173225436, + "grad_norm": 8.092206954956055, + "learning_rate": 3.2906213950433745e-06, + "loss": 0.4236, + "mean_token_accuracy": 0.8524337723851204, + "num_tokens": 229382446.0, + "step": 190670 + }, + { + "entropy": 1.9217692241072655, + "epoch": 0.5910913164475933, + "grad_norm": 9.312432289123535, + "learning_rate": 3.2905351069744307e-06, + "loss": 0.4852, + "mean_token_accuracy": 0.8461758613586425, + "num_tokens": 229393634.0, + "step": 190680 + }, + { + "entropy": 1.8857564225792884, + "epoch": 0.5911223155726429, + "grad_norm": 9.134812355041504, + "learning_rate": 3.2904488256931777e-06, + "loss": 0.4678, + "mean_token_accuracy": 0.8554666772484779, + "num_tokens": 229406173.0, + "step": 190690 + }, + { + "entropy": 1.8799413040280342, + "epoch": 0.5911533146976926, + "grad_norm": 8.584431648254395, + "learning_rate": 3.2903625511987254e-06, + "loss": 0.4473, + "mean_token_accuracy": 0.8597801223397254, + "num_tokens": 229417486.0, + "step": 190700 + }, + { + "entropy": 1.8675891801714897, + "epoch": 0.5911843138227424, + "grad_norm": 7.617621898651123, + "learning_rate": 3.290276283490185e-06, + "loss": 0.4465, + "mean_token_accuracy": 0.8503276690840721, + "num_tokens": 229430100.0, + "step": 190710 + }, + { + "entropy": 1.9346825338900089, + "epoch": 0.5912153129477921, + "grad_norm": 10.103340148925781, + "learning_rate": 3.290190022566666e-06, + "loss": 0.4456, + "mean_token_accuracy": 0.8546615943312645, + "num_tokens": 229441996.0, + "step": 190720 + }, + { + "entropy": 1.8633850425481797, + "epoch": 0.5912463120728417, + "grad_norm": 8.875619888305664, + "learning_rate": 3.2901037684272796e-06, + "loss": 0.4331, + "mean_token_accuracy": 0.8545124664902687, + "num_tokens": 229454182.0, + "step": 190730 + }, + { + "entropy": 1.8230377197265626, + "epoch": 0.5912773111978914, + "grad_norm": 7.591728210449219, + "learning_rate": 3.2900175210711366e-06, + "loss": 0.4321, + "mean_token_accuracy": 0.8573815956711769, + "num_tokens": 229465840.0, + "step": 190740 + }, + { + "entropy": 1.8421626284718513, + "epoch": 0.5913083103229412, + "grad_norm": 8.975364685058594, + "learning_rate": 3.2899312804973477e-06, + "loss": 0.3937, + "mean_token_accuracy": 0.8599640503525734, + "num_tokens": 229478483.0, + "step": 190750 + }, + { + "entropy": 1.8469863578677177, + "epoch": 0.5913393094479908, + "grad_norm": 7.199525356292725, + "learning_rate": 3.289845046705025e-06, + "loss": 0.3998, + "mean_token_accuracy": 0.860556535422802, + "num_tokens": 229490958.0, + "step": 190760 + }, + { + "entropy": 1.941561996936798, + "epoch": 0.5913703085730405, + "grad_norm": 6.5112080574035645, + "learning_rate": 3.289758819693278e-06, + "loss": 0.4735, + "mean_token_accuracy": 0.855010262131691, + "num_tokens": 229501945.0, + "step": 190770 + }, + { + "entropy": 1.830227905511856, + "epoch": 0.5914013076980902, + "grad_norm": 7.796418190002441, + "learning_rate": 3.28967259946122e-06, + "loss": 0.4471, + "mean_token_accuracy": 0.8564648866653443, + "num_tokens": 229514727.0, + "step": 190780 + }, + { + "entropy": 1.856962652504444, + "epoch": 0.59143230682314, + "grad_norm": 8.053735733032227, + "learning_rate": 3.2895863860079615e-06, + "loss": 0.3957, + "mean_token_accuracy": 0.8695003658533096, + "num_tokens": 229527296.0, + "step": 190790 + }, + { + "entropy": 1.879108041524887, + "epoch": 0.5914633059481896, + "grad_norm": 8.106398582458496, + "learning_rate": 3.2895001793326137e-06, + "loss": 0.4134, + "mean_token_accuracy": 0.8647478729486465, + "num_tokens": 229538967.0, + "step": 190800 + }, + { + "entropy": 1.8307623445987702, + "epoch": 0.5914943050732393, + "grad_norm": 7.531067848205566, + "learning_rate": 3.2894139794342905e-06, + "loss": 0.4486, + "mean_token_accuracy": 0.8509333118796348, + "num_tokens": 229551817.0, + "step": 190810 + }, + { + "entropy": 1.871297837793827, + "epoch": 0.591525304198289, + "grad_norm": 7.942951202392578, + "learning_rate": 3.289327786312102e-06, + "loss": 0.4349, + "mean_token_accuracy": 0.8627708986401558, + "num_tokens": 229563622.0, + "step": 190820 + }, + { + "entropy": 1.849818505346775, + "epoch": 0.5915563033233388, + "grad_norm": 4.293807506561279, + "learning_rate": 3.2892415999651623e-06, + "loss": 0.4298, + "mean_token_accuracy": 0.8529814571142197, + "num_tokens": 229575703.0, + "step": 190830 + }, + { + "entropy": 1.8120758205652236, + "epoch": 0.5915873024483884, + "grad_norm": 4.3140869140625, + "learning_rate": 3.2891554203925823e-06, + "loss": 0.4253, + "mean_token_accuracy": 0.8529886394739151, + "num_tokens": 229588734.0, + "step": 190840 + }, + { + "entropy": 1.8439616784453392, + "epoch": 0.5916183015734381, + "grad_norm": 3.7536003589630127, + "learning_rate": 3.289069247593475e-06, + "loss": 0.4103, + "mean_token_accuracy": 0.8631413847208023, + "num_tokens": 229601210.0, + "step": 190850 + }, + { + "entropy": 1.8744244769215583, + "epoch": 0.5916493006984878, + "grad_norm": 7.325197696685791, + "learning_rate": 3.2889830815669536e-06, + "loss": 0.4116, + "mean_token_accuracy": 0.8581292122602463, + "num_tokens": 229613374.0, + "step": 190860 + }, + { + "entropy": 1.8809070125222207, + "epoch": 0.5916802998235374, + "grad_norm": 4.105676651000977, + "learning_rate": 3.288896922312131e-06, + "loss": 0.4658, + "mean_token_accuracy": 0.8529094144701957, + "num_tokens": 229625494.0, + "step": 190870 + }, + { + "entropy": 1.900406464934349, + "epoch": 0.5917112989485872, + "grad_norm": 6.932359218597412, + "learning_rate": 3.2888107698281193e-06, + "loss": 0.4697, + "mean_token_accuracy": 0.842863355576992, + "num_tokens": 229637447.0, + "step": 190880 + }, + { + "entropy": 1.899899123609066, + "epoch": 0.5917422980736369, + "grad_norm": 8.196488380432129, + "learning_rate": 3.288724624114033e-06, + "loss": 0.4888, + "mean_token_accuracy": 0.8505870133638382, + "num_tokens": 229648952.0, + "step": 190890 + }, + { + "entropy": 1.8124980226159095, + "epoch": 0.5917732971986865, + "grad_norm": 8.71292495727539, + "learning_rate": 3.2886384851689847e-06, + "loss": 0.3686, + "mean_token_accuracy": 0.8594872117042541, + "num_tokens": 229661520.0, + "step": 190900 + }, + { + "entropy": 1.9170294746756553, + "epoch": 0.5918042963237362, + "grad_norm": 8.061601638793945, + "learning_rate": 3.2885523529920883e-06, + "loss": 0.4595, + "mean_token_accuracy": 0.8494271531701088, + "num_tokens": 229673586.0, + "step": 190910 + }, + { + "entropy": 1.947436010837555, + "epoch": 0.591835295448786, + "grad_norm": 10.752445220947266, + "learning_rate": 3.2884662275824575e-06, + "loss": 0.494, + "mean_token_accuracy": 0.8480764240026474, + "num_tokens": 229684280.0, + "step": 190920 + }, + { + "entropy": 1.8770150147378444, + "epoch": 0.5918662945738357, + "grad_norm": 7.548092365264893, + "learning_rate": 3.2883801089392058e-06, + "loss": 0.4648, + "mean_token_accuracy": 0.8523095726966858, + "num_tokens": 229696570.0, + "step": 190930 + }, + { + "entropy": 1.8843964487314224, + "epoch": 0.5918972936988853, + "grad_norm": 8.857001304626465, + "learning_rate": 3.2882939970614485e-06, + "loss": 0.444, + "mean_token_accuracy": 0.8598011612892151, + "num_tokens": 229708816.0, + "step": 190940 + }, + { + "entropy": 1.8774077758193015, + "epoch": 0.591928292823935, + "grad_norm": 8.422423362731934, + "learning_rate": 3.2882078919482983e-06, + "loss": 0.4059, + "mean_token_accuracy": 0.8628498017787933, + "num_tokens": 229720697.0, + "step": 190950 + }, + { + "entropy": 1.8349227026104926, + "epoch": 0.5919592919489848, + "grad_norm": 8.86723518371582, + "learning_rate": 3.28812179359887e-06, + "loss": 0.4308, + "mean_token_accuracy": 0.864958544075489, + "num_tokens": 229733597.0, + "step": 190960 + }, + { + "entropy": 1.8025419846177102, + "epoch": 0.5919902910740344, + "grad_norm": 3.7912731170654297, + "learning_rate": 3.288035702012279e-06, + "loss": 0.403, + "mean_token_accuracy": 0.8578298717737198, + "num_tokens": 229746676.0, + "step": 190970 + }, + { + "entropy": 1.9060227274894714, + "epoch": 0.5920212901990841, + "grad_norm": 9.238999366760254, + "learning_rate": 3.2879496171876383e-06, + "loss": 0.4513, + "mean_token_accuracy": 0.8610510796308517, + "num_tokens": 229757868.0, + "step": 190980 + }, + { + "entropy": 1.8276602059602738, + "epoch": 0.5920522893241338, + "grad_norm": 6.9514970779418945, + "learning_rate": 3.287863539124065e-06, + "loss": 0.3916, + "mean_token_accuracy": 0.8673168778419494, + "num_tokens": 229770018.0, + "step": 190990 + }, + { + "entropy": 1.8806076273322105, + "epoch": 0.5920832884491836, + "grad_norm": 9.204923629760742, + "learning_rate": 3.287777467820672e-06, + "loss": 0.4105, + "mean_token_accuracy": 0.8578602224588394, + "num_tokens": 229781623.0, + "step": 191000 + }, + { + "entropy": 1.8622824847698212, + "epoch": 0.5921142875742332, + "grad_norm": 8.241448402404785, + "learning_rate": 3.2876914032765763e-06, + "loss": 0.4353, + "mean_token_accuracy": 0.8582434579730034, + "num_tokens": 229793960.0, + "step": 191010 + }, + { + "entropy": 1.9200577780604362, + "epoch": 0.5921452866992829, + "grad_norm": 8.937591552734375, + "learning_rate": 3.2876053454908915e-06, + "loss": 0.4428, + "mean_token_accuracy": 0.8509643584489822, + "num_tokens": 229805175.0, + "step": 191020 + }, + { + "entropy": 1.846642728149891, + "epoch": 0.5921762858243326, + "grad_norm": 9.971576690673828, + "learning_rate": 3.287519294462734e-06, + "loss": 0.4646, + "mean_token_accuracy": 0.8499289125204086, + "num_tokens": 229817090.0, + "step": 191030 + }, + { + "entropy": 1.8406170830130577, + "epoch": 0.5922072849493824, + "grad_norm": 3.3949577808380127, + "learning_rate": 3.2874332501912204e-06, + "loss": 0.4157, + "mean_token_accuracy": 0.8627133294939995, + "num_tokens": 229828824.0, + "step": 191040 + }, + { + "entropy": 1.7291911885142326, + "epoch": 0.592238284074432, + "grad_norm": 3.691608190536499, + "learning_rate": 3.2873472126754647e-06, + "loss": 0.3814, + "mean_token_accuracy": 0.8592000618577004, + "num_tokens": 229843350.0, + "step": 191050 + }, + { + "entropy": 1.806246455013752, + "epoch": 0.5922692831994817, + "grad_norm": 9.855522155761719, + "learning_rate": 3.2872611819145838e-06, + "loss": 0.3753, + "mean_token_accuracy": 0.8735313445329667, + "num_tokens": 229855513.0, + "step": 191060 + }, + { + "entropy": 1.9019072502851486, + "epoch": 0.5923002823245314, + "grad_norm": 7.412065029144287, + "learning_rate": 3.2871751579076937e-06, + "loss": 0.4758, + "mean_token_accuracy": 0.8572008535265923, + "num_tokens": 229866228.0, + "step": 191070 + }, + { + "entropy": 1.8569483637809754, + "epoch": 0.5923312814495811, + "grad_norm": 4.520204067230225, + "learning_rate": 3.287089140653912e-06, + "loss": 0.429, + "mean_token_accuracy": 0.8527514040470123, + "num_tokens": 229878084.0, + "step": 191080 + }, + { + "entropy": 1.8153726771473884, + "epoch": 0.5923622805746308, + "grad_norm": 3.624915599822998, + "learning_rate": 3.2870031301523534e-06, + "loss": 0.4317, + "mean_token_accuracy": 0.8487672790884971, + "num_tokens": 229890969.0, + "step": 191090 + }, + { + "entropy": 1.8336610078811646, + "epoch": 0.5923932796996805, + "grad_norm": 7.657143592834473, + "learning_rate": 3.2869171264021356e-06, + "loss": 0.4096, + "mean_token_accuracy": 0.8640960767865181, + "num_tokens": 229902960.0, + "step": 191100 + }, + { + "entropy": 1.843455323576927, + "epoch": 0.5924242788247301, + "grad_norm": 7.258844375610352, + "learning_rate": 3.2868311294023743e-06, + "loss": 0.4093, + "mean_token_accuracy": 0.8586787343025207, + "num_tokens": 229915427.0, + "step": 191110 + }, + { + "entropy": 1.871065580844879, + "epoch": 0.5924552779497798, + "grad_norm": 8.253884315490723, + "learning_rate": 3.286745139152187e-06, + "loss": 0.4192, + "mean_token_accuracy": 0.8717275485396385, + "num_tokens": 229926864.0, + "step": 191120 + }, + { + "entropy": 1.8746835321187973, + "epoch": 0.5924862770748296, + "grad_norm": 4.194708347320557, + "learning_rate": 3.2866591556506915e-06, + "loss": 0.4474, + "mean_token_accuracy": 0.8504206746816635, + "num_tokens": 229938889.0, + "step": 191130 + }, + { + "entropy": 1.9504939645528794, + "epoch": 0.5925172761998793, + "grad_norm": 8.435033798217773, + "learning_rate": 3.2865731788970053e-06, + "loss": 0.5223, + "mean_token_accuracy": 0.8394033655524253, + "num_tokens": 229950278.0, + "step": 191140 + }, + { + "entropy": 1.83246139138937, + "epoch": 0.5925482753249289, + "grad_norm": 8.544919967651367, + "learning_rate": 3.2864872088902443e-06, + "loss": 0.4293, + "mean_token_accuracy": 0.8616950988769532, + "num_tokens": 229963221.0, + "step": 191150 + }, + { + "entropy": 1.8219051674008369, + "epoch": 0.5925792744499786, + "grad_norm": 4.832451343536377, + "learning_rate": 3.286401245629527e-06, + "loss": 0.4261, + "mean_token_accuracy": 0.8639433071017265, + "num_tokens": 229976076.0, + "step": 191160 + }, + { + "entropy": 1.8256654024124146, + "epoch": 0.5926102735750284, + "grad_norm": 8.011951446533203, + "learning_rate": 3.2863152891139717e-06, + "loss": 0.4464, + "mean_token_accuracy": 0.8585153207182884, + "num_tokens": 229988966.0, + "step": 191170 + }, + { + "entropy": 1.925898441672325, + "epoch": 0.592641272700078, + "grad_norm": 7.606298923492432, + "learning_rate": 3.2862293393426953e-06, + "loss": 0.5079, + "mean_token_accuracy": 0.8489936038851738, + "num_tokens": 230000081.0, + "step": 191180 + }, + { + "entropy": 1.807943683117628, + "epoch": 0.5926722718251277, + "grad_norm": 7.411771297454834, + "learning_rate": 3.2861433963148164e-06, + "loss": 0.3932, + "mean_token_accuracy": 0.8664886385202408, + "num_tokens": 230013462.0, + "step": 191190 + }, + { + "entropy": 1.9161637112498284, + "epoch": 0.5927032709501774, + "grad_norm": 8.881155967712402, + "learning_rate": 3.2860574600294535e-06, + "loss": 0.5271, + "mean_token_accuracy": 0.8394664421677589, + "num_tokens": 230024516.0, + "step": 191200 + }, + { + "entropy": 1.8874651074409485, + "epoch": 0.5927342700752272, + "grad_norm": 3.6384761333465576, + "learning_rate": 3.2859715304857254e-06, + "loss": 0.4523, + "mean_token_accuracy": 0.8549251481890678, + "num_tokens": 230036074.0, + "step": 191210 + }, + { + "entropy": 1.9186082750558853, + "epoch": 0.5927652692002768, + "grad_norm": 7.823178768157959, + "learning_rate": 3.285885607682749e-06, + "loss": 0.4595, + "mean_token_accuracy": 0.8534322634339333, + "num_tokens": 230047336.0, + "step": 191220 + }, + { + "entropy": 1.87621361464262, + "epoch": 0.5927962683253265, + "grad_norm": 9.075730323791504, + "learning_rate": 3.285799691619645e-06, + "loss": 0.4277, + "mean_token_accuracy": 0.8539279267191887, + "num_tokens": 230058593.0, + "step": 191230 + }, + { + "entropy": 1.816857473552227, + "epoch": 0.5928272674503762, + "grad_norm": 8.063228607177734, + "learning_rate": 3.285713782295531e-06, + "loss": 0.4016, + "mean_token_accuracy": 0.8562188908457756, + "num_tokens": 230071437.0, + "step": 191240 + }, + { + "entropy": 1.806934006512165, + "epoch": 0.592858266575426, + "grad_norm": 7.675832271575928, + "learning_rate": 3.2856278797095264e-06, + "loss": 0.4024, + "mean_token_accuracy": 0.8649509236216545, + "num_tokens": 230084144.0, + "step": 191250 + }, + { + "entropy": 1.9102825194597244, + "epoch": 0.5928892657004756, + "grad_norm": 9.0030517578125, + "learning_rate": 3.2855419838607507e-06, + "loss": 0.465, + "mean_token_accuracy": 0.8502855435013771, + "num_tokens": 230095428.0, + "step": 191260 + }, + { + "entropy": 1.9246444031596184, + "epoch": 0.5929202648255253, + "grad_norm": 9.100739479064941, + "learning_rate": 3.2854560947483234e-06, + "loss": 0.4677, + "mean_token_accuracy": 0.8507225677371025, + "num_tokens": 230106120.0, + "step": 191270 + }, + { + "entropy": 1.8004204019904138, + "epoch": 0.592951263950575, + "grad_norm": 7.986629009246826, + "learning_rate": 3.2853702123713637e-06, + "loss": 0.356, + "mean_token_accuracy": 0.8648659467697144, + "num_tokens": 230119224.0, + "step": 191280 + }, + { + "entropy": 1.899955762922764, + "epoch": 0.5929822630756247, + "grad_norm": 7.1050262451171875, + "learning_rate": 3.285284336728991e-06, + "loss": 0.4563, + "mean_token_accuracy": 0.8590516731142998, + "num_tokens": 230130967.0, + "step": 191290 + }, + { + "entropy": 1.8669502303004264, + "epoch": 0.5930132622006744, + "grad_norm": 7.009920597076416, + "learning_rate": 3.2851984678203264e-06, + "loss": 0.4348, + "mean_token_accuracy": 0.8672806769609451, + "num_tokens": 230143119.0, + "step": 191300 + }, + { + "entropy": 1.8921653680503367, + "epoch": 0.5930442613257241, + "grad_norm": 4.443764686584473, + "learning_rate": 3.2851126056444887e-06, + "loss": 0.43, + "mean_token_accuracy": 0.8577714130282402, + "num_tokens": 230154483.0, + "step": 191310 + }, + { + "entropy": 1.8913510665297508, + "epoch": 0.5930752604507737, + "grad_norm": 3.9651403427124023, + "learning_rate": 3.2850267502005983e-06, + "loss": 0.4303, + "mean_token_accuracy": 0.8589453801512719, + "num_tokens": 230166104.0, + "step": 191320 + }, + { + "entropy": 1.82257222533226, + "epoch": 0.5931062595758235, + "grad_norm": 4.177800178527832, + "learning_rate": 3.284940901487776e-06, + "loss": 0.3705, + "mean_token_accuracy": 0.8736421540379524, + "num_tokens": 230179512.0, + "step": 191330 + }, + { + "entropy": 1.9297788351774217, + "epoch": 0.5931372587008732, + "grad_norm": 9.562280654907227, + "learning_rate": 3.284855059505142e-06, + "loss": 0.4625, + "mean_token_accuracy": 0.8502230435609818, + "num_tokens": 230191047.0, + "step": 191340 + }, + { + "entropy": 1.9412023738026618, + "epoch": 0.5931682578259229, + "grad_norm": 7.020253658294678, + "learning_rate": 3.284769224251817e-06, + "loss": 0.4729, + "mean_token_accuracy": 0.8489518001675606, + "num_tokens": 230202789.0, + "step": 191350 + }, + { + "entropy": 1.9434513285756112, + "epoch": 0.5931992569509725, + "grad_norm": 8.415234565734863, + "learning_rate": 3.284683395726922e-06, + "loss": 0.4407, + "mean_token_accuracy": 0.8568918332457542, + "num_tokens": 230213774.0, + "step": 191360 + }, + { + "entropy": 1.8855139076709748, + "epoch": 0.5932302560760222, + "grad_norm": 10.343971252441406, + "learning_rate": 3.284597573929578e-06, + "loss": 0.4631, + "mean_token_accuracy": 0.8502753600478172, + "num_tokens": 230225552.0, + "step": 191370 + }, + { + "entropy": 1.8967630013823509, + "epoch": 0.593261255201072, + "grad_norm": 8.149733543395996, + "learning_rate": 3.284511758858906e-06, + "loss": 0.4346, + "mean_token_accuracy": 0.8509375154972076, + "num_tokens": 230237427.0, + "step": 191380 + }, + { + "entropy": 1.7960798382759093, + "epoch": 0.5932922543261216, + "grad_norm": 7.275712966918945, + "learning_rate": 3.2844259505140276e-06, + "loss": 0.3491, + "mean_token_accuracy": 0.8680211365222931, + "num_tokens": 230250950.0, + "step": 191390 + }, + { + "entropy": 1.8419205382466317, + "epoch": 0.5933232534511713, + "grad_norm": 3.7994577884674072, + "learning_rate": 3.284340148894064e-06, + "loss": 0.4106, + "mean_token_accuracy": 0.8641535311937332, + "num_tokens": 230263043.0, + "step": 191400 + }, + { + "entropy": 1.736213345825672, + "epoch": 0.593354252576221, + "grad_norm": 7.778532028198242, + "learning_rate": 3.2842543539981364e-06, + "loss": 0.3567, + "mean_token_accuracy": 0.8742548227310181, + "num_tokens": 230276722.0, + "step": 191410 + }, + { + "entropy": 1.8282306969165802, + "epoch": 0.5933852517012708, + "grad_norm": 9.659984588623047, + "learning_rate": 3.284168565825368e-06, + "loss": 0.4445, + "mean_token_accuracy": 0.8552992671728135, + "num_tokens": 230289280.0, + "step": 191420 + }, + { + "entropy": 1.9203779965639114, + "epoch": 0.5934162508263204, + "grad_norm": 7.255051612854004, + "learning_rate": 3.2840827843748797e-06, + "loss": 0.4823, + "mean_token_accuracy": 0.8464539498090744, + "num_tokens": 230300136.0, + "step": 191430 + }, + { + "entropy": 1.7909793078899383, + "epoch": 0.5934472499513701, + "grad_norm": 3.7259464263916016, + "learning_rate": 3.2839970096457935e-06, + "loss": 0.4302, + "mean_token_accuracy": 0.8664010047912598, + "num_tokens": 230313917.0, + "step": 191440 + }, + { + "entropy": 1.9096889346837997, + "epoch": 0.5934782490764198, + "grad_norm": 9.46761417388916, + "learning_rate": 3.2839112416372327e-06, + "loss": 0.5224, + "mean_token_accuracy": 0.8417908117175102, + "num_tokens": 230324881.0, + "step": 191450 + }, + { + "entropy": 1.9355971276760102, + "epoch": 0.5935092482014696, + "grad_norm": 8.894039154052734, + "learning_rate": 3.2838254803483185e-06, + "loss": 0.4866, + "mean_token_accuracy": 0.8509115308523179, + "num_tokens": 230335923.0, + "step": 191460 + }, + { + "entropy": 1.906863410770893, + "epoch": 0.5935402473265192, + "grad_norm": 7.58919620513916, + "learning_rate": 3.283739725778174e-06, + "loss": 0.4721, + "mean_token_accuracy": 0.8512718111276627, + "num_tokens": 230347235.0, + "step": 191470 + }, + { + "entropy": 1.9059718355536461, + "epoch": 0.5935712464515689, + "grad_norm": 8.579503059387207, + "learning_rate": 3.2836539779259224e-06, + "loss": 0.4622, + "mean_token_accuracy": 0.8537722200155258, + "num_tokens": 230358488.0, + "step": 191480 + }, + { + "entropy": 1.819216100871563, + "epoch": 0.5936022455766186, + "grad_norm": 8.80203914642334, + "learning_rate": 3.2835682367906855e-06, + "loss": 0.3962, + "mean_token_accuracy": 0.8643426224589348, + "num_tokens": 230370597.0, + "step": 191490 + }, + { + "entropy": 1.8791495144367218, + "epoch": 0.5936332447016683, + "grad_norm": 9.201972007751465, + "learning_rate": 3.2834825023715877e-06, + "loss": 0.4697, + "mean_token_accuracy": 0.8502372041344642, + "num_tokens": 230382760.0, + "step": 191500 + }, + { + "entropy": 1.819232840836048, + "epoch": 0.593664243826718, + "grad_norm": 7.602062702178955, + "learning_rate": 3.2833967746677518e-06, + "loss": 0.3809, + "mean_token_accuracy": 0.8715303301811218, + "num_tokens": 230395032.0, + "step": 191510 + }, + { + "entropy": 1.8747132286429404, + "epoch": 0.5936952429517677, + "grad_norm": 4.803540229797363, + "learning_rate": 3.2833110536783016e-06, + "loss": 0.4888, + "mean_token_accuracy": 0.8512809172272682, + "num_tokens": 230406951.0, + "step": 191520 + }, + { + "entropy": 1.8536844834685327, + "epoch": 0.5937262420768173, + "grad_norm": 9.080991744995117, + "learning_rate": 3.2832253394023596e-06, + "loss": 0.4075, + "mean_token_accuracy": 0.8655486524105072, + "num_tokens": 230419249.0, + "step": 191530 + }, + { + "entropy": 1.9990214914083482, + "epoch": 0.5937572412018671, + "grad_norm": 7.607864856719971, + "learning_rate": 3.2831396318390503e-06, + "loss": 0.4952, + "mean_token_accuracy": 0.8464673578739166, + "num_tokens": 230430199.0, + "step": 191540 + }, + { + "entropy": 1.7617909386754036, + "epoch": 0.5937882403269168, + "grad_norm": 3.5683536529541016, + "learning_rate": 3.283053930987497e-06, + "loss": 0.3467, + "mean_token_accuracy": 0.8725787281990052, + "num_tokens": 230443895.0, + "step": 191550 + }, + { + "entropy": 1.9190792486071586, + "epoch": 0.5938192394519665, + "grad_norm": 7.397488594055176, + "learning_rate": 3.2829682368468247e-06, + "loss": 0.4402, + "mean_token_accuracy": 0.854629234969616, + "num_tokens": 230455306.0, + "step": 191560 + }, + { + "entropy": 1.776638814806938, + "epoch": 0.5938502385770161, + "grad_norm": 7.879034042358398, + "learning_rate": 3.282882549416157e-06, + "loss": 0.3757, + "mean_token_accuracy": 0.868142431974411, + "num_tokens": 230468425.0, + "step": 191570 + }, + { + "entropy": 1.872198860347271, + "epoch": 0.5938812377020659, + "grad_norm": 8.502291679382324, + "learning_rate": 3.282796868694618e-06, + "loss": 0.4276, + "mean_token_accuracy": 0.8636263519525528, + "num_tokens": 230479390.0, + "step": 191580 + }, + { + "entropy": 1.8519338369369507, + "epoch": 0.5939122368271156, + "grad_norm": 4.0390543937683105, + "learning_rate": 3.2827111946813322e-06, + "loss": 0.4277, + "mean_token_accuracy": 0.8555379435420036, + "num_tokens": 230491954.0, + "step": 191590 + }, + { + "entropy": 1.8225978150963784, + "epoch": 0.5939432359521652, + "grad_norm": 8.881861686706543, + "learning_rate": 3.282625527375426e-06, + "loss": 0.3955, + "mean_token_accuracy": 0.8637348249554634, + "num_tokens": 230504269.0, + "step": 191600 + }, + { + "entropy": 1.8848466604948044, + "epoch": 0.5939742350772149, + "grad_norm": 7.539337635040283, + "learning_rate": 3.2825398667760223e-06, + "loss": 0.4104, + "mean_token_accuracy": 0.8589835464954376, + "num_tokens": 230516153.0, + "step": 191610 + }, + { + "entropy": 1.9843410074710846, + "epoch": 0.5940052342022646, + "grad_norm": 9.129307746887207, + "learning_rate": 3.282454212882246e-06, + "loss": 0.5621, + "mean_token_accuracy": 0.8272468775510788, + "num_tokens": 230526588.0, + "step": 191620 + }, + { + "entropy": 1.8471686720848084, + "epoch": 0.5940362333273144, + "grad_norm": 7.236810684204102, + "learning_rate": 3.2823685656932235e-06, + "loss": 0.4541, + "mean_token_accuracy": 0.8503222376108169, + "num_tokens": 230538252.0, + "step": 191630 + }, + { + "entropy": 1.848535105586052, + "epoch": 0.594067232452364, + "grad_norm": 9.330757141113281, + "learning_rate": 3.28228292520808e-06, + "loss": 0.4744, + "mean_token_accuracy": 0.8530674621462822, + "num_tokens": 230550476.0, + "step": 191640 + }, + { + "entropy": 1.8827123567461967, + "epoch": 0.5940982315774137, + "grad_norm": 8.938699722290039, + "learning_rate": 3.2821972914259404e-06, + "loss": 0.4394, + "mean_token_accuracy": 0.8543351233005524, + "num_tokens": 230562174.0, + "step": 191650 + }, + { + "entropy": 1.842447192966938, + "epoch": 0.5941292307024634, + "grad_norm": 7.114195346832275, + "learning_rate": 3.2821116643459306e-06, + "loss": 0.409, + "mean_token_accuracy": 0.8618290036916733, + "num_tokens": 230574113.0, + "step": 191660 + }, + { + "entropy": 1.8264191061258317, + "epoch": 0.5941602298275132, + "grad_norm": 3.526878595352173, + "learning_rate": 3.282026043967176e-06, + "loss": 0.3666, + "mean_token_accuracy": 0.8595398098230362, + "num_tokens": 230587543.0, + "step": 191670 + }, + { + "entropy": 1.871898628771305, + "epoch": 0.5941912289525628, + "grad_norm": 3.80003023147583, + "learning_rate": 3.2819404302888037e-06, + "loss": 0.4486, + "mean_token_accuracy": 0.8516824513673782, + "num_tokens": 230600047.0, + "step": 191680 + }, + { + "entropy": 1.87267697006464, + "epoch": 0.5942222280776125, + "grad_norm": 8.057140350341797, + "learning_rate": 3.281854823309938e-06, + "loss": 0.3967, + "mean_token_accuracy": 0.8602946862578392, + "num_tokens": 230612330.0, + "step": 191690 + }, + { + "entropy": 1.819714893400669, + "epoch": 0.5942532272026622, + "grad_norm": 5.560348987579346, + "learning_rate": 3.2817692230297066e-06, + "loss": 0.3805, + "mean_token_accuracy": 0.8662676781415939, + "num_tokens": 230624973.0, + "step": 191700 + }, + { + "entropy": 1.8511559277772904, + "epoch": 0.5942842263277119, + "grad_norm": 10.996221542358398, + "learning_rate": 3.281683629447236e-06, + "loss": 0.4261, + "mean_token_accuracy": 0.8555448532104493, + "num_tokens": 230637494.0, + "step": 191710 + }, + { + "entropy": 1.8133092552423478, + "epoch": 0.5943152254527616, + "grad_norm": 4.265011310577393, + "learning_rate": 3.2815980425616522e-06, + "loss": 0.3555, + "mean_token_accuracy": 0.8670377910137177, + "num_tokens": 230650747.0, + "step": 191720 + }, + { + "entropy": 1.7965379044413567, + "epoch": 0.5943462245778113, + "grad_norm": 4.251617908477783, + "learning_rate": 3.2815124623720825e-06, + "loss": 0.3882, + "mean_token_accuracy": 0.8509297609329224, + "num_tokens": 230663926.0, + "step": 191730 + }, + { + "entropy": 1.8378178104758263, + "epoch": 0.594377223702861, + "grad_norm": 8.80667781829834, + "learning_rate": 3.281426888877653e-06, + "loss": 0.4392, + "mean_token_accuracy": 0.8531530737876892, + "num_tokens": 230676509.0, + "step": 191740 + }, + { + "entropy": 1.9434722900390624, + "epoch": 0.5944082228279107, + "grad_norm": 8.150884628295898, + "learning_rate": 3.2813413220774917e-06, + "loss": 0.4754, + "mean_token_accuracy": 0.849103407561779, + "num_tokens": 230687164.0, + "step": 191750 + }, + { + "entropy": 1.746304377913475, + "epoch": 0.5944392219529604, + "grad_norm": 8.53097152709961, + "learning_rate": 3.2812557619707245e-06, + "loss": 0.4021, + "mean_token_accuracy": 0.8571577414870262, + "num_tokens": 230700490.0, + "step": 191760 + }, + { + "entropy": 1.8631037756800652, + "epoch": 0.5944702210780101, + "grad_norm": 3.721904993057251, + "learning_rate": 3.281170208556481e-06, + "loss": 0.4147, + "mean_token_accuracy": 0.8637348189949989, + "num_tokens": 230712164.0, + "step": 191770 + }, + { + "entropy": 1.8626684874296189, + "epoch": 0.5945012202030597, + "grad_norm": 9.87203311920166, + "learning_rate": 3.2810846618338865e-06, + "loss": 0.4516, + "mean_token_accuracy": 0.8456865876913071, + "num_tokens": 230723576.0, + "step": 191780 + }, + { + "entropy": 1.846021082997322, + "epoch": 0.5945322193281095, + "grad_norm": 8.10145092010498, + "learning_rate": 3.28099912180207e-06, + "loss": 0.4744, + "mean_token_accuracy": 0.84685700237751, + "num_tokens": 230735756.0, + "step": 191790 + }, + { + "entropy": 1.9466848850250245, + "epoch": 0.5945632184531592, + "grad_norm": 8.779047012329102, + "learning_rate": 3.2809135884601595e-06, + "loss": 0.4874, + "mean_token_accuracy": 0.8415557697415352, + "num_tokens": 230747089.0, + "step": 191800 + }, + { + "entropy": 1.8106557488441468, + "epoch": 0.5945942175782089, + "grad_norm": 7.083572864532471, + "learning_rate": 3.2808280618072817e-06, + "loss": 0.3861, + "mean_token_accuracy": 0.8683284193277359, + "num_tokens": 230759971.0, + "step": 191810 + }, + { + "entropy": 1.755371056497097, + "epoch": 0.5946252167032585, + "grad_norm": 4.735435485839844, + "learning_rate": 3.2807425418425672e-06, + "loss": 0.3442, + "mean_token_accuracy": 0.869535180926323, + "num_tokens": 230773520.0, + "step": 191820 + }, + { + "entropy": 1.9257057636976243, + "epoch": 0.5946562158283083, + "grad_norm": 8.006759643554688, + "learning_rate": 3.280657028565142e-06, + "loss": 0.4385, + "mean_token_accuracy": 0.8602747738361358, + "num_tokens": 230784640.0, + "step": 191830 + }, + { + "entropy": 1.904049114882946, + "epoch": 0.594687214953358, + "grad_norm": 8.111559867858887, + "learning_rate": 3.2805715219741352e-06, + "loss": 0.4546, + "mean_token_accuracy": 0.8505662351846695, + "num_tokens": 230795955.0, + "step": 191840 + }, + { + "entropy": 1.7928750395774842, + "epoch": 0.5947182140784076, + "grad_norm": 3.7698416709899902, + "learning_rate": 3.280486022068676e-06, + "loss": 0.3698, + "mean_token_accuracy": 0.8676718294620513, + "num_tokens": 230808581.0, + "step": 191850 + }, + { + "entropy": 1.8332011282444, + "epoch": 0.5947492132034573, + "grad_norm": 9.309735298156738, + "learning_rate": 3.280400528847893e-06, + "loss": 0.4294, + "mean_token_accuracy": 0.8579959213733673, + "num_tokens": 230821353.0, + "step": 191860 + }, + { + "entropy": 1.8673254698514938, + "epoch": 0.594780212328507, + "grad_norm": 3.900421619415283, + "learning_rate": 3.280315042310916e-06, + "loss": 0.4234, + "mean_token_accuracy": 0.8639498367905617, + "num_tokens": 230832972.0, + "step": 191870 + }, + { + "entropy": 1.799913428723812, + "epoch": 0.5948112114535568, + "grad_norm": 3.741468906402588, + "learning_rate": 3.2802295624568725e-06, + "loss": 0.3769, + "mean_token_accuracy": 0.8695779353380203, + "num_tokens": 230845058.0, + "step": 191880 + }, + { + "entropy": 1.8533201590180397, + "epoch": 0.5948422105786064, + "grad_norm": 4.252591133117676, + "learning_rate": 3.2801440892848922e-06, + "loss": 0.4459, + "mean_token_accuracy": 0.8506780445575715, + "num_tokens": 230856930.0, + "step": 191890 + }, + { + "entropy": 1.82045723721385, + "epoch": 0.5948732097036561, + "grad_norm": 9.829673767089844, + "learning_rate": 3.2800586227941063e-06, + "loss": 0.3771, + "mean_token_accuracy": 0.8705439165234565, + "num_tokens": 230868606.0, + "step": 191900 + }, + { + "entropy": 1.9105019956827163, + "epoch": 0.5949042088287058, + "grad_norm": 7.704622268676758, + "learning_rate": 3.279973162983642e-06, + "loss": 0.4722, + "mean_token_accuracy": 0.8548673078417778, + "num_tokens": 230879612.0, + "step": 191910 + }, + { + "entropy": 1.7643791824579238, + "epoch": 0.5949352079537555, + "grad_norm": 8.07160472869873, + "learning_rate": 3.279887709852631e-06, + "loss": 0.3875, + "mean_token_accuracy": 0.8598277315497398, + "num_tokens": 230893181.0, + "step": 191920 + }, + { + "entropy": 1.8020456477999687, + "epoch": 0.5949662070788052, + "grad_norm": 8.120401382446289, + "learning_rate": 3.2798022634002025e-06, + "loss": 0.4261, + "mean_token_accuracy": 0.8596922606229782, + "num_tokens": 230905797.0, + "step": 191930 + }, + { + "entropy": 1.8268871188163758, + "epoch": 0.5949972062038549, + "grad_norm": 8.7583589553833, + "learning_rate": 3.2797168236254867e-06, + "loss": 0.3928, + "mean_token_accuracy": 0.8658729165792465, + "num_tokens": 230917719.0, + "step": 191940 + }, + { + "entropy": 1.8274653524160385, + "epoch": 0.5950282053289045, + "grad_norm": 8.776835441589355, + "learning_rate": 3.2796313905276135e-06, + "loss": 0.4778, + "mean_token_accuracy": 0.8572709947824478, + "num_tokens": 230930520.0, + "step": 191950 + }, + { + "entropy": 1.7833070874214172, + "epoch": 0.5950592044539543, + "grad_norm": 8.960201263427734, + "learning_rate": 3.2795459641057135e-06, + "loss": 0.4131, + "mean_token_accuracy": 0.8575366869568825, + "num_tokens": 230944152.0, + "step": 191960 + }, + { + "entropy": 1.9216564670205116, + "epoch": 0.595090203579004, + "grad_norm": 5.831974506378174, + "learning_rate": 3.2794605443589176e-06, + "loss": 0.4398, + "mean_token_accuracy": 0.8623504295945168, + "num_tokens": 230955827.0, + "step": 191970 + }, + { + "entropy": 1.9013028174638749, + "epoch": 0.5951212027040537, + "grad_norm": 8.19417667388916, + "learning_rate": 3.2793751312863564e-06, + "loss": 0.4694, + "mean_token_accuracy": 0.8450792387127877, + "num_tokens": 230968253.0, + "step": 191980 + }, + { + "entropy": 1.913487869501114, + "epoch": 0.5951522018291033, + "grad_norm": 7.69880485534668, + "learning_rate": 3.2792897248871604e-06, + "loss": 0.4304, + "mean_token_accuracy": 0.8569048374891282, + "num_tokens": 230979258.0, + "step": 191990 + }, + { + "entropy": 1.9196341052651404, + "epoch": 0.5951832009541531, + "grad_norm": 7.564324378967285, + "learning_rate": 3.279204325160461e-06, + "loss": 0.4305, + "mean_token_accuracy": 0.861096502840519, + "num_tokens": 230990249.0, + "step": 192000 + }, + { + "entropy": 1.9277661710977554, + "epoch": 0.5952142000792028, + "grad_norm": 8.377885818481445, + "learning_rate": 3.2791189321053897e-06, + "loss": 0.4539, + "mean_token_accuracy": 0.8573820039629936, + "num_tokens": 231000863.0, + "step": 192010 + }, + { + "entropy": 1.7512979969382285, + "epoch": 0.5952451992042525, + "grad_norm": 4.907512187957764, + "learning_rate": 3.2790335457210778e-06, + "loss": 0.3723, + "mean_token_accuracy": 0.86670580804348, + "num_tokens": 231014360.0, + "step": 192020 + }, + { + "entropy": 1.8931189358234406, + "epoch": 0.5952761983293021, + "grad_norm": 7.5218377113342285, + "learning_rate": 3.2789481660066556e-06, + "loss": 0.4542, + "mean_token_accuracy": 0.8527333214879036, + "num_tokens": 231025991.0, + "step": 192030 + }, + { + "entropy": 1.8755558401346206, + "epoch": 0.5953071974543519, + "grad_norm": 7.976413726806641, + "learning_rate": 3.2788627929612564e-06, + "loss": 0.4795, + "mean_token_accuracy": 0.8503453806042671, + "num_tokens": 231037580.0, + "step": 192040 + }, + { + "entropy": 1.9751273036003112, + "epoch": 0.5953381965794016, + "grad_norm": 8.17392349243164, + "learning_rate": 3.2787774265840118e-06, + "loss": 0.5357, + "mean_token_accuracy": 0.8392359703779221, + "num_tokens": 231048287.0, + "step": 192050 + }, + { + "entropy": 1.8563969075679778, + "epoch": 0.5953691957044512, + "grad_norm": 8.1824951171875, + "learning_rate": 3.2786920668740523e-06, + "loss": 0.3955, + "mean_token_accuracy": 0.8573218896985054, + "num_tokens": 231060684.0, + "step": 192060 + }, + { + "entropy": 1.8792769759893417, + "epoch": 0.5954001948295009, + "grad_norm": 8.142267227172852, + "learning_rate": 3.2786067138305128e-06, + "loss": 0.4212, + "mean_token_accuracy": 0.8620775952935219, + "num_tokens": 231072491.0, + "step": 192070 + }, + { + "entropy": 1.8542990058660507, + "epoch": 0.5954311939545507, + "grad_norm": 4.228769779205322, + "learning_rate": 3.278521367452523e-06, + "loss": 0.4365, + "mean_token_accuracy": 0.8524388536810875, + "num_tokens": 231084915.0, + "step": 192080 + }, + { + "entropy": 1.8490941271185874, + "epoch": 0.5954621930796004, + "grad_norm": 6.09950590133667, + "learning_rate": 3.2784360277392156e-06, + "loss": 0.4326, + "mean_token_accuracy": 0.8661493092775345, + "num_tokens": 231097243.0, + "step": 192090 + }, + { + "entropy": 1.8824499115347861, + "epoch": 0.59549319220465, + "grad_norm": 10.269917488098145, + "learning_rate": 3.2783506946897257e-06, + "loss": 0.4303, + "mean_token_accuracy": 0.8613364815711975, + "num_tokens": 231109082.0, + "step": 192100 + }, + { + "entropy": 1.8853991687297822, + "epoch": 0.5955241913296997, + "grad_norm": 8.798211097717285, + "learning_rate": 3.2782653683031833e-06, + "loss": 0.467, + "mean_token_accuracy": 0.851449416577816, + "num_tokens": 231121501.0, + "step": 192110 + }, + { + "entropy": 1.8421639442443847, + "epoch": 0.5955551904547494, + "grad_norm": 9.831924438476562, + "learning_rate": 3.278180048578723e-06, + "loss": 0.3744, + "mean_token_accuracy": 0.8697717979550361, + "num_tokens": 231133864.0, + "step": 192120 + }, + { + "entropy": 1.9034344598650932, + "epoch": 0.5955861895797991, + "grad_norm": 8.559617042541504, + "learning_rate": 3.2780947355154772e-06, + "loss": 0.4635, + "mean_token_accuracy": 0.8431790366768837, + "num_tokens": 231145502.0, + "step": 192130 + }, + { + "entropy": 1.927782055735588, + "epoch": 0.5956171887048488, + "grad_norm": 7.649790287017822, + "learning_rate": 3.2780094291125804e-06, + "loss": 0.471, + "mean_token_accuracy": 0.8572627499699592, + "num_tokens": 231156738.0, + "step": 192140 + }, + { + "entropy": 1.7562423720955849, + "epoch": 0.5956481878298985, + "grad_norm": 3.492551326751709, + "learning_rate": 3.277924129369164e-06, + "loss": 0.399, + "mean_token_accuracy": 0.8618337422609329, + "num_tokens": 231170841.0, + "step": 192150 + }, + { + "entropy": 1.8300852000713348, + "epoch": 0.5956791869549481, + "grad_norm": 4.240060806274414, + "learning_rate": 3.277838836284362e-06, + "loss": 0.3706, + "mean_token_accuracy": 0.8771671429276466, + "num_tokens": 231182659.0, + "step": 192160 + }, + { + "entropy": 1.8263467103242874, + "epoch": 0.5957101860799979, + "grad_norm": 4.0324788093566895, + "learning_rate": 3.2777535498573097e-06, + "loss": 0.4734, + "mean_token_accuracy": 0.8442503765225411, + "num_tokens": 231195832.0, + "step": 192170 + }, + { + "entropy": 1.8414695873856544, + "epoch": 0.5957411852050476, + "grad_norm": 8.079511642456055, + "learning_rate": 3.2776682700871396e-06, + "loss": 0.3827, + "mean_token_accuracy": 0.8633176237344742, + "num_tokens": 231208416.0, + "step": 192180 + }, + { + "entropy": 1.9129913032054902, + "epoch": 0.5957721843300973, + "grad_norm": 7.3883585929870605, + "learning_rate": 3.2775829969729867e-06, + "loss": 0.4703, + "mean_token_accuracy": 0.859006418287754, + "num_tokens": 231219018.0, + "step": 192190 + }, + { + "entropy": 1.9273998066782951, + "epoch": 0.5958031834551469, + "grad_norm": 7.80372953414917, + "learning_rate": 3.2774977305139842e-06, + "loss": 0.4364, + "mean_token_accuracy": 0.8586554944515228, + "num_tokens": 231230542.0, + "step": 192200 + }, + { + "entropy": 1.9060659855604172, + "epoch": 0.5958341825801967, + "grad_norm": 8.776111602783203, + "learning_rate": 3.2774124707092676e-06, + "loss": 0.4366, + "mean_token_accuracy": 0.8529020965099334, + "num_tokens": 231242139.0, + "step": 192210 + }, + { + "entropy": 1.8044569581747054, + "epoch": 0.5958651817052464, + "grad_norm": 8.237578392028809, + "learning_rate": 3.2773272175579697e-06, + "loss": 0.4111, + "mean_token_accuracy": 0.8572305858135223, + "num_tokens": 231255179.0, + "step": 192220 + }, + { + "entropy": 1.8218290351331234, + "epoch": 0.595896180830296, + "grad_norm": 9.003937721252441, + "learning_rate": 3.2772419710592275e-06, + "loss": 0.4043, + "mean_token_accuracy": 0.8563547030091285, + "num_tokens": 231268401.0, + "step": 192230 + }, + { + "entropy": 1.8903609573841096, + "epoch": 0.5959271799553457, + "grad_norm": 7.921415328979492, + "learning_rate": 3.2771567312121737e-06, + "loss": 0.4608, + "mean_token_accuracy": 0.8533718347549438, + "num_tokens": 231279496.0, + "step": 192240 + }, + { + "entropy": 1.9152607202529908, + "epoch": 0.5959581790803955, + "grad_norm": 6.411677837371826, + "learning_rate": 3.277071498015945e-06, + "loss": 0.4684, + "mean_token_accuracy": 0.8491850972175599, + "num_tokens": 231290709.0, + "step": 192250 + }, + { + "entropy": 1.9035677805542945, + "epoch": 0.5959891782054452, + "grad_norm": 9.079903602600098, + "learning_rate": 3.2769862714696755e-06, + "loss": 0.4571, + "mean_token_accuracy": 0.8487890064716339, + "num_tokens": 231302222.0, + "step": 192260 + }, + { + "entropy": 1.9371833622455596, + "epoch": 0.5960201773304948, + "grad_norm": 7.364938259124756, + "learning_rate": 3.2769010515725007e-06, + "loss": 0.4713, + "mean_token_accuracy": 0.850156307220459, + "num_tokens": 231313394.0, + "step": 192270 + }, + { + "entropy": 1.8407243877649306, + "epoch": 0.5960511764555445, + "grad_norm": 8.76639175415039, + "learning_rate": 3.276815838323557e-06, + "loss": 0.4282, + "mean_token_accuracy": 0.8598857238888741, + "num_tokens": 231325783.0, + "step": 192280 + }, + { + "entropy": 1.8932856723666192, + "epoch": 0.5960821755805943, + "grad_norm": 8.80115795135498, + "learning_rate": 3.276730631721979e-06, + "loss": 0.4519, + "mean_token_accuracy": 0.8356119051575661, + "num_tokens": 231337818.0, + "step": 192290 + }, + { + "entropy": 1.7974711254239082, + "epoch": 0.596113174705644, + "grad_norm": 4.231179714202881, + "learning_rate": 3.2766454317669018e-06, + "loss": 0.349, + "mean_token_accuracy": 0.8690388962626457, + "num_tokens": 231350995.0, + "step": 192300 + }, + { + "entropy": 1.8709260776638985, + "epoch": 0.5961441738306936, + "grad_norm": 6.723526477813721, + "learning_rate": 3.2765602384574635e-06, + "loss": 0.4299, + "mean_token_accuracy": 0.8658482566475868, + "num_tokens": 231363054.0, + "step": 192310 + }, + { + "entropy": 1.8911598384380341, + "epoch": 0.5961751729557433, + "grad_norm": 7.90709924697876, + "learning_rate": 3.276475051792798e-06, + "loss": 0.469, + "mean_token_accuracy": 0.8582043945789337, + "num_tokens": 231374130.0, + "step": 192320 + }, + { + "entropy": 1.8854842871427535, + "epoch": 0.5962061720807931, + "grad_norm": 8.477952003479004, + "learning_rate": 3.2763898717720433e-06, + "loss": 0.436, + "mean_token_accuracy": 0.8625700756907463, + "num_tokens": 231385137.0, + "step": 192330 + }, + { + "entropy": 1.8439275979995728, + "epoch": 0.5962371712058427, + "grad_norm": 8.775875091552734, + "learning_rate": 3.2763046983943347e-06, + "loss": 0.4485, + "mean_token_accuracy": 0.8571325197815896, + "num_tokens": 231397818.0, + "step": 192340 + }, + { + "entropy": 1.8874992370605468, + "epoch": 0.5962681703308924, + "grad_norm": 8.183721542358398, + "learning_rate": 3.276219531658809e-06, + "loss": 0.4692, + "mean_token_accuracy": 0.8506439417600632, + "num_tokens": 231409817.0, + "step": 192350 + }, + { + "entropy": 1.886862662434578, + "epoch": 0.5962991694559421, + "grad_norm": 9.884288787841797, + "learning_rate": 3.276134371564604e-06, + "loss": 0.4771, + "mean_token_accuracy": 0.8503222018480301, + "num_tokens": 231421405.0, + "step": 192360 + }, + { + "entropy": 1.9534014105796813, + "epoch": 0.5963301685809917, + "grad_norm": 9.145366668701172, + "learning_rate": 3.2760492181108543e-06, + "loss": 0.5263, + "mean_token_accuracy": 0.8373357936739921, + "num_tokens": 231432345.0, + "step": 192370 + }, + { + "entropy": 1.8393455937504768, + "epoch": 0.5963611677060415, + "grad_norm": 7.934825420379639, + "learning_rate": 3.2759640712966994e-06, + "loss": 0.3978, + "mean_token_accuracy": 0.8589689537882805, + "num_tokens": 231444501.0, + "step": 192380 + }, + { + "entropy": 1.7986860394477844, + "epoch": 0.5963921668310912, + "grad_norm": 9.304616928100586, + "learning_rate": 3.2758789311212754e-06, + "loss": 0.3973, + "mean_token_accuracy": 0.8577401593327523, + "num_tokens": 231457103.0, + "step": 192390 + }, + { + "entropy": 1.9137774541974069, + "epoch": 0.5964231659561409, + "grad_norm": 9.727755546569824, + "learning_rate": 3.2757937975837195e-06, + "loss": 0.495, + "mean_token_accuracy": 0.8480470091104507, + "num_tokens": 231468352.0, + "step": 192400 + }, + { + "entropy": 1.9033278197050094, + "epoch": 0.5964541650811905, + "grad_norm": 8.264392852783203, + "learning_rate": 3.275708670683169e-06, + "loss": 0.412, + "mean_token_accuracy": 0.8584694027900696, + "num_tokens": 231479992.0, + "step": 192410 + }, + { + "entropy": 1.936842668056488, + "epoch": 0.5964851642062403, + "grad_norm": 8.501169204711914, + "learning_rate": 3.275623550418762e-06, + "loss": 0.4878, + "mean_token_accuracy": 0.8506403654813767, + "num_tokens": 231491144.0, + "step": 192420 + }, + { + "entropy": 1.8134740129113198, + "epoch": 0.59651616333129, + "grad_norm": 9.514694213867188, + "learning_rate": 3.2755384367896375e-06, + "loss": 0.4056, + "mean_token_accuracy": 0.8578062415122986, + "num_tokens": 231503499.0, + "step": 192430 + }, + { + "entropy": 1.8479931145906447, + "epoch": 0.5965471624563397, + "grad_norm": 8.95875072479248, + "learning_rate": 3.275453329794932e-06, + "loss": 0.3958, + "mean_token_accuracy": 0.8631100043654442, + "num_tokens": 231515536.0, + "step": 192440 + }, + { + "entropy": 1.8025830656290054, + "epoch": 0.5965781615813893, + "grad_norm": 3.1973462104797363, + "learning_rate": 3.2753682294337836e-06, + "loss": 0.3814, + "mean_token_accuracy": 0.8669339135289192, + "num_tokens": 231528253.0, + "step": 192450 + }, + { + "entropy": 1.9666550129652023, + "epoch": 0.5966091607064391, + "grad_norm": 7.123218059539795, + "learning_rate": 3.2752831357053304e-06, + "loss": 0.5224, + "mean_token_accuracy": 0.8414567679166793, + "num_tokens": 231539229.0, + "step": 192460 + }, + { + "entropy": 1.8350554794073104, + "epoch": 0.5966401598314888, + "grad_norm": 9.063071250915527, + "learning_rate": 3.2751980486087127e-06, + "loss": 0.4336, + "mean_token_accuracy": 0.8557635545730591, + "num_tokens": 231551262.0, + "step": 192470 + }, + { + "entropy": 1.84403538107872, + "epoch": 0.5966711589565384, + "grad_norm": 7.305988788604736, + "learning_rate": 3.275112968143067e-06, + "loss": 0.4092, + "mean_token_accuracy": 0.8625670328736306, + "num_tokens": 231563485.0, + "step": 192480 + }, + { + "entropy": 1.929225890338421, + "epoch": 0.5967021580815881, + "grad_norm": 8.14575481414795, + "learning_rate": 3.2750278943075337e-06, + "loss": 0.4446, + "mean_token_accuracy": 0.8594361409544945, + "num_tokens": 231574806.0, + "step": 192490 + }, + { + "entropy": 1.9129980459809304, + "epoch": 0.5967331572066379, + "grad_norm": 7.192739009857178, + "learning_rate": 3.2749428271012503e-06, + "loss": 0.4545, + "mean_token_accuracy": 0.8492211416363716, + "num_tokens": 231586108.0, + "step": 192500 + }, + { + "entropy": 1.7690053448081016, + "epoch": 0.5967641563316876, + "grad_norm": 3.324474573135376, + "learning_rate": 3.274857766523356e-06, + "loss": 0.3511, + "mean_token_accuracy": 0.8734697043895722, + "num_tokens": 231599945.0, + "step": 192510 + }, + { + "entropy": 1.8933797150850296, + "epoch": 0.5967951554567372, + "grad_norm": 7.506561756134033, + "learning_rate": 3.2747727125729916e-06, + "loss": 0.4465, + "mean_token_accuracy": 0.8573498860001564, + "num_tokens": 231611932.0, + "step": 192520 + }, + { + "entropy": 1.9005317762494087, + "epoch": 0.5968261545817869, + "grad_norm": 8.153799057006836, + "learning_rate": 3.2746876652492956e-06, + "loss": 0.475, + "mean_token_accuracy": 0.8526677310466766, + "num_tokens": 231623838.0, + "step": 192530 + }, + { + "entropy": 1.9433410674333573, + "epoch": 0.5968571537068367, + "grad_norm": 7.4116530418396, + "learning_rate": 3.2746026245514067e-06, + "loss": 0.5235, + "mean_token_accuracy": 0.8457577064633369, + "num_tokens": 231634466.0, + "step": 192540 + }, + { + "entropy": 1.8406395971775056, + "epoch": 0.5968881528318863, + "grad_norm": 8.905058860778809, + "learning_rate": 3.274517590478466e-06, + "loss": 0.3864, + "mean_token_accuracy": 0.8675936937332154, + "num_tokens": 231646117.0, + "step": 192550 + }, + { + "entropy": 1.8168965771794319, + "epoch": 0.596919151956936, + "grad_norm": 7.075071811676025, + "learning_rate": 3.2744325630296127e-06, + "loss": 0.3893, + "mean_token_accuracy": 0.8673455089330673, + "num_tokens": 231658932.0, + "step": 192560 + }, + { + "entropy": 1.854390124976635, + "epoch": 0.5969501510819857, + "grad_norm": 7.545094966888428, + "learning_rate": 3.2743475422039867e-06, + "loss": 0.4227, + "mean_token_accuracy": 0.8597999036312103, + "num_tokens": 231670845.0, + "step": 192570 + }, + { + "entropy": 1.9084011524915696, + "epoch": 0.5969811502070355, + "grad_norm": 8.939079284667969, + "learning_rate": 3.2742625280007286e-06, + "loss": 0.4572, + "mean_token_accuracy": 0.8635304853320122, + "num_tokens": 231681984.0, + "step": 192580 + }, + { + "entropy": 1.7502815291285514, + "epoch": 0.5970121493320851, + "grad_norm": 3.648104667663574, + "learning_rate": 3.2741775204189778e-06, + "loss": 0.3869, + "mean_token_accuracy": 0.8664402410387992, + "num_tokens": 231695315.0, + "step": 192590 + }, + { + "entropy": 1.915536816418171, + "epoch": 0.5970431484571348, + "grad_norm": 7.727672100067139, + "learning_rate": 3.274092519457876e-06, + "loss": 0.5206, + "mean_token_accuracy": 0.827784538269043, + "num_tokens": 231707239.0, + "step": 192600 + }, + { + "entropy": 1.869948796927929, + "epoch": 0.5970741475821845, + "grad_norm": 8.682625770568848, + "learning_rate": 3.274007525116563e-06, + "loss": 0.4365, + "mean_token_accuracy": 0.8611716479063034, + "num_tokens": 231719236.0, + "step": 192610 + }, + { + "entropy": 1.9523400247097016, + "epoch": 0.5971051467072341, + "grad_norm": 9.861188888549805, + "learning_rate": 3.2739225373941804e-06, + "loss": 0.4744, + "mean_token_accuracy": 0.8510428488254547, + "num_tokens": 231729734.0, + "step": 192620 + }, + { + "entropy": 1.8546181842684746, + "epoch": 0.5971361458322839, + "grad_norm": 8.405597686767578, + "learning_rate": 3.2738375562898684e-06, + "loss": 0.4359, + "mean_token_accuracy": 0.8611133188009262, + "num_tokens": 231741463.0, + "step": 192630 + }, + { + "entropy": 1.8450936824083328, + "epoch": 0.5971671449573336, + "grad_norm": 7.496282577514648, + "learning_rate": 3.273752581802769e-06, + "loss": 0.4426, + "mean_token_accuracy": 0.8597393468022346, + "num_tokens": 231753316.0, + "step": 192640 + }, + { + "entropy": 1.8092393189668656, + "epoch": 0.5971981440823833, + "grad_norm": 8.165148735046387, + "learning_rate": 3.2736676139320224e-06, + "loss": 0.4095, + "mean_token_accuracy": 0.8632441446185112, + "num_tokens": 231766156.0, + "step": 192650 + }, + { + "entropy": 1.9509679853916169, + "epoch": 0.5972291432074329, + "grad_norm": 10.11458969116211, + "learning_rate": 3.2735826526767705e-06, + "loss": 0.4703, + "mean_token_accuracy": 0.8565096750855445, + "num_tokens": 231777012.0, + "step": 192660 + }, + { + "entropy": 1.912872165441513, + "epoch": 0.5972601423324827, + "grad_norm": 8.525450706481934, + "learning_rate": 3.2734976980361547e-06, + "loss": 0.5026, + "mean_token_accuracy": 0.84669349193573, + "num_tokens": 231787575.0, + "step": 192670 + }, + { + "entropy": 1.7871014818549156, + "epoch": 0.5972911414575324, + "grad_norm": 8.310676574707031, + "learning_rate": 3.2734127500093175e-06, + "loss": 0.3778, + "mean_token_accuracy": 0.8582663521170616, + "num_tokens": 231800421.0, + "step": 192680 + }, + { + "entropy": 1.9158703058958053, + "epoch": 0.597322140582582, + "grad_norm": 8.641186714172363, + "learning_rate": 3.2733278085954e-06, + "loss": 0.4326, + "mean_token_accuracy": 0.8641590908169746, + "num_tokens": 231811362.0, + "step": 192690 + }, + { + "entropy": 1.8585405111312867, + "epoch": 0.5973531397076317, + "grad_norm": 9.843905448913574, + "learning_rate": 3.2732428737935443e-06, + "loss": 0.481, + "mean_token_accuracy": 0.8423675552010537, + "num_tokens": 231823920.0, + "step": 192700 + }, + { + "entropy": 1.9212913721799851, + "epoch": 0.5973841388326815, + "grad_norm": 7.2393341064453125, + "learning_rate": 3.273157945602893e-06, + "loss": 0.4341, + "mean_token_accuracy": 0.8591304495930672, + "num_tokens": 231834874.0, + "step": 192710 + }, + { + "entropy": 1.9177624106407165, + "epoch": 0.5974151379577312, + "grad_norm": 7.788320064544678, + "learning_rate": 3.2730730240225896e-06, + "loss": 0.4679, + "mean_token_accuracy": 0.8575213506817818, + "num_tokens": 231845743.0, + "step": 192720 + }, + { + "entropy": 1.750645785033703, + "epoch": 0.5974461370827808, + "grad_norm": 8.398082733154297, + "learning_rate": 3.2729881090517734e-06, + "loss": 0.346, + "mean_token_accuracy": 0.8724553629755973, + "num_tokens": 231858233.0, + "step": 192730 + }, + { + "entropy": 1.8802256792783738, + "epoch": 0.5974771362078305, + "grad_norm": 7.55579948425293, + "learning_rate": 3.27290320068959e-06, + "loss": 0.41, + "mean_token_accuracy": 0.8634855419397354, + "num_tokens": 231870089.0, + "step": 192740 + }, + { + "entropy": 1.8600758731365203, + "epoch": 0.5975081353328803, + "grad_norm": 7.949321746826172, + "learning_rate": 3.272818298935181e-06, + "loss": 0.4434, + "mean_token_accuracy": 0.8537335246801376, + "num_tokens": 231881937.0, + "step": 192750 + }, + { + "entropy": 1.931334674358368, + "epoch": 0.5975391344579299, + "grad_norm": 6.258232593536377, + "learning_rate": 3.2727334037876896e-06, + "loss": 0.4986, + "mean_token_accuracy": 0.8499545440077781, + "num_tokens": 231892789.0, + "step": 192760 + }, + { + "entropy": 1.9782539188861847, + "epoch": 0.5975701335829796, + "grad_norm": 8.786333084106445, + "learning_rate": 3.2726485152462585e-06, + "loss": 0.4957, + "mean_token_accuracy": 0.84947439879179, + "num_tokens": 231903402.0, + "step": 192770 + }, + { + "entropy": 1.9479021221399306, + "epoch": 0.5976011327080293, + "grad_norm": 11.152770042419434, + "learning_rate": 3.2725636333100318e-06, + "loss": 0.5102, + "mean_token_accuracy": 0.8418626189231873, + "num_tokens": 231914878.0, + "step": 192780 + }, + { + "entropy": 1.870383796095848, + "epoch": 0.5976321318330791, + "grad_norm": 9.35821533203125, + "learning_rate": 3.272478757978153e-06, + "loss": 0.4081, + "mean_token_accuracy": 0.8648263871669769, + "num_tokens": 231927587.0, + "step": 192790 + }, + { + "entropy": 1.9339314162731172, + "epoch": 0.5976631309581287, + "grad_norm": 12.332832336425781, + "learning_rate": 3.272393889249766e-06, + "loss": 0.48, + "mean_token_accuracy": 0.8536783277988433, + "num_tokens": 231939020.0, + "step": 192800 + }, + { + "entropy": 1.867577140033245, + "epoch": 0.5976941300831784, + "grad_norm": 3.920532703399658, + "learning_rate": 3.2723090271240126e-06, + "loss": 0.4298, + "mean_token_accuracy": 0.8560645163059235, + "num_tokens": 231951443.0, + "step": 192810 + }, + { + "entropy": 1.874250377714634, + "epoch": 0.5977251292082281, + "grad_norm": 11.359745979309082, + "learning_rate": 3.272224171600038e-06, + "loss": 0.4143, + "mean_token_accuracy": 0.8662922665476799, + "num_tokens": 231963672.0, + "step": 192820 + }, + { + "entropy": 1.8122723251581192, + "epoch": 0.5977561283332778, + "grad_norm": 4.158001899719238, + "learning_rate": 3.2721393226769865e-06, + "loss": 0.3682, + "mean_token_accuracy": 0.8700658664107322, + "num_tokens": 231976624.0, + "step": 192830 + }, + { + "entropy": 1.8762271001935005, + "epoch": 0.5977871274583275, + "grad_norm": 5.985019207000732, + "learning_rate": 3.2720544803540026e-06, + "loss": 0.4151, + "mean_token_accuracy": 0.8615304440259933, + "num_tokens": 231988561.0, + "step": 192840 + }, + { + "entropy": 1.948697453737259, + "epoch": 0.5978181265833772, + "grad_norm": 3.944962978363037, + "learning_rate": 3.27196964463023e-06, + "loss": 0.478, + "mean_token_accuracy": 0.8495228439569473, + "num_tokens": 231999874.0, + "step": 192850 + }, + { + "entropy": 1.8740996643900871, + "epoch": 0.5978491257084269, + "grad_norm": 9.939412117004395, + "learning_rate": 3.2718848155048135e-06, + "loss": 0.4362, + "mean_token_accuracy": 0.8526767179369926, + "num_tokens": 232012109.0, + "step": 192860 + }, + { + "entropy": 1.8960766091942787, + "epoch": 0.5978801248334765, + "grad_norm": 8.182035446166992, + "learning_rate": 3.271799992976898e-06, + "loss": 0.4289, + "mean_token_accuracy": 0.8591491937637329, + "num_tokens": 232023642.0, + "step": 192870 + }, + { + "entropy": 1.9175554364919662, + "epoch": 0.5979111239585263, + "grad_norm": 8.83017635345459, + "learning_rate": 3.2717151770456274e-06, + "loss": 0.4514, + "mean_token_accuracy": 0.8541746884584427, + "num_tokens": 232035340.0, + "step": 192880 + }, + { + "entropy": 1.8813014537096024, + "epoch": 0.597942123083576, + "grad_norm": 8.69917106628418, + "learning_rate": 3.271630367710148e-06, + "loss": 0.4102, + "mean_token_accuracy": 0.8582179397344589, + "num_tokens": 232047284.0, + "step": 192890 + }, + { + "entropy": 1.9298100471496582, + "epoch": 0.5979731222086256, + "grad_norm": 7.181746006011963, + "learning_rate": 3.271545564969604e-06, + "loss": 0.4514, + "mean_token_accuracy": 0.8552373722195625, + "num_tokens": 232058105.0, + "step": 192900 + }, + { + "entropy": 1.835771170258522, + "epoch": 0.5980041213336753, + "grad_norm": 8.301854133605957, + "learning_rate": 3.2714607688231415e-06, + "loss": 0.4252, + "mean_token_accuracy": 0.8567116022109985, + "num_tokens": 232070445.0, + "step": 192910 + }, + { + "entropy": 1.803771485388279, + "epoch": 0.5980351204587251, + "grad_norm": 3.411482095718384, + "learning_rate": 3.2713759792699057e-06, + "loss": 0.3896, + "mean_token_accuracy": 0.8670433759689331, + "num_tokens": 232083201.0, + "step": 192920 + }, + { + "entropy": 1.8091656923294068, + "epoch": 0.5980661195837748, + "grad_norm": 3.935356378555298, + "learning_rate": 3.2712911963090414e-06, + "loss": 0.4069, + "mean_token_accuracy": 0.8612916901707649, + "num_tokens": 232095640.0, + "step": 192930 + }, + { + "entropy": 1.7774908185005187, + "epoch": 0.5980971187088244, + "grad_norm": 4.483081340789795, + "learning_rate": 3.271206419939695e-06, + "loss": 0.339, + "mean_token_accuracy": 0.8757447093725205, + "num_tokens": 232108961.0, + "step": 192940 + }, + { + "entropy": 1.857834528386593, + "epoch": 0.5981281178338741, + "grad_norm": 13.082944869995117, + "learning_rate": 3.2711216501610145e-06, + "loss": 0.4389, + "mean_token_accuracy": 0.8536162137985229, + "num_tokens": 232120922.0, + "step": 192950 + }, + { + "entropy": 1.9390349090099335, + "epoch": 0.5981591169589239, + "grad_norm": 7.945557594299316, + "learning_rate": 3.271036886972142e-06, + "loss": 0.4603, + "mean_token_accuracy": 0.854901310801506, + "num_tokens": 232132214.0, + "step": 192960 + }, + { + "entropy": 1.9223468393087386, + "epoch": 0.5981901160839735, + "grad_norm": 8.340877532958984, + "learning_rate": 3.2709521303722264e-06, + "loss": 0.4912, + "mean_token_accuracy": 0.8535726457834244, + "num_tokens": 232143217.0, + "step": 192970 + }, + { + "entropy": 1.8658886596560478, + "epoch": 0.5982211152090232, + "grad_norm": 9.085555076599121, + "learning_rate": 3.2708673803604135e-06, + "loss": 0.4568, + "mean_token_accuracy": 0.8573013827204704, + "num_tokens": 232155836.0, + "step": 192980 + }, + { + "entropy": 1.7965003371238708, + "epoch": 0.5982521143340729, + "grad_norm": 8.895267486572266, + "learning_rate": 3.2707826369358496e-06, + "loss": 0.4072, + "mean_token_accuracy": 0.856090834736824, + "num_tokens": 232168724.0, + "step": 192990 + }, + { + "entropy": 1.8233628869056702, + "epoch": 0.5982831134591227, + "grad_norm": 3.5606913566589355, + "learning_rate": 3.2706979000976823e-06, + "loss": 0.4248, + "mean_token_accuracy": 0.8582077980041504, + "num_tokens": 232180804.0, + "step": 193000 + }, + { + "entropy": 1.8934690535068512, + "epoch": 0.5983141125841723, + "grad_norm": 9.63917064666748, + "learning_rate": 3.270613169845057e-06, + "loss": 0.4814, + "mean_token_accuracy": 0.846235491335392, + "num_tokens": 232192284.0, + "step": 193010 + }, + { + "entropy": 1.8537084929645062, + "epoch": 0.598345111709222, + "grad_norm": 3.254624605178833, + "learning_rate": 3.2705284461771226e-06, + "loss": 0.3885, + "mean_token_accuracy": 0.864462012052536, + "num_tokens": 232204849.0, + "step": 193020 + }, + { + "entropy": 1.849136011302471, + "epoch": 0.5983761108342717, + "grad_norm": 7.761828422546387, + "learning_rate": 3.2704437290930247e-06, + "loss": 0.4467, + "mean_token_accuracy": 0.8548069670796394, + "num_tokens": 232216877.0, + "step": 193030 + }, + { + "entropy": 1.8947343826293945, + "epoch": 0.5984071099593214, + "grad_norm": 7.728636264801025, + "learning_rate": 3.270359018591911e-06, + "loss": 0.436, + "mean_token_accuracy": 0.8585354000329971, + "num_tokens": 232228648.0, + "step": 193040 + }, + { + "entropy": 1.8421491459012032, + "epoch": 0.5984381090843711, + "grad_norm": 1.6748687028884888, + "learning_rate": 3.270274314672929e-06, + "loss": 0.4616, + "mean_token_accuracy": 0.8583970040082931, + "num_tokens": 232241457.0, + "step": 193050 + }, + { + "entropy": 1.9403438106179238, + "epoch": 0.5984691082094208, + "grad_norm": 7.910982131958008, + "learning_rate": 3.2701896173352277e-06, + "loss": 0.523, + "mean_token_accuracy": 0.842846755683422, + "num_tokens": 232252793.0, + "step": 193060 + }, + { + "entropy": 1.8780175030231476, + "epoch": 0.5985001073344705, + "grad_norm": 8.453100204467773, + "learning_rate": 3.2701049265779523e-06, + "loss": 0.3958, + "mean_token_accuracy": 0.866106778383255, + "num_tokens": 232265169.0, + "step": 193070 + }, + { + "entropy": 1.779895555973053, + "epoch": 0.5985311064595202, + "grad_norm": 5.00262975692749, + "learning_rate": 3.2700202424002527e-06, + "loss": 0.4239, + "mean_token_accuracy": 0.8611460939049721, + "num_tokens": 232278699.0, + "step": 193080 + }, + { + "entropy": 1.8304154217243194, + "epoch": 0.5985621055845699, + "grad_norm": 7.103408336639404, + "learning_rate": 3.2699355648012763e-06, + "loss": 0.3709, + "mean_token_accuracy": 0.8728136703372001, + "num_tokens": 232291597.0, + "step": 193090 + }, + { + "entropy": 1.923523449897766, + "epoch": 0.5985931047096196, + "grad_norm": 8.71379280090332, + "learning_rate": 3.2698508937801714e-06, + "loss": 0.4458, + "mean_token_accuracy": 0.8502437174320221, + "num_tokens": 232303268.0, + "step": 193100 + }, + { + "entropy": 1.9225520223379136, + "epoch": 0.5986241038346692, + "grad_norm": 8.151899337768555, + "learning_rate": 3.2697662293360872e-06, + "loss": 0.4667, + "mean_token_accuracy": 0.8552662536501885, + "num_tokens": 232314225.0, + "step": 193110 + }, + { + "entropy": 1.871592454612255, + "epoch": 0.5986551029597189, + "grad_norm": 7.62418270111084, + "learning_rate": 3.2696815714681707e-06, + "loss": 0.4087, + "mean_token_accuracy": 0.8585105463862419, + "num_tokens": 232326953.0, + "step": 193120 + }, + { + "entropy": 1.8937001720070838, + "epoch": 0.5986861020847687, + "grad_norm": 8.169354438781738, + "learning_rate": 3.2695969201755716e-06, + "loss": 0.428, + "mean_token_accuracy": 0.8637664049863816, + "num_tokens": 232338445.0, + "step": 193130 + }, + { + "entropy": 1.9185884833335876, + "epoch": 0.5987171012098184, + "grad_norm": 9.113455772399902, + "learning_rate": 3.2695122754574392e-06, + "loss": 0.4388, + "mean_token_accuracy": 0.8630929663777351, + "num_tokens": 232349797.0, + "step": 193140 + }, + { + "entropy": 1.676446057856083, + "epoch": 0.598748100334868, + "grad_norm": 6.821518898010254, + "learning_rate": 3.269427637312922e-06, + "loss": 0.2483, + "mean_token_accuracy": 0.8841329157352448, + "num_tokens": 232364387.0, + "step": 193150 + }, + { + "entropy": 1.8946543186903, + "epoch": 0.5987790994599177, + "grad_norm": 7.599132061004639, + "learning_rate": 3.269343005741169e-06, + "loss": 0.4098, + "mean_token_accuracy": 0.857475508749485, + "num_tokens": 232376474.0, + "step": 193160 + }, + { + "entropy": 1.8924265503883362, + "epoch": 0.5988100985849675, + "grad_norm": 8.127379417419434, + "learning_rate": 3.2692583807413296e-06, + "loss": 0.4587, + "mean_token_accuracy": 0.8496007680892944, + "num_tokens": 232388628.0, + "step": 193170 + }, + { + "entropy": 1.8490541279315948, + "epoch": 0.5988410977100171, + "grad_norm": 7.713603973388672, + "learning_rate": 3.269173762312554e-06, + "loss": 0.4225, + "mean_token_accuracy": 0.8569965809583664, + "num_tokens": 232401302.0, + "step": 193180 + }, + { + "entropy": 1.894689080119133, + "epoch": 0.5988720968350668, + "grad_norm": 6.935965538024902, + "learning_rate": 3.2690891504539906e-06, + "loss": 0.4643, + "mean_token_accuracy": 0.8526157677173615, + "num_tokens": 232413830.0, + "step": 193190 + }, + { + "entropy": 1.916493308544159, + "epoch": 0.5989030959601165, + "grad_norm": 9.567176818847656, + "learning_rate": 3.26900454516479e-06, + "loss": 0.4583, + "mean_token_accuracy": 0.8588901504874229, + "num_tokens": 232424966.0, + "step": 193200 + }, + { + "entropy": 1.8126317217946053, + "epoch": 0.5989340950851663, + "grad_norm": 3.7845699787139893, + "learning_rate": 3.2689199464441022e-06, + "loss": 0.4287, + "mean_token_accuracy": 0.8520424425601959, + "num_tokens": 232438169.0, + "step": 193210 + }, + { + "entropy": 1.8341682583093644, + "epoch": 0.5989650942102159, + "grad_norm": 8.897905349731445, + "learning_rate": 3.268835354291078e-06, + "loss": 0.3813, + "mean_token_accuracy": 0.8573813617229462, + "num_tokens": 232451220.0, + "step": 193220 + }, + { + "entropy": 1.936905488371849, + "epoch": 0.5989960933352656, + "grad_norm": 9.211556434631348, + "learning_rate": 3.2687507687048653e-06, + "loss": 0.4759, + "mean_token_accuracy": 0.8555907920002938, + "num_tokens": 232462258.0, + "step": 193230 + }, + { + "entropy": 1.9206456869840622, + "epoch": 0.5990270924603153, + "grad_norm": 8.207487106323242, + "learning_rate": 3.2686661896846167e-06, + "loss": 0.4774, + "mean_token_accuracy": 0.8534928426146507, + "num_tokens": 232473643.0, + "step": 193240 + }, + { + "entropy": 1.9361522287130355, + "epoch": 0.599058091585365, + "grad_norm": 7.724515438079834, + "learning_rate": 3.2685816172294825e-06, + "loss": 0.5038, + "mean_token_accuracy": 0.8487791284918785, + "num_tokens": 232484346.0, + "step": 193250 + }, + { + "entropy": 1.8511110663414, + "epoch": 0.5990890907104147, + "grad_norm": 2.388965606689453, + "learning_rate": 3.2684970513386126e-06, + "loss": 0.4309, + "mean_token_accuracy": 0.8582392647862435, + "num_tokens": 232496624.0, + "step": 193260 + }, + { + "entropy": 1.8527045264840125, + "epoch": 0.5991200898354644, + "grad_norm": 7.476731777191162, + "learning_rate": 3.2684124920111587e-06, + "loss": 0.4084, + "mean_token_accuracy": 0.8627937600016594, + "num_tokens": 232508749.0, + "step": 193270 + }, + { + "entropy": 1.7666074097156526, + "epoch": 0.5991510889605141, + "grad_norm": 4.153000354766846, + "learning_rate": 3.268327939246271e-06, + "loss": 0.3951, + "mean_token_accuracy": 0.8549092411994934, + "num_tokens": 232522585.0, + "step": 193280 + }, + { + "entropy": 1.8363301545381545, + "epoch": 0.5991820880855638, + "grad_norm": 10.776314735412598, + "learning_rate": 3.2682433930431013e-06, + "loss": 0.4202, + "mean_token_accuracy": 0.8624247089028358, + "num_tokens": 232535038.0, + "step": 193290 + }, + { + "entropy": 1.8398231253027917, + "epoch": 0.5992130872106135, + "grad_norm": 7.152562141418457, + "learning_rate": 3.2681588534008004e-06, + "loss": 0.4497, + "mean_token_accuracy": 0.8504539951682091, + "num_tokens": 232548491.0, + "step": 193300 + }, + { + "entropy": 1.8838460862636566, + "epoch": 0.5992440863356632, + "grad_norm": 10.93239974975586, + "learning_rate": 3.2680743203185205e-06, + "loss": 0.4315, + "mean_token_accuracy": 0.8576446041464806, + "num_tokens": 232559957.0, + "step": 193310 + }, + { + "entropy": 1.9226768806576728, + "epoch": 0.5992750854607128, + "grad_norm": 7.900559902191162, + "learning_rate": 3.267989793795413e-06, + "loss": 0.4959, + "mean_token_accuracy": 0.8387135237455368, + "num_tokens": 232570903.0, + "step": 193320 + }, + { + "entropy": 1.8913242816925049, + "epoch": 0.5993060845857626, + "grad_norm": 7.149113178253174, + "learning_rate": 3.2679052738306294e-06, + "loss": 0.4507, + "mean_token_accuracy": 0.8567343667149544, + "num_tokens": 232582496.0, + "step": 193330 + }, + { + "entropy": 1.8473687127232552, + "epoch": 0.5993370837108123, + "grad_norm": 8.293903350830078, + "learning_rate": 3.2678207604233227e-06, + "loss": 0.3866, + "mean_token_accuracy": 0.8724023550748825, + "num_tokens": 232594664.0, + "step": 193340 + }, + { + "entropy": 1.9113451212644577, + "epoch": 0.599368082835862, + "grad_norm": 7.854452610015869, + "learning_rate": 3.267736253572643e-06, + "loss": 0.4285, + "mean_token_accuracy": 0.8739783897995949, + "num_tokens": 232605471.0, + "step": 193350 + }, + { + "entropy": 1.9115384712815284, + "epoch": 0.5993990819609116, + "grad_norm": 6.98056697845459, + "learning_rate": 3.267651753277744e-06, + "loss": 0.4429, + "mean_token_accuracy": 0.8557354226708412, + "num_tokens": 232617080.0, + "step": 193360 + }, + { + "entropy": 1.9489524781703949, + "epoch": 0.5994300810859613, + "grad_norm": 7.385791301727295, + "learning_rate": 3.2675672595377785e-06, + "loss": 0.4952, + "mean_token_accuracy": 0.847708423435688, + "num_tokens": 232627867.0, + "step": 193370 + }, + { + "entropy": 1.8183632001280785, + "epoch": 0.5994610802110111, + "grad_norm": 4.137541770935059, + "learning_rate": 3.2674827723518975e-06, + "loss": 0.4927, + "mean_token_accuracy": 0.8466948196291924, + "num_tokens": 232640614.0, + "step": 193380 + }, + { + "entropy": 1.8530121505260468, + "epoch": 0.5994920793360607, + "grad_norm": 7.538222312927246, + "learning_rate": 3.2673982917192556e-06, + "loss": 0.4231, + "mean_token_accuracy": 0.861600874364376, + "num_tokens": 232652014.0, + "step": 193390 + }, + { + "entropy": 1.8623303756117822, + "epoch": 0.5995230784611104, + "grad_norm": 7.082211494445801, + "learning_rate": 3.267313817639004e-06, + "loss": 0.4051, + "mean_token_accuracy": 0.8642869338393211, + "num_tokens": 232664199.0, + "step": 193400 + }, + { + "entropy": 1.7694115832448005, + "epoch": 0.5995540775861601, + "grad_norm": 3.6996982097625732, + "learning_rate": 3.2672293501102966e-06, + "loss": 0.3785, + "mean_token_accuracy": 0.8665571510791779, + "num_tokens": 232677707.0, + "step": 193410 + }, + { + "entropy": 1.8033221870660783, + "epoch": 0.5995850767112099, + "grad_norm": 6.799063682556152, + "learning_rate": 3.267144889132287e-06, + "loss": 0.4092, + "mean_token_accuracy": 0.8535528868436814, + "num_tokens": 232690686.0, + "step": 193420 + }, + { + "entropy": 1.8295499309897423, + "epoch": 0.5996160758362595, + "grad_norm": 2.946974515914917, + "learning_rate": 3.2670604347041273e-06, + "loss": 0.4236, + "mean_token_accuracy": 0.8552382230758667, + "num_tokens": 232703540.0, + "step": 193430 + }, + { + "entropy": 1.937794703245163, + "epoch": 0.5996470749613092, + "grad_norm": 8.346149444580078, + "learning_rate": 3.2669759868249717e-06, + "loss": 0.4511, + "mean_token_accuracy": 0.8573932707309723, + "num_tokens": 232714394.0, + "step": 193440 + }, + { + "entropy": 1.872834388911724, + "epoch": 0.5996780740863589, + "grad_norm": 7.327252388000488, + "learning_rate": 3.2668915454939737e-06, + "loss": 0.4421, + "mean_token_accuracy": 0.8600111588835716, + "num_tokens": 232725531.0, + "step": 193450 + }, + { + "entropy": 1.9216588035225868, + "epoch": 0.5997090732114086, + "grad_norm": 5.6127400398254395, + "learning_rate": 3.266807110710288e-06, + "loss": 0.4927, + "mean_token_accuracy": 0.8471625164151192, + "num_tokens": 232736811.0, + "step": 193460 + }, + { + "entropy": 1.8705299958586692, + "epoch": 0.5997400723364583, + "grad_norm": 7.6800971031188965, + "learning_rate": 3.2667226824730674e-06, + "loss": 0.4132, + "mean_token_accuracy": 0.8520657777786255, + "num_tokens": 232748796.0, + "step": 193470 + }, + { + "entropy": 1.9085370868444442, + "epoch": 0.599771071461508, + "grad_norm": 8.688849449157715, + "learning_rate": 3.2666382607814663e-06, + "loss": 0.4506, + "mean_token_accuracy": 0.8572714865207672, + "num_tokens": 232760237.0, + "step": 193480 + }, + { + "entropy": 1.935324090719223, + "epoch": 0.5998020705865577, + "grad_norm": 5.7981791496276855, + "learning_rate": 3.2665538456346385e-06, + "loss": 0.4836, + "mean_token_accuracy": 0.8494788527488708, + "num_tokens": 232771006.0, + "step": 193490 + }, + { + "entropy": 1.8078247852623464, + "epoch": 0.5998330697116074, + "grad_norm": 3.719273090362549, + "learning_rate": 3.266469437031739e-06, + "loss": 0.3763, + "mean_token_accuracy": 0.8646056219935417, + "num_tokens": 232784079.0, + "step": 193500 + }, + { + "entropy": 1.802126082777977, + "epoch": 0.5998640688366571, + "grad_norm": 8.91614818572998, + "learning_rate": 3.2663850349719223e-06, + "loss": 0.3865, + "mean_token_accuracy": 0.8643121868371964, + "num_tokens": 232796465.0, + "step": 193510 + }, + { + "entropy": 1.9151442632079125, + "epoch": 0.5998950679617068, + "grad_norm": 4.269763946533203, + "learning_rate": 3.2663006394543432e-06, + "loss": 0.4553, + "mean_token_accuracy": 0.8613762855529785, + "num_tokens": 232808088.0, + "step": 193520 + }, + { + "entropy": 1.8307126134634018, + "epoch": 0.5999260670867564, + "grad_norm": 7.482618808746338, + "learning_rate": 3.266216250478157e-06, + "loss": 0.3853, + "mean_token_accuracy": 0.8658735737204551, + "num_tokens": 232820793.0, + "step": 193530 + }, + { + "entropy": 1.834550380706787, + "epoch": 0.5999570662118062, + "grad_norm": 7.851871013641357, + "learning_rate": 3.2661318680425176e-06, + "loss": 0.4695, + "mean_token_accuracy": 0.8613473907113075, + "num_tokens": 232833220.0, + "step": 193540 + }, + { + "entropy": 1.8679144203662872, + "epoch": 0.5999880653368559, + "grad_norm": 8.374329566955566, + "learning_rate": 3.26604749214658e-06, + "loss": 0.4531, + "mean_token_accuracy": 0.8531968653202057, + "num_tokens": 232845400.0, + "step": 193550 + }, + { + "entropy": 1.9155082762241364, + "epoch": 0.6000190644619056, + "grad_norm": 8.865435600280762, + "learning_rate": 3.2659631227895016e-06, + "loss": 0.4492, + "mean_token_accuracy": 0.8567243069410324, + "num_tokens": 232856561.0, + "step": 193560 + }, + { + "entropy": 1.7923461616039276, + "epoch": 0.6000500635869552, + "grad_norm": 3.426719903945923, + "learning_rate": 3.265878759970436e-06, + "loss": 0.3789, + "mean_token_accuracy": 0.8709788426756859, + "num_tokens": 232869397.0, + "step": 193570 + }, + { + "entropy": 1.8624560877680778, + "epoch": 0.600081062712005, + "grad_norm": 7.726009845733643, + "learning_rate": 3.2657944036885394e-06, + "loss": 0.4342, + "mean_token_accuracy": 0.8586367711424827, + "num_tokens": 232881337.0, + "step": 193580 + }, + { + "entropy": 1.873550534248352, + "epoch": 0.6001120618370547, + "grad_norm": 8.238359451293945, + "learning_rate": 3.265710053942967e-06, + "loss": 0.4344, + "mean_token_accuracy": 0.8538247391581535, + "num_tokens": 232893398.0, + "step": 193590 + }, + { + "entropy": 1.8931268915534019, + "epoch": 0.6001430609621043, + "grad_norm": 7.079104423522949, + "learning_rate": 3.2656257107328758e-06, + "loss": 0.4212, + "mean_token_accuracy": 0.8740960642695427, + "num_tokens": 232904771.0, + "step": 193600 + }, + { + "entropy": 1.8737851038575173, + "epoch": 0.600174060087154, + "grad_norm": 7.745109558105469, + "learning_rate": 3.265541374057421e-06, + "loss": 0.4418, + "mean_token_accuracy": 0.850721350312233, + "num_tokens": 232916506.0, + "step": 193610 + }, + { + "entropy": 1.8829128816723824, + "epoch": 0.6002050592122037, + "grad_norm": 8.40649127960205, + "learning_rate": 3.2654570439157595e-06, + "loss": 0.4804, + "mean_token_accuracy": 0.8492335632443428, + "num_tokens": 232928689.0, + "step": 193620 + }, + { + "entropy": 1.8552207678556443, + "epoch": 0.6002360583372535, + "grad_norm": 7.093789577484131, + "learning_rate": 3.265372720307048e-06, + "loss": 0.415, + "mean_token_accuracy": 0.8560128346085548, + "num_tokens": 232941993.0, + "step": 193630 + }, + { + "entropy": 1.8739019855856895, + "epoch": 0.6002670574623031, + "grad_norm": 7.4782490730285645, + "learning_rate": 3.2652884032304424e-06, + "loss": 0.4381, + "mean_token_accuracy": 0.8601368397474289, + "num_tokens": 232953269.0, + "step": 193640 + }, + { + "entropy": 1.8960627764463425, + "epoch": 0.6002980565873528, + "grad_norm": 6.894164085388184, + "learning_rate": 3.265204092685098e-06, + "loss": 0.4838, + "mean_token_accuracy": 0.8524527877569199, + "num_tokens": 232964859.0, + "step": 193650 + }, + { + "entropy": 1.8895342394709587, + "epoch": 0.6003290557124025, + "grad_norm": 7.591185092926025, + "learning_rate": 3.2651197886701742e-06, + "loss": 0.4484, + "mean_token_accuracy": 0.8550887897610664, + "num_tokens": 232976895.0, + "step": 193660 + }, + { + "entropy": 1.8984116226434709, + "epoch": 0.6003600548374523, + "grad_norm": 7.582576274871826, + "learning_rate": 3.2650354911848266e-06, + "loss": 0.4924, + "mean_token_accuracy": 0.8485193282365799, + "num_tokens": 232989028.0, + "step": 193670 + }, + { + "entropy": 1.8505906209349632, + "epoch": 0.6003910539625019, + "grad_norm": 3.8775970935821533, + "learning_rate": 3.2649512002282124e-06, + "loss": 0.4203, + "mean_token_accuracy": 0.8600415006279946, + "num_tokens": 233001436.0, + "step": 193680 + }, + { + "entropy": 1.8285743162035941, + "epoch": 0.6004220530875516, + "grad_norm": 9.559125900268555, + "learning_rate": 3.2648669157994896e-06, + "loss": 0.4169, + "mean_token_accuracy": 0.8612663343548774, + "num_tokens": 233014192.0, + "step": 193690 + }, + { + "entropy": 1.9036530345678329, + "epoch": 0.6004530522126013, + "grad_norm": 7.049911975860596, + "learning_rate": 3.2647826378978148e-06, + "loss": 0.5172, + "mean_token_accuracy": 0.8517943635582924, + "num_tokens": 233026158.0, + "step": 193700 + }, + { + "entropy": 1.889918527007103, + "epoch": 0.600484051337651, + "grad_norm": 3.9879229068756104, + "learning_rate": 3.2646983665223462e-06, + "loss": 0.4538, + "mean_token_accuracy": 0.8560131743550301, + "num_tokens": 233037390.0, + "step": 193710 + }, + { + "entropy": 1.8171194300055504, + "epoch": 0.6005150504627007, + "grad_norm": 4.150033473968506, + "learning_rate": 3.264614101672241e-06, + "loss": 0.4417, + "mean_token_accuracy": 0.8582745015621185, + "num_tokens": 233050147.0, + "step": 193720 + }, + { + "entropy": 1.9315490901470185, + "epoch": 0.6005460495877504, + "grad_norm": 9.391478538513184, + "learning_rate": 3.264529843346658e-06, + "loss": 0.5165, + "mean_token_accuracy": 0.838477349281311, + "num_tokens": 233061368.0, + "step": 193730 + }, + { + "entropy": 1.8814868345856666, + "epoch": 0.6005770487128, + "grad_norm": 7.952883243560791, + "learning_rate": 3.2644455915447548e-06, + "loss": 0.4463, + "mean_token_accuracy": 0.8509133666753769, + "num_tokens": 233073284.0, + "step": 193740 + }, + { + "entropy": 1.850820629298687, + "epoch": 0.6006080478378498, + "grad_norm": 8.097213745117188, + "learning_rate": 3.264361346265689e-06, + "loss": 0.4185, + "mean_token_accuracy": 0.8518119260668755, + "num_tokens": 233085186.0, + "step": 193750 + }, + { + "entropy": 1.845947080850601, + "epoch": 0.6006390469628995, + "grad_norm": 4.145138263702393, + "learning_rate": 3.2642771075086203e-06, + "loss": 0.4359, + "mean_token_accuracy": 0.8572747737169266, + "num_tokens": 233097277.0, + "step": 193760 + }, + { + "entropy": 1.9382601469755172, + "epoch": 0.6006700460879492, + "grad_norm": 7.650824069976807, + "learning_rate": 3.2641928752727066e-06, + "loss": 0.4886, + "mean_token_accuracy": 0.8393212363123894, + "num_tokens": 233108532.0, + "step": 193770 + }, + { + "entropy": 1.9437370300292969, + "epoch": 0.6007010452129988, + "grad_norm": 9.946969032287598, + "learning_rate": 3.2641086495571056e-06, + "loss": 0.4972, + "mean_token_accuracy": 0.8489497140049934, + "num_tokens": 233119148.0, + "step": 193780 + }, + { + "entropy": 1.8955764845013618, + "epoch": 0.6007320443380486, + "grad_norm": 9.539673805236816, + "learning_rate": 3.2640244303609774e-06, + "loss": 0.4232, + "mean_token_accuracy": 0.8624850213527679, + "num_tokens": 233130697.0, + "step": 193790 + }, + { + "entropy": 1.868457528948784, + "epoch": 0.6007630434630983, + "grad_norm": 8.330771446228027, + "learning_rate": 3.2639402176834805e-06, + "loss": 0.4313, + "mean_token_accuracy": 0.8593797326087952, + "num_tokens": 233142920.0, + "step": 193800 + }, + { + "entropy": 1.834128810465336, + "epoch": 0.600794042588148, + "grad_norm": 8.034552574157715, + "learning_rate": 3.263856011523774e-06, + "loss": 0.4024, + "mean_token_accuracy": 0.8655689373612404, + "num_tokens": 233155201.0, + "step": 193810 + }, + { + "entropy": 1.8737445190548896, + "epoch": 0.6008250417131976, + "grad_norm": 8.31792163848877, + "learning_rate": 3.2637718118810175e-06, + "loss": 0.4203, + "mean_token_accuracy": 0.8651196718215942, + "num_tokens": 233166612.0, + "step": 193820 + }, + { + "entropy": 1.9548739314079284, + "epoch": 0.6008560408382473, + "grad_norm": 8.683969497680664, + "learning_rate": 3.2636876187543705e-06, + "loss": 0.5071, + "mean_token_accuracy": 0.8405161753296853, + "num_tokens": 233177679.0, + "step": 193830 + }, + { + "entropy": 1.816799834370613, + "epoch": 0.6008870399632971, + "grad_norm": 8.057676315307617, + "learning_rate": 3.2636034321429916e-06, + "loss": 0.4629, + "mean_token_accuracy": 0.8585867390036583, + "num_tokens": 233191105.0, + "step": 193840 + }, + { + "entropy": 1.7605865061283112, + "epoch": 0.6009180390883467, + "grad_norm": 10.099117279052734, + "learning_rate": 3.263519252046042e-06, + "loss": 0.3582, + "mean_token_accuracy": 0.8697127535939216, + "num_tokens": 233205120.0, + "step": 193850 + }, + { + "entropy": 1.8630171611905098, + "epoch": 0.6009490382133964, + "grad_norm": 11.458540916442871, + "learning_rate": 3.2634350784626803e-06, + "loss": 0.4421, + "mean_token_accuracy": 0.854158864915371, + "num_tokens": 233217172.0, + "step": 193860 + }, + { + "entropy": 1.7534714862704277, + "epoch": 0.6009800373384461, + "grad_norm": 8.782004356384277, + "learning_rate": 3.263350911392067e-06, + "loss": 0.3824, + "mean_token_accuracy": 0.8709551870822907, + "num_tokens": 233230844.0, + "step": 193870 + }, + { + "entropy": 1.757567247748375, + "epoch": 0.6010110364634959, + "grad_norm": 3.572934865951538, + "learning_rate": 3.2632667508333627e-06, + "loss": 0.3279, + "mean_token_accuracy": 0.8731027454137802, + "num_tokens": 233243904.0, + "step": 193880 + }, + { + "entropy": 1.7916569873690604, + "epoch": 0.6010420355885455, + "grad_norm": 8.961586952209473, + "learning_rate": 3.2631825967857267e-06, + "loss": 0.3954, + "mean_token_accuracy": 0.8586719200015068, + "num_tokens": 233256370.0, + "step": 193890 + }, + { + "entropy": 1.8887753248214723, + "epoch": 0.6010730347135952, + "grad_norm": 8.99552059173584, + "learning_rate": 3.2630984492483208e-06, + "loss": 0.4665, + "mean_token_accuracy": 0.8502195388078689, + "num_tokens": 233268291.0, + "step": 193900 + }, + { + "entropy": 1.905160166323185, + "epoch": 0.6011040338386449, + "grad_norm": 8.513335227966309, + "learning_rate": 3.2630143082203054e-06, + "loss": 0.4491, + "mean_token_accuracy": 0.8540363058447837, + "num_tokens": 233279493.0, + "step": 193910 + }, + { + "entropy": 1.8477469071745873, + "epoch": 0.6011350329636946, + "grad_norm": 9.18237018585205, + "learning_rate": 3.26293017370084e-06, + "loss": 0.4271, + "mean_token_accuracy": 0.8550464868545532, + "num_tokens": 233291971.0, + "step": 193920 + }, + { + "entropy": 1.932934196293354, + "epoch": 0.6011660320887443, + "grad_norm": 8.203780174255371, + "learning_rate": 3.262846045689087e-06, + "loss": 0.4474, + "mean_token_accuracy": 0.856448483467102, + "num_tokens": 233303256.0, + "step": 193930 + }, + { + "entropy": 1.8909889429807663, + "epoch": 0.601197031213794, + "grad_norm": 8.730911254882812, + "learning_rate": 3.2627619241842075e-06, + "loss": 0.4558, + "mean_token_accuracy": 0.8476616650819778, + "num_tokens": 233315468.0, + "step": 193940 + }, + { + "entropy": 1.9053794205188752, + "epoch": 0.6012280303388436, + "grad_norm": 8.247823715209961, + "learning_rate": 3.2626778091853617e-06, + "loss": 0.4369, + "mean_token_accuracy": 0.8605243265628815, + "num_tokens": 233327021.0, + "step": 193950 + }, + { + "entropy": 1.873201458156109, + "epoch": 0.6012590294638934, + "grad_norm": 4.534573078155518, + "learning_rate": 3.262593700691711e-06, + "loss": 0.4639, + "mean_token_accuracy": 0.8496430322527886, + "num_tokens": 233339811.0, + "step": 193960 + }, + { + "entropy": 1.8513876274228096, + "epoch": 0.6012900285889431, + "grad_norm": 3.71006178855896, + "learning_rate": 3.2625095987024186e-06, + "loss": 0.4257, + "mean_token_accuracy": 0.8635744020342827, + "num_tokens": 233351662.0, + "step": 193970 + }, + { + "entropy": 1.9053289726376534, + "epoch": 0.6013210277139928, + "grad_norm": 8.673333168029785, + "learning_rate": 3.2624255032166445e-06, + "loss": 0.4696, + "mean_token_accuracy": 0.846896342933178, + "num_tokens": 233363924.0, + "step": 193980 + }, + { + "entropy": 1.875174443423748, + "epoch": 0.6013520268390424, + "grad_norm": 7.574441909790039, + "learning_rate": 3.2623414142335513e-06, + "loss": 0.4757, + "mean_token_accuracy": 0.8469843864440918, + "num_tokens": 233376185.0, + "step": 193990 + }, + { + "entropy": 1.8693370588123799, + "epoch": 0.6013830259640922, + "grad_norm": 7.561769962310791, + "learning_rate": 3.2622573317523008e-06, + "loss": 0.427, + "mean_token_accuracy": 0.8566849261522294, + "num_tokens": 233388303.0, + "step": 194000 + }, + { + "entropy": 1.9676036804914474, + "epoch": 0.6014140250891419, + "grad_norm": 9.288419723510742, + "learning_rate": 3.262173255772056e-06, + "loss": 0.4925, + "mean_token_accuracy": 0.8490712627768516, + "num_tokens": 233399248.0, + "step": 194010 + }, + { + "entropy": 1.8525095939636231, + "epoch": 0.6014450242141915, + "grad_norm": 9.906126022338867, + "learning_rate": 3.2620891862919774e-06, + "loss": 0.452, + "mean_token_accuracy": 0.8561065092682838, + "num_tokens": 233411943.0, + "step": 194020 + }, + { + "entropy": 1.8668144181370736, + "epoch": 0.6014760233392412, + "grad_norm": 7.300792694091797, + "learning_rate": 3.2620051233112295e-06, + "loss": 0.4151, + "mean_token_accuracy": 0.8610492289066315, + "num_tokens": 233424386.0, + "step": 194030 + }, + { + "entropy": 1.8017012298107147, + "epoch": 0.601507022464291, + "grad_norm": 8.391036033630371, + "learning_rate": 3.2619210668289734e-06, + "loss": 0.4048, + "mean_token_accuracy": 0.8658035784959793, + "num_tokens": 233437193.0, + "step": 194040 + }, + { + "entropy": 1.857760213315487, + "epoch": 0.6015380215893407, + "grad_norm": 7.2276530265808105, + "learning_rate": 3.2618370168443723e-06, + "loss": 0.4269, + "mean_token_accuracy": 0.8575306445360183, + "num_tokens": 233449580.0, + "step": 194050 + }, + { + "entropy": 1.8119343966245651, + "epoch": 0.6015690207143903, + "grad_norm": 8.690643310546875, + "learning_rate": 3.2617529733565897e-06, + "loss": 0.3645, + "mean_token_accuracy": 0.8623786956071854, + "num_tokens": 233462399.0, + "step": 194060 + }, + { + "entropy": 1.7857703417539597, + "epoch": 0.60160001983944, + "grad_norm": 7.535576343536377, + "learning_rate": 3.261668936364788e-06, + "loss": 0.4343, + "mean_token_accuracy": 0.850919260084629, + "num_tokens": 233476237.0, + "step": 194070 + }, + { + "entropy": 1.767759621143341, + "epoch": 0.6016310189644897, + "grad_norm": 8.007060050964355, + "learning_rate": 3.2615849058681304e-06, + "loss": 0.3887, + "mean_token_accuracy": 0.8724671140313148, + "num_tokens": 233489544.0, + "step": 194080 + }, + { + "entropy": 1.929178735613823, + "epoch": 0.6016620180895395, + "grad_norm": 7.4237775802612305, + "learning_rate": 3.2615008818657813e-06, + "loss": 0.4403, + "mean_token_accuracy": 0.8556190207600594, + "num_tokens": 233500500.0, + "step": 194090 + }, + { + "entropy": 1.8788953140377997, + "epoch": 0.6016930172145891, + "grad_norm": 9.347229957580566, + "learning_rate": 3.261416864356902e-06, + "loss": 0.4587, + "mean_token_accuracy": 0.8497606843709946, + "num_tokens": 233512190.0, + "step": 194100 + }, + { + "entropy": 1.7931357741355896, + "epoch": 0.6017240163396388, + "grad_norm": 6.776369571685791, + "learning_rate": 3.2613328533406585e-06, + "loss": 0.3821, + "mean_token_accuracy": 0.8622545897960663, + "num_tokens": 233525905.0, + "step": 194110 + }, + { + "entropy": 1.8501682430505753, + "epoch": 0.6017550154646885, + "grad_norm": 7.778374195098877, + "learning_rate": 3.261248848816214e-06, + "loss": 0.4055, + "mean_token_accuracy": 0.8576924055814743, + "num_tokens": 233538057.0, + "step": 194120 + }, + { + "entropy": 1.803722159564495, + "epoch": 0.6017860145897382, + "grad_norm": 7.2863054275512695, + "learning_rate": 3.2611648507827317e-06, + "loss": 0.4083, + "mean_token_accuracy": 0.8544072508811951, + "num_tokens": 233550730.0, + "step": 194130 + }, + { + "entropy": 1.766814012825489, + "epoch": 0.6018170137147879, + "grad_norm": 8.607423782348633, + "learning_rate": 3.2610808592393763e-06, + "loss": 0.3566, + "mean_token_accuracy": 0.8650653749704361, + "num_tokens": 233564293.0, + "step": 194140 + }, + { + "entropy": 1.9077049940824509, + "epoch": 0.6018480128398376, + "grad_norm": 8.215561866760254, + "learning_rate": 3.2609968741853123e-06, + "loss": 0.4829, + "mean_token_accuracy": 0.8469783037900924, + "num_tokens": 233576154.0, + "step": 194150 + }, + { + "entropy": 1.8505355820059777, + "epoch": 0.6018790119648872, + "grad_norm": 3.6293749809265137, + "learning_rate": 3.2609128956197027e-06, + "loss": 0.428, + "mean_token_accuracy": 0.8624635130167008, + "num_tokens": 233588765.0, + "step": 194160 + }, + { + "entropy": 1.9129736453294754, + "epoch": 0.601910011089937, + "grad_norm": 3.988649606704712, + "learning_rate": 3.260828923541714e-06, + "loss": 0.4424, + "mean_token_accuracy": 0.8539815843105316, + "num_tokens": 233600721.0, + "step": 194170 + }, + { + "entropy": 1.751829355955124, + "epoch": 0.6019410102149867, + "grad_norm": 7.624534606933594, + "learning_rate": 3.26074495795051e-06, + "loss": 0.3417, + "mean_token_accuracy": 0.8725816965103149, + "num_tokens": 233614563.0, + "step": 194180 + }, + { + "entropy": 1.8846061840653419, + "epoch": 0.6019720093400364, + "grad_norm": 9.135648727416992, + "learning_rate": 3.2606609988452565e-06, + "loss": 0.467, + "mean_token_accuracy": 0.8573798984289169, + "num_tokens": 233626732.0, + "step": 194190 + }, + { + "entropy": 1.9097521618008613, + "epoch": 0.602003008465086, + "grad_norm": 9.23742389678955, + "learning_rate": 3.260577046225117e-06, + "loss": 0.4443, + "mean_token_accuracy": 0.8530338272452355, + "num_tokens": 233638787.0, + "step": 194200 + }, + { + "entropy": 1.900249882042408, + "epoch": 0.6020340075901358, + "grad_norm": 8.547508239746094, + "learning_rate": 3.260493100089257e-06, + "loss": 0.4815, + "mean_token_accuracy": 0.8495122745633126, + "num_tokens": 233650856.0, + "step": 194210 + }, + { + "entropy": 1.8656728267669678, + "epoch": 0.6020650067151855, + "grad_norm": 9.300488471984863, + "learning_rate": 3.2604091604368428e-06, + "loss": 0.4515, + "mean_token_accuracy": 0.8569801360368728, + "num_tokens": 233663014.0, + "step": 194220 + }, + { + "entropy": 1.9170267432928085, + "epoch": 0.6020960058402351, + "grad_norm": 8.057255744934082, + "learning_rate": 3.2603252272670386e-06, + "loss": 0.4536, + "mean_token_accuracy": 0.851267957687378, + "num_tokens": 233674576.0, + "step": 194230 + }, + { + "entropy": 1.8948960661888123, + "epoch": 0.6021270049652848, + "grad_norm": 8.496610641479492, + "learning_rate": 3.2602413005790114e-06, + "loss": 0.4068, + "mean_token_accuracy": 0.8641295149922371, + "num_tokens": 233686986.0, + "step": 194240 + }, + { + "entropy": 1.8183800637722016, + "epoch": 0.6021580040903346, + "grad_norm": 8.398859024047852, + "learning_rate": 3.2601573803719268e-06, + "loss": 0.3852, + "mean_token_accuracy": 0.8668906956911087, + "num_tokens": 233699610.0, + "step": 194250 + }, + { + "entropy": 1.830089993774891, + "epoch": 0.6021890032153843, + "grad_norm": 4.63645601272583, + "learning_rate": 3.260073466644949e-06, + "loss": 0.4444, + "mean_token_accuracy": 0.855503711104393, + "num_tokens": 233713020.0, + "step": 194260 + }, + { + "entropy": 1.8949781507253647, + "epoch": 0.6022200023404339, + "grad_norm": 9.201313018798828, + "learning_rate": 3.2599895593972454e-06, + "loss": 0.4639, + "mean_token_accuracy": 0.8466179206967354, + "num_tokens": 233724971.0, + "step": 194270 + }, + { + "entropy": 1.857119870185852, + "epoch": 0.6022510014654836, + "grad_norm": 7.717209815979004, + "learning_rate": 3.2599056586279833e-06, + "loss": 0.4003, + "mean_token_accuracy": 0.8609469637274743, + "num_tokens": 233737558.0, + "step": 194280 + }, + { + "entropy": 1.8890058800578118, + "epoch": 0.6022820005905334, + "grad_norm": 2.933504104614258, + "learning_rate": 3.2598217643363266e-06, + "loss": 0.4414, + "mean_token_accuracy": 0.8636812314391136, + "num_tokens": 233749352.0, + "step": 194290 + }, + { + "entropy": 1.8319082364439965, + "epoch": 0.602312999715583, + "grad_norm": 3.977686643600464, + "learning_rate": 3.2597378765214437e-06, + "loss": 0.4071, + "mean_token_accuracy": 0.8596528321504593, + "num_tokens": 233761268.0, + "step": 194300 + }, + { + "entropy": 1.9153360083699227, + "epoch": 0.6023439988406327, + "grad_norm": 8.767776489257812, + "learning_rate": 3.2596539951825013e-06, + "loss": 0.4758, + "mean_token_accuracy": 0.8494154572486877, + "num_tokens": 233772804.0, + "step": 194310 + }, + { + "entropy": 1.8809309303760529, + "epoch": 0.6023749979656824, + "grad_norm": 8.12421989440918, + "learning_rate": 3.2595701203186654e-06, + "loss": 0.4498, + "mean_token_accuracy": 0.8539055183529853, + "num_tokens": 233785351.0, + "step": 194320 + }, + { + "entropy": 1.8863722085952759, + "epoch": 0.6024059970907321, + "grad_norm": 3.445355176925659, + "learning_rate": 3.2594862519291025e-06, + "loss": 0.4426, + "mean_token_accuracy": 0.8564076155424118, + "num_tokens": 233796945.0, + "step": 194330 + }, + { + "entropy": 1.8581660106778144, + "epoch": 0.6024369962157818, + "grad_norm": 8.705437660217285, + "learning_rate": 3.2594023900129813e-06, + "loss": 0.4451, + "mean_token_accuracy": 0.856708000600338, + "num_tokens": 233808677.0, + "step": 194340 + }, + { + "entropy": 1.8240687146782875, + "epoch": 0.6024679953408315, + "grad_norm": 8.276046752929688, + "learning_rate": 3.259318534569468e-06, + "loss": 0.4727, + "mean_token_accuracy": 0.8530595406889916, + "num_tokens": 233821656.0, + "step": 194350 + }, + { + "entropy": 1.8797230839729309, + "epoch": 0.6024989944658812, + "grad_norm": 7.852595329284668, + "learning_rate": 3.2592346855977304e-06, + "loss": 0.4417, + "mean_token_accuracy": 0.8535345628857612, + "num_tokens": 233833787.0, + "step": 194360 + }, + { + "entropy": 1.8471519738435744, + "epoch": 0.6025299935909308, + "grad_norm": 8.075080871582031, + "learning_rate": 3.259150843096936e-06, + "loss": 0.3899, + "mean_token_accuracy": 0.8740817040205002, + "num_tokens": 233845912.0, + "step": 194370 + }, + { + "entropy": 1.8665089890360833, + "epoch": 0.6025609927159806, + "grad_norm": 4.186502456665039, + "learning_rate": 3.2590670070662517e-06, + "loss": 0.4278, + "mean_token_accuracy": 0.8563824415206909, + "num_tokens": 233858116.0, + "step": 194380 + }, + { + "entropy": 1.867053084075451, + "epoch": 0.6025919918410303, + "grad_norm": 7.1243133544921875, + "learning_rate": 3.258983177504847e-06, + "loss": 0.4172, + "mean_token_accuracy": 0.8526987448334694, + "num_tokens": 233870411.0, + "step": 194390 + }, + { + "entropy": 1.8570081070065498, + "epoch": 0.60262299096608, + "grad_norm": 5.597967624664307, + "learning_rate": 3.258899354411889e-06, + "loss": 0.4158, + "mean_token_accuracy": 0.8594145223498344, + "num_tokens": 233883552.0, + "step": 194400 + }, + { + "entropy": 1.8842222318053246, + "epoch": 0.6026539900911296, + "grad_norm": 9.581622123718262, + "learning_rate": 3.258815537786546e-06, + "loss": 0.4305, + "mean_token_accuracy": 0.8546239584684372, + "num_tokens": 233895529.0, + "step": 194410 + }, + { + "entropy": 1.8748262420296669, + "epoch": 0.6026849892161794, + "grad_norm": 4.644106864929199, + "learning_rate": 3.2587317276279862e-06, + "loss": 0.4534, + "mean_token_accuracy": 0.8466273933649063, + "num_tokens": 233907783.0, + "step": 194420 + }, + { + "entropy": 1.8434445157647132, + "epoch": 0.6027159883412291, + "grad_norm": 9.612750053405762, + "learning_rate": 3.258647923935378e-06, + "loss": 0.4407, + "mean_token_accuracy": 0.8550927296280861, + "num_tokens": 233918985.0, + "step": 194430 + }, + { + "entropy": 1.8526080772280693, + "epoch": 0.6027469874662787, + "grad_norm": 7.647796154022217, + "learning_rate": 3.258564126707891e-06, + "loss": 0.3581, + "mean_token_accuracy": 0.8636227637529373, + "num_tokens": 233931333.0, + "step": 194440 + }, + { + "entropy": 1.8961267843842506, + "epoch": 0.6027779865913284, + "grad_norm": 7.142885684967041, + "learning_rate": 3.2584803359446926e-06, + "loss": 0.4422, + "mean_token_accuracy": 0.8610294297337532, + "num_tokens": 233942827.0, + "step": 194450 + }, + { + "entropy": 1.8752504363656044, + "epoch": 0.6028089857163782, + "grad_norm": 8.758703231811523, + "learning_rate": 3.258396551644953e-06, + "loss": 0.4262, + "mean_token_accuracy": 0.8480131477117538, + "num_tokens": 233955077.0, + "step": 194460 + }, + { + "entropy": 1.9181806400418282, + "epoch": 0.6028399848414279, + "grad_norm": 8.899065017700195, + "learning_rate": 3.2583127738078397e-06, + "loss": 0.4755, + "mean_token_accuracy": 0.8472186312079429, + "num_tokens": 233966300.0, + "step": 194470 + }, + { + "entropy": 1.9128697007894515, + "epoch": 0.6028709839664775, + "grad_norm": 8.163961410522461, + "learning_rate": 3.258229002432523e-06, + "loss": 0.4591, + "mean_token_accuracy": 0.8505765497684479, + "num_tokens": 233977795.0, + "step": 194480 + }, + { + "entropy": 1.8266849026083947, + "epoch": 0.6029019830915272, + "grad_norm": 3.2791669368743896, + "learning_rate": 3.2581452375181723e-06, + "loss": 0.3898, + "mean_token_accuracy": 0.8646172285079956, + "num_tokens": 233989904.0, + "step": 194490 + }, + { + "entropy": 1.8116347730159759, + "epoch": 0.602932982216577, + "grad_norm": 7.465194225311279, + "learning_rate": 3.2580614790639577e-06, + "loss": 0.3866, + "mean_token_accuracy": 0.8606933891773224, + "num_tokens": 234002395.0, + "step": 194500 + }, + { + "entropy": 1.7665009267628193, + "epoch": 0.6029639813416267, + "grad_norm": 9.060105323791504, + "learning_rate": 3.2579777270690473e-06, + "loss": 0.3952, + "mean_token_accuracy": 0.8632282540202141, + "num_tokens": 234016437.0, + "step": 194510 + }, + { + "entropy": 1.8606160417199136, + "epoch": 0.6029949804666763, + "grad_norm": 6.732377052307129, + "learning_rate": 3.2578939815326116e-06, + "loss": 0.4017, + "mean_token_accuracy": 0.8659750744700432, + "num_tokens": 234028357.0, + "step": 194520 + }, + { + "entropy": 1.8387962013483048, + "epoch": 0.603025979591726, + "grad_norm": 7.758693695068359, + "learning_rate": 3.2578102424538213e-06, + "loss": 0.4185, + "mean_token_accuracy": 0.8687957838177681, + "num_tokens": 234040845.0, + "step": 194530 + }, + { + "entropy": 1.8796296492218971, + "epoch": 0.6030569787167758, + "grad_norm": 8.825248718261719, + "learning_rate": 3.257726509831845e-06, + "loss": 0.4148, + "mean_token_accuracy": 0.8598513454198837, + "num_tokens": 234051837.0, + "step": 194540 + }, + { + "entropy": 1.9274076849222184, + "epoch": 0.6030879778418254, + "grad_norm": 9.172769546508789, + "learning_rate": 3.2576427836658545e-06, + "loss": 0.4743, + "mean_token_accuracy": 0.8573598250746727, + "num_tokens": 234062981.0, + "step": 194550 + }, + { + "entropy": 1.8512599140405654, + "epoch": 0.6031189769668751, + "grad_norm": 7.954952716827393, + "learning_rate": 3.25755906395502e-06, + "loss": 0.4305, + "mean_token_accuracy": 0.8574766382575035, + "num_tokens": 234075485.0, + "step": 194560 + }, + { + "entropy": 1.82259241938591, + "epoch": 0.6031499760919248, + "grad_norm": 8.998270034790039, + "learning_rate": 3.257475350698512e-06, + "loss": 0.3997, + "mean_token_accuracy": 0.8661994159221649, + "num_tokens": 234088067.0, + "step": 194570 + }, + { + "entropy": 1.79864399433136, + "epoch": 0.6031809752169744, + "grad_norm": 6.640564918518066, + "learning_rate": 3.2573916438955004e-06, + "loss": 0.4454, + "mean_token_accuracy": 0.8569759652018547, + "num_tokens": 234101864.0, + "step": 194580 + }, + { + "entropy": 1.9484706163406371, + "epoch": 0.6032119743420242, + "grad_norm": 5.720554828643799, + "learning_rate": 3.257307943545156e-06, + "loss": 0.4506, + "mean_token_accuracy": 0.8616125836968422, + "num_tokens": 234112736.0, + "step": 194590 + }, + { + "entropy": 1.9250062137842179, + "epoch": 0.6032429734670739, + "grad_norm": 7.0284318923950195, + "learning_rate": 3.2572242496466517e-06, + "loss": 0.4421, + "mean_token_accuracy": 0.8584776639938354, + "num_tokens": 234123876.0, + "step": 194600 + }, + { + "entropy": 1.9174670904874802, + "epoch": 0.6032739725921236, + "grad_norm": 8.020894050598145, + "learning_rate": 3.257140562199157e-06, + "loss": 0.4624, + "mean_token_accuracy": 0.8567168533802032, + "num_tokens": 234134673.0, + "step": 194610 + }, + { + "entropy": 1.888125415146351, + "epoch": 0.6033049717171732, + "grad_norm": 8.607301712036133, + "learning_rate": 3.257056881201843e-06, + "loss": 0.4332, + "mean_token_accuracy": 0.8639842256903648, + "num_tokens": 234146458.0, + "step": 194620 + }, + { + "entropy": 1.867220088839531, + "epoch": 0.603335970842223, + "grad_norm": 9.075867652893066, + "learning_rate": 3.256973206653882e-06, + "loss": 0.4141, + "mean_token_accuracy": 0.861547501385212, + "num_tokens": 234158546.0, + "step": 194630 + }, + { + "entropy": 1.860349379479885, + "epoch": 0.6033669699672727, + "grad_norm": 4.725429058074951, + "learning_rate": 3.2568895385544454e-06, + "loss": 0.4405, + "mean_token_accuracy": 0.8553501516580582, + "num_tokens": 234171213.0, + "step": 194640 + }, + { + "entropy": 1.8892497256398202, + "epoch": 0.6033979690923224, + "grad_norm": 6.847842216491699, + "learning_rate": 3.256805876902705e-06, + "loss": 0.4598, + "mean_token_accuracy": 0.8541723743081093, + "num_tokens": 234183250.0, + "step": 194650 + }, + { + "entropy": 1.9206036776304245, + "epoch": 0.603428968217372, + "grad_norm": 9.059221267700195, + "learning_rate": 3.2567222216978333e-06, + "loss": 0.475, + "mean_token_accuracy": 0.8496464163064956, + "num_tokens": 234193842.0, + "step": 194660 + }, + { + "entropy": 1.845395915210247, + "epoch": 0.6034599673424218, + "grad_norm": 8.204310417175293, + "learning_rate": 3.2566385729390017e-06, + "loss": 0.416, + "mean_token_accuracy": 0.8577623799443245, + "num_tokens": 234206952.0, + "step": 194670 + }, + { + "entropy": 1.8823886767029763, + "epoch": 0.6034909664674715, + "grad_norm": 5.860503673553467, + "learning_rate": 3.256554930625382e-06, + "loss": 0.499, + "mean_token_accuracy": 0.8504135593771934, + "num_tokens": 234218721.0, + "step": 194680 + }, + { + "entropy": 1.8299574121832847, + "epoch": 0.6035219655925211, + "grad_norm": 5.5976057052612305, + "learning_rate": 3.2564712947561462e-06, + "loss": 0.3959, + "mean_token_accuracy": 0.8664263695478439, + "num_tokens": 234231910.0, + "step": 194690 + }, + { + "entropy": 1.825202153623104, + "epoch": 0.6035529647175708, + "grad_norm": 3.987738609313965, + "learning_rate": 3.256387665330469e-06, + "loss": 0.4114, + "mean_token_accuracy": 0.8607425019145012, + "num_tokens": 234244624.0, + "step": 194700 + }, + { + "entropy": 1.8675637185573577, + "epoch": 0.6035839638426206, + "grad_norm": 7.845833778381348, + "learning_rate": 3.256304042347521e-06, + "loss": 0.4261, + "mean_token_accuracy": 0.8430291518568993, + "num_tokens": 234257973.0, + "step": 194710 + }, + { + "entropy": 1.906636281311512, + "epoch": 0.6036149629676703, + "grad_norm": 3.074232578277588, + "learning_rate": 3.2562204258064755e-06, + "loss": 0.4918, + "mean_token_accuracy": 0.8502906784415245, + "num_tokens": 234269531.0, + "step": 194720 + }, + { + "entropy": 1.853766144812107, + "epoch": 0.6036459620927199, + "grad_norm": 7.9279093742370605, + "learning_rate": 3.256136815706506e-06, + "loss": 0.387, + "mean_token_accuracy": 0.8601211562752724, + "num_tokens": 234282642.0, + "step": 194730 + }, + { + "entropy": 1.880632211267948, + "epoch": 0.6036769612177696, + "grad_norm": 8.704763412475586, + "learning_rate": 3.256053212046785e-06, + "loss": 0.4341, + "mean_token_accuracy": 0.8636703386902809, + "num_tokens": 234293805.0, + "step": 194740 + }, + { + "entropy": 1.9346161454916, + "epoch": 0.6037079603428194, + "grad_norm": 8.34154224395752, + "learning_rate": 3.2559696148264864e-06, + "loss": 0.4963, + "mean_token_accuracy": 0.848820036649704, + "num_tokens": 234305089.0, + "step": 194750 + }, + { + "entropy": 1.8757025212049485, + "epoch": 0.603738959467869, + "grad_norm": 8.365077018737793, + "learning_rate": 3.2558860240447836e-06, + "loss": 0.4554, + "mean_token_accuracy": 0.8507075756788254, + "num_tokens": 234317804.0, + "step": 194760 + }, + { + "entropy": 1.9414838343858718, + "epoch": 0.6037699585929187, + "grad_norm": 8.162266731262207, + "learning_rate": 3.2558024397008493e-06, + "loss": 0.4716, + "mean_token_accuracy": 0.8519972428679466, + "num_tokens": 234328443.0, + "step": 194770 + }, + { + "entropy": 1.894056186079979, + "epoch": 0.6038009577179684, + "grad_norm": 8.409675598144531, + "learning_rate": 3.255718861793857e-06, + "loss": 0.4551, + "mean_token_accuracy": 0.854424498975277, + "num_tokens": 234339704.0, + "step": 194780 + }, + { + "entropy": 1.8207903996109962, + "epoch": 0.6038319568430182, + "grad_norm": 6.82459831237793, + "learning_rate": 3.2556352903229823e-06, + "loss": 0.3634, + "mean_token_accuracy": 0.8668209493160248, + "num_tokens": 234352453.0, + "step": 194790 + }, + { + "entropy": 1.8704303100705146, + "epoch": 0.6038629559680678, + "grad_norm": 8.402528762817383, + "learning_rate": 3.255551725287397e-06, + "loss": 0.4087, + "mean_token_accuracy": 0.8615420445799827, + "num_tokens": 234364986.0, + "step": 194800 + }, + { + "entropy": 1.8487682089209556, + "epoch": 0.6038939550931175, + "grad_norm": 8.179166793823242, + "learning_rate": 3.255468166686277e-06, + "loss": 0.478, + "mean_token_accuracy": 0.8573832005262375, + "num_tokens": 234376774.0, + "step": 194810 + }, + { + "entropy": 1.8813228532671928, + "epoch": 0.6039249542181672, + "grad_norm": 9.37818431854248, + "learning_rate": 3.2553846145187955e-06, + "loss": 0.462, + "mean_token_accuracy": 0.8545000702142715, + "num_tokens": 234389254.0, + "step": 194820 + }, + { + "entropy": 1.8351835548877715, + "epoch": 0.6039559533432168, + "grad_norm": 8.236607551574707, + "learning_rate": 3.2553010687841272e-06, + "loss": 0.3922, + "mean_token_accuracy": 0.8637291580438614, + "num_tokens": 234401607.0, + "step": 194830 + }, + { + "entropy": 1.7457613579928875, + "epoch": 0.6039869524682666, + "grad_norm": 8.144421577453613, + "learning_rate": 3.2552175294814477e-06, + "loss": 0.323, + "mean_token_accuracy": 0.8719058111310005, + "num_tokens": 234415866.0, + "step": 194840 + }, + { + "entropy": 1.9108372643589973, + "epoch": 0.6040179515933163, + "grad_norm": 8.523344039916992, + "learning_rate": 3.2551339966099303e-06, + "loss": 0.4617, + "mean_token_accuracy": 0.8551770314574242, + "num_tokens": 234427296.0, + "step": 194850 + }, + { + "entropy": 1.9472704499959945, + "epoch": 0.604048950718366, + "grad_norm": 8.217750549316406, + "learning_rate": 3.2550504701687506e-06, + "loss": 0.4961, + "mean_token_accuracy": 0.8496365174651146, + "num_tokens": 234438164.0, + "step": 194860 + }, + { + "entropy": 1.8766314849257468, + "epoch": 0.6040799498434156, + "grad_norm": 7.862065315246582, + "learning_rate": 3.254966950157084e-06, + "loss": 0.388, + "mean_token_accuracy": 0.8675270214676857, + "num_tokens": 234449674.0, + "step": 194870 + }, + { + "entropy": 1.9081780746579171, + "epoch": 0.6041109489684654, + "grad_norm": 8.318652153015137, + "learning_rate": 3.2548834365741045e-06, + "loss": 0.4496, + "mean_token_accuracy": 0.8522797405719758, + "num_tokens": 234460914.0, + "step": 194880 + }, + { + "entropy": 1.9317078649997712, + "epoch": 0.6041419480935151, + "grad_norm": 7.822876930236816, + "learning_rate": 3.2547999294189885e-06, + "loss": 0.4632, + "mean_token_accuracy": 0.8557189866900444, + "num_tokens": 234471839.0, + "step": 194890 + }, + { + "entropy": 1.944938975572586, + "epoch": 0.6041729472185647, + "grad_norm": 8.961193084716797, + "learning_rate": 3.2547164286909106e-06, + "loss": 0.4491, + "mean_token_accuracy": 0.8495589464902877, + "num_tokens": 234483129.0, + "step": 194900 + }, + { + "entropy": 1.8596145376563071, + "epoch": 0.6042039463436144, + "grad_norm": 8.231999397277832, + "learning_rate": 3.2546329343890477e-06, + "loss": 0.4084, + "mean_token_accuracy": 0.8626977398991584, + "num_tokens": 234495421.0, + "step": 194910 + }, + { + "entropy": 1.9315678521990776, + "epoch": 0.6042349454686642, + "grad_norm": 8.767191886901855, + "learning_rate": 3.254549446512574e-06, + "loss": 0.5051, + "mean_token_accuracy": 0.8393030554056168, + "num_tokens": 234507370.0, + "step": 194920 + }, + { + "entropy": 1.863025739789009, + "epoch": 0.6042659445937139, + "grad_norm": 9.030223846435547, + "learning_rate": 3.2544659650606662e-06, + "loss": 0.4552, + "mean_token_accuracy": 0.8481202870607376, + "num_tokens": 234520201.0, + "step": 194930 + }, + { + "entropy": 1.9192076668143272, + "epoch": 0.6042969437187635, + "grad_norm": 8.617793083190918, + "learning_rate": 3.254382490032501e-06, + "loss": 0.4622, + "mean_token_accuracy": 0.8552293330430984, + "num_tokens": 234532032.0, + "step": 194940 + }, + { + "entropy": 1.869245907664299, + "epoch": 0.6043279428438132, + "grad_norm": 8.926547050476074, + "learning_rate": 3.2542990214272536e-06, + "loss": 0.4473, + "mean_token_accuracy": 0.8564789742231369, + "num_tokens": 234543524.0, + "step": 194950 + }, + { + "entropy": 1.8756817936897279, + "epoch": 0.604358941968863, + "grad_norm": 10.440224647521973, + "learning_rate": 3.254215559244101e-06, + "loss": 0.406, + "mean_token_accuracy": 0.8592181280255318, + "num_tokens": 234555409.0, + "step": 194960 + }, + { + "entropy": 1.974171131849289, + "epoch": 0.6043899410939126, + "grad_norm": 6.878018856048584, + "learning_rate": 3.2541321034822192e-06, + "loss": 0.4648, + "mean_token_accuracy": 0.8529852941632271, + "num_tokens": 234566421.0, + "step": 194970 + }, + { + "entropy": 1.9051358461380006, + "epoch": 0.6044209402189623, + "grad_norm": 7.7151641845703125, + "learning_rate": 3.2540486541407855e-06, + "loss": 0.4402, + "mean_token_accuracy": 0.8591649144887924, + "num_tokens": 234577678.0, + "step": 194980 + }, + { + "entropy": 1.8991325095295906, + "epoch": 0.604451939344012, + "grad_norm": 7.806362628936768, + "learning_rate": 3.2539652112189764e-06, + "loss": 0.4509, + "mean_token_accuracy": 0.8524706467986107, + "num_tokens": 234589834.0, + "step": 194990 + }, + { + "entropy": 1.9010869175195695, + "epoch": 0.6044829384690618, + "grad_norm": 4.471253871917725, + "learning_rate": 3.2538817747159686e-06, + "loss": 0.4381, + "mean_token_accuracy": 0.8526588916778565, + "num_tokens": 234601789.0, + "step": 195000 + }, + { + "entropy": 1.9764428943395616, + "epoch": 0.6045139375941114, + "grad_norm": 9.513835906982422, + "learning_rate": 3.253798344630939e-06, + "loss": 0.4746, + "mean_token_accuracy": 0.8532751992344856, + "num_tokens": 234612475.0, + "step": 195010 + }, + { + "entropy": 1.859013244509697, + "epoch": 0.6045449367191611, + "grad_norm": 8.61437702178955, + "learning_rate": 3.2537149209630657e-06, + "loss": 0.4175, + "mean_token_accuracy": 0.8659439861774445, + "num_tokens": 234625091.0, + "step": 195020 + }, + { + "entropy": 1.8695720225572585, + "epoch": 0.6045759358442108, + "grad_norm": 8.737157821655273, + "learning_rate": 3.2536315037115256e-06, + "loss": 0.4483, + "mean_token_accuracy": 0.8510843962430954, + "num_tokens": 234637555.0, + "step": 195030 + }, + { + "entropy": 1.890501080453396, + "epoch": 0.6046069349692605, + "grad_norm": 7.709783554077148, + "learning_rate": 3.253548092875496e-06, + "loss": 0.441, + "mean_token_accuracy": 0.8527573123574257, + "num_tokens": 234650075.0, + "step": 195040 + }, + { + "entropy": 1.9229430109262466, + "epoch": 0.6046379340943102, + "grad_norm": 9.317115783691406, + "learning_rate": 3.2534646884541554e-06, + "loss": 0.5282, + "mean_token_accuracy": 0.8381368711590766, + "num_tokens": 234661397.0, + "step": 195050 + }, + { + "entropy": 1.88003631234169, + "epoch": 0.6046689332193599, + "grad_norm": 10.253046989440918, + "learning_rate": 3.253381290446681e-06, + "loss": 0.4203, + "mean_token_accuracy": 0.8574978530406951, + "num_tokens": 234673626.0, + "step": 195060 + }, + { + "entropy": 1.8511680349707604, + "epoch": 0.6046999323444096, + "grad_norm": 7.66872501373291, + "learning_rate": 3.2532978988522507e-06, + "loss": 0.4148, + "mean_token_accuracy": 0.8553901076316833, + "num_tokens": 234685268.0, + "step": 195070 + }, + { + "entropy": 1.792649395763874, + "epoch": 0.6047309314694592, + "grad_norm": 4.690402984619141, + "learning_rate": 3.2532145136700434e-06, + "loss": 0.3734, + "mean_token_accuracy": 0.87020503282547, + "num_tokens": 234699509.0, + "step": 195080 + }, + { + "entropy": 1.8900166735053063, + "epoch": 0.604761930594509, + "grad_norm": 7.005067825317383, + "learning_rate": 3.2531311348992368e-06, + "loss": 0.4268, + "mean_token_accuracy": 0.8626169845461845, + "num_tokens": 234711090.0, + "step": 195090 + }, + { + "entropy": 1.9425034090876578, + "epoch": 0.6047929297195587, + "grad_norm": 9.413784980773926, + "learning_rate": 3.2530477625390084e-06, + "loss": 0.4869, + "mean_token_accuracy": 0.8451692447066307, + "num_tokens": 234722059.0, + "step": 195100 + }, + { + "entropy": 1.7889280065894126, + "epoch": 0.6048239288446083, + "grad_norm": 3.961284637451172, + "learning_rate": 3.252964396588539e-06, + "loss": 0.3559, + "mean_token_accuracy": 0.8673868477344513, + "num_tokens": 234735463.0, + "step": 195110 + }, + { + "entropy": 1.7480084210634232, + "epoch": 0.604854927969658, + "grad_norm": 7.0458292961120605, + "learning_rate": 3.2528810370470048e-06, + "loss": 0.3167, + "mean_token_accuracy": 0.8772800341248512, + "num_tokens": 234748860.0, + "step": 195120 + }, + { + "entropy": 1.8654575437307357, + "epoch": 0.6048859270947078, + "grad_norm": 7.868505001068115, + "learning_rate": 3.252797683913587e-06, + "loss": 0.4293, + "mean_token_accuracy": 0.8634969860315322, + "num_tokens": 234760800.0, + "step": 195130 + }, + { + "entropy": 1.8794177323579788, + "epoch": 0.6049169262197575, + "grad_norm": 9.02531909942627, + "learning_rate": 3.2527143371874637e-06, + "loss": 0.4453, + "mean_token_accuracy": 0.8530821368098259, + "num_tokens": 234772987.0, + "step": 195140 + }, + { + "entropy": 1.925089493393898, + "epoch": 0.6049479253448071, + "grad_norm": 9.341005325317383, + "learning_rate": 3.2526309968678134e-06, + "loss": 0.4984, + "mean_token_accuracy": 0.8462122529745102, + "num_tokens": 234783221.0, + "step": 195150 + }, + { + "entropy": 1.8883005887269975, + "epoch": 0.6049789244698568, + "grad_norm": 8.575138092041016, + "learning_rate": 3.252547662953816e-06, + "loss": 0.4475, + "mean_token_accuracy": 0.8575668081641197, + "num_tokens": 234794379.0, + "step": 195160 + }, + { + "entropy": 1.862256444990635, + "epoch": 0.6050099235949066, + "grad_norm": 7.103546619415283, + "learning_rate": 3.2524643354446505e-06, + "loss": 0.4073, + "mean_token_accuracy": 0.8599424958229065, + "num_tokens": 234806398.0, + "step": 195170 + }, + { + "entropy": 1.8624321684241294, + "epoch": 0.6050409227199562, + "grad_norm": 3.818366050720215, + "learning_rate": 3.252381014339498e-06, + "loss": 0.4032, + "mean_token_accuracy": 0.8636543408036232, + "num_tokens": 234817814.0, + "step": 195180 + }, + { + "entropy": 1.8703722521662711, + "epoch": 0.6050719218450059, + "grad_norm": 4.305206298828125, + "learning_rate": 3.2522976996375367e-06, + "loss": 0.404, + "mean_token_accuracy": 0.8602907463908196, + "num_tokens": 234829832.0, + "step": 195190 + }, + { + "entropy": 1.8468534156680108, + "epoch": 0.6051029209700556, + "grad_norm": 3.9072325229644775, + "learning_rate": 3.2522143913379473e-06, + "loss": 0.4012, + "mean_token_accuracy": 0.8623139545321464, + "num_tokens": 234843094.0, + "step": 195200 + }, + { + "entropy": 1.8561934053897857, + "epoch": 0.6051339200951054, + "grad_norm": 4.525662899017334, + "learning_rate": 3.252131089439909e-06, + "loss": 0.4292, + "mean_token_accuracy": 0.8550740092992782, + "num_tokens": 234856218.0, + "step": 195210 + }, + { + "entropy": 1.8898674458265305, + "epoch": 0.605164919220155, + "grad_norm": 7.813089370727539, + "learning_rate": 3.2520477939426032e-06, + "loss": 0.4168, + "mean_token_accuracy": 0.8595176964998246, + "num_tokens": 234867782.0, + "step": 195220 + }, + { + "entropy": 1.8157613292336463, + "epoch": 0.6051959183452047, + "grad_norm": 3.6355841159820557, + "learning_rate": 3.2519645048452092e-06, + "loss": 0.3635, + "mean_token_accuracy": 0.866732519865036, + "num_tokens": 234880463.0, + "step": 195230 + }, + { + "entropy": 1.854991014301777, + "epoch": 0.6052269174702544, + "grad_norm": 3.633997678756714, + "learning_rate": 3.2518812221469077e-06, + "loss": 0.43, + "mean_token_accuracy": 0.858416149020195, + "num_tokens": 234892857.0, + "step": 195240 + }, + { + "entropy": 1.9053555011749268, + "epoch": 0.6052579165953041, + "grad_norm": 4.182331562042236, + "learning_rate": 3.2517979458468796e-06, + "loss": 0.5057, + "mean_token_accuracy": 0.8488553673028946, + "num_tokens": 234904345.0, + "step": 195250 + }, + { + "entropy": 1.8818264544010161, + "epoch": 0.6052889157203538, + "grad_norm": 7.960140228271484, + "learning_rate": 3.251714675944306e-06, + "loss": 0.4199, + "mean_token_accuracy": 0.8615189298987389, + "num_tokens": 234916251.0, + "step": 195260 + }, + { + "entropy": 1.8361662790179252, + "epoch": 0.6053199148454035, + "grad_norm": 8.360210418701172, + "learning_rate": 3.251631412438367e-06, + "loss": 0.3575, + "mean_token_accuracy": 0.8698137417435646, + "num_tokens": 234928346.0, + "step": 195270 + }, + { + "entropy": 1.7871503584086894, + "epoch": 0.6053509139704532, + "grad_norm": 7.105410575866699, + "learning_rate": 3.2515481553282447e-06, + "loss": 0.3557, + "mean_token_accuracy": 0.8668757110834122, + "num_tokens": 234942366.0, + "step": 195280 + }, + { + "entropy": 1.7971447199583053, + "epoch": 0.6053819130955029, + "grad_norm": 7.089763164520264, + "learning_rate": 3.2514649046131196e-06, + "loss": 0.4001, + "mean_token_accuracy": 0.8638567492365837, + "num_tokens": 234955981.0, + "step": 195290 + }, + { + "entropy": 1.9256662502884865, + "epoch": 0.6054129122205526, + "grad_norm": 7.474786281585693, + "learning_rate": 3.251381660292173e-06, + "loss": 0.4329, + "mean_token_accuracy": 0.8565624952316284, + "num_tokens": 234967329.0, + "step": 195300 + }, + { + "entropy": 1.899828238785267, + "epoch": 0.6054439113456023, + "grad_norm": 8.254742622375488, + "learning_rate": 3.251298422364587e-06, + "loss": 0.4554, + "mean_token_accuracy": 0.8602090954780579, + "num_tokens": 234978476.0, + "step": 195310 + }, + { + "entropy": 1.7984695211052895, + "epoch": 0.6054749104706519, + "grad_norm": 7.558569431304932, + "learning_rate": 3.251215190829542e-06, + "loss": 0.3842, + "mean_token_accuracy": 0.8689236968755722, + "num_tokens": 234991750.0, + "step": 195320 + }, + { + "entropy": 1.885707500576973, + "epoch": 0.6055059095957016, + "grad_norm": 9.93189811706543, + "learning_rate": 3.2511319656862224e-06, + "loss": 0.4656, + "mean_token_accuracy": 0.8508443906903267, + "num_tokens": 235003414.0, + "step": 195330 + }, + { + "entropy": 1.954411643743515, + "epoch": 0.6055369087207514, + "grad_norm": 7.854653835296631, + "learning_rate": 3.251048746933807e-06, + "loss": 0.4464, + "mean_token_accuracy": 0.8575719073414803, + "num_tokens": 235014106.0, + "step": 195340 + }, + { + "entropy": 1.9777034103870392, + "epoch": 0.6055679078458011, + "grad_norm": 7.847527980804443, + "learning_rate": 3.2509655345714796e-06, + "loss": 0.4875, + "mean_token_accuracy": 0.850102536380291, + "num_tokens": 235024582.0, + "step": 195350 + }, + { + "entropy": 1.868042555451393, + "epoch": 0.6055989069708507, + "grad_norm": 7.146167755126953, + "learning_rate": 3.2508823285984228e-06, + "loss": 0.414, + "mean_token_accuracy": 0.8623890817165375, + "num_tokens": 235036898.0, + "step": 195360 + }, + { + "entropy": 1.856178944557905, + "epoch": 0.6056299060959004, + "grad_norm": 7.413034439086914, + "learning_rate": 3.250799129013818e-06, + "loss": 0.4141, + "mean_token_accuracy": 0.8633477687835693, + "num_tokens": 235048923.0, + "step": 195370 + }, + { + "entropy": 1.845955815911293, + "epoch": 0.6056609052209502, + "grad_norm": 3.1155219078063965, + "learning_rate": 3.2507159358168485e-06, + "loss": 0.3825, + "mean_token_accuracy": 0.8615516528487206, + "num_tokens": 235061678.0, + "step": 195380 + }, + { + "entropy": 1.8690281867980958, + "epoch": 0.6056919043459998, + "grad_norm": 9.380828857421875, + "learning_rate": 3.250632749006697e-06, + "loss": 0.4436, + "mean_token_accuracy": 0.8594823315739631, + "num_tokens": 235074606.0, + "step": 195390 + }, + { + "entropy": 1.8899589315056802, + "epoch": 0.6057229034710495, + "grad_norm": 3.6747963428497314, + "learning_rate": 3.2505495685825455e-06, + "loss": 0.4324, + "mean_token_accuracy": 0.8502388820052147, + "num_tokens": 235087108.0, + "step": 195400 + }, + { + "entropy": 1.8682299882173539, + "epoch": 0.6057539025960992, + "grad_norm": 4.402914524078369, + "learning_rate": 3.250466394543577e-06, + "loss": 0.4336, + "mean_token_accuracy": 0.854416199028492, + "num_tokens": 235099187.0, + "step": 195410 + }, + { + "entropy": 1.8698025971651078, + "epoch": 0.605784901721149, + "grad_norm": 3.2012693881988525, + "learning_rate": 3.2503832268889757e-06, + "loss": 0.3847, + "mean_token_accuracy": 0.8672586590051651, + "num_tokens": 235111448.0, + "step": 195420 + }, + { + "entropy": 1.831081511080265, + "epoch": 0.6058159008461986, + "grad_norm": 9.413392066955566, + "learning_rate": 3.250300065617925e-06, + "loss": 0.3793, + "mean_token_accuracy": 0.8682454422116279, + "num_tokens": 235124547.0, + "step": 195430 + }, + { + "entropy": 1.7941778182983399, + "epoch": 0.6058468999712483, + "grad_norm": 3.7888948917388916, + "learning_rate": 3.250216910729607e-06, + "loss": 0.3671, + "mean_token_accuracy": 0.8639795690774917, + "num_tokens": 235138392.0, + "step": 195440 + }, + { + "entropy": 1.8263335436582566, + "epoch": 0.605877899096298, + "grad_norm": 6.821713924407959, + "learning_rate": 3.2501337622232058e-06, + "loss": 0.4002, + "mean_token_accuracy": 0.8669615253806114, + "num_tokens": 235151379.0, + "step": 195450 + }, + { + "entropy": 1.95903280377388, + "epoch": 0.6059088982213477, + "grad_norm": 7.372539520263672, + "learning_rate": 3.250050620097905e-06, + "loss": 0.4776, + "mean_token_accuracy": 0.8572368696331978, + "num_tokens": 235162289.0, + "step": 195460 + }, + { + "entropy": 1.9106680050492286, + "epoch": 0.6059398973463974, + "grad_norm": 8.736983299255371, + "learning_rate": 3.2499674843528896e-06, + "loss": 0.4272, + "mean_token_accuracy": 0.86090597063303, + "num_tokens": 235174024.0, + "step": 195470 + }, + { + "entropy": 1.907187005877495, + "epoch": 0.6059708964714471, + "grad_norm": 10.137788772583008, + "learning_rate": 3.249884354987342e-06, + "loss": 0.4589, + "mean_token_accuracy": 0.8515108034014702, + "num_tokens": 235185376.0, + "step": 195480 + }, + { + "entropy": 1.8968829110264778, + "epoch": 0.6060018955964968, + "grad_norm": 4.01402473449707, + "learning_rate": 3.2498012320004473e-06, + "loss": 0.4677, + "mean_token_accuracy": 0.8576158374547959, + "num_tokens": 235197232.0, + "step": 195490 + }, + { + "entropy": 1.957116511464119, + "epoch": 0.6060328947215465, + "grad_norm": 8.424763679504395, + "learning_rate": 3.24971811539139e-06, + "loss": 0.4923, + "mean_token_accuracy": 0.8485281109809876, + "num_tokens": 235207687.0, + "step": 195500 + }, + { + "entropy": 1.7959214732050897, + "epoch": 0.6060638938465962, + "grad_norm": 3.726801633834839, + "learning_rate": 3.249635005159353e-06, + "loss": 0.3504, + "mean_token_accuracy": 0.8715131148695946, + "num_tokens": 235220308.0, + "step": 195510 + }, + { + "entropy": 1.8073456510901451, + "epoch": 0.6060948929716459, + "grad_norm": 9.020705223083496, + "learning_rate": 3.2495519013035233e-06, + "loss": 0.4018, + "mean_token_accuracy": 0.8587002739310264, + "num_tokens": 235234216.0, + "step": 195520 + }, + { + "entropy": 1.8868019104003906, + "epoch": 0.6061258920966955, + "grad_norm": 8.478205680847168, + "learning_rate": 3.249468803823083e-06, + "loss": 0.4385, + "mean_token_accuracy": 0.8596696853637695, + "num_tokens": 235245495.0, + "step": 195530 + }, + { + "entropy": 1.9176657855510713, + "epoch": 0.6061568912217453, + "grad_norm": 10.844125747680664, + "learning_rate": 3.2493857127172197e-06, + "loss": 0.4828, + "mean_token_accuracy": 0.8496246546506881, + "num_tokens": 235256714.0, + "step": 195540 + }, + { + "entropy": 1.95402589738369, + "epoch": 0.606187890346795, + "grad_norm": 8.79029369354248, + "learning_rate": 3.2493026279851157e-06, + "loss": 0.5417, + "mean_token_accuracy": 0.8361954748630523, + "num_tokens": 235267437.0, + "step": 195550 + }, + { + "entropy": 1.9639874294400215, + "epoch": 0.6062188894718447, + "grad_norm": 6.906721591949463, + "learning_rate": 3.249219549625959e-06, + "loss": 0.462, + "mean_token_accuracy": 0.8525883480906487, + "num_tokens": 235278628.0, + "step": 195560 + }, + { + "entropy": 1.8649201110005378, + "epoch": 0.6062498885968943, + "grad_norm": 9.573200225830078, + "learning_rate": 3.2491364776389322e-06, + "loss": 0.4099, + "mean_token_accuracy": 0.8608361706137657, + "num_tokens": 235290518.0, + "step": 195570 + }, + { + "entropy": 1.8654879599809646, + "epoch": 0.606280887721944, + "grad_norm": 5.2166666984558105, + "learning_rate": 3.2490534120232226e-06, + "loss": 0.4454, + "mean_token_accuracy": 0.8498337626457214, + "num_tokens": 235303213.0, + "step": 195580 + }, + { + "entropy": 1.8535957843065263, + "epoch": 0.6063118868469938, + "grad_norm": 3.9607527256011963, + "learning_rate": 3.248970352778015e-06, + "loss": 0.4155, + "mean_token_accuracy": 0.8599080324172974, + "num_tokens": 235316471.0, + "step": 195590 + }, + { + "entropy": 1.862211149930954, + "epoch": 0.6063428859720434, + "grad_norm": 9.359057426452637, + "learning_rate": 3.2488872999024964e-06, + "loss": 0.4519, + "mean_token_accuracy": 0.8552052542567253, + "num_tokens": 235328218.0, + "step": 195600 + }, + { + "entropy": 1.9296976819634437, + "epoch": 0.6063738850970931, + "grad_norm": 4.127715587615967, + "learning_rate": 3.2488042533958505e-06, + "loss": 0.4762, + "mean_token_accuracy": 0.8469863876700401, + "num_tokens": 235339957.0, + "step": 195610 + }, + { + "entropy": 1.880443413555622, + "epoch": 0.6064048842221428, + "grad_norm": 7.147751808166504, + "learning_rate": 3.2487212132572648e-06, + "loss": 0.4236, + "mean_token_accuracy": 0.851515157520771, + "num_tokens": 235352123.0, + "step": 195620 + }, + { + "entropy": 1.938239911198616, + "epoch": 0.6064358833471926, + "grad_norm": 7.499401092529297, + "learning_rate": 3.2486381794859252e-06, + "loss": 0.4796, + "mean_token_accuracy": 0.8516676813364029, + "num_tokens": 235363800.0, + "step": 195630 + }, + { + "entropy": 1.9163310438394547, + "epoch": 0.6064668824722422, + "grad_norm": 7.778500080108643, + "learning_rate": 3.2485551520810183e-06, + "loss": 0.4688, + "mean_token_accuracy": 0.8536335572600364, + "num_tokens": 235375270.0, + "step": 195640 + }, + { + "entropy": 1.928651387989521, + "epoch": 0.6064978815972919, + "grad_norm": 13.652063369750977, + "learning_rate": 3.24847213104173e-06, + "loss": 0.4866, + "mean_token_accuracy": 0.8396665036678315, + "num_tokens": 235386782.0, + "step": 195650 + }, + { + "entropy": 1.778220947086811, + "epoch": 0.6065288807223416, + "grad_norm": 4.027807712554932, + "learning_rate": 3.2483891163672477e-06, + "loss": 0.4061, + "mean_token_accuracy": 0.8619612753391266, + "num_tokens": 235401570.0, + "step": 195660 + }, + { + "entropy": 1.7784413129091263, + "epoch": 0.6065598798473913, + "grad_norm": 4.169196605682373, + "learning_rate": 3.248306108056758e-06, + "loss": 0.3822, + "mean_token_accuracy": 0.8609212309122085, + "num_tokens": 235414496.0, + "step": 195670 + }, + { + "entropy": 1.8775099590420723, + "epoch": 0.606590878972441, + "grad_norm": 7.162752628326416, + "learning_rate": 3.2482231061094467e-06, + "loss": 0.4024, + "mean_token_accuracy": 0.8568590208888054, + "num_tokens": 235426691.0, + "step": 195680 + }, + { + "entropy": 1.8699518121778964, + "epoch": 0.6066218780974907, + "grad_norm": 4.00959587097168, + "learning_rate": 3.248140110524503e-06, + "loss": 0.366, + "mean_token_accuracy": 0.8694511279463768, + "num_tokens": 235439467.0, + "step": 195690 + }, + { + "entropy": 1.7986382976174355, + "epoch": 0.6066528772225404, + "grad_norm": 7.902369499206543, + "learning_rate": 3.248057121301112e-06, + "loss": 0.3647, + "mean_token_accuracy": 0.8713407471776009, + "num_tokens": 235451837.0, + "step": 195700 + }, + { + "entropy": 1.853308279812336, + "epoch": 0.6066838763475901, + "grad_norm": 4.1074628829956055, + "learning_rate": 3.247974138438463e-06, + "loss": 0.3729, + "mean_token_accuracy": 0.87129997164011, + "num_tokens": 235464410.0, + "step": 195710 + }, + { + "entropy": 1.9251540154218674, + "epoch": 0.6067148754726398, + "grad_norm": 7.291789531707764, + "learning_rate": 3.2478911619357413e-06, + "loss": 0.4646, + "mean_token_accuracy": 0.8508058547973633, + "num_tokens": 235475058.0, + "step": 195720 + }, + { + "entropy": 1.7726059317588807, + "epoch": 0.6067458745976895, + "grad_norm": 9.575237274169922, + "learning_rate": 3.2478081917921357e-06, + "loss": 0.383, + "mean_token_accuracy": 0.8688571244478226, + "num_tokens": 235487731.0, + "step": 195730 + }, + { + "entropy": 1.8045031249523162, + "epoch": 0.6067768737227391, + "grad_norm": 4.120028018951416, + "learning_rate": 3.247725228006835e-06, + "loss": 0.3759, + "mean_token_accuracy": 0.8642520338296891, + "num_tokens": 235500017.0, + "step": 195740 + }, + { + "entropy": 1.929566852748394, + "epoch": 0.6068078728477889, + "grad_norm": 5.6811203956604, + "learning_rate": 3.2476422705790256e-06, + "loss": 0.484, + "mean_token_accuracy": 0.8492823615670204, + "num_tokens": 235511445.0, + "step": 195750 + }, + { + "entropy": 1.8349244624376297, + "epoch": 0.6068388719728386, + "grad_norm": 7.675958156585693, + "learning_rate": 3.2475593195078966e-06, + "loss": 0.4712, + "mean_token_accuracy": 0.8610428079962731, + "num_tokens": 235524298.0, + "step": 195760 + }, + { + "entropy": 1.8410429537296296, + "epoch": 0.6068698710978883, + "grad_norm": 6.709105014801025, + "learning_rate": 3.2474763747926347e-06, + "loss": 0.4403, + "mean_token_accuracy": 0.8631402567029, + "num_tokens": 235536357.0, + "step": 195770 + }, + { + "entropy": 1.949179396033287, + "epoch": 0.6069008702229379, + "grad_norm": 7.939975261688232, + "learning_rate": 3.2473934364324306e-06, + "loss": 0.4539, + "mean_token_accuracy": 0.8561415210366249, + "num_tokens": 235547135.0, + "step": 195780 + }, + { + "entropy": 1.8811473071575164, + "epoch": 0.6069318693479877, + "grad_norm": 8.053126335144043, + "learning_rate": 3.2473105044264703e-06, + "loss": 0.4369, + "mean_token_accuracy": 0.8551708459854126, + "num_tokens": 235558745.0, + "step": 195790 + }, + { + "entropy": 1.921061459183693, + "epoch": 0.6069628684730374, + "grad_norm": 9.278833389282227, + "learning_rate": 3.2472275787739445e-06, + "loss": 0.5125, + "mean_token_accuracy": 0.8361942797899247, + "num_tokens": 235570191.0, + "step": 195800 + }, + { + "entropy": 1.7232392877340317, + "epoch": 0.606993867598087, + "grad_norm": 8.542473793029785, + "learning_rate": 3.2471446594740413e-06, + "loss": 0.3554, + "mean_token_accuracy": 0.8698214948177337, + "num_tokens": 235584902.0, + "step": 195810 + }, + { + "entropy": 1.923740841448307, + "epoch": 0.6070248667231367, + "grad_norm": 8.452467918395996, + "learning_rate": 3.2470617465259496e-06, + "loss": 0.4753, + "mean_token_accuracy": 0.8503957465291023, + "num_tokens": 235596504.0, + "step": 195820 + }, + { + "entropy": 1.8590814411640166, + "epoch": 0.6070558658481864, + "grad_norm": 4.637422561645508, + "learning_rate": 3.246978839928858e-06, + "loss": 0.4276, + "mean_token_accuracy": 0.8605611816048622, + "num_tokens": 235608967.0, + "step": 195830 + }, + { + "entropy": 1.8847665458917617, + "epoch": 0.6070868649732362, + "grad_norm": 7.79510498046875, + "learning_rate": 3.246895939681957e-06, + "loss": 0.4712, + "mean_token_accuracy": 0.8544412195682526, + "num_tokens": 235621390.0, + "step": 195840 + }, + { + "entropy": 1.8920132741332054, + "epoch": 0.6071178640982858, + "grad_norm": 10.064355850219727, + "learning_rate": 3.2468130457844353e-06, + "loss": 0.4621, + "mean_token_accuracy": 0.8499759629368782, + "num_tokens": 235633162.0, + "step": 195850 + }, + { + "entropy": 1.9165378838777543, + "epoch": 0.6071488632233355, + "grad_norm": 6.8753132820129395, + "learning_rate": 3.2467301582354816e-06, + "loss": 0.5017, + "mean_token_accuracy": 0.8535776749253273, + "num_tokens": 235644499.0, + "step": 195860 + }, + { + "entropy": 1.837621060013771, + "epoch": 0.6071798623483852, + "grad_norm": 4.135498046875, + "learning_rate": 3.2466472770342873e-06, + "loss": 0.4054, + "mean_token_accuracy": 0.8666768983006478, + "num_tokens": 235656397.0, + "step": 195870 + }, + { + "entropy": 1.880282236635685, + "epoch": 0.607210861473435, + "grad_norm": 7.622752666473389, + "learning_rate": 3.24656440218004e-06, + "loss": 0.4346, + "mean_token_accuracy": 0.8597177520394326, + "num_tokens": 235668423.0, + "step": 195880 + }, + { + "entropy": 1.8491656512022019, + "epoch": 0.6072418605984846, + "grad_norm": 8.067870140075684, + "learning_rate": 3.2464815336719317e-06, + "loss": 0.3972, + "mean_token_accuracy": 0.8632102563977242, + "num_tokens": 235680658.0, + "step": 195890 + }, + { + "entropy": 1.8378586277365685, + "epoch": 0.6072728597235343, + "grad_norm": 4.336076259613037, + "learning_rate": 3.2463986715091527e-06, + "loss": 0.4127, + "mean_token_accuracy": 0.859869047999382, + "num_tokens": 235693223.0, + "step": 195900 + }, + { + "entropy": 1.7845323011279106, + "epoch": 0.607303858848584, + "grad_norm": 8.610404968261719, + "learning_rate": 3.2463158156908914e-06, + "loss": 0.3724, + "mean_token_accuracy": 0.8675568237900734, + "num_tokens": 235706654.0, + "step": 195910 + }, + { + "entropy": 1.9118803530931472, + "epoch": 0.6073348579736337, + "grad_norm": 9.936572074890137, + "learning_rate": 3.246232966216339e-06, + "loss": 0.4685, + "mean_token_accuracy": 0.8511000022292137, + "num_tokens": 235718122.0, + "step": 195920 + }, + { + "entropy": 1.886147889494896, + "epoch": 0.6073658570986834, + "grad_norm": 8.693392753601074, + "learning_rate": 3.2461501230846863e-06, + "loss": 0.4781, + "mean_token_accuracy": 0.8479161009192466, + "num_tokens": 235729645.0, + "step": 195930 + }, + { + "entropy": 1.8959844335913658, + "epoch": 0.6073968562237331, + "grad_norm": 8.2455472946167, + "learning_rate": 3.2460672862951235e-06, + "loss": 0.4382, + "mean_token_accuracy": 0.8538859143853188, + "num_tokens": 235741982.0, + "step": 195940 + }, + { + "entropy": 1.907785764336586, + "epoch": 0.6074278553487827, + "grad_norm": 7.019741058349609, + "learning_rate": 3.2459844558468436e-06, + "loss": 0.4127, + "mean_token_accuracy": 0.8617287069559098, + "num_tokens": 235753575.0, + "step": 195950 + }, + { + "entropy": 1.840754969418049, + "epoch": 0.6074588544738325, + "grad_norm": 8.111040115356445, + "learning_rate": 3.245901631739034e-06, + "loss": 0.415, + "mean_token_accuracy": 0.8641861319541931, + "num_tokens": 235765664.0, + "step": 195960 + }, + { + "entropy": 1.8874695479869843, + "epoch": 0.6074898535988822, + "grad_norm": 8.226910591125488, + "learning_rate": 3.2458188139708885e-06, + "loss": 0.4425, + "mean_token_accuracy": 0.8520441308617592, + "num_tokens": 235777654.0, + "step": 195970 + }, + { + "entropy": 1.9285912573337556, + "epoch": 0.6075208527239319, + "grad_norm": 8.283777236938477, + "learning_rate": 3.2457360025415967e-06, + "loss": 0.4783, + "mean_token_accuracy": 0.8478949934244155, + "num_tokens": 235789176.0, + "step": 195980 + }, + { + "entropy": 1.9073958545923233, + "epoch": 0.6075518518489815, + "grad_norm": 8.261918067932129, + "learning_rate": 3.245653197450352e-06, + "loss": 0.4654, + "mean_token_accuracy": 0.8549600809812545, + "num_tokens": 235800644.0, + "step": 195990 + }, + { + "entropy": 1.8290598958730697, + "epoch": 0.6075828509740313, + "grad_norm": 8.439154624938965, + "learning_rate": 3.2455703986963444e-06, + "loss": 0.472, + "mean_token_accuracy": 0.8555337622761726, + "num_tokens": 235813792.0, + "step": 196000 + }, + { + "entropy": 1.891948239505291, + "epoch": 0.607613850099081, + "grad_norm": 8.003715515136719, + "learning_rate": 3.2454876062787656e-06, + "loss": 0.4513, + "mean_token_accuracy": 0.8544703394174575, + "num_tokens": 235825305.0, + "step": 196010 + }, + { + "entropy": 1.8116373434662818, + "epoch": 0.6076448492241306, + "grad_norm": 7.792770862579346, + "learning_rate": 3.245404820196808e-06, + "loss": 0.3699, + "mean_token_accuracy": 0.8783703818917274, + "num_tokens": 235837596.0, + "step": 196020 + }, + { + "entropy": 1.8243522822856904, + "epoch": 0.6076758483491803, + "grad_norm": 8.489314079284668, + "learning_rate": 3.245322040449663e-06, + "loss": 0.4255, + "mean_token_accuracy": 0.8566062182188035, + "num_tokens": 235850827.0, + "step": 196030 + }, + { + "entropy": 1.823631004989147, + "epoch": 0.6077068474742301, + "grad_norm": 9.67441177368164, + "learning_rate": 3.245239267036524e-06, + "loss": 0.3855, + "mean_token_accuracy": 0.8649219900369645, + "num_tokens": 235863914.0, + "step": 196040 + }, + { + "entropy": 1.9446945399045945, + "epoch": 0.6077378465992798, + "grad_norm": 7.182814598083496, + "learning_rate": 3.245156499956582e-06, + "loss": 0.4913, + "mean_token_accuracy": 0.8489960566163063, + "num_tokens": 235874892.0, + "step": 196050 + }, + { + "entropy": 1.9042619869112969, + "epoch": 0.6077688457243294, + "grad_norm": 9.036920547485352, + "learning_rate": 3.24507373920903e-06, + "loss": 0.5058, + "mean_token_accuracy": 0.8501462116837502, + "num_tokens": 235886665.0, + "step": 196060 + }, + { + "entropy": 1.9438833177089692, + "epoch": 0.6077998448493791, + "grad_norm": 8.651317596435547, + "learning_rate": 3.2449909847930606e-06, + "loss": 0.5106, + "mean_token_accuracy": 0.843551354110241, + "num_tokens": 235897580.0, + "step": 196070 + }, + { + "entropy": 1.8759539812803268, + "epoch": 0.6078308439744288, + "grad_norm": 3.959768533706665, + "learning_rate": 3.244908236707866e-06, + "loss": 0.4392, + "mean_token_accuracy": 0.8555333316326141, + "num_tokens": 235909609.0, + "step": 196080 + }, + { + "entropy": 1.8777537867426872, + "epoch": 0.6078618430994785, + "grad_norm": 6.433634281158447, + "learning_rate": 3.24482549495264e-06, + "loss": 0.4071, + "mean_token_accuracy": 0.8643855184316636, + "num_tokens": 235922055.0, + "step": 196090 + }, + { + "entropy": 1.9086138397455215, + "epoch": 0.6078928422245282, + "grad_norm": 8.411335945129395, + "learning_rate": 3.244742759526575e-06, + "loss": 0.4146, + "mean_token_accuracy": 0.8512304544448852, + "num_tokens": 235933703.0, + "step": 196100 + }, + { + "entropy": 1.9222751021385194, + "epoch": 0.6079238413495779, + "grad_norm": 10.310203552246094, + "learning_rate": 3.244660030428864e-06, + "loss": 0.508, + "mean_token_accuracy": 0.8420830890536308, + "num_tokens": 235944993.0, + "step": 196110 + }, + { + "entropy": 1.90309539437294, + "epoch": 0.6079548404746276, + "grad_norm": 4.145473957061768, + "learning_rate": 3.2445773076587004e-06, + "loss": 0.4278, + "mean_token_accuracy": 0.8556212335824966, + "num_tokens": 235957125.0, + "step": 196120 + }, + { + "entropy": 1.8705694049596786, + "epoch": 0.6079858395996773, + "grad_norm": 7.968718528747559, + "learning_rate": 3.2444945912152776e-06, + "loss": 0.4219, + "mean_token_accuracy": 0.861976896226406, + "num_tokens": 235968426.0, + "step": 196130 + }, + { + "entropy": 1.8437914915382863, + "epoch": 0.608016838724727, + "grad_norm": 7.557614803314209, + "learning_rate": 3.244411881097789e-06, + "loss": 0.4583, + "mean_token_accuracy": 0.8543598294258118, + "num_tokens": 235981059.0, + "step": 196140 + }, + { + "entropy": 1.8821563974022866, + "epoch": 0.6080478378497767, + "grad_norm": 8.101224899291992, + "learning_rate": 3.2443291773054294e-06, + "loss": 0.4593, + "mean_token_accuracy": 0.848881970345974, + "num_tokens": 235992831.0, + "step": 196150 + }, + { + "entropy": 1.799989365041256, + "epoch": 0.6080788369748263, + "grad_norm": 6.886673450469971, + "learning_rate": 3.244246479837392e-06, + "loss": 0.4032, + "mean_token_accuracy": 0.8733502745628356, + "num_tokens": 236006483.0, + "step": 196160 + }, + { + "entropy": 1.9383499220013618, + "epoch": 0.6081098360998761, + "grad_norm": 7.37465763092041, + "learning_rate": 3.24416378869287e-06, + "loss": 0.4858, + "mean_token_accuracy": 0.8524863049387932, + "num_tokens": 236017916.0, + "step": 196170 + }, + { + "entropy": 1.8896848455071449, + "epoch": 0.6081408352249258, + "grad_norm": 6.178394794464111, + "learning_rate": 3.2440811038710583e-06, + "loss": 0.467, + "mean_token_accuracy": 0.8497600927948952, + "num_tokens": 236030461.0, + "step": 196180 + }, + { + "entropy": 1.9355302542448043, + "epoch": 0.6081718343499755, + "grad_norm": 7.876641750335693, + "learning_rate": 3.2439984253711515e-06, + "loss": 0.5212, + "mean_token_accuracy": 0.8411651849746704, + "num_tokens": 236042188.0, + "step": 196190 + }, + { + "entropy": 1.877279168367386, + "epoch": 0.6082028334750251, + "grad_norm": 6.912283420562744, + "learning_rate": 3.2439157531923432e-06, + "loss": 0.4101, + "mean_token_accuracy": 0.8578936800360679, + "num_tokens": 236053737.0, + "step": 196200 + }, + { + "entropy": 1.9325706675648688, + "epoch": 0.6082338326000749, + "grad_norm": 7.355538845062256, + "learning_rate": 3.2438330873338297e-06, + "loss": 0.4665, + "mean_token_accuracy": 0.8536846920847893, + "num_tokens": 236065060.0, + "step": 196210 + }, + { + "entropy": 1.9052503943443297, + "epoch": 0.6082648317251246, + "grad_norm": 7.190591812133789, + "learning_rate": 3.2437504277948032e-06, + "loss": 0.4317, + "mean_token_accuracy": 0.8584253296256066, + "num_tokens": 236077520.0, + "step": 196220 + }, + { + "entropy": 1.7885384008288383, + "epoch": 0.6082958308501742, + "grad_norm": 7.963929176330566, + "learning_rate": 3.2436677745744605e-06, + "loss": 0.3639, + "mean_token_accuracy": 0.865674552321434, + "num_tokens": 236090760.0, + "step": 196230 + }, + { + "entropy": 1.9493159145116805, + "epoch": 0.6083268299752239, + "grad_norm": 8.272683143615723, + "learning_rate": 3.243585127671996e-06, + "loss": 0.4766, + "mean_token_accuracy": 0.844498673081398, + "num_tokens": 236101923.0, + "step": 196240 + }, + { + "entropy": 1.92689688205719, + "epoch": 0.6083578291002737, + "grad_norm": 7.235065937042236, + "learning_rate": 3.2435024870866042e-06, + "loss": 0.4287, + "mean_token_accuracy": 0.8672019839286804, + "num_tokens": 236113110.0, + "step": 196250 + }, + { + "entropy": 1.9139114201068879, + "epoch": 0.6083888282253234, + "grad_norm": 8.594062805175781, + "learning_rate": 3.2434198528174823e-06, + "loss": 0.4418, + "mean_token_accuracy": 0.8535932213068008, + "num_tokens": 236124685.0, + "step": 196260 + }, + { + "entropy": 1.938517615199089, + "epoch": 0.608419827350373, + "grad_norm": 8.178302764892578, + "learning_rate": 3.2433372248638235e-06, + "loss": 0.4367, + "mean_token_accuracy": 0.8637794941663742, + "num_tokens": 236136003.0, + "step": 196270 + }, + { + "entropy": 1.825549528002739, + "epoch": 0.6084508264754227, + "grad_norm": 8.20291805267334, + "learning_rate": 3.2432546032248247e-06, + "loss": 0.3806, + "mean_token_accuracy": 0.866515477001667, + "num_tokens": 236149072.0, + "step": 196280 + }, + { + "entropy": 1.8921896636486053, + "epoch": 0.6084818256004725, + "grad_norm": 7.851335048675537, + "learning_rate": 3.2431719878996814e-06, + "loss": 0.4614, + "mean_token_accuracy": 0.8482361376285553, + "num_tokens": 236160335.0, + "step": 196290 + }, + { + "entropy": 1.8066128924489022, + "epoch": 0.6085128247255222, + "grad_norm": 8.3627347946167, + "learning_rate": 3.243089378887589e-06, + "loss": 0.4086, + "mean_token_accuracy": 0.8578595578670501, + "num_tokens": 236172757.0, + "step": 196300 + }, + { + "entropy": 1.8721036612987518, + "epoch": 0.6085438238505718, + "grad_norm": 8.146467208862305, + "learning_rate": 3.243006776187744e-06, + "loss": 0.4263, + "mean_token_accuracy": 0.8607116281986237, + "num_tokens": 236184544.0, + "step": 196310 + }, + { + "entropy": 1.8730005353689194, + "epoch": 0.6085748229756215, + "grad_norm": 3.757640838623047, + "learning_rate": 3.2429241797993433e-06, + "loss": 0.4125, + "mean_token_accuracy": 0.8655624583363533, + "num_tokens": 236196381.0, + "step": 196320 + }, + { + "entropy": 1.7163779377937316, + "epoch": 0.6086058221006712, + "grad_norm": 3.7942216396331787, + "learning_rate": 3.2428415897215815e-06, + "loss": 0.3292, + "mean_token_accuracy": 0.8697278842329978, + "num_tokens": 236210821.0, + "step": 196330 + }, + { + "entropy": 1.9790637254714967, + "epoch": 0.6086368212257209, + "grad_norm": 8.186054229736328, + "learning_rate": 3.242759005953657e-06, + "loss": 0.5292, + "mean_token_accuracy": 0.839950506389141, + "num_tokens": 236221650.0, + "step": 196340 + }, + { + "entropy": 1.9705463379621506, + "epoch": 0.6086678203507706, + "grad_norm": 6.929887771606445, + "learning_rate": 3.242676428494764e-06, + "loss": 0.478, + "mean_token_accuracy": 0.8496137440204621, + "num_tokens": 236232491.0, + "step": 196350 + }, + { + "entropy": 1.8872851014137269, + "epoch": 0.6086988194758203, + "grad_norm": 8.718164443969727, + "learning_rate": 3.2425938573441014e-06, + "loss": 0.4755, + "mean_token_accuracy": 0.8533385038375855, + "num_tokens": 236244089.0, + "step": 196360 + }, + { + "entropy": 1.7990367278456687, + "epoch": 0.6087298186008699, + "grad_norm": 8.366378784179688, + "learning_rate": 3.2425112925008656e-06, + "loss": 0.4352, + "mean_token_accuracy": 0.8653234079480171, + "num_tokens": 236258434.0, + "step": 196370 + }, + { + "entropy": 1.8804176807403565, + "epoch": 0.6087608177259197, + "grad_norm": 3.7890381813049316, + "learning_rate": 3.2424287339642526e-06, + "loss": 0.4348, + "mean_token_accuracy": 0.8637651592493057, + "num_tokens": 236269932.0, + "step": 196380 + }, + { + "entropy": 1.921053881943226, + "epoch": 0.6087918168509694, + "grad_norm": 8.576593399047852, + "learning_rate": 3.2423461817334606e-06, + "loss": 0.4843, + "mean_token_accuracy": 0.8536047399044037, + "num_tokens": 236281900.0, + "step": 196390 + }, + { + "entropy": 1.8900701761245728, + "epoch": 0.6088228159760191, + "grad_norm": 7.210069179534912, + "learning_rate": 3.2422636358076863e-06, + "loss": 0.447, + "mean_token_accuracy": 0.8583195567131042, + "num_tokens": 236292882.0, + "step": 196400 + }, + { + "entropy": 1.9627773761749268, + "epoch": 0.6088538151010687, + "grad_norm": 10.353533744812012, + "learning_rate": 3.242181096186128e-06, + "loss": 0.5226, + "mean_token_accuracy": 0.8441329196095466, + "num_tokens": 236303626.0, + "step": 196410 + }, + { + "entropy": 1.9172202914953231, + "epoch": 0.6088848142261185, + "grad_norm": 7.017533302307129, + "learning_rate": 3.2420985628679825e-06, + "loss": 0.4182, + "mean_token_accuracy": 0.8683144554495812, + "num_tokens": 236314947.0, + "step": 196420 + }, + { + "entropy": 1.7904640406370163, + "epoch": 0.6089158133511682, + "grad_norm": 8.5465087890625, + "learning_rate": 3.242016035852447e-06, + "loss": 0.393, + "mean_token_accuracy": 0.8550734579563141, + "num_tokens": 236327944.0, + "step": 196430 + }, + { + "entropy": 1.8345270901918411, + "epoch": 0.6089468124762178, + "grad_norm": 10.23214054107666, + "learning_rate": 3.241933515138721e-06, + "loss": 0.4352, + "mean_token_accuracy": 0.8526005238294602, + "num_tokens": 236339687.0, + "step": 196440 + }, + { + "entropy": 1.871364989876747, + "epoch": 0.6089778116012675, + "grad_norm": 3.8333592414855957, + "learning_rate": 3.2418510007260012e-06, + "loss": 0.4428, + "mean_token_accuracy": 0.8566494002938271, + "num_tokens": 236351160.0, + "step": 196450 + }, + { + "entropy": 1.9011761844158173, + "epoch": 0.6090088107263173, + "grad_norm": 9.281325340270996, + "learning_rate": 3.241768492613486e-06, + "loss": 0.4825, + "mean_token_accuracy": 0.8505636110901833, + "num_tokens": 236363054.0, + "step": 196460 + }, + { + "entropy": 1.8195012554526329, + "epoch": 0.609039809851367, + "grad_norm": 8.800650596618652, + "learning_rate": 3.241685990800375e-06, + "loss": 0.3941, + "mean_token_accuracy": 0.8715409860014915, + "num_tokens": 236375258.0, + "step": 196470 + }, + { + "entropy": 1.9295989215373992, + "epoch": 0.6090708089764166, + "grad_norm": 4.64478063583374, + "learning_rate": 3.2416034952858657e-06, + "loss": 0.4684, + "mean_token_accuracy": 0.8563613042235374, + "num_tokens": 236386829.0, + "step": 196480 + }, + { + "entropy": 1.882970041036606, + "epoch": 0.6091018081014663, + "grad_norm": 8.149700164794922, + "learning_rate": 3.2415210060691556e-06, + "loss": 0.4319, + "mean_token_accuracy": 0.8507476598024368, + "num_tokens": 236398588.0, + "step": 196490 + }, + { + "entropy": 1.8938290104269981, + "epoch": 0.6091328072265161, + "grad_norm": 7.915431022644043, + "learning_rate": 3.241438523149444e-06, + "loss": 0.4216, + "mean_token_accuracy": 0.866845078766346, + "num_tokens": 236409808.0, + "step": 196500 + }, + { + "entropy": 1.9040289625525475, + "epoch": 0.6091638063515658, + "grad_norm": 8.756927490234375, + "learning_rate": 3.241356046525931e-06, + "loss": 0.467, + "mean_token_accuracy": 0.8510834023356437, + "num_tokens": 236421042.0, + "step": 196510 + }, + { + "entropy": 1.863030880689621, + "epoch": 0.6091948054766154, + "grad_norm": 8.007649421691895, + "learning_rate": 3.241273576197815e-06, + "loss": 0.4034, + "mean_token_accuracy": 0.8647183448076248, + "num_tokens": 236432441.0, + "step": 196520 + }, + { + "entropy": 1.7401383429765702, + "epoch": 0.6092258046016651, + "grad_norm": 7.5847272872924805, + "learning_rate": 3.241191112164295e-06, + "loss": 0.3899, + "mean_token_accuracy": 0.8612486839294433, + "num_tokens": 236446359.0, + "step": 196530 + }, + { + "entropy": 1.8843446135520936, + "epoch": 0.6092568037267149, + "grad_norm": 7.633392810821533, + "learning_rate": 3.2411086544245702e-06, + "loss": 0.471, + "mean_token_accuracy": 0.8475699871778488, + "num_tokens": 236457568.0, + "step": 196540 + }, + { + "entropy": 1.9031406715512276, + "epoch": 0.6092878028517645, + "grad_norm": 7.588416576385498, + "learning_rate": 3.24102620297784e-06, + "loss": 0.459, + "mean_token_accuracy": 0.8562910869717598, + "num_tokens": 236469337.0, + "step": 196550 + }, + { + "entropy": 1.854507052898407, + "epoch": 0.6093188019768142, + "grad_norm": 10.432723999023438, + "learning_rate": 3.2409437578233037e-06, + "loss": 0.4633, + "mean_token_accuracy": 0.8570180460810661, + "num_tokens": 236481076.0, + "step": 196560 + }, + { + "entropy": 1.8487398087978364, + "epoch": 0.6093498011018639, + "grad_norm": 8.431697845458984, + "learning_rate": 3.2408613189601625e-06, + "loss": 0.4247, + "mean_token_accuracy": 0.8562931612133979, + "num_tokens": 236493481.0, + "step": 196570 + }, + { + "entropy": 1.7827741295099258, + "epoch": 0.6093808002269135, + "grad_norm": 7.114185810089111, + "learning_rate": 3.2407788863876144e-06, + "loss": 0.4015, + "mean_token_accuracy": 0.863072381913662, + "num_tokens": 236506265.0, + "step": 196580 + }, + { + "entropy": 1.8275396153330803, + "epoch": 0.6094117993519633, + "grad_norm": 8.118542671203613, + "learning_rate": 3.2406964601048606e-06, + "loss": 0.4293, + "mean_token_accuracy": 0.8519252866506577, + "num_tokens": 236518948.0, + "step": 196590 + }, + { + "entropy": 1.8712569296360015, + "epoch": 0.609442798477013, + "grad_norm": 7.7484660148620605, + "learning_rate": 3.2406140401111015e-06, + "loss": 0.4241, + "mean_token_accuracy": 0.8604287385940552, + "num_tokens": 236530875.0, + "step": 196600 + }, + { + "entropy": 1.8790629103779792, + "epoch": 0.6094737976020627, + "grad_norm": 7.387578010559082, + "learning_rate": 3.240531626405536e-06, + "loss": 0.415, + "mean_token_accuracy": 0.8621467888355255, + "num_tokens": 236542685.0, + "step": 196610 + }, + { + "entropy": 1.8917624711990357, + "epoch": 0.6095047967271123, + "grad_norm": 8.596790313720703, + "learning_rate": 3.240449218987366e-06, + "loss": 0.5302, + "mean_token_accuracy": 0.8511245831847191, + "num_tokens": 236554217.0, + "step": 196620 + }, + { + "entropy": 1.8684320464730262, + "epoch": 0.6095357958521621, + "grad_norm": 4.486491680145264, + "learning_rate": 3.240366817855792e-06, + "loss": 0.439, + "mean_token_accuracy": 0.8597828835248947, + "num_tokens": 236565677.0, + "step": 196630 + }, + { + "entropy": 1.8999014109373094, + "epoch": 0.6095667949772118, + "grad_norm": 8.079882621765137, + "learning_rate": 3.240284423010013e-06, + "loss": 0.4231, + "mean_token_accuracy": 0.864780393242836, + "num_tokens": 236576691.0, + "step": 196640 + }, + { + "entropy": 1.845630045235157, + "epoch": 0.6095977941022614, + "grad_norm": 7.2956109046936035, + "learning_rate": 3.240202034449232e-06, + "loss": 0.42, + "mean_token_accuracy": 0.8588384404778481, + "num_tokens": 236588267.0, + "step": 196650 + }, + { + "entropy": 1.8553462713956832, + "epoch": 0.6096287932273111, + "grad_norm": 9.026920318603516, + "learning_rate": 3.2401196521726493e-06, + "loss": 0.4567, + "mean_token_accuracy": 0.8575291231274604, + "num_tokens": 236600044.0, + "step": 196660 + }, + { + "entropy": 1.738269330561161, + "epoch": 0.6096597923523609, + "grad_norm": 4.796543121337891, + "learning_rate": 3.2400372761794647e-06, + "loss": 0.3288, + "mean_token_accuracy": 0.8745391935110092, + "num_tokens": 236613490.0, + "step": 196670 + }, + { + "entropy": 1.7987927034497262, + "epoch": 0.6096907914774106, + "grad_norm": 8.865738868713379, + "learning_rate": 3.239954906468882e-06, + "loss": 0.4463, + "mean_token_accuracy": 0.8437414780259133, + "num_tokens": 236626711.0, + "step": 196680 + }, + { + "entropy": 1.8533886075019836, + "epoch": 0.6097217906024602, + "grad_norm": 8.215073585510254, + "learning_rate": 3.239872543040101e-06, + "loss": 0.4249, + "mean_token_accuracy": 0.8536760002374649, + "num_tokens": 236638229.0, + "step": 196690 + }, + { + "entropy": 1.7952544182538985, + "epoch": 0.6097527897275099, + "grad_norm": 9.725830078125, + "learning_rate": 3.239790185892323e-06, + "loss": 0.4011, + "mean_token_accuracy": 0.8603607341647148, + "num_tokens": 236650934.0, + "step": 196700 + }, + { + "entropy": 1.7925215408205986, + "epoch": 0.6097837888525597, + "grad_norm": 3.792999744415283, + "learning_rate": 3.2397078350247505e-06, + "loss": 0.401, + "mean_token_accuracy": 0.8590068429708481, + "num_tokens": 236663905.0, + "step": 196710 + }, + { + "entropy": 1.8281165212392807, + "epoch": 0.6098147879776094, + "grad_norm": 7.636138439178467, + "learning_rate": 3.239625490436586e-06, + "loss": 0.4295, + "mean_token_accuracy": 0.863307175040245, + "num_tokens": 236675769.0, + "step": 196720 + }, + { + "entropy": 1.8852648198604585, + "epoch": 0.609845787102659, + "grad_norm": 6.831247329711914, + "learning_rate": 3.239543152127031e-06, + "loss": 0.4309, + "mean_token_accuracy": 0.8598569333553314, + "num_tokens": 236686930.0, + "step": 196730 + }, + { + "entropy": 1.71738261282444, + "epoch": 0.6098767862277087, + "grad_norm": 3.711994171142578, + "learning_rate": 3.2394608200952864e-06, + "loss": 0.3729, + "mean_token_accuracy": 0.8827052310109138, + "num_tokens": 236700123.0, + "step": 196740 + }, + { + "entropy": 1.794165775924921, + "epoch": 0.6099077853527585, + "grad_norm": 7.85221004486084, + "learning_rate": 3.239378494340556e-06, + "loss": 0.4, + "mean_token_accuracy": 0.8599389553070068, + "num_tokens": 236713096.0, + "step": 196750 + }, + { + "entropy": 1.8303263157606124, + "epoch": 0.6099387844778081, + "grad_norm": 4.312600612640381, + "learning_rate": 3.239296174862041e-06, + "loss": 0.4697, + "mean_token_accuracy": 0.8508070096373558, + "num_tokens": 236725536.0, + "step": 196760 + }, + { + "entropy": 1.85920792222023, + "epoch": 0.6099697836028578, + "grad_norm": 7.289279937744141, + "learning_rate": 3.239213861658946e-06, + "loss": 0.4479, + "mean_token_accuracy": 0.8590033814311028, + "num_tokens": 236736840.0, + "step": 196770 + }, + { + "entropy": 1.8310221433639526, + "epoch": 0.6100007827279075, + "grad_norm": 7.4863715171813965, + "learning_rate": 3.2391315547304714e-06, + "loss": 0.4155, + "mean_token_accuracy": 0.8629860013723374, + "num_tokens": 236749212.0, + "step": 196780 + }, + { + "entropy": 1.8335285529494285, + "epoch": 0.6100317818529573, + "grad_norm": 8.005071640014648, + "learning_rate": 3.2390492540758217e-06, + "loss": 0.4201, + "mean_token_accuracy": 0.8613960042595863, + "num_tokens": 236761681.0, + "step": 196790 + }, + { + "entropy": 1.7937164142727853, + "epoch": 0.6100627809780069, + "grad_norm": 7.301197052001953, + "learning_rate": 3.2389669596941985e-06, + "loss": 0.4242, + "mean_token_accuracy": 0.8637640863656998, + "num_tokens": 236774319.0, + "step": 196800 + }, + { + "entropy": 1.8970348417758942, + "epoch": 0.6100937801030566, + "grad_norm": 7.350532531738281, + "learning_rate": 3.2388846715848064e-06, + "loss": 0.4576, + "mean_token_accuracy": 0.848687818646431, + "num_tokens": 236785809.0, + "step": 196810 + }, + { + "entropy": 1.8532146289944649, + "epoch": 0.6101247792281063, + "grad_norm": 8.086607933044434, + "learning_rate": 3.2388023897468483e-06, + "loss": 0.4145, + "mean_token_accuracy": 0.8612830236554145, + "num_tokens": 236797749.0, + "step": 196820 + }, + { + "entropy": 1.8598564073443413, + "epoch": 0.6101557783531559, + "grad_norm": 3.8540594577789307, + "learning_rate": 3.2387201141795277e-06, + "loss": 0.4237, + "mean_token_accuracy": 0.8560948729515075, + "num_tokens": 236809315.0, + "step": 196830 + }, + { + "entropy": 1.693070538341999, + "epoch": 0.6101867774782057, + "grad_norm": 4.18641996383667, + "learning_rate": 3.238637844882047e-06, + "loss": 0.3489, + "mean_token_accuracy": 0.8701474830508232, + "num_tokens": 236823125.0, + "step": 196840 + }, + { + "entropy": 1.8074431777000428, + "epoch": 0.6102177766032554, + "grad_norm": 7.329400539398193, + "learning_rate": 3.238555581853611e-06, + "loss": 0.4006, + "mean_token_accuracy": 0.8615547999739647, + "num_tokens": 236836016.0, + "step": 196850 + }, + { + "entropy": 1.8871548235416413, + "epoch": 0.610248775728305, + "grad_norm": 8.988987922668457, + "learning_rate": 3.238473325093423e-06, + "loss": 0.4659, + "mean_token_accuracy": 0.8461986109614372, + "num_tokens": 236847417.0, + "step": 196860 + }, + { + "entropy": 1.8518931522965432, + "epoch": 0.6102797748533547, + "grad_norm": 7.389161586761475, + "learning_rate": 3.2383910746006876e-06, + "loss": 0.4374, + "mean_token_accuracy": 0.8587850123643875, + "num_tokens": 236859125.0, + "step": 196870 + }, + { + "entropy": 1.8223053202033044, + "epoch": 0.6103107739784045, + "grad_norm": 7.211876392364502, + "learning_rate": 3.2383088303746092e-06, + "loss": 0.4053, + "mean_token_accuracy": 0.8618249401450158, + "num_tokens": 236871551.0, + "step": 196880 + }, + { + "entropy": 1.8100453361868858, + "epoch": 0.6103417731034542, + "grad_norm": 4.357117176055908, + "learning_rate": 3.2382265924143915e-06, + "loss": 0.4501, + "mean_token_accuracy": 0.8459898293018341, + "num_tokens": 236884756.0, + "step": 196890 + }, + { + "entropy": 1.8431350201368333, + "epoch": 0.6103727722285038, + "grad_norm": 6.173425674438477, + "learning_rate": 3.238144360719238e-06, + "loss": 0.4408, + "mean_token_accuracy": 0.8548444300889969, + "num_tokens": 236897054.0, + "step": 196900 + }, + { + "entropy": 1.8529955729842187, + "epoch": 0.6104037713535535, + "grad_norm": 8.83094596862793, + "learning_rate": 3.2380621352883545e-06, + "loss": 0.4685, + "mean_token_accuracy": 0.8517158061265946, + "num_tokens": 236908851.0, + "step": 196910 + }, + { + "entropy": 1.8931651636958122, + "epoch": 0.6104347704786033, + "grad_norm": 9.133150100708008, + "learning_rate": 3.237979916120946e-06, + "loss": 0.4787, + "mean_token_accuracy": 0.8503081545233726, + "num_tokens": 236920724.0, + "step": 196920 + }, + { + "entropy": 1.7971744000911714, + "epoch": 0.610465769603653, + "grad_norm": 7.09853982925415, + "learning_rate": 3.237897703216217e-06, + "loss": 0.3723, + "mean_token_accuracy": 0.8749179974198341, + "num_tokens": 236933466.0, + "step": 196930 + }, + { + "entropy": 1.956950694322586, + "epoch": 0.6104967687287026, + "grad_norm": 7.854861259460449, + "learning_rate": 3.237815496573371e-06, + "loss": 0.5144, + "mean_token_accuracy": 0.851188787817955, + "num_tokens": 236944729.0, + "step": 196940 + }, + { + "entropy": 1.8420781329274178, + "epoch": 0.6105277678537523, + "grad_norm": 4.328573226928711, + "learning_rate": 3.2377332961916154e-06, + "loss": 0.4509, + "mean_token_accuracy": 0.851442477107048, + "num_tokens": 236958154.0, + "step": 196950 + }, + { + "entropy": 1.9192476660013198, + "epoch": 0.6105587669788021, + "grad_norm": 7.871819019317627, + "learning_rate": 3.237651102070154e-06, + "loss": 0.4576, + "mean_token_accuracy": 0.8586204499006271, + "num_tokens": 236968917.0, + "step": 196960 + }, + { + "entropy": 1.7532826662063599, + "epoch": 0.6105897661038517, + "grad_norm": 8.188135147094727, + "learning_rate": 3.237568914208193e-06, + "loss": 0.374, + "mean_token_accuracy": 0.8674729630351067, + "num_tokens": 236982705.0, + "step": 196970 + }, + { + "entropy": 1.9599376797676087, + "epoch": 0.6106207652289014, + "grad_norm": 11.441996574401855, + "learning_rate": 3.2374867326049374e-06, + "loss": 0.474, + "mean_token_accuracy": 0.8513784274458885, + "num_tokens": 236993636.0, + "step": 196980 + }, + { + "entropy": 1.9321315556764602, + "epoch": 0.6106517643539511, + "grad_norm": 10.580621719360352, + "learning_rate": 3.2374045572595936e-06, + "loss": 0.4976, + "mean_token_accuracy": 0.8470071226358413, + "num_tokens": 237004534.0, + "step": 196990 + }, + { + "entropy": 1.8774404481053353, + "epoch": 0.6106827634790009, + "grad_norm": 8.446964263916016, + "learning_rate": 3.237322388171367e-06, + "loss": 0.4343, + "mean_token_accuracy": 0.859823040664196, + "num_tokens": 237016327.0, + "step": 197000 + }, + { + "entropy": 1.9051348567008972, + "epoch": 0.6107137626040505, + "grad_norm": 8.405506134033203, + "learning_rate": 3.2372402253394627e-06, + "loss": 0.4877, + "mean_token_accuracy": 0.8614391669631004, + "num_tokens": 237027383.0, + "step": 197010 + }, + { + "entropy": 1.8986682042479515, + "epoch": 0.6107447617291002, + "grad_norm": 7.353382110595703, + "learning_rate": 3.2371580687630882e-06, + "loss": 0.4324, + "mean_token_accuracy": 0.8544711455702781, + "num_tokens": 237039405.0, + "step": 197020 + }, + { + "entropy": 1.8387581080198288, + "epoch": 0.6107757608541499, + "grad_norm": 7.9818501472473145, + "learning_rate": 3.2370759184414486e-06, + "loss": 0.4099, + "mean_token_accuracy": 0.8615599378943444, + "num_tokens": 237052054.0, + "step": 197030 + }, + { + "entropy": 1.907764096558094, + "epoch": 0.6108067599791995, + "grad_norm": 7.708189010620117, + "learning_rate": 3.2369937743737518e-06, + "loss": 0.4402, + "mean_token_accuracy": 0.8530245617032051, + "num_tokens": 237064397.0, + "step": 197040 + }, + { + "entropy": 1.8867248311638831, + "epoch": 0.6108377591042493, + "grad_norm": 9.634085655212402, + "learning_rate": 3.2369116365592037e-06, + "loss": 0.473, + "mean_token_accuracy": 0.8468973502516747, + "num_tokens": 237076147.0, + "step": 197050 + }, + { + "entropy": 1.859901525080204, + "epoch": 0.610868758229299, + "grad_norm": 8.066091537475586, + "learning_rate": 3.2368295049970094e-06, + "loss": 0.4365, + "mean_token_accuracy": 0.8572079196572304, + "num_tokens": 237087769.0, + "step": 197060 + }, + { + "entropy": 1.878857983648777, + "epoch": 0.6108997573543486, + "grad_norm": 7.823647499084473, + "learning_rate": 3.2367473796863778e-06, + "loss": 0.4699, + "mean_token_accuracy": 0.8438057050108909, + "num_tokens": 237099723.0, + "step": 197070 + }, + { + "entropy": 1.8205513462424279, + "epoch": 0.6109307564793983, + "grad_norm": 3.872516393661499, + "learning_rate": 3.236665260626515e-06, + "loss": 0.3891, + "mean_token_accuracy": 0.8547244861721992, + "num_tokens": 237111838.0, + "step": 197080 + }, + { + "entropy": 1.8410150706768036, + "epoch": 0.6109617556044481, + "grad_norm": 9.855649948120117, + "learning_rate": 3.2365831478166287e-06, + "loss": 0.4648, + "mean_token_accuracy": 0.8547208964824676, + "num_tokens": 237124565.0, + "step": 197090 + }, + { + "entropy": 1.7792586773633956, + "epoch": 0.6109927547294978, + "grad_norm": 9.24644947052002, + "learning_rate": 3.236501041255925e-06, + "loss": 0.3683, + "mean_token_accuracy": 0.8620190620422363, + "num_tokens": 237138066.0, + "step": 197100 + }, + { + "entropy": 1.893431168794632, + "epoch": 0.6110237538545474, + "grad_norm": 8.146915435791016, + "learning_rate": 3.2364189409436118e-06, + "loss": 0.4575, + "mean_token_accuracy": 0.8524102881550789, + "num_tokens": 237149822.0, + "step": 197110 + }, + { + "entropy": 1.898918354511261, + "epoch": 0.6110547529795971, + "grad_norm": 3.6508231163024902, + "learning_rate": 3.2363368468788973e-06, + "loss": 0.4527, + "mean_token_accuracy": 0.8517585694789886, + "num_tokens": 237160803.0, + "step": 197120 + }, + { + "entropy": 1.8995780304074288, + "epoch": 0.6110857521046469, + "grad_norm": 7.793694496154785, + "learning_rate": 3.2362547590609886e-06, + "loss": 0.4895, + "mean_token_accuracy": 0.8504430979490281, + "num_tokens": 237172435.0, + "step": 197130 + }, + { + "entropy": 1.7506232902407646, + "epoch": 0.6111167512296966, + "grad_norm": 8.007512092590332, + "learning_rate": 3.236172677489094e-06, + "loss": 0.4034, + "mean_token_accuracy": 0.8630235210061074, + "num_tokens": 237186683.0, + "step": 197140 + }, + { + "entropy": 1.7544875517487526, + "epoch": 0.6111477503547462, + "grad_norm": 8.81264877319336, + "learning_rate": 3.23609060216242e-06, + "loss": 0.3796, + "mean_token_accuracy": 0.8727211073040962, + "num_tokens": 237200218.0, + "step": 197150 + }, + { + "entropy": 1.9269504621624947, + "epoch": 0.6111787494797959, + "grad_norm": 7.985466957092285, + "learning_rate": 3.2360085330801767e-06, + "loss": 0.421, + "mean_token_accuracy": 0.8687825843691825, + "num_tokens": 237211313.0, + "step": 197160 + }, + { + "entropy": 1.859518238902092, + "epoch": 0.6112097486048457, + "grad_norm": 5.540771961212158, + "learning_rate": 3.23592647024157e-06, + "loss": 0.4441, + "mean_token_accuracy": 0.853747871518135, + "num_tokens": 237222938.0, + "step": 197170 + }, + { + "entropy": 1.9155102893710136, + "epoch": 0.6112407477298953, + "grad_norm": 8.322548866271973, + "learning_rate": 3.2358444136458106e-06, + "loss": 0.4616, + "mean_token_accuracy": 0.8527912616729736, + "num_tokens": 237233724.0, + "step": 197180 + }, + { + "entropy": 1.9483972951769828, + "epoch": 0.611271746854945, + "grad_norm": 8.302480697631836, + "learning_rate": 3.235762363292106e-06, + "loss": 0.4694, + "mean_token_accuracy": 0.8494256645441055, + "num_tokens": 237245105.0, + "step": 197190 + }, + { + "entropy": 1.8738413840532302, + "epoch": 0.6113027459799947, + "grad_norm": 3.559133529663086, + "learning_rate": 3.2356803191796642e-06, + "loss": 0.4215, + "mean_token_accuracy": 0.855416576564312, + "num_tokens": 237257405.0, + "step": 197200 + }, + { + "entropy": 1.8959673658013343, + "epoch": 0.6113337451050445, + "grad_norm": 10.196348190307617, + "learning_rate": 3.2355982813076953e-06, + "loss": 0.4681, + "mean_token_accuracy": 0.8532440841197968, + "num_tokens": 237268409.0, + "step": 197210 + }, + { + "entropy": 1.8422715082764625, + "epoch": 0.6113647442300941, + "grad_norm": 5.707858562469482, + "learning_rate": 3.2355162496754073e-06, + "loss": 0.4789, + "mean_token_accuracy": 0.84599878937006, + "num_tokens": 237281020.0, + "step": 197220 + }, + { + "entropy": 1.8892938852310182, + "epoch": 0.6113957433551438, + "grad_norm": 3.6076462268829346, + "learning_rate": 3.235434224282009e-06, + "loss": 0.4153, + "mean_token_accuracy": 0.8560446351766586, + "num_tokens": 237293369.0, + "step": 197230 + }, + { + "entropy": 1.9170384138822556, + "epoch": 0.6114267424801935, + "grad_norm": 5.8077921867370605, + "learning_rate": 3.235352205126711e-06, + "loss": 0.4699, + "mean_token_accuracy": 0.8565402716398239, + "num_tokens": 237304689.0, + "step": 197240 + }, + { + "entropy": 1.9104067966341973, + "epoch": 0.6114577416052432, + "grad_norm": 9.619139671325684, + "learning_rate": 3.2352701922087217e-06, + "loss": 0.4633, + "mean_token_accuracy": 0.8569847285747528, + "num_tokens": 237316300.0, + "step": 197250 + }, + { + "entropy": 1.7930829659104348, + "epoch": 0.6114887407302929, + "grad_norm": 3.60917592048645, + "learning_rate": 3.23518818552725e-06, + "loss": 0.3691, + "mean_token_accuracy": 0.8651748016476631, + "num_tokens": 237329389.0, + "step": 197260 + }, + { + "entropy": 1.8553033851087093, + "epoch": 0.6115197398553426, + "grad_norm": 2.5513739585876465, + "learning_rate": 3.2351061850815062e-06, + "loss": 0.47, + "mean_token_accuracy": 0.8540530279278755, + "num_tokens": 237342015.0, + "step": 197270 + }, + { + "entropy": 1.8537073224782943, + "epoch": 0.6115507389803922, + "grad_norm": 7.609178066253662, + "learning_rate": 3.235024190870701e-06, + "loss": 0.4222, + "mean_token_accuracy": 0.8554342269897461, + "num_tokens": 237354500.0, + "step": 197280 + }, + { + "entropy": 1.8396665275096893, + "epoch": 0.6115817381054419, + "grad_norm": 9.246115684509277, + "learning_rate": 3.2349422028940426e-06, + "loss": 0.4347, + "mean_token_accuracy": 0.852047860622406, + "num_tokens": 237366515.0, + "step": 197290 + }, + { + "entropy": 1.8538162797689437, + "epoch": 0.6116127372304917, + "grad_norm": 7.036844730377197, + "learning_rate": 3.2348602211507424e-06, + "loss": 0.3995, + "mean_token_accuracy": 0.8756770327687263, + "num_tokens": 237378222.0, + "step": 197300 + }, + { + "entropy": 1.9473214149475098, + "epoch": 0.6116437363555414, + "grad_norm": 7.349457263946533, + "learning_rate": 3.23477824564001e-06, + "loss": 0.4921, + "mean_token_accuracy": 0.8548978239297866, + "num_tokens": 237389710.0, + "step": 197310 + }, + { + "entropy": 1.9008982643485068, + "epoch": 0.611674735480591, + "grad_norm": 8.243997573852539, + "learning_rate": 3.2346962763610556e-06, + "loss": 0.4524, + "mean_token_accuracy": 0.8519371166825295, + "num_tokens": 237401797.0, + "step": 197320 + }, + { + "entropy": 1.831042182445526, + "epoch": 0.6117057346056407, + "grad_norm": 8.572514533996582, + "learning_rate": 3.2346143133130897e-06, + "loss": 0.4071, + "mean_token_accuracy": 0.8584729507565498, + "num_tokens": 237414474.0, + "step": 197330 + }, + { + "entropy": 1.8398546814918517, + "epoch": 0.6117367337306905, + "grad_norm": 10.016891479492188, + "learning_rate": 3.234532356495323e-06, + "loss": 0.4172, + "mean_token_accuracy": 0.8620932117104531, + "num_tokens": 237426310.0, + "step": 197340 + }, + { + "entropy": 1.8987890288233757, + "epoch": 0.6117677328557402, + "grad_norm": 4.706457614898682, + "learning_rate": 3.234450405906967e-06, + "loss": 0.4558, + "mean_token_accuracy": 0.8507528349757194, + "num_tokens": 237437770.0, + "step": 197350 + }, + { + "entropy": 1.851937872171402, + "epoch": 0.6117987319807898, + "grad_norm": 7.817611217498779, + "learning_rate": 3.2343684615472313e-06, + "loss": 0.3843, + "mean_token_accuracy": 0.8699416399002076, + "num_tokens": 237449562.0, + "step": 197360 + }, + { + "entropy": 1.8670279935002327, + "epoch": 0.6118297311058395, + "grad_norm": 7.869930744171143, + "learning_rate": 3.2342865234153282e-06, + "loss": 0.4245, + "mean_token_accuracy": 0.8561404958367348, + "num_tokens": 237461931.0, + "step": 197370 + }, + { + "entropy": 1.8751569628715514, + "epoch": 0.6118607302308893, + "grad_norm": 8.188568115234375, + "learning_rate": 3.2342045915104675e-06, + "loss": 0.461, + "mean_token_accuracy": 0.8529067426919937, + "num_tokens": 237473861.0, + "step": 197380 + }, + { + "entropy": 1.8241604372859002, + "epoch": 0.6118917293559389, + "grad_norm": 3.8533084392547607, + "learning_rate": 3.2341226658318614e-06, + "loss": 0.3897, + "mean_token_accuracy": 0.8676508396863938, + "num_tokens": 237486393.0, + "step": 197390 + }, + { + "entropy": 1.9073212459683417, + "epoch": 0.6119227284809886, + "grad_norm": 7.623655796051025, + "learning_rate": 3.234040746378722e-06, + "loss": 0.4447, + "mean_token_accuracy": 0.8545937642455101, + "num_tokens": 237497546.0, + "step": 197400 + }, + { + "entropy": 1.8588381856679916, + "epoch": 0.6119537276060383, + "grad_norm": 7.781834602355957, + "learning_rate": 3.2339588331502598e-06, + "loss": 0.4383, + "mean_token_accuracy": 0.8564682066440582, + "num_tokens": 237510315.0, + "step": 197410 + }, + { + "entropy": 1.8095207661390305, + "epoch": 0.6119847267310881, + "grad_norm": 4.215339660644531, + "learning_rate": 3.233876926145686e-06, + "loss": 0.3784, + "mean_token_accuracy": 0.8653566733002662, + "num_tokens": 237522650.0, + "step": 197420 + }, + { + "entropy": 1.8454365268349648, + "epoch": 0.6120157258561377, + "grad_norm": 6.672299385070801, + "learning_rate": 3.2337950253642135e-06, + "loss": 0.4404, + "mean_token_accuracy": 0.8600479036569595, + "num_tokens": 237534798.0, + "step": 197430 + }, + { + "entropy": 1.7626978799700737, + "epoch": 0.6120467249811874, + "grad_norm": 9.613987922668457, + "learning_rate": 3.2337131308050545e-06, + "loss": 0.3694, + "mean_token_accuracy": 0.8646810084581376, + "num_tokens": 237548710.0, + "step": 197440 + }, + { + "entropy": 1.9045309767127037, + "epoch": 0.6120777241062371, + "grad_norm": 8.034784317016602, + "learning_rate": 3.233631242467421e-06, + "loss": 0.4571, + "mean_token_accuracy": 0.8537096366286278, + "num_tokens": 237560491.0, + "step": 197450 + }, + { + "entropy": 1.75089842826128, + "epoch": 0.6121087232312868, + "grad_norm": 8.159855842590332, + "learning_rate": 3.2335493603505246e-06, + "loss": 0.3873, + "mean_token_accuracy": 0.869796434044838, + "num_tokens": 237573666.0, + "step": 197460 + }, + { + "entropy": 1.8842484802007675, + "epoch": 0.6121397223563365, + "grad_norm": 7.651088237762451, + "learning_rate": 3.2334674844535783e-06, + "loss": 0.4253, + "mean_token_accuracy": 0.8594219341874123, + "num_tokens": 237585141.0, + "step": 197470 + }, + { + "entropy": 1.8589984819293022, + "epoch": 0.6121707214813862, + "grad_norm": 3.8243820667266846, + "learning_rate": 3.233385614775794e-06, + "loss": 0.3806, + "mean_token_accuracy": 0.8726780936121941, + "num_tokens": 237596837.0, + "step": 197480 + }, + { + "entropy": 1.7896558150649071, + "epoch": 0.6122017206064359, + "grad_norm": 10.02405834197998, + "learning_rate": 3.2333037513163856e-06, + "loss": 0.3866, + "mean_token_accuracy": 0.8653994277119637, + "num_tokens": 237610276.0, + "step": 197490 + }, + { + "entropy": 1.7988707900047303, + "epoch": 0.6122327197314856, + "grad_norm": 7.22122049331665, + "learning_rate": 3.233221894074566e-06, + "loss": 0.3847, + "mean_token_accuracy": 0.863201479613781, + "num_tokens": 237623129.0, + "step": 197500 + }, + { + "entropy": 1.8126246452331543, + "epoch": 0.6122637188565353, + "grad_norm": 8.908790588378906, + "learning_rate": 3.233140043049547e-06, + "loss": 0.3593, + "mean_token_accuracy": 0.8658370718359947, + "num_tokens": 237635448.0, + "step": 197510 + }, + { + "entropy": 1.8080999478697777, + "epoch": 0.612294717981585, + "grad_norm": 2.396630048751831, + "learning_rate": 3.233058198240541e-06, + "loss": 0.3758, + "mean_token_accuracy": 0.8676091253757476, + "num_tokens": 237648272.0, + "step": 197520 + }, + { + "entropy": 1.7976605847477913, + "epoch": 0.6123257171066346, + "grad_norm": 9.25484561920166, + "learning_rate": 3.232976359646764e-06, + "loss": 0.35, + "mean_token_accuracy": 0.8769566550850868, + "num_tokens": 237660724.0, + "step": 197530 + }, + { + "entropy": 1.9517788350582124, + "epoch": 0.6123567162316843, + "grad_norm": 8.63337230682373, + "learning_rate": 3.232894527267427e-06, + "loss": 0.5639, + "mean_token_accuracy": 0.8373573496937752, + "num_tokens": 237672350.0, + "step": 197540 + }, + { + "entropy": 1.838518961519003, + "epoch": 0.6123877153567341, + "grad_norm": 8.386921882629395, + "learning_rate": 3.232812701101745e-06, + "loss": 0.4273, + "mean_token_accuracy": 0.8607329323887825, + "num_tokens": 237684833.0, + "step": 197550 + }, + { + "entropy": 1.9415549844503404, + "epoch": 0.6124187144817838, + "grad_norm": 7.513718605041504, + "learning_rate": 3.2327308811489318e-06, + "loss": 0.4443, + "mean_token_accuracy": 0.8599239453673363, + "num_tokens": 237695928.0, + "step": 197560 + }, + { + "entropy": 1.8711989670991898, + "epoch": 0.6124497136068334, + "grad_norm": 2.5814082622528076, + "learning_rate": 3.2326490674081996e-06, + "loss": 0.4596, + "mean_token_accuracy": 0.8575176253914834, + "num_tokens": 237708081.0, + "step": 197570 + }, + { + "entropy": 1.9002268463373184, + "epoch": 0.6124807127318831, + "grad_norm": 7.871204853057861, + "learning_rate": 3.2325672598787633e-06, + "loss": 0.4741, + "mean_token_accuracy": 0.8506250888109207, + "num_tokens": 237719581.0, + "step": 197580 + }, + { + "entropy": 1.858892673254013, + "epoch": 0.6125117118569329, + "grad_norm": 4.453184127807617, + "learning_rate": 3.232485458559837e-06, + "loss": 0.4198, + "mean_token_accuracy": 0.8605369016528129, + "num_tokens": 237731116.0, + "step": 197590 + }, + { + "entropy": 1.8251206517219543, + "epoch": 0.6125427109819825, + "grad_norm": 7.465099334716797, + "learning_rate": 3.232403663450635e-06, + "loss": 0.4134, + "mean_token_accuracy": 0.8610186651349068, + "num_tokens": 237743524.0, + "step": 197600 + }, + { + "entropy": 1.8249526843428612, + "epoch": 0.6125737101070322, + "grad_norm": 4.715644836425781, + "learning_rate": 3.2323218745503727e-06, + "loss": 0.4143, + "mean_token_accuracy": 0.8602005869150162, + "num_tokens": 237755768.0, + "step": 197610 + }, + { + "entropy": 1.9152856677770616, + "epoch": 0.6126047092320819, + "grad_norm": 3.7345130443573, + "learning_rate": 3.232240091858262e-06, + "loss": 0.4123, + "mean_token_accuracy": 0.8612644776701928, + "num_tokens": 237767214.0, + "step": 197620 + }, + { + "entropy": 1.812970919162035, + "epoch": 0.6126357083571317, + "grad_norm": 11.833577156066895, + "learning_rate": 3.23215831537352e-06, + "loss": 0.3969, + "mean_token_accuracy": 0.8618756428360939, + "num_tokens": 237780541.0, + "step": 197630 + }, + { + "entropy": 1.8648294284939766, + "epoch": 0.6126667074821813, + "grad_norm": 3.68322491645813, + "learning_rate": 3.2320765450953605e-06, + "loss": 0.4, + "mean_token_accuracy": 0.8567120462656022, + "num_tokens": 237793175.0, + "step": 197640 + }, + { + "entropy": 1.8529816403985024, + "epoch": 0.612697706607231, + "grad_norm": 7.8077592849731445, + "learning_rate": 3.2319947810229992e-06, + "loss": 0.3818, + "mean_token_accuracy": 0.866400220990181, + "num_tokens": 237805646.0, + "step": 197650 + }, + { + "entropy": 1.9075462460517882, + "epoch": 0.6127287057322807, + "grad_norm": 8.350885391235352, + "learning_rate": 3.23191302315565e-06, + "loss": 0.4491, + "mean_token_accuracy": 0.8558677047491073, + "num_tokens": 237817020.0, + "step": 197660 + }, + { + "entropy": 1.8251194074749946, + "epoch": 0.6127597048573304, + "grad_norm": 8.18161678314209, + "learning_rate": 3.2318312714925286e-06, + "loss": 0.4597, + "mean_token_accuracy": 0.8488647878170014, + "num_tokens": 237830190.0, + "step": 197670 + }, + { + "entropy": 1.910027140378952, + "epoch": 0.6127907039823801, + "grad_norm": 8.298951148986816, + "learning_rate": 3.2317495260328507e-06, + "loss": 0.4329, + "mean_token_accuracy": 0.8613663628697396, + "num_tokens": 237841186.0, + "step": 197680 + }, + { + "entropy": 1.8520949259400368, + "epoch": 0.6128217031074298, + "grad_norm": 7.493919372558594, + "learning_rate": 3.231667786775831e-06, + "loss": 0.417, + "mean_token_accuracy": 0.864731340110302, + "num_tokens": 237853366.0, + "step": 197690 + }, + { + "entropy": 1.8635297536849975, + "epoch": 0.6128527022324795, + "grad_norm": 10.218779563903809, + "learning_rate": 3.231586053720686e-06, + "loss": 0.4244, + "mean_token_accuracy": 0.8612703114748002, + "num_tokens": 237865400.0, + "step": 197700 + }, + { + "entropy": 1.8062796369194984, + "epoch": 0.6128837013575292, + "grad_norm": 3.485726833343506, + "learning_rate": 3.2315043268666313e-06, + "loss": 0.4039, + "mean_token_accuracy": 0.8678748309612274, + "num_tokens": 237877764.0, + "step": 197710 + }, + { + "entropy": 1.8527929231524467, + "epoch": 0.6129147004825789, + "grad_norm": 7.560081958770752, + "learning_rate": 3.231422606212883e-06, + "loss": 0.4175, + "mean_token_accuracy": 0.8541894242167473, + "num_tokens": 237889106.0, + "step": 197720 + }, + { + "entropy": 1.8421387121081352, + "epoch": 0.6129456996076286, + "grad_norm": 7.515761852264404, + "learning_rate": 3.2313408917586557e-06, + "loss": 0.3825, + "mean_token_accuracy": 0.8652972161769867, + "num_tokens": 237901508.0, + "step": 197730 + }, + { + "entropy": 1.8394855439662934, + "epoch": 0.6129766987326782, + "grad_norm": 4.9240546226501465, + "learning_rate": 3.231259183503168e-06, + "loss": 0.5128, + "mean_token_accuracy": 0.8508085206151008, + "num_tokens": 237913802.0, + "step": 197740 + }, + { + "entropy": 1.884807887673378, + "epoch": 0.613007697857728, + "grad_norm": 8.731439590454102, + "learning_rate": 3.231177481445634e-06, + "loss": 0.417, + "mean_token_accuracy": 0.8633570462465286, + "num_tokens": 237924909.0, + "step": 197750 + }, + { + "entropy": 1.8097985833883286, + "epoch": 0.6130386969827777, + "grad_norm": 8.296426773071289, + "learning_rate": 3.2310957855852715e-06, + "loss": 0.3985, + "mean_token_accuracy": 0.862330450117588, + "num_tokens": 237937637.0, + "step": 197760 + }, + { + "entropy": 1.8103355050086976, + "epoch": 0.6130696961078274, + "grad_norm": 4.088588714599609, + "learning_rate": 3.231014095921296e-06, + "loss": 0.387, + "mean_token_accuracy": 0.8579976573586464, + "num_tokens": 237950579.0, + "step": 197770 + }, + { + "entropy": 1.8344371646642685, + "epoch": 0.613100695232877, + "grad_norm": 7.0518388748168945, + "learning_rate": 3.2309324124529264e-06, + "loss": 0.4239, + "mean_token_accuracy": 0.8459786728024483, + "num_tokens": 237963468.0, + "step": 197780 + }, + { + "entropy": 1.8165979743003846, + "epoch": 0.6131316943579267, + "grad_norm": 8.043184280395508, + "learning_rate": 3.230850735179376e-06, + "loss": 0.377, + "mean_token_accuracy": 0.8663666322827339, + "num_tokens": 237976313.0, + "step": 197790 + }, + { + "entropy": 1.81719990670681, + "epoch": 0.6131626934829765, + "grad_norm": 6.555117130279541, + "learning_rate": 3.2307690640998657e-06, + "loss": 0.3561, + "mean_token_accuracy": 0.8760808929800987, + "num_tokens": 237989127.0, + "step": 197800 + }, + { + "entropy": 1.8462613999843598, + "epoch": 0.6131936926080261, + "grad_norm": 7.677376747131348, + "learning_rate": 3.2306873992136107e-06, + "loss": 0.3921, + "mean_token_accuracy": 0.8611488565802574, + "num_tokens": 238001168.0, + "step": 197810 + }, + { + "entropy": 1.9142689868807792, + "epoch": 0.6132246917330758, + "grad_norm": 6.646963119506836, + "learning_rate": 3.2306057405198276e-06, + "loss": 0.4428, + "mean_token_accuracy": 0.8539297908544541, + "num_tokens": 238012767.0, + "step": 197820 + }, + { + "entropy": 1.905218243598938, + "epoch": 0.6132556908581255, + "grad_norm": 8.068191528320312, + "learning_rate": 3.230524088017735e-06, + "loss": 0.4308, + "mean_token_accuracy": 0.8539548605680466, + "num_tokens": 238024831.0, + "step": 197830 + }, + { + "entropy": 1.8698325335979462, + "epoch": 0.6132866899831753, + "grad_norm": 7.347733020782471, + "learning_rate": 3.2304424417065505e-06, + "loss": 0.4083, + "mean_token_accuracy": 0.8575117960572243, + "num_tokens": 238036523.0, + "step": 197840 + }, + { + "entropy": 1.824147316813469, + "epoch": 0.6133176891082249, + "grad_norm": 9.090344429016113, + "learning_rate": 3.2303608015854916e-06, + "loss": 0.4235, + "mean_token_accuracy": 0.8511833533644676, + "num_tokens": 238049547.0, + "step": 197850 + }, + { + "entropy": 1.851954886317253, + "epoch": 0.6133486882332746, + "grad_norm": 8.417724609375, + "learning_rate": 3.2302791676537758e-06, + "loss": 0.3987, + "mean_token_accuracy": 0.8720298394560814, + "num_tokens": 238061858.0, + "step": 197860 + }, + { + "entropy": 1.7978620529174805, + "epoch": 0.6133796873583243, + "grad_norm": 4.008333206176758, + "learning_rate": 3.2301975399106215e-06, + "loss": 0.3852, + "mean_token_accuracy": 0.8638000205159188, + "num_tokens": 238075560.0, + "step": 197870 + }, + { + "entropy": 1.8916511505842208, + "epoch": 0.613410686483374, + "grad_norm": 7.247588634490967, + "learning_rate": 3.2301159183552466e-06, + "loss": 0.4714, + "mean_token_accuracy": 0.8495002686977386, + "num_tokens": 238086563.0, + "step": 197880 + }, + { + "entropy": 1.8061557114124298, + "epoch": 0.6134416856084237, + "grad_norm": 3.697267770767212, + "learning_rate": 3.2300343029868697e-06, + "loss": 0.4692, + "mean_token_accuracy": 0.8531102150678634, + "num_tokens": 238099873.0, + "step": 197890 + }, + { + "entropy": 1.9366677463054658, + "epoch": 0.6134726847334734, + "grad_norm": 8.543197631835938, + "learning_rate": 3.229952693804708e-06, + "loss": 0.4636, + "mean_token_accuracy": 0.8499609500169754, + "num_tokens": 238111212.0, + "step": 197900 + }, + { + "entropy": 1.9563579097390176, + "epoch": 0.613503683858523, + "grad_norm": 9.433308601379395, + "learning_rate": 3.2298710908079823e-06, + "loss": 0.4932, + "mean_token_accuracy": 0.8431360751390458, + "num_tokens": 238122557.0, + "step": 197910 + }, + { + "entropy": 1.815135808289051, + "epoch": 0.6135346829835728, + "grad_norm": 7.3137969970703125, + "learning_rate": 3.2297894939959094e-06, + "loss": 0.404, + "mean_token_accuracy": 0.8702261924743653, + "num_tokens": 238135193.0, + "step": 197920 + }, + { + "entropy": 1.8503555253148078, + "epoch": 0.6135656821086225, + "grad_norm": 4.021295070648193, + "learning_rate": 3.2297079033677094e-06, + "loss": 0.4389, + "mean_token_accuracy": 0.8535087972879409, + "num_tokens": 238147844.0, + "step": 197930 + }, + { + "entropy": 1.8699439376592637, + "epoch": 0.6135966812336722, + "grad_norm": 8.971370697021484, + "learning_rate": 3.2296263189225992e-06, + "loss": 0.4331, + "mean_token_accuracy": 0.8574245125055313, + "num_tokens": 238159624.0, + "step": 197940 + }, + { + "entropy": 1.837162759900093, + "epoch": 0.6136276803587218, + "grad_norm": 4.629751682281494, + "learning_rate": 3.2295447406598004e-06, + "loss": 0.4149, + "mean_token_accuracy": 0.8483832642436028, + "num_tokens": 238172709.0, + "step": 197950 + }, + { + "entropy": 1.7987508580088616, + "epoch": 0.6136586794837716, + "grad_norm": 8.438607215881348, + "learning_rate": 3.22946316857853e-06, + "loss": 0.3552, + "mean_token_accuracy": 0.8662834882736206, + "num_tokens": 238185176.0, + "step": 197960 + }, + { + "entropy": 1.833447128534317, + "epoch": 0.6136896786088213, + "grad_norm": 4.577413558959961, + "learning_rate": 3.22938160267801e-06, + "loss": 0.423, + "mean_token_accuracy": 0.8562129974365235, + "num_tokens": 238197981.0, + "step": 197970 + }, + { + "entropy": 1.8880952566862106, + "epoch": 0.613720677733871, + "grad_norm": 8.018782615661621, + "learning_rate": 3.2293000429574573e-06, + "loss": 0.4072, + "mean_token_accuracy": 0.8598210200667381, + "num_tokens": 238209028.0, + "step": 197980 + }, + { + "entropy": 1.797704565525055, + "epoch": 0.6137516768589206, + "grad_norm": 4.476016998291016, + "learning_rate": 3.2292184894160926e-06, + "loss": 0.3497, + "mean_token_accuracy": 0.8691965594887734, + "num_tokens": 238222150.0, + "step": 197990 + }, + { + "entropy": 1.7164287373423577, + "epoch": 0.6137826759839704, + "grad_norm": 8.452033996582031, + "learning_rate": 3.229136942053136e-06, + "loss": 0.3452, + "mean_token_accuracy": 0.8812255963683129, + "num_tokens": 238235915.0, + "step": 198000 + }, + { + "entropy": 1.8963288113474845, + "epoch": 0.6138136751090201, + "grad_norm": 8.194510459899902, + "learning_rate": 3.2290554008678066e-06, + "loss": 0.4848, + "mean_token_accuracy": 0.846800397336483, + "num_tokens": 238248130.0, + "step": 198010 + }, + { + "entropy": 1.9342078655958175, + "epoch": 0.6138446742340697, + "grad_norm": 8.6097993850708, + "learning_rate": 3.2289738658593254e-06, + "loss": 0.492, + "mean_token_accuracy": 0.8513077095150947, + "num_tokens": 238259188.0, + "step": 198020 + }, + { + "entropy": 1.7547577977180482, + "epoch": 0.6138756733591194, + "grad_norm": 6.928021430969238, + "learning_rate": 3.2288923370269125e-06, + "loss": 0.3452, + "mean_token_accuracy": 0.8803062126040458, + "num_tokens": 238272312.0, + "step": 198030 + }, + { + "entropy": 1.892691618204117, + "epoch": 0.6139066724841691, + "grad_norm": 8.509113311767578, + "learning_rate": 3.228810814369788e-06, + "loss": 0.4369, + "mean_token_accuracy": 0.8577762544155121, + "num_tokens": 238284154.0, + "step": 198040 + }, + { + "entropy": 1.9040351197123528, + "epoch": 0.6139376716092189, + "grad_norm": 9.362594604492188, + "learning_rate": 3.2287292978871717e-06, + "loss": 0.4842, + "mean_token_accuracy": 0.845809280872345, + "num_tokens": 238295201.0, + "step": 198050 + }, + { + "entropy": 1.9087980896234513, + "epoch": 0.6139686707342685, + "grad_norm": 8.504251480102539, + "learning_rate": 3.2286477875782848e-06, + "loss": 0.4468, + "mean_token_accuracy": 0.8549018561840057, + "num_tokens": 238306358.0, + "step": 198060 + }, + { + "entropy": 1.8494928494095801, + "epoch": 0.6139996698593182, + "grad_norm": 8.494181632995605, + "learning_rate": 3.2285662834423486e-06, + "loss": 0.4133, + "mean_token_accuracy": 0.853402042388916, + "num_tokens": 238319160.0, + "step": 198070 + }, + { + "entropy": 1.9097889050841332, + "epoch": 0.6140306689843679, + "grad_norm": 9.164204597473145, + "learning_rate": 3.228484785478583e-06, + "loss": 0.5, + "mean_token_accuracy": 0.847290362417698, + "num_tokens": 238330705.0, + "step": 198080 + }, + { + "entropy": 1.8471282333135606, + "epoch": 0.6140616681094176, + "grad_norm": 9.91257381439209, + "learning_rate": 3.2284032936862096e-06, + "loss": 0.4111, + "mean_token_accuracy": 0.8610455006361007, + "num_tokens": 238342634.0, + "step": 198090 + }, + { + "entropy": 1.888751471042633, + "epoch": 0.6140926672344673, + "grad_norm": 7.841835975646973, + "learning_rate": 3.2283218080644495e-06, + "loss": 0.4841, + "mean_token_accuracy": 0.8621149614453316, + "num_tokens": 238354440.0, + "step": 198100 + }, + { + "entropy": 1.874174064397812, + "epoch": 0.614123666359517, + "grad_norm": 3.7390925884246826, + "learning_rate": 3.2282403286125243e-06, + "loss": 0.4404, + "mean_token_accuracy": 0.8606786787509918, + "num_tokens": 238365755.0, + "step": 198110 + }, + { + "entropy": 1.933007836341858, + "epoch": 0.6141546654845667, + "grad_norm": 7.941004753112793, + "learning_rate": 3.2281588553296544e-06, + "loss": 0.4697, + "mean_token_accuracy": 0.8500162839889527, + "num_tokens": 238377048.0, + "step": 198120 + }, + { + "entropy": 1.8944669008255004, + "epoch": 0.6141856646096164, + "grad_norm": 9.369904518127441, + "learning_rate": 3.2280773882150623e-06, + "loss": 0.4184, + "mean_token_accuracy": 0.8600606888532638, + "num_tokens": 238388297.0, + "step": 198130 + }, + { + "entropy": 1.8376318320631981, + "epoch": 0.6142166637346661, + "grad_norm": 7.919347763061523, + "learning_rate": 3.2279959272679695e-06, + "loss": 0.4329, + "mean_token_accuracy": 0.8605689898133277, + "num_tokens": 238399407.0, + "step": 198140 + }, + { + "entropy": 1.8755689665675164, + "epoch": 0.6142476628597158, + "grad_norm": 9.534124374389648, + "learning_rate": 3.2279144724875984e-06, + "loss": 0.4495, + "mean_token_accuracy": 0.8619197174906731, + "num_tokens": 238410783.0, + "step": 198150 + }, + { + "entropy": 1.8431221053004265, + "epoch": 0.6142786619847654, + "grad_norm": 3.9078283309936523, + "learning_rate": 3.2278330238731697e-06, + "loss": 0.4836, + "mean_token_accuracy": 0.8388959527015686, + "num_tokens": 238422622.0, + "step": 198160 + }, + { + "entropy": 1.8363049641251563, + "epoch": 0.6143096611098152, + "grad_norm": 7.458052158355713, + "learning_rate": 3.2277515814239064e-06, + "loss": 0.4527, + "mean_token_accuracy": 0.8575503110885621, + "num_tokens": 238434950.0, + "step": 198170 + }, + { + "entropy": 1.757038240134716, + "epoch": 0.6143406602348649, + "grad_norm": 7.210783004760742, + "learning_rate": 3.2276701451390308e-06, + "loss": 0.3817, + "mean_token_accuracy": 0.8728042364120483, + "num_tokens": 238446797.0, + "step": 198180 + }, + { + "entropy": 1.7734515801072122, + "epoch": 0.6143716593599146, + "grad_norm": 8.399144172668457, + "learning_rate": 3.227588715017765e-06, + "loss": 0.3601, + "mean_token_accuracy": 0.8601013854146004, + "num_tokens": 238460683.0, + "step": 198190 + }, + { + "entropy": 1.7706726059317588, + "epoch": 0.6144026584849642, + "grad_norm": 7.503983020782471, + "learning_rate": 3.2275072910593307e-06, + "loss": 0.4025, + "mean_token_accuracy": 0.8651550650596619, + "num_tokens": 238473268.0, + "step": 198200 + }, + { + "entropy": 1.9047612845897675, + "epoch": 0.614433657610014, + "grad_norm": 8.110784530639648, + "learning_rate": 3.227425873262953e-06, + "loss": 0.472, + "mean_token_accuracy": 0.8519498959183693, + "num_tokens": 238484560.0, + "step": 198210 + }, + { + "entropy": 1.842281450331211, + "epoch": 0.6144646567350637, + "grad_norm": 2.75750470161438, + "learning_rate": 3.227344461627852e-06, + "loss": 0.4766, + "mean_token_accuracy": 0.8465745970606804, + "num_tokens": 238496894.0, + "step": 198220 + }, + { + "entropy": 1.8060670062899589, + "epoch": 0.6144956558601133, + "grad_norm": 8.336816787719727, + "learning_rate": 3.227263056153253e-06, + "loss": 0.3836, + "mean_token_accuracy": 0.8592736333608627, + "num_tokens": 238509479.0, + "step": 198230 + }, + { + "entropy": 1.8654377192258835, + "epoch": 0.614526654985163, + "grad_norm": 9.700325965881348, + "learning_rate": 3.2271816568383773e-06, + "loss": 0.4458, + "mean_token_accuracy": 0.8588502183556557, + "num_tokens": 238520909.0, + "step": 198240 + }, + { + "entropy": 1.8672282606363297, + "epoch": 0.6145576541102128, + "grad_norm": 8.815815925598145, + "learning_rate": 3.2271002636824487e-06, + "loss": 0.4492, + "mean_token_accuracy": 0.8577031642198563, + "num_tokens": 238532759.0, + "step": 198250 + }, + { + "entropy": 1.8973318859934807, + "epoch": 0.6145886532352625, + "grad_norm": 7.610883712768555, + "learning_rate": 3.2270188766846907e-06, + "loss": 0.4677, + "mean_token_accuracy": 0.847311581671238, + "num_tokens": 238544572.0, + "step": 198260 + }, + { + "entropy": 1.8408987671136856, + "epoch": 0.6146196523603121, + "grad_norm": 7.623629570007324, + "learning_rate": 3.2269374958443272e-06, + "loss": 0.4092, + "mean_token_accuracy": 0.8681899920105934, + "num_tokens": 238557228.0, + "step": 198270 + }, + { + "entropy": 1.8358715653419495, + "epoch": 0.6146506514853618, + "grad_norm": 4.767894744873047, + "learning_rate": 3.226856121160581e-06, + "loss": 0.4297, + "mean_token_accuracy": 0.8569270819425583, + "num_tokens": 238569345.0, + "step": 198280 + }, + { + "entropy": 1.8315407454967498, + "epoch": 0.6146816506104115, + "grad_norm": 10.133764266967773, + "learning_rate": 3.2267747526326765e-06, + "loss": 0.3687, + "mean_token_accuracy": 0.870010358095169, + "num_tokens": 238581506.0, + "step": 198290 + }, + { + "entropy": 1.861008095741272, + "epoch": 0.6147126497354612, + "grad_norm": 7.7988152503967285, + "learning_rate": 3.226693390259837e-06, + "loss": 0.488, + "mean_token_accuracy": 0.8421275913715363, + "num_tokens": 238593036.0, + "step": 198300 + }, + { + "entropy": 1.90427585542202, + "epoch": 0.6147436488605109, + "grad_norm": 8.522828102111816, + "learning_rate": 3.226612034041287e-06, + "loss": 0.4981, + "mean_token_accuracy": 0.8471856519579888, + "num_tokens": 238604178.0, + "step": 198310 + }, + { + "entropy": 1.7154094487428666, + "epoch": 0.6147746479855606, + "grad_norm": 7.768447399139404, + "learning_rate": 3.226530683976251e-06, + "loss": 0.3595, + "mean_token_accuracy": 0.8743997603654862, + "num_tokens": 238617882.0, + "step": 198320 + }, + { + "entropy": 1.9018635407090188, + "epoch": 0.6148056471106103, + "grad_norm": 8.199962615966797, + "learning_rate": 3.2264493400639522e-06, + "loss": 0.4756, + "mean_token_accuracy": 0.8509716719388962, + "num_tokens": 238629421.0, + "step": 198330 + }, + { + "entropy": 1.8207050204277038, + "epoch": 0.61483664623566, + "grad_norm": 7.46588134765625, + "learning_rate": 3.2263680023036163e-06, + "loss": 0.424, + "mean_token_accuracy": 0.8665485486388207, + "num_tokens": 238642304.0, + "step": 198340 + }, + { + "entropy": 1.8872560843825341, + "epoch": 0.6148676453607097, + "grad_norm": 6.688050270080566, + "learning_rate": 3.226286670694467e-06, + "loss": 0.4562, + "mean_token_accuracy": 0.8600884407758713, + "num_tokens": 238653847.0, + "step": 198350 + }, + { + "entropy": 1.8786954209208488, + "epoch": 0.6148986444857594, + "grad_norm": 8.85261344909668, + "learning_rate": 3.22620534523573e-06, + "loss": 0.5395, + "mean_token_accuracy": 0.8428824350237847, + "num_tokens": 238665976.0, + "step": 198360 + }, + { + "entropy": 1.865576508641243, + "epoch": 0.614929643610809, + "grad_norm": 7.852488994598389, + "learning_rate": 3.226124025926629e-06, + "loss": 0.4199, + "mean_token_accuracy": 0.8543688908219338, + "num_tokens": 238678533.0, + "step": 198370 + }, + { + "entropy": 1.8879351884126663, + "epoch": 0.6149606427358588, + "grad_norm": 8.607827186584473, + "learning_rate": 3.22604271276639e-06, + "loss": 0.421, + "mean_token_accuracy": 0.8636692270636559, + "num_tokens": 238690116.0, + "step": 198380 + }, + { + "entropy": 1.874523502588272, + "epoch": 0.6149916418609085, + "grad_norm": 8.846104621887207, + "learning_rate": 3.2259614057542377e-06, + "loss": 0.4378, + "mean_token_accuracy": 0.8568495228886605, + "num_tokens": 238701949.0, + "step": 198390 + }, + { + "entropy": 1.8623564064502716, + "epoch": 0.6150226409859582, + "grad_norm": 6.648087978363037, + "learning_rate": 3.2258801048893974e-06, + "loss": 0.443, + "mean_token_accuracy": 0.8549107179045677, + "num_tokens": 238713655.0, + "step": 198400 + }, + { + "entropy": 1.938394169509411, + "epoch": 0.6150536401110078, + "grad_norm": 8.953487396240234, + "learning_rate": 3.2257988101710937e-06, + "loss": 0.4643, + "mean_token_accuracy": 0.8423136845231056, + "num_tokens": 238724982.0, + "step": 198410 + }, + { + "entropy": 1.8558956488966942, + "epoch": 0.6150846392360576, + "grad_norm": 4.239894390106201, + "learning_rate": 3.2257175215985535e-06, + "loss": 0.4442, + "mean_token_accuracy": 0.8487705007195473, + "num_tokens": 238737393.0, + "step": 198420 + }, + { + "entropy": 1.8345892995595932, + "epoch": 0.6151156383611073, + "grad_norm": 4.324648380279541, + "learning_rate": 3.2256362391710023e-06, + "loss": 0.4015, + "mean_token_accuracy": 0.8563659995794296, + "num_tokens": 238749535.0, + "step": 198430 + }, + { + "entropy": 1.9343143850564957, + "epoch": 0.6151466374861569, + "grad_norm": 8.221383094787598, + "learning_rate": 3.2255549628876655e-06, + "loss": 0.4908, + "mean_token_accuracy": 0.847763192653656, + "num_tokens": 238760327.0, + "step": 198440 + }, + { + "entropy": 1.8079216808080674, + "epoch": 0.6151776366112066, + "grad_norm": 8.265410423278809, + "learning_rate": 3.225473692747769e-06, + "loss": 0.3999, + "mean_token_accuracy": 0.8656876266002655, + "num_tokens": 238773252.0, + "step": 198450 + }, + { + "entropy": 1.8814214587211608, + "epoch": 0.6152086357362564, + "grad_norm": 8.192183494567871, + "learning_rate": 3.2253924287505382e-06, + "loss": 0.4289, + "mean_token_accuracy": 0.8665597438812256, + "num_tokens": 238784746.0, + "step": 198460 + }, + { + "entropy": 1.8934792011976243, + "epoch": 0.6152396348613061, + "grad_norm": 9.08458137512207, + "learning_rate": 3.2253111708952007e-06, + "loss": 0.4584, + "mean_token_accuracy": 0.8449272364377975, + "num_tokens": 238796524.0, + "step": 198470 + }, + { + "entropy": 1.8874700501561166, + "epoch": 0.6152706339863557, + "grad_norm": 8.042974472045898, + "learning_rate": 3.225229919180983e-06, + "loss": 0.4448, + "mean_token_accuracy": 0.8547151833772659, + "num_tokens": 238807564.0, + "step": 198480 + }, + { + "entropy": 1.8576602458953857, + "epoch": 0.6153016331114054, + "grad_norm": 3.5211164951324463, + "learning_rate": 3.22514867360711e-06, + "loss": 0.4267, + "mean_token_accuracy": 0.8638330847024918, + "num_tokens": 238819320.0, + "step": 198490 + }, + { + "entropy": 1.9324481308460235, + "epoch": 0.6153326322364552, + "grad_norm": 8.405794143676758, + "learning_rate": 3.22506743417281e-06, + "loss": 0.4537, + "mean_token_accuracy": 0.8566653996706008, + "num_tokens": 238830154.0, + "step": 198500 + }, + { + "entropy": 1.8496293708682061, + "epoch": 0.6153636313615048, + "grad_norm": 8.828564643859863, + "learning_rate": 3.224986200877309e-06, + "loss": 0.4046, + "mean_token_accuracy": 0.8587576478719712, + "num_tokens": 238842348.0, + "step": 198510 + }, + { + "entropy": 1.9132272452116013, + "epoch": 0.6153946304865545, + "grad_norm": 7.063823223114014, + "learning_rate": 3.2249049737198336e-06, + "loss": 0.4393, + "mean_token_accuracy": 0.8504229575395584, + "num_tokens": 238853792.0, + "step": 198520 + }, + { + "entropy": 1.8963459238409996, + "epoch": 0.6154256296116042, + "grad_norm": 3.3602142333984375, + "learning_rate": 3.224823752699611e-06, + "loss": 0.4292, + "mean_token_accuracy": 0.8591250211000443, + "num_tokens": 238865334.0, + "step": 198530 + }, + { + "entropy": 1.8504900515079499, + "epoch": 0.6154566287366539, + "grad_norm": 9.155741691589355, + "learning_rate": 3.224742537815869e-06, + "loss": 0.4123, + "mean_token_accuracy": 0.862186798453331, + "num_tokens": 238877468.0, + "step": 198540 + }, + { + "entropy": 1.8866616129875182, + "epoch": 0.6154876278617036, + "grad_norm": 9.354654312133789, + "learning_rate": 3.2246613290678347e-06, + "loss": 0.4945, + "mean_token_accuracy": 0.84332734644413, + "num_tokens": 238887991.0, + "step": 198550 + }, + { + "entropy": 1.8416400015354157, + "epoch": 0.6155186269867533, + "grad_norm": 3.3999783992767334, + "learning_rate": 3.224580126454735e-06, + "loss": 0.3933, + "mean_token_accuracy": 0.8651275053620339, + "num_tokens": 238900140.0, + "step": 198560 + }, + { + "entropy": 1.8191387727856636, + "epoch": 0.615549626111803, + "grad_norm": 7.950167655944824, + "learning_rate": 3.2244989299757985e-06, + "loss": 0.3814, + "mean_token_accuracy": 0.873188602924347, + "num_tokens": 238911988.0, + "step": 198570 + }, + { + "entropy": 1.8519021973013878, + "epoch": 0.6155806252368526, + "grad_norm": 2.7390058040618896, + "learning_rate": 3.224417739630251e-06, + "loss": 0.425, + "mean_token_accuracy": 0.8561043664813042, + "num_tokens": 238924394.0, + "step": 198580 + }, + { + "entropy": 1.8018446296453476, + "epoch": 0.6156116243619024, + "grad_norm": 4.802493095397949, + "learning_rate": 3.2243365554173233e-06, + "loss": 0.4041, + "mean_token_accuracy": 0.8607585147023201, + "num_tokens": 238937932.0, + "step": 198590 + }, + { + "entropy": 1.8335761934518815, + "epoch": 0.6156426234869521, + "grad_norm": 7.089979648590088, + "learning_rate": 3.224255377336241e-06, + "loss": 0.3715, + "mean_token_accuracy": 0.8711427718400955, + "num_tokens": 238950359.0, + "step": 198600 + }, + { + "entropy": 1.814923171699047, + "epoch": 0.6156736226120018, + "grad_norm": 9.263649940490723, + "learning_rate": 3.224174205386233e-06, + "loss": 0.4192, + "mean_token_accuracy": 0.8567982107400894, + "num_tokens": 238963129.0, + "step": 198610 + }, + { + "entropy": 1.9202240645885467, + "epoch": 0.6157046217370514, + "grad_norm": 6.889978885650635, + "learning_rate": 3.224093039566528e-06, + "loss": 0.4209, + "mean_token_accuracy": 0.8707507133483887, + "num_tokens": 238974124.0, + "step": 198620 + }, + { + "entropy": 1.9241647884249686, + "epoch": 0.6157356208621012, + "grad_norm": 8.047388076782227, + "learning_rate": 3.224011879876354e-06, + "loss": 0.4879, + "mean_token_accuracy": 0.8526466712355614, + "num_tokens": 238985580.0, + "step": 198630 + }, + { + "entropy": 1.887078620493412, + "epoch": 0.6157666199871509, + "grad_norm": 9.372336387634277, + "learning_rate": 3.2239307263149396e-06, + "loss": 0.4833, + "mean_token_accuracy": 0.8520206958055496, + "num_tokens": 238997245.0, + "step": 198640 + }, + { + "entropy": 1.8250777184963227, + "epoch": 0.6157976191122005, + "grad_norm": 4.112492084503174, + "learning_rate": 3.2238495788815127e-06, + "loss": 0.403, + "mean_token_accuracy": 0.8616786897182465, + "num_tokens": 239010115.0, + "step": 198650 + }, + { + "entropy": 1.8797308295965194, + "epoch": 0.6158286182372502, + "grad_norm": 7.629178047180176, + "learning_rate": 3.223768437575303e-06, + "loss": 0.4646, + "mean_token_accuracy": 0.8530390352010727, + "num_tokens": 239022003.0, + "step": 198660 + }, + { + "entropy": 1.9422622606158257, + "epoch": 0.6158596173623, + "grad_norm": 8.976255416870117, + "learning_rate": 3.22368730239554e-06, + "loss": 0.4623, + "mean_token_accuracy": 0.855775335431099, + "num_tokens": 239033410.0, + "step": 198670 + }, + { + "entropy": 1.924929141998291, + "epoch": 0.6158906164873497, + "grad_norm": 9.170016288757324, + "learning_rate": 3.2236061733414515e-06, + "loss": 0.4611, + "mean_token_accuracy": 0.8503917545080185, + "num_tokens": 239044548.0, + "step": 198680 + }, + { + "entropy": 1.8258700281381608, + "epoch": 0.6159216156123993, + "grad_norm": 7.752175331115723, + "learning_rate": 3.2235250504122685e-06, + "loss": 0.4535, + "mean_token_accuracy": 0.8530758947134018, + "num_tokens": 239058126.0, + "step": 198690 + }, + { + "entropy": 1.8341179564595222, + "epoch": 0.615952614737449, + "grad_norm": 8.030282974243164, + "learning_rate": 3.2234439336072184e-06, + "loss": 0.3675, + "mean_token_accuracy": 0.8733653351664543, + "num_tokens": 239070897.0, + "step": 198700 + }, + { + "entropy": 1.9363200575113297, + "epoch": 0.6159836138624988, + "grad_norm": 3.9513907432556152, + "learning_rate": 3.2233628229255317e-06, + "loss": 0.469, + "mean_token_accuracy": 0.8473497003316879, + "num_tokens": 239082965.0, + "step": 198710 + }, + { + "entropy": 1.8560904487967491, + "epoch": 0.6160146129875484, + "grad_norm": 8.370074272155762, + "learning_rate": 3.223281718366438e-06, + "loss": 0.413, + "mean_token_accuracy": 0.8634010136127472, + "num_tokens": 239094946.0, + "step": 198720 + }, + { + "entropy": 1.9300329357385635, + "epoch": 0.6160456121125981, + "grad_norm": 7.569369792938232, + "learning_rate": 3.2232006199291665e-06, + "loss": 0.4474, + "mean_token_accuracy": 0.8554398715496063, + "num_tokens": 239106368.0, + "step": 198730 + }, + { + "entropy": 1.8099862188100815, + "epoch": 0.6160766112376478, + "grad_norm": 8.18184757232666, + "learning_rate": 3.223119527612948e-06, + "loss": 0.43, + "mean_token_accuracy": 0.8559572860598564, + "num_tokens": 239119049.0, + "step": 198740 + }, + { + "entropy": 1.869805746525526, + "epoch": 0.6161076103626976, + "grad_norm": 3.7304792404174805, + "learning_rate": 3.223038441417012e-06, + "loss": 0.4134, + "mean_token_accuracy": 0.8575934499502182, + "num_tokens": 239130718.0, + "step": 198750 + }, + { + "entropy": 1.8467522531747818, + "epoch": 0.6161386094877472, + "grad_norm": 9.405539512634277, + "learning_rate": 3.2229573613405883e-06, + "loss": 0.3907, + "mean_token_accuracy": 0.8660490214824677, + "num_tokens": 239142637.0, + "step": 198760 + }, + { + "entropy": 1.860187816619873, + "epoch": 0.6161696086127969, + "grad_norm": 9.967666625976562, + "learning_rate": 3.2228762873829078e-06, + "loss": 0.4259, + "mean_token_accuracy": 0.8545159950852395, + "num_tokens": 239154629.0, + "step": 198770 + }, + { + "entropy": 1.8846646919846535, + "epoch": 0.6162006077378466, + "grad_norm": 8.279898643493652, + "learning_rate": 3.222795219543201e-06, + "loss": 0.4543, + "mean_token_accuracy": 0.8574783384799958, + "num_tokens": 239165944.0, + "step": 198780 + }, + { + "entropy": 1.8572672247886657, + "epoch": 0.6162316068628962, + "grad_norm": 6.9327521324157715, + "learning_rate": 3.2227141578206988e-06, + "loss": 0.4497, + "mean_token_accuracy": 0.8502766817808152, + "num_tokens": 239178303.0, + "step": 198790 + }, + { + "entropy": 1.9343391239643097, + "epoch": 0.616262605987946, + "grad_norm": 9.41601848602295, + "learning_rate": 3.2226331022146312e-06, + "loss": 0.484, + "mean_token_accuracy": 0.8539301916956902, + "num_tokens": 239188981.0, + "step": 198800 + }, + { + "entropy": 1.9151773989200591, + "epoch": 0.6162936051129957, + "grad_norm": 8.566873550415039, + "learning_rate": 3.222552052724229e-06, + "loss": 0.4602, + "mean_token_accuracy": 0.8554543927311897, + "num_tokens": 239199823.0, + "step": 198810 + }, + { + "entropy": 1.8708260178565979, + "epoch": 0.6163246042380454, + "grad_norm": 7.07456636428833, + "learning_rate": 3.2224710093487233e-06, + "loss": 0.4185, + "mean_token_accuracy": 0.8673142939805984, + "num_tokens": 239211868.0, + "step": 198820 + }, + { + "entropy": 1.8244033619761466, + "epoch": 0.616355603363095, + "grad_norm": 7.7489800453186035, + "learning_rate": 3.222389972087346e-06, + "loss": 0.3842, + "mean_token_accuracy": 0.8668134346604347, + "num_tokens": 239224272.0, + "step": 198830 + }, + { + "entropy": 1.8696729049086571, + "epoch": 0.6163866024881448, + "grad_norm": 8.857129096984863, + "learning_rate": 3.2223089409393272e-06, + "loss": 0.4601, + "mean_token_accuracy": 0.8558687403798103, + "num_tokens": 239236145.0, + "step": 198840 + }, + { + "entropy": 1.862788826227188, + "epoch": 0.6164176016131945, + "grad_norm": 8.101411819458008, + "learning_rate": 3.2222279159038993e-06, + "loss": 0.4349, + "mean_token_accuracy": 0.8538533791899681, + "num_tokens": 239248699.0, + "step": 198850 + }, + { + "entropy": 1.9156145766377448, + "epoch": 0.6164486007382441, + "grad_norm": 7.721922397613525, + "learning_rate": 3.222146896980293e-06, + "loss": 0.4445, + "mean_token_accuracy": 0.8570765167474746, + "num_tokens": 239260281.0, + "step": 198860 + }, + { + "entropy": 1.8555357545614242, + "epoch": 0.6164795998632938, + "grad_norm": 7.878803730010986, + "learning_rate": 3.222065884167741e-06, + "loss": 0.4, + "mean_token_accuracy": 0.8711650311946869, + "num_tokens": 239272202.0, + "step": 198870 + }, + { + "entropy": 1.8976218074560165, + "epoch": 0.6165105989883436, + "grad_norm": 7.291401386260986, + "learning_rate": 3.2219848774654737e-06, + "loss": 0.4195, + "mean_token_accuracy": 0.8570414736866951, + "num_tokens": 239283994.0, + "step": 198880 + }, + { + "entropy": 1.9023361951112747, + "epoch": 0.6165415981133933, + "grad_norm": 4.05077600479126, + "learning_rate": 3.2219038768727244e-06, + "loss": 0.4414, + "mean_token_accuracy": 0.8495072141289711, + "num_tokens": 239296291.0, + "step": 198890 + }, + { + "entropy": 1.8692690685391427, + "epoch": 0.6165725972384429, + "grad_norm": 8.19445514678955, + "learning_rate": 3.221822882388725e-06, + "loss": 0.4228, + "mean_token_accuracy": 0.8572457298636437, + "num_tokens": 239308415.0, + "step": 198900 + }, + { + "entropy": 1.84363332092762, + "epoch": 0.6166035963634926, + "grad_norm": 8.040933609008789, + "learning_rate": 3.221741894012706e-06, + "loss": 0.4498, + "mean_token_accuracy": 0.8483372122049332, + "num_tokens": 239321387.0, + "step": 198910 + }, + { + "entropy": 1.9250966876745224, + "epoch": 0.6166345954885424, + "grad_norm": 8.463235855102539, + "learning_rate": 3.221660911743902e-06, + "loss": 0.485, + "mean_token_accuracy": 0.8497035041451454, + "num_tokens": 239332518.0, + "step": 198920 + }, + { + "entropy": 1.795482875406742, + "epoch": 0.616665594613592, + "grad_norm": 9.157936096191406, + "learning_rate": 3.221579935581544e-06, + "loss": 0.387, + "mean_token_accuracy": 0.867797677218914, + "num_tokens": 239346161.0, + "step": 198930 + }, + { + "entropy": 1.9090109691023827, + "epoch": 0.6166965937386417, + "grad_norm": 7.305553436279297, + "learning_rate": 3.221498965524865e-06, + "loss": 0.4669, + "mean_token_accuracy": 0.8539376124739647, + "num_tokens": 239357090.0, + "step": 198940 + }, + { + "entropy": 1.8361437231302262, + "epoch": 0.6167275928636914, + "grad_norm": 7.487894058227539, + "learning_rate": 3.221418001573099e-06, + "loss": 0.4262, + "mean_token_accuracy": 0.8567282602190971, + "num_tokens": 239369359.0, + "step": 198950 + }, + { + "entropy": 1.8894758448004723, + "epoch": 0.6167585919887412, + "grad_norm": 5.915857315063477, + "learning_rate": 3.221337043725477e-06, + "loss": 0.4348, + "mean_token_accuracy": 0.8565126731991768, + "num_tokens": 239381084.0, + "step": 198960 + }, + { + "entropy": 1.953935231268406, + "epoch": 0.6167895911137908, + "grad_norm": 8.806802749633789, + "learning_rate": 3.2212560919812327e-06, + "loss": 0.4583, + "mean_token_accuracy": 0.8550879210233688, + "num_tokens": 239392342.0, + "step": 198970 + }, + { + "entropy": 1.848163591325283, + "epoch": 0.6168205902388405, + "grad_norm": 3.8063647747039795, + "learning_rate": 3.2211751463395992e-06, + "loss": 0.3818, + "mean_token_accuracy": 0.862635625898838, + "num_tokens": 239404961.0, + "step": 198980 + }, + { + "entropy": 1.9147900685667991, + "epoch": 0.6168515893638902, + "grad_norm": 8.159584999084473, + "learning_rate": 3.2210942067998103e-06, + "loss": 0.4119, + "mean_token_accuracy": 0.8590657487511635, + "num_tokens": 239416335.0, + "step": 198990 + }, + { + "entropy": 1.9331546038389207, + "epoch": 0.61688258848894, + "grad_norm": 10.028764724731445, + "learning_rate": 3.221013273361099e-06, + "loss": 0.5019, + "mean_token_accuracy": 0.8451780170202255, + "num_tokens": 239427470.0, + "step": 199000 + }, + { + "entropy": 1.8391878455877304, + "epoch": 0.6169135876139896, + "grad_norm": 8.022989273071289, + "learning_rate": 3.220932346022699e-06, + "loss": 0.3846, + "mean_token_accuracy": 0.8623939231038094, + "num_tokens": 239440496.0, + "step": 199010 + }, + { + "entropy": 1.8676461964845656, + "epoch": 0.6169445867390393, + "grad_norm": 3.621432065963745, + "learning_rate": 3.220851424783843e-06, + "loss": 0.4093, + "mean_token_accuracy": 0.8650773599743843, + "num_tokens": 239452244.0, + "step": 199020 + }, + { + "entropy": 1.896689024567604, + "epoch": 0.616975585864089, + "grad_norm": 8.148860931396484, + "learning_rate": 3.2207705096437664e-06, + "loss": 0.4692, + "mean_token_accuracy": 0.8594577699899674, + "num_tokens": 239462795.0, + "step": 199030 + }, + { + "entropy": 1.82914230376482, + "epoch": 0.6170065849891386, + "grad_norm": 8.967019081115723, + "learning_rate": 3.220689600601702e-06, + "loss": 0.4341, + "mean_token_accuracy": 0.857168261706829, + "num_tokens": 239476029.0, + "step": 199040 + }, + { + "entropy": 1.8706032626330853, + "epoch": 0.6170375841141884, + "grad_norm": 8.116171836853027, + "learning_rate": 3.2206086976568857e-06, + "loss": 0.4525, + "mean_token_accuracy": 0.8567556083202362, + "num_tokens": 239487901.0, + "step": 199050 + }, + { + "entropy": 1.8878035575151444, + "epoch": 0.6170685832392381, + "grad_norm": 3.669631004333496, + "learning_rate": 3.220527800808549e-06, + "loss": 0.4185, + "mean_token_accuracy": 0.8642433434724808, + "num_tokens": 239500013.0, + "step": 199060 + }, + { + "entropy": 1.8171610817313195, + "epoch": 0.6170995823642877, + "grad_norm": 2.781083822250366, + "learning_rate": 3.220446910055929e-06, + "loss": 0.372, + "mean_token_accuracy": 0.8732234045863152, + "num_tokens": 239512409.0, + "step": 199070 + }, + { + "entropy": 1.8164382711052895, + "epoch": 0.6171305814893374, + "grad_norm": 7.623589038848877, + "learning_rate": 3.2203660253982575e-06, + "loss": 0.4445, + "mean_token_accuracy": 0.8559349969029426, + "num_tokens": 239525947.0, + "step": 199080 + }, + { + "entropy": 1.7972185373306275, + "epoch": 0.6171615806143872, + "grad_norm": 7.532422065734863, + "learning_rate": 3.2202851468347713e-06, + "loss": 0.3724, + "mean_token_accuracy": 0.8654538482427597, + "num_tokens": 239539242.0, + "step": 199090 + }, + { + "entropy": 1.9267156958580016, + "epoch": 0.6171925797394369, + "grad_norm": 8.321616172790527, + "learning_rate": 3.2202042743647046e-06, + "loss": 0.4779, + "mean_token_accuracy": 0.8527379527688026, + "num_tokens": 239550383.0, + "step": 199100 + }, + { + "entropy": 1.8652078002691268, + "epoch": 0.6172235788644865, + "grad_norm": 8.199543952941895, + "learning_rate": 3.220123407987291e-06, + "loss": 0.4072, + "mean_token_accuracy": 0.8678713470697403, + "num_tokens": 239562372.0, + "step": 199110 + }, + { + "entropy": 1.8801757156848908, + "epoch": 0.6172545779895362, + "grad_norm": 4.798285484313965, + "learning_rate": 3.2200425477017676e-06, + "loss": 0.4314, + "mean_token_accuracy": 0.8524357378482819, + "num_tokens": 239574140.0, + "step": 199120 + }, + { + "entropy": 1.871886597573757, + "epoch": 0.617285577114586, + "grad_norm": 9.886223793029785, + "learning_rate": 3.2199616935073684e-06, + "loss": 0.4191, + "mean_token_accuracy": 0.8601740583777427, + "num_tokens": 239586553.0, + "step": 199130 + }, + { + "entropy": 1.8011711463332176, + "epoch": 0.6173165762396357, + "grad_norm": 3.447469472885132, + "learning_rate": 3.219880845403329e-06, + "loss": 0.4188, + "mean_token_accuracy": 0.8566708326339721, + "num_tokens": 239599835.0, + "step": 199140 + }, + { + "entropy": 1.953110656142235, + "epoch": 0.6173475753646853, + "grad_norm": 7.4284796714782715, + "learning_rate": 3.2198000033888844e-06, + "loss": 0.4344, + "mean_token_accuracy": 0.8602241560816765, + "num_tokens": 239610521.0, + "step": 199150 + }, + { + "entropy": 1.7998080857098102, + "epoch": 0.617378574489735, + "grad_norm": 4.082189559936523, + "learning_rate": 3.2197191674632703e-06, + "loss": 0.4031, + "mean_token_accuracy": 0.8608902215957641, + "num_tokens": 239623309.0, + "step": 199160 + }, + { + "entropy": 1.7854876972734928, + "epoch": 0.6174095736147848, + "grad_norm": 10.280808448791504, + "learning_rate": 3.2196383376257232e-06, + "loss": 0.3503, + "mean_token_accuracy": 0.8712309464812279, + "num_tokens": 239637541.0, + "step": 199170 + }, + { + "entropy": 1.941126611828804, + "epoch": 0.6174405727398344, + "grad_norm": 7.390081882476807, + "learning_rate": 3.219557513875478e-06, + "loss": 0.4495, + "mean_token_accuracy": 0.8549496442079544, + "num_tokens": 239649091.0, + "step": 199180 + }, + { + "entropy": 1.7639221414923667, + "epoch": 0.6174715718648841, + "grad_norm": 6.774654388427734, + "learning_rate": 3.2194766962117714e-06, + "loss": 0.3335, + "mean_token_accuracy": 0.8723353713750839, + "num_tokens": 239662638.0, + "step": 199190 + }, + { + "entropy": 1.945545607805252, + "epoch": 0.6175025709899338, + "grad_norm": 8.495312690734863, + "learning_rate": 3.219395884633839e-06, + "loss": 0.5042, + "mean_token_accuracy": 0.8469432502985, + "num_tokens": 239673886.0, + "step": 199200 + }, + { + "entropy": 1.8317839153110982, + "epoch": 0.6175335701149836, + "grad_norm": 9.136716842651367, + "learning_rate": 3.219315079140918e-06, + "loss": 0.3809, + "mean_token_accuracy": 0.8626415833830834, + "num_tokens": 239686394.0, + "step": 199210 + }, + { + "entropy": 1.856505075097084, + "epoch": 0.6175645692400332, + "grad_norm": 3.8257100582122803, + "learning_rate": 3.219234279732243e-06, + "loss": 0.3951, + "mean_token_accuracy": 0.857763585448265, + "num_tokens": 239699160.0, + "step": 199220 + }, + { + "entropy": 1.9680979698896408, + "epoch": 0.6175955683650829, + "grad_norm": 10.98916244506836, + "learning_rate": 3.219153486407052e-06, + "loss": 0.4892, + "mean_token_accuracy": 0.8537120461463928, + "num_tokens": 239709462.0, + "step": 199230 + }, + { + "entropy": 1.8636929139494895, + "epoch": 0.6176265674901326, + "grad_norm": 9.323714256286621, + "learning_rate": 3.2190726991645806e-06, + "loss": 0.407, + "mean_token_accuracy": 0.8667399495840072, + "num_tokens": 239721459.0, + "step": 199240 + }, + { + "entropy": 1.7128648608922958, + "epoch": 0.6176575666151823, + "grad_norm": 3.4446444511413574, + "learning_rate": 3.2189919180040667e-06, + "loss": 0.3653, + "mean_token_accuracy": 0.8705388396978379, + "num_tokens": 239735044.0, + "step": 199250 + }, + { + "entropy": 1.8738186940550805, + "epoch": 0.617688565740232, + "grad_norm": 9.5772123336792, + "learning_rate": 3.2189111429247467e-06, + "loss": 0.399, + "mean_token_accuracy": 0.8661644160747528, + "num_tokens": 239747635.0, + "step": 199260 + }, + { + "entropy": 1.867714224755764, + "epoch": 0.6177195648652817, + "grad_norm": 8.063520431518555, + "learning_rate": 3.2188303739258574e-06, + "loss": 0.4329, + "mean_token_accuracy": 0.8557335734367371, + "num_tokens": 239759230.0, + "step": 199270 + }, + { + "entropy": 1.8949234545230866, + "epoch": 0.6177505639903313, + "grad_norm": 9.034475326538086, + "learning_rate": 3.2187496110066364e-06, + "loss": 0.4393, + "mean_token_accuracy": 0.8573550119996071, + "num_tokens": 239770185.0, + "step": 199280 + }, + { + "entropy": 1.9065477907657624, + "epoch": 0.617781563115381, + "grad_norm": 8.86186408996582, + "learning_rate": 3.218668854166321e-06, + "loss": 0.4439, + "mean_token_accuracy": 0.8593709141016006, + "num_tokens": 239781844.0, + "step": 199290 + }, + { + "entropy": 1.8973913431167602, + "epoch": 0.6178125622404308, + "grad_norm": 9.058403968811035, + "learning_rate": 3.2185881034041483e-06, + "loss": 0.465, + "mean_token_accuracy": 0.8546336054801941, + "num_tokens": 239793680.0, + "step": 199300 + }, + { + "entropy": 1.8704157873988152, + "epoch": 0.6178435613654805, + "grad_norm": 3.7460262775421143, + "learning_rate": 3.218507358719356e-06, + "loss": 0.4316, + "mean_token_accuracy": 0.871597295999527, + "num_tokens": 239805800.0, + "step": 199310 + }, + { + "entropy": 1.9420752674341202, + "epoch": 0.6178745604905301, + "grad_norm": 4.6435346603393555, + "learning_rate": 3.2184266201111815e-06, + "loss": 0.486, + "mean_token_accuracy": 0.8475510746240615, + "num_tokens": 239817299.0, + "step": 199320 + }, + { + "entropy": 1.8781623139977455, + "epoch": 0.6179055596155798, + "grad_norm": 6.959751129150391, + "learning_rate": 3.218345887578864e-06, + "loss": 0.4344, + "mean_token_accuracy": 0.8654584184288978, + "num_tokens": 239829124.0, + "step": 199330 + }, + { + "entropy": 1.9558769553899764, + "epoch": 0.6179365587406296, + "grad_norm": 7.644688606262207, + "learning_rate": 3.2182651611216393e-06, + "loss": 0.4525, + "mean_token_accuracy": 0.8626610890030861, + "num_tokens": 239840235.0, + "step": 199340 + }, + { + "entropy": 1.9544329941272736, + "epoch": 0.6179675578656793, + "grad_norm": 7.619667053222656, + "learning_rate": 3.2181844407387477e-06, + "loss": 0.4907, + "mean_token_accuracy": 0.8480888634920121, + "num_tokens": 239850897.0, + "step": 199350 + }, + { + "entropy": 1.9408447206020356, + "epoch": 0.6179985569907289, + "grad_norm": 9.406037330627441, + "learning_rate": 3.218103726429427e-06, + "loss": 0.5414, + "mean_token_accuracy": 0.8272959098219872, + "num_tokens": 239862352.0, + "step": 199360 + }, + { + "entropy": 1.8339162483811378, + "epoch": 0.6180295561157786, + "grad_norm": 7.903733253479004, + "learning_rate": 3.218023018192914e-06, + "loss": 0.3946, + "mean_token_accuracy": 0.864348916709423, + "num_tokens": 239875037.0, + "step": 199370 + }, + { + "entropy": 1.8993135869503022, + "epoch": 0.6180605552408284, + "grad_norm": 7.121351718902588, + "learning_rate": 3.2179423160284488e-06, + "loss": 0.4683, + "mean_token_accuracy": 0.8541018515825272, + "num_tokens": 239886000.0, + "step": 199380 + }, + { + "entropy": 1.9435019135475158, + "epoch": 0.618091554365878, + "grad_norm": 8.412862777709961, + "learning_rate": 3.2178616199352696e-06, + "loss": 0.4982, + "mean_token_accuracy": 0.8492094442248345, + "num_tokens": 239897551.0, + "step": 199390 + }, + { + "entropy": 1.8815071359276772, + "epoch": 0.6181225534909277, + "grad_norm": 10.336962699890137, + "learning_rate": 3.217780929912615e-06, + "loss": 0.4228, + "mean_token_accuracy": 0.8544108346104622, + "num_tokens": 239908859.0, + "step": 199400 + }, + { + "entropy": 1.789330853521824, + "epoch": 0.6181535526159774, + "grad_norm": 9.012761116027832, + "learning_rate": 3.2177002459597244e-06, + "loss": 0.3917, + "mean_token_accuracy": 0.8624624639749527, + "num_tokens": 239922173.0, + "step": 199410 + }, + { + "entropy": 1.7700959324836731, + "epoch": 0.6181845517410272, + "grad_norm": 8.16477108001709, + "learning_rate": 3.2176195680758367e-06, + "loss": 0.3792, + "mean_token_accuracy": 0.8740750655531884, + "num_tokens": 239935765.0, + "step": 199420 + }, + { + "entropy": 1.898264265060425, + "epoch": 0.6182155508660768, + "grad_norm": 7.839510917663574, + "learning_rate": 3.2175388962601905e-06, + "loss": 0.4812, + "mean_token_accuracy": 0.8386017501354217, + "num_tokens": 239947401.0, + "step": 199430 + }, + { + "entropy": 1.8775344803929328, + "epoch": 0.6182465499911265, + "grad_norm": 8.23879337310791, + "learning_rate": 3.217458230512026e-06, + "loss": 0.46, + "mean_token_accuracy": 0.8576970815658569, + "num_tokens": 239958844.0, + "step": 199440 + }, + { + "entropy": 1.9250696122646331, + "epoch": 0.6182775491161762, + "grad_norm": 8.848472595214844, + "learning_rate": 3.217377570830582e-06, + "loss": 0.49, + "mean_token_accuracy": 0.8477827280759811, + "num_tokens": 239969861.0, + "step": 199450 + }, + { + "entropy": 1.822139708697796, + "epoch": 0.6183085482412259, + "grad_norm": 6.093864440917969, + "learning_rate": 3.2172969172150988e-06, + "loss": 0.4089, + "mean_token_accuracy": 0.8548695519566536, + "num_tokens": 239982644.0, + "step": 199460 + }, + { + "entropy": 1.8985069781541823, + "epoch": 0.6183395473662756, + "grad_norm": 7.458169460296631, + "learning_rate": 3.2172162696648153e-06, + "loss": 0.4334, + "mean_token_accuracy": 0.8565488919615746, + "num_tokens": 239994171.0, + "step": 199470 + }, + { + "entropy": 1.8331923454999923, + "epoch": 0.6183705464913253, + "grad_norm": 7.46090841293335, + "learning_rate": 3.217135628178972e-06, + "loss": 0.4207, + "mean_token_accuracy": 0.8442671373486519, + "num_tokens": 240007032.0, + "step": 199480 + }, + { + "entropy": 1.864036823809147, + "epoch": 0.618401545616375, + "grad_norm": 3.682788848876953, + "learning_rate": 3.2170549927568086e-06, + "loss": 0.4336, + "mean_token_accuracy": 0.8551156163215637, + "num_tokens": 240019306.0, + "step": 199490 + }, + { + "entropy": 1.879542638361454, + "epoch": 0.6184325447414247, + "grad_norm": 8.343369483947754, + "learning_rate": 3.216974363397565e-06, + "loss": 0.4333, + "mean_token_accuracy": 0.856440258026123, + "num_tokens": 240031122.0, + "step": 199500 + }, + { + "entropy": 1.8944496288895607, + "epoch": 0.6184635438664744, + "grad_norm": 8.23908519744873, + "learning_rate": 3.2168937401004816e-06, + "loss": 0.4287, + "mean_token_accuracy": 0.8619818806648254, + "num_tokens": 240043420.0, + "step": 199510 + }, + { + "entropy": 1.831877386569977, + "epoch": 0.6184945429915241, + "grad_norm": 8.815319061279297, + "learning_rate": 3.2168131228647997e-06, + "loss": 0.3623, + "mean_token_accuracy": 0.8666740179061889, + "num_tokens": 240056129.0, + "step": 199520 + }, + { + "entropy": 1.861776551604271, + "epoch": 0.6185255421165737, + "grad_norm": 10.601828575134277, + "learning_rate": 3.216732511689758e-06, + "loss": 0.4158, + "mean_token_accuracy": 0.8628039136528969, + "num_tokens": 240067794.0, + "step": 199530 + }, + { + "entropy": 1.8404061213135718, + "epoch": 0.6185565412416234, + "grad_norm": 8.904887199401855, + "learning_rate": 3.2166519065745986e-06, + "loss": 0.4142, + "mean_token_accuracy": 0.8643636539578438, + "num_tokens": 240080470.0, + "step": 199540 + }, + { + "entropy": 1.9593759417533874, + "epoch": 0.6185875403666732, + "grad_norm": 8.867597579956055, + "learning_rate": 3.216571307518563e-06, + "loss": 0.5131, + "mean_token_accuracy": 0.837907250225544, + "num_tokens": 240091238.0, + "step": 199550 + }, + { + "entropy": 1.8562731474637986, + "epoch": 0.6186185394917229, + "grad_norm": 4.115611553192139, + "learning_rate": 3.2164907145208897e-06, + "loss": 0.4471, + "mean_token_accuracy": 0.850125202536583, + "num_tokens": 240103551.0, + "step": 199560 + }, + { + "entropy": 1.8324606716632843, + "epoch": 0.6186495386167725, + "grad_norm": 4.200334072113037, + "learning_rate": 3.2164101275808217e-06, + "loss": 0.3759, + "mean_token_accuracy": 0.863725571334362, + "num_tokens": 240115791.0, + "step": 199570 + }, + { + "entropy": 1.8296230912208558, + "epoch": 0.6186805377418222, + "grad_norm": 7.744045734405518, + "learning_rate": 3.2163295466975992e-06, + "loss": 0.4115, + "mean_token_accuracy": 0.8615129724144935, + "num_tokens": 240128381.0, + "step": 199580 + }, + { + "entropy": 1.860279454290867, + "epoch": 0.618711536866872, + "grad_norm": 10.000602722167969, + "learning_rate": 3.2162489718704637e-06, + "loss": 0.4181, + "mean_token_accuracy": 0.8575031846761704, + "num_tokens": 240140364.0, + "step": 199590 + }, + { + "entropy": 1.733563742041588, + "epoch": 0.6187425359919216, + "grad_norm": 1.9951728582382202, + "learning_rate": 3.2161684030986574e-06, + "loss": 0.3909, + "mean_token_accuracy": 0.8683977887034416, + "num_tokens": 240153746.0, + "step": 199600 + }, + { + "entropy": 1.7496012575924396, + "epoch": 0.6187735351169713, + "grad_norm": 10.270469665527344, + "learning_rate": 3.216087840381421e-06, + "loss": 0.3555, + "mean_token_accuracy": 0.8686364412307739, + "num_tokens": 240167779.0, + "step": 199610 + }, + { + "entropy": 1.8671685114502907, + "epoch": 0.618804534242021, + "grad_norm": 6.848788738250732, + "learning_rate": 3.2160072837179973e-06, + "loss": 0.4525, + "mean_token_accuracy": 0.859526140987873, + "num_tokens": 240179637.0, + "step": 199620 + }, + { + "entropy": 1.851687641441822, + "epoch": 0.6188355333670708, + "grad_norm": 7.782304286956787, + "learning_rate": 3.2159267331076264e-06, + "loss": 0.4502, + "mean_token_accuracy": 0.8539917662739753, + "num_tokens": 240191476.0, + "step": 199630 + }, + { + "entropy": 1.8153897300362587, + "epoch": 0.6188665324921204, + "grad_norm": 8.59154224395752, + "learning_rate": 3.2158461885495516e-06, + "loss": 0.3978, + "mean_token_accuracy": 0.8624886780977249, + "num_tokens": 240204196.0, + "step": 199640 + }, + { + "entropy": 1.8622885420918465, + "epoch": 0.6188975316171701, + "grad_norm": 7.064817428588867, + "learning_rate": 3.215765650043015e-06, + "loss": 0.4145, + "mean_token_accuracy": 0.8695329040288925, + "num_tokens": 240216191.0, + "step": 199650 + }, + { + "entropy": 1.8840928718447685, + "epoch": 0.6189285307422198, + "grad_norm": 7.8145341873168945, + "learning_rate": 3.215685117587258e-06, + "loss": 0.4572, + "mean_token_accuracy": 0.8516005665063858, + "num_tokens": 240227499.0, + "step": 199660 + }, + { + "entropy": 1.9540878593921662, + "epoch": 0.6189595298672695, + "grad_norm": 7.2002105712890625, + "learning_rate": 3.2156045911815255e-06, + "loss": 0.5119, + "mean_token_accuracy": 0.8494768559932708, + "num_tokens": 240238394.0, + "step": 199670 + }, + { + "entropy": 1.8394357457756996, + "epoch": 0.6189905289923192, + "grad_norm": 7.4307169914245605, + "learning_rate": 3.215524070825056e-06, + "loss": 0.3882, + "mean_token_accuracy": 0.8646936297416687, + "num_tokens": 240250932.0, + "step": 199680 + }, + { + "entropy": 1.9449533462524413, + "epoch": 0.6190215281173689, + "grad_norm": 9.829404830932617, + "learning_rate": 3.2154435565170947e-06, + "loss": 0.4779, + "mean_token_accuracy": 0.8556923270225525, + "num_tokens": 240261925.0, + "step": 199690 + }, + { + "entropy": 1.845785889029503, + "epoch": 0.6190525272424185, + "grad_norm": 4.0769944190979, + "learning_rate": 3.2153630482568847e-06, + "loss": 0.4125, + "mean_token_accuracy": 0.857506263256073, + "num_tokens": 240274462.0, + "step": 199700 + }, + { + "entropy": 1.8135719284415246, + "epoch": 0.6190835263674683, + "grad_norm": 4.39888334274292, + "learning_rate": 3.2152825460436675e-06, + "loss": 0.3959, + "mean_token_accuracy": 0.8683549582958221, + "num_tokens": 240287191.0, + "step": 199710 + }, + { + "entropy": 1.9001155897974968, + "epoch": 0.619114525492518, + "grad_norm": 7.417325019836426, + "learning_rate": 3.2152020498766874e-06, + "loss": 0.4515, + "mean_token_accuracy": 0.8583855494856835, + "num_tokens": 240298252.0, + "step": 199720 + }, + { + "entropy": 1.9175253897905349, + "epoch": 0.6191455246175677, + "grad_norm": 7.785533428192139, + "learning_rate": 3.215121559755187e-06, + "loss": 0.5209, + "mean_token_accuracy": 0.8480686709284783, + "num_tokens": 240309816.0, + "step": 199730 + }, + { + "entropy": 1.757785053551197, + "epoch": 0.6191765237426173, + "grad_norm": 3.814818859100342, + "learning_rate": 3.2150410756784088e-06, + "loss": 0.3694, + "mean_token_accuracy": 0.8653519213199615, + "num_tokens": 240322874.0, + "step": 199740 + }, + { + "entropy": 1.837675492465496, + "epoch": 0.6192075228676671, + "grad_norm": 6.441486358642578, + "learning_rate": 3.214960597645598e-06, + "loss": 0.4462, + "mean_token_accuracy": 0.8579330682754517, + "num_tokens": 240335672.0, + "step": 199750 + }, + { + "entropy": 1.9337011486291886, + "epoch": 0.6192385219927168, + "grad_norm": 8.365361213684082, + "learning_rate": 3.214880125655997e-06, + "loss": 0.4904, + "mean_token_accuracy": 0.8540092423558235, + "num_tokens": 240345914.0, + "step": 199760 + }, + { + "entropy": 1.8270794779062272, + "epoch": 0.6192695211177665, + "grad_norm": 7.633695125579834, + "learning_rate": 3.2147996597088495e-06, + "loss": 0.4056, + "mean_token_accuracy": 0.860172837972641, + "num_tokens": 240357519.0, + "step": 199770 + }, + { + "entropy": 1.9075476467609405, + "epoch": 0.6193005202428161, + "grad_norm": 10.322786331176758, + "learning_rate": 3.2147191998034004e-06, + "loss": 0.4528, + "mean_token_accuracy": 0.8539448142051697, + "num_tokens": 240369329.0, + "step": 199780 + }, + { + "entropy": 1.915835802257061, + "epoch": 0.6193315193678658, + "grad_norm": 3.6770076751708984, + "learning_rate": 3.2146387459388915e-06, + "loss": 0.434, + "mean_token_accuracy": 0.8576722934842109, + "num_tokens": 240380801.0, + "step": 199790 + }, + { + "entropy": 1.8189099460840226, + "epoch": 0.6193625184929156, + "grad_norm": 7.405015468597412, + "learning_rate": 3.2145582981145695e-06, + "loss": 0.3819, + "mean_token_accuracy": 0.8634320721030235, + "num_tokens": 240393144.0, + "step": 199800 + }, + { + "entropy": 1.860145990550518, + "epoch": 0.6193935176179652, + "grad_norm": 7.664626598358154, + "learning_rate": 3.214477856329677e-06, + "loss": 0.4422, + "mean_token_accuracy": 0.8565018594264984, + "num_tokens": 240404917.0, + "step": 199810 + }, + { + "entropy": 1.848432496190071, + "epoch": 0.6194245167430149, + "grad_norm": 8.589262962341309, + "learning_rate": 3.2143974205834592e-06, + "loss": 0.4368, + "mean_token_accuracy": 0.8619036614894867, + "num_tokens": 240416883.0, + "step": 199820 + }, + { + "entropy": 1.8772276252508164, + "epoch": 0.6194555158680646, + "grad_norm": 8.915504455566406, + "learning_rate": 3.2143169908751603e-06, + "loss": 0.4476, + "mean_token_accuracy": 0.8534212946891785, + "num_tokens": 240428587.0, + "step": 199830 + }, + { + "entropy": 1.86657772064209, + "epoch": 0.6194865149931144, + "grad_norm": 5.037930011749268, + "learning_rate": 3.214236567204025e-06, + "loss": 0.4766, + "mean_token_accuracy": 0.8429229214787484, + "num_tokens": 240441470.0, + "step": 199840 + }, + { + "entropy": 1.8196565955877304, + "epoch": 0.619517514118164, + "grad_norm": 7.531178951263428, + "learning_rate": 3.2141561495692973e-06, + "loss": 0.3875, + "mean_token_accuracy": 0.8649965927004815, + "num_tokens": 240453843.0, + "step": 199850 + }, + { + "entropy": 1.9064827859401703, + "epoch": 0.6195485132432137, + "grad_norm": 6.881914138793945, + "learning_rate": 3.214075737970223e-06, + "loss": 0.4427, + "mean_token_accuracy": 0.8601453080773354, + "num_tokens": 240465162.0, + "step": 199860 + }, + { + "entropy": 1.9310476690530778, + "epoch": 0.6195795123682634, + "grad_norm": 8.316636085510254, + "learning_rate": 3.2139953324060473e-06, + "loss": 0.4568, + "mean_token_accuracy": 0.8539797902107239, + "num_tokens": 240476257.0, + "step": 199870 + }, + { + "entropy": 1.723580890893936, + "epoch": 0.6196105114933131, + "grad_norm": 8.44727897644043, + "learning_rate": 3.213914932876015e-06, + "loss": 0.3252, + "mean_token_accuracy": 0.8854304000735282, + "num_tokens": 240490189.0, + "step": 199880 + }, + { + "entropy": 1.9352102607488633, + "epoch": 0.6196415106183628, + "grad_norm": 6.963158130645752, + "learning_rate": 3.2138345393793713e-06, + "loss": 0.4948, + "mean_token_accuracy": 0.8497881457209587, + "num_tokens": 240500887.0, + "step": 199890 + }, + { + "entropy": 1.7900185361504555, + "epoch": 0.6196725097434125, + "grad_norm": 3.517059326171875, + "learning_rate": 3.2137541519153613e-06, + "loss": 0.373, + "mean_token_accuracy": 0.8654786735773087, + "num_tokens": 240513549.0, + "step": 199900 + }, + { + "entropy": 1.8556710347533225, + "epoch": 0.6197035088684621, + "grad_norm": 8.985323905944824, + "learning_rate": 3.213673770483231e-06, + "loss": 0.4297, + "mean_token_accuracy": 0.8574738964438439, + "num_tokens": 240525617.0, + "step": 199910 + }, + { + "entropy": 1.8572029948234559, + "epoch": 0.6197345079935119, + "grad_norm": 8.606582641601562, + "learning_rate": 3.213593395082226e-06, + "loss": 0.4194, + "mean_token_accuracy": 0.8530072510242462, + "num_tokens": 240537926.0, + "step": 199920 + }, + { + "entropy": 1.823797370493412, + "epoch": 0.6197655071185616, + "grad_norm": 9.115023612976074, + "learning_rate": 3.2135130257115936e-06, + "loss": 0.4123, + "mean_token_accuracy": 0.8578477054834366, + "num_tokens": 240550835.0, + "step": 199930 + }, + { + "entropy": 1.853266417235136, + "epoch": 0.6197965062436113, + "grad_norm": 7.527276992797852, + "learning_rate": 3.2134326623705775e-06, + "loss": 0.4336, + "mean_token_accuracy": 0.8626913323998451, + "num_tokens": 240563639.0, + "step": 199940 + }, + { + "entropy": 1.882079529762268, + "epoch": 0.6198275053686609, + "grad_norm": 7.617688179016113, + "learning_rate": 3.213352305058424e-06, + "loss": 0.4216, + "mean_token_accuracy": 0.8646379739046097, + "num_tokens": 240575040.0, + "step": 199950 + }, + { + "entropy": 1.9121492326259613, + "epoch": 0.6198585044937107, + "grad_norm": 7.7319817543029785, + "learning_rate": 3.2132719537743813e-06, + "loss": 0.4168, + "mean_token_accuracy": 0.8699195802211761, + "num_tokens": 240585966.0, + "step": 199960 + }, + { + "entropy": 1.820683164894581, + "epoch": 0.6198895036187604, + "grad_norm": 8.092674255371094, + "learning_rate": 3.213191608517694e-06, + "loss": 0.399, + "mean_token_accuracy": 0.8665958195924759, + "num_tokens": 240599118.0, + "step": 199970 + }, + { + "entropy": 1.9012608036398888, + "epoch": 0.61992050274381, + "grad_norm": 8.381446838378906, + "learning_rate": 3.2131112692876087e-06, + "loss": 0.4323, + "mean_token_accuracy": 0.8655321434140205, + "num_tokens": 240611152.0, + "step": 199980 + }, + { + "entropy": 1.917787343263626, + "epoch": 0.6199515018688597, + "grad_norm": 9.232633590698242, + "learning_rate": 3.213030936083373e-06, + "loss": 0.4704, + "mean_token_accuracy": 0.8561913996934891, + "num_tokens": 240622027.0, + "step": 199990 + }, + { + "entropy": 1.8914671823382379, + "epoch": 0.6199825009939095, + "grad_norm": 8.929455757141113, + "learning_rate": 3.212950608904232e-06, + "loss": 0.4855, + "mean_token_accuracy": 0.8469156593084335, + "num_tokens": 240634070.0, + "step": 200000 + } + ], + "logging_steps": 10, + "max_steps": 258072, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 50000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 9.909176166603817e+17, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}